diff --git a/sys/arm/ti/ti_pruss.c b/sys/arm/ti/ti_pruss.c
index b6895a8a9cef..9f8bcedbdb05 100644
--- a/sys/arm/ti/ti_pruss.c
+++ b/sys/arm/ti/ti_pruss.c
@@ -1,843 +1,843 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2013 Rui Paulo <rpaulo@FreeBSD.org>
  * Copyright (c) 2017 Manuel Stuehn
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 #include <sys/poll.h>
 #include <sys/time.h>
 #include <sys/uio.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <sys/event.h>
 #include <sys/selinfo.h>
 #include <machine/bus.h>
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/intr.h>
 #include <machine/atomic.h>
 
 #include <dev/ofw/openfirm.h>
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <dev/extres/clk/clk.h>
 
 #include <arm/ti/ti_sysc.h>
 #include <arm/ti/ti_pruss.h>
 #include <arm/ti/ti_prm.h>
 
 #ifdef DEBUG
 #define	DPRINTF(fmt, ...)	do {	\
 	printf("%s: ", __func__);	\
 	printf(fmt, __VA_ARGS__);	\
 } while (0)
 #else
 #define	DPRINTF(fmt, ...)
 #endif
 
 static d_open_t			ti_pruss_irq_open;
 static d_read_t			ti_pruss_irq_read;
 static d_poll_t			ti_pruss_irq_poll;
 
 static device_probe_t		ti_pruss_probe;
 static device_attach_t		ti_pruss_attach;
 static device_detach_t		ti_pruss_detach;
 static void			ti_pruss_intr(void *);
 static d_open_t			ti_pruss_open;
 static d_mmap_t			ti_pruss_mmap;
 static void 			ti_pruss_irq_kqread_detach(struct knote *);
 static int 			ti_pruss_irq_kqevent(struct knote *, long);
 static d_kqfilter_t		ti_pruss_irq_kqfilter;
 static void			ti_pruss_privdtor(void *data);
 
 #define	TI_PRUSS_PRU_IRQS 2
 #define	TI_PRUSS_HOST_IRQS 8
 #define	TI_PRUSS_IRQS (TI_PRUSS_HOST_IRQS+TI_PRUSS_PRU_IRQS)
 #define	TI_PRUSS_EVENTS 64
 #define	NOT_SET_STR "NONE"
 #define	TI_TS_ARRAY 16
 
 struct ctl
 {
 	size_t cnt;
 	size_t idx;
 };
 
 struct ts_ring_buf
 {
 	struct ctl ctl;
 	uint64_t ts[TI_TS_ARRAY];
 };
 
 struct ti_pruss_irqsc
 {
 	struct mtx		sc_mtx;
 	struct cdev		*sc_pdev;
 	struct selinfo		sc_selinfo;
 	int8_t			channel;
 	int8_t			last;
 	int8_t			event;
 	bool			enable;
 	struct ts_ring_buf	tstamps;
 };
 
 static struct cdevsw ti_pruss_cdevirq = {
 	.d_version =	D_VERSION,
 	.d_name =	"ti_pruss_irq",
 	.d_open =	ti_pruss_irq_open,
 	.d_read =	ti_pruss_irq_read,
 	.d_poll =	ti_pruss_irq_poll,
 	.d_kqfilter =	ti_pruss_irq_kqfilter,
 };
 
 struct ti_pruss_softc {
 	struct mtx		sc_mtx;
 	struct resource 	*sc_mem_res;
 	struct resource 	*sc_irq_res[TI_PRUSS_HOST_IRQS];
 	void            	*sc_intr[TI_PRUSS_HOST_IRQS];
 	struct ti_pruss_irqsc	sc_irq_devs[TI_PRUSS_IRQS];
 	bus_space_tag_t		sc_bt;
 	bus_space_handle_t	sc_bh;
 	struct cdev		*sc_pdev;
 	struct selinfo		sc_selinfo;
 	bool			sc_glob_irqen;
 };
 
 static struct cdevsw ti_pruss_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_name =	"ti_pruss",
 	.d_open =	ti_pruss_open,
 	.d_mmap =	ti_pruss_mmap,
 };
 
 static device_method_t ti_pruss_methods[] = {
 	DEVMETHOD(device_probe,		ti_pruss_probe),
 	DEVMETHOD(device_attach,	ti_pruss_attach),
 	DEVMETHOD(device_detach,	ti_pruss_detach),
 
 	DEVMETHOD_END
 };
 
 static driver_t ti_pruss_driver = {
 	"ti_pruss",
 	ti_pruss_methods,
 	sizeof(struct ti_pruss_softc)
 };
 
 DRIVER_MODULE(ti_pruss, simplebus, ti_pruss_driver, 0, 0);
 MODULE_DEPEND(ti_pruss, ti_sysc, 1, 1, 1);
 MODULE_DEPEND(ti_pruss, ti_prm, 1, 1, 1);
 
 static struct resource_spec ti_pruss_irq_spec[] = {
 	{ SYS_RES_IRQ,	    0,  RF_ACTIVE },
 	{ SYS_RES_IRQ,	    1,  RF_ACTIVE },
 	{ SYS_RES_IRQ,	    2,  RF_ACTIVE },
 	{ SYS_RES_IRQ,	    3,  RF_ACTIVE },
 	{ SYS_RES_IRQ,	    4,  RF_ACTIVE },
 	{ SYS_RES_IRQ,	    5,  RF_ACTIVE },
 	{ SYS_RES_IRQ,	    6,  RF_ACTIVE },
 	{ SYS_RES_IRQ,	    7,  RF_ACTIVE },
 	{ -1,               0,  0 }
 };
 CTASSERT(TI_PRUSS_HOST_IRQS == nitems(ti_pruss_irq_spec) - 1);
 
 static int
 ti_pruss_irq_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct ctl* irqs;
 	struct ti_pruss_irqsc *sc;
 	sc = dev->si_drv1;
 
 	irqs = malloc(sizeof(struct ctl), M_DEVBUF, M_WAITOK);
 	irqs->cnt = sc->tstamps.ctl.cnt;
 	irqs->idx = sc->tstamps.ctl.idx;
 
 	return devfs_set_cdevpriv(irqs, ti_pruss_privdtor);
 }
 
 static void
 ti_pruss_privdtor(void *data)
 {
     free(data, M_DEVBUF);
 }
 
 static int
 ti_pruss_irq_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct ctl* irqs;
 	struct ti_pruss_irqsc *sc;
 	sc = dev->si_drv1;
 
 	devfs_get_cdevpriv((void**)&irqs);
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (sc->tstamps.ctl.cnt != irqs->cnt)
 			return events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &sc->sc_selinfo);
 	}
 	return 0;
 }
 
 static int
 ti_pruss_irq_read(struct cdev *cdev, struct uio *uio, int ioflag)
 {
 	const size_t ts_len = sizeof(uint64_t);
 	struct ti_pruss_irqsc* irq;
 	struct ctl* priv;
 	int error = 0;
 	size_t idx;
 	ssize_t level;
 
 	irq = cdev->si_drv1;
 
 	if (uio->uio_resid < ts_len)
 		return (EINVAL);
 
 	error = devfs_get_cdevpriv((void**)&priv);
 	if (error)
 	    return (error);
 
 	mtx_lock(&irq->sc_mtx);
 
 	if (irq->tstamps.ctl.cnt - priv->cnt > TI_TS_ARRAY)
 	{
 		priv->cnt = irq->tstamps.ctl.cnt;
 		priv->idx = irq->tstamps.ctl.idx;
 		mtx_unlock(&irq->sc_mtx);
 		return (ENXIO);
 	}
 
 	do {
 		idx = priv->idx;
 		level = irq->tstamps.ctl.idx - idx;
 		if (level < 0)
 			level += TI_TS_ARRAY;
 
 		if (level == 0) {
 			if (ioflag & O_NONBLOCK) {
 				mtx_unlock(&irq->sc_mtx);
 				return (EWOULDBLOCK);
 			}
 
 			error = msleep(irq, &irq->sc_mtx, PCATCH | PDROP,
 				"pruirq", 0);
 			if (error)
 				return error;
 
 			mtx_lock(&irq->sc_mtx);
 		}
 	}while(level == 0);
 
 	mtx_unlock(&irq->sc_mtx);
 
 	error = uiomove(&irq->tstamps.ts[idx], ts_len, uio);
 
 	if (++idx == TI_TS_ARRAY)
 		idx = 0;
 	priv->idx = idx;
 
 	atomic_add_32(&priv->cnt, 1);
 
 	return (error);
 }
 
 static struct ti_pruss_irq_arg {
 	int 		       irq;
 	struct ti_pruss_softc *sc;
 } ti_pruss_irq_args[TI_PRUSS_IRQS];
 
 static __inline uint32_t
 ti_pruss_reg_read(struct ti_pruss_softc *sc, uint32_t reg)
 {
 	return (bus_space_read_4(sc->sc_bt, sc->sc_bh, reg));
 }
 
 static __inline void
 ti_pruss_reg_write(struct ti_pruss_softc *sc, uint32_t reg, uint32_t val)
 {
 	bus_space_write_4(sc->sc_bt, sc->sc_bh, reg, val);
 }
 
 static __inline void
 ti_pruss_interrupts_clear(struct ti_pruss_softc *sc)
 {
 	/* disable global interrupt */
 	ti_pruss_reg_write(sc, PRUSS_INTC_GER, 0 );
 
 	/* clear all events */
 	ti_pruss_reg_write(sc, PRUSS_INTC_SECR0, 0xFFFFFFFF);
 	ti_pruss_reg_write(sc, PRUSS_INTC_SECR1, 0xFFFFFFFF);
 
 	/* disable all host interrupts */
 	ti_pruss_reg_write(sc, PRUSS_INTC_HIER, 0);
 }
 
 static __inline int
 ti_pruss_interrupts_enable(struct ti_pruss_softc *sc, int8_t irq, bool enable)
 {
 	if (enable && ((sc->sc_irq_devs[irq].channel == -1) ||
 	    (sc->sc_irq_devs[irq].event== -1)))
 	{
 		device_printf( sc->sc_pdev->si_drv1,
 			"Interrupt chain not fully configured, not possible to enable\n" );
 		return (EINVAL);
 	}
 
 	sc->sc_irq_devs[irq].enable = enable;
 
 	if (sc->sc_irq_devs[irq].sc_pdev) {
 		destroy_dev(sc->sc_irq_devs[irq].sc_pdev);
 		sc->sc_irq_devs[irq].sc_pdev = NULL;
 	}
 
 	if (enable) {
 		sc->sc_irq_devs[irq].sc_pdev = make_dev(&ti_pruss_cdevirq, 0, UID_ROOT, GID_WHEEL,
 		    0600, "pruss%d.irq%d", device_get_unit(sc->sc_pdev->si_drv1), irq);
 		sc->sc_irq_devs[irq].sc_pdev->si_drv1 = &sc->sc_irq_devs[irq];
 
 		sc->sc_irq_devs[irq].tstamps.ctl.idx = 0;
 	}
 
 	uint32_t reg = enable ? PRUSS_INTC_HIEISR : PRUSS_INTC_HIDISR;
 	ti_pruss_reg_write(sc, reg, sc->sc_irq_devs[irq].channel);
 
 	reg = enable ? PRUSS_INTC_EISR : PRUSS_INTC_EICR;
 	ti_pruss_reg_write(sc, reg, sc->sc_irq_devs[irq].event );
 
 	return (0);
 }
 
 static __inline void
 ti_pruss_map_write(struct ti_pruss_softc *sc, uint32_t basereg, uint8_t index, uint8_t content)
 {
 	const size_t regadr = basereg + index & ~0x03;
 	const size_t bitpos = (index & 0x03) * 8;
 	uint32_t rmw = ti_pruss_reg_read(sc, regadr);
 	rmw = (rmw & ~( 0xF << bitpos)) | ( (content & 0xF) << bitpos);
 	ti_pruss_reg_write(sc, regadr, rmw);
 }
 
 static int
 ti_pruss_event_map( SYSCTL_HANDLER_ARGS )
 {
 	struct ti_pruss_softc *sc;
 	const int8_t irq = arg2;
 	int err;
 	char event[sizeof(NOT_SET_STR)];
 
 	sc = arg1;
 
 	if(sc->sc_irq_devs[irq].event == -1)
 		bcopy(NOT_SET_STR, event, sizeof(event));
 	else
 		snprintf(event, sizeof(event), "%d", sc->sc_irq_devs[irq].event);
 
 	err = sysctl_handle_string(oidp, event, sizeof(event), req);
 	if(err != 0)
 		return (err);
 
 	if (req->newptr) {  // write event
 		if (strcmp(NOT_SET_STR, event) == 0) {
 			ti_pruss_interrupts_enable(sc, irq, false);
 			sc->sc_irq_devs[irq].event = -1;
 		} else {
 			if (sc->sc_irq_devs[irq].channel == -1) {
 				device_printf( sc->sc_pdev->si_drv1,
 					"corresponding channel not configured\n");
 				return (ENXIO);
 			}
 
 			const int8_t channelnr = sc->sc_irq_devs[irq].channel;
 			const int8_t eventnr = strtol( event, NULL, 10 ); // TODO: check if strol is valid
 			if (eventnr > TI_PRUSS_EVENTS || eventnr < 0) {
 				device_printf( sc->sc_pdev->si_drv1,
 					"Event number %d not valid (0 - %d)",
 					channelnr, TI_PRUSS_EVENTS -1);
 				return (EINVAL);
 			}
 
 			sc->sc_irq_devs[irq].channel = channelnr;
 			sc->sc_irq_devs[irq].event = eventnr;
 
 			// event[nr] <= channel
 			ti_pruss_map_write(sc, PRUSS_INTC_CMR_BASE,
 			    eventnr, channelnr);
 		}
 	}
 	return (err);
 }
 
 static int
 ti_pruss_channel_map(SYSCTL_HANDLER_ARGS)
 {
 	struct ti_pruss_softc *sc;
 	int err;
 	char channel[sizeof(NOT_SET_STR)];
 	const int8_t irq = arg2;
 
 	sc = arg1;
 
 	if (sc->sc_irq_devs[irq].channel == -1)
 		bcopy(NOT_SET_STR, channel, sizeof(channel));
 	else
 		snprintf(channel, sizeof(channel), "%d", sc->sc_irq_devs[irq].channel);
 
 	err = sysctl_handle_string(oidp, channel, sizeof(channel), req);
 	if (err != 0)
 		return (err);
 
 	if (req->newptr) { // write event
 		if (strcmp(NOT_SET_STR, channel) == 0) {
 			ti_pruss_interrupts_enable(sc, irq, false);
 			ti_pruss_reg_write(sc, PRUSS_INTC_HIDISR,
 			    sc->sc_irq_devs[irq].channel);
 			sc->sc_irq_devs[irq].channel = -1;
 		} else {
 			const int8_t channelnr = strtol(channel, NULL, 10); // TODO: check if strol is valid
 			if (channelnr > TI_PRUSS_IRQS || channelnr < 0)
 			{
 				device_printf(sc->sc_pdev->si_drv1,
 					"Channel number %d not valid (0 - %d)",
 					channelnr, TI_PRUSS_IRQS-1);
 				return (EINVAL);
 			}
 
 			sc->sc_irq_devs[irq].channel = channelnr;
 			sc->sc_irq_devs[irq].last = -1;
 
 			// channel[nr] <= irqnr
 			ti_pruss_map_write(sc, PRUSS_INTC_HMR_BASE,
 				irq, channelnr);
 		}
 	}
 
 	return (err);
 }
 
 static int
 ti_pruss_interrupt_enable(SYSCTL_HANDLER_ARGS)
 {
 	struct ti_pruss_softc *sc;
 	int err;
 	bool irqenable;
 	const int8_t irq = arg2;
 
 	sc = arg1;
 	irqenable = sc->sc_irq_devs[arg2].enable;
 
 	err = sysctl_handle_bool(oidp, &irqenable, arg2, req);
 	if (err != 0)
 		return (err);
 
 	if (req->newptr) // write enable
 		return ti_pruss_interrupts_enable(sc, irq, irqenable);
 
 	return (err);
 }
 
 static int
 ti_pruss_global_interrupt_enable(SYSCTL_HANDLER_ARGS)
 {
 	struct ti_pruss_softc *sc;
 	int err;
 	bool glob_irqen;
 
 	sc = arg1;
 	glob_irqen = sc->sc_glob_irqen;
 
 	err = sysctl_handle_bool(oidp, &glob_irqen, arg2, req);
 	if (err != 0)
 		return (err);
 
 	if (req->newptr) {
 		sc->sc_glob_irqen = glob_irqen;
 		ti_pruss_reg_write(sc, PRUSS_INTC_GER, glob_irqen);
 	}
 
 	return (err);
 }
 static int
 ti_pruss_probe(device_t dev)
 {
 
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (ofw_bus_is_compatible(dev, "ti,pruss-v1") ||
 	    ofw_bus_is_compatible(dev, "ti,pruss-v2")) {
 		device_set_desc(dev, "TI Programmable Realtime Unit Subsystem");
 		return (BUS_PROBE_DEFAULT);
 	}
 
 	return (ENXIO);
 }
 
 static int
 ti_pruss_attach(device_t dev)
 {
 	struct ti_pruss_softc *sc;
 	int rid, i, err, ncells;
 	phandle_t node;
 	clk_t l3_gclk, pruss_ocp_gclk;
 	phandle_t ti_prm_ref, *cells;
         device_t ti_prm_dev;
 
 	rid = 0;
 	sc = device_get_softc(dev);
 	node = ofw_bus_get_node(device_get_parent(dev));
 	if (node <= 0) {
 		device_printf(dev, "Cant get ofw node\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Follow activate pattern from sys/arm/ti/am335x/am335x_prcm.c
 	 * by Damjan Marion
 	 */
 
 	/* Set MODULEMODE to ENABLE(2) */
 	/* Wait for MODULEMODE to become ENABLE(2) */
 	if (ti_sysc_clock_enable(device_get_parent(dev)) != 0) {
 		device_printf(dev, "Could not enable PRUSS clock\n");
 		return (ENXIO);
 	}
 
 	/* Set CLKTRCTRL to SW_WKUP(2) */
 	/* Wait for the 200 MHz OCP clock to become active */
 	/* Wait for the 200 MHz IEP clock to become active */
 	/* Wait for the 192 MHz UART clock to become active */
 	/*
 	 * At the moment there is no reference to CM_PER_PRU_ICSS_CLKSTCTRL@140
 	 * in the devicetree. The register reset state are SW_WKUP(2) as default
 	 * so at the moment ignore setting this register.
 	 */
 
 	/* Select L3F as OCP clock */
 	/* Get the clock and set the parent */
 	err = clk_get_by_name(dev, "l3_gclk", &l3_gclk);
 	if (err) {
 		device_printf(dev, "Cant get l3_gclk err %d\n", err);
 		return (ENXIO);
 	}
 
 	err = clk_get_by_name(dev, "pruss_ocp_gclk@530", &pruss_ocp_gclk);
 	if (err) {
 		device_printf(dev, "Cant get pruss_ocp_gclk@530 err %d\n", err);
 		return (ENXIO);
 	}
 
 	err = clk_set_parent_by_clk(pruss_ocp_gclk, l3_gclk);
 	if (err) {
 		device_printf(dev,
 		    "Cant set pruss_ocp_gclk parent to l3_gclk err %d\n", err);
 		return (ENXIO);
 	}
 
 	/* Clear the RESET bit */
 	/* Find the ti_prm */
 	/* #reset-cells should not been used in this way but... */
 	err = ofw_bus_parse_xref_list_alloc(node, "resets", "#reset-cells", 0,
 	    &ti_prm_ref, &ncells, &cells);
 	OF_prop_free(cells);
 	if (err) {
 		device_printf(dev,
 		    "Cant fetch \"resets\" reference %x\n", err);
 		return (ENXIO);
 	}
 
 	ti_prm_dev = OF_device_from_xref(ti_prm_ref);
 	if (ti_prm_dev == NULL) {
 		device_printf(dev, "Cant get device from \"resets\"\n");
 		return (ENXIO);
 	}
 
 	err = ti_prm_reset(ti_prm_dev);
 	if (err) {
 		device_printf(dev, "ti_prm_reset failed %d\n", err);
 		return (ENXIO);
 	}
 	/* End of clock activation */
 
 	mtx_init(&sc->sc_mtx, "TI PRUSS", NULL, MTX_DEF);
 	sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 	    RF_ACTIVE);
 	if (sc->sc_mem_res == NULL) {
 		device_printf(dev, "could not allocate memory resource\n");
 		return (ENXIO);
 	}
 
 	struct sysctl_ctx_list *clist = device_get_sysctl_ctx(dev);
 	if (!clist)
 		return (EINVAL);
 
 	struct sysctl_oid *poid;
 	poid = device_get_sysctl_tree( dev );
 	if (!poid)
 		return (EINVAL);
 
 	sc->sc_glob_irqen = false;
 	struct sysctl_oid *irq_root = SYSCTL_ADD_NODE(clist, SYSCTL_CHILDREN(poid),
 	    OID_AUTO, "irq", CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "PRUSS Host Interrupts");
 	SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(poid), OID_AUTO,
 	    "global_interrupt_enable",
 	    CTLFLAG_RW | CTLTYPE_U8 | CTLFLAG_NEEDGIANT,
 	    sc, 0, ti_pruss_global_interrupt_enable,
 	    "CU", "Global interrupt enable");
 
 	sc->sc_bt = rman_get_bustag(sc->sc_mem_res);
 	sc->sc_bh = rman_get_bushandle(sc->sc_mem_res);
 	if (bus_alloc_resources(dev, ti_pruss_irq_spec, sc->sc_irq_res) != 0) {
 		device_printf(dev, "could not allocate interrupt resource\n");
 		ti_pruss_detach(dev);
 		return (ENXIO);
 	}
 
 	ti_pruss_interrupts_clear(sc);
 
 	for (i = 0; i < TI_PRUSS_IRQS; i++) {
 		char name[8];
 		snprintf(name, sizeof(name), "%d", i);
 
 		struct sysctl_oid *irq_nodes = SYSCTL_ADD_NODE(clist, SYSCTL_CHILDREN(irq_root),
 		    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 		    "PRUSS Interrupts");
 		SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(irq_nodes), OID_AUTO,
 		    "channel", CTLFLAG_RW | CTLTYPE_STRING | CTLFLAG_NEEDGIANT,
 		    sc, i, ti_pruss_channel_map,
 		    "A", "Channel attached to this irq");
 		SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(irq_nodes), OID_AUTO,
 		    "event", CTLFLAG_RW | CTLTYPE_STRING | CTLFLAG_NEEDGIANT,
 		    sc, i, ti_pruss_event_map,
 		    "A", "Event attached to this irq");
 		SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(irq_nodes), OID_AUTO,
 		    "enable", CTLFLAG_RW | CTLTYPE_U8 | CTLFLAG_NEEDGIANT,
 		    sc, i, ti_pruss_interrupt_enable,
 		    "CU", "Enable/Disable interrupt");
 
 		sc->sc_irq_devs[i].event = -1;
 		sc->sc_irq_devs[i].channel = -1;
 		sc->sc_irq_devs[i].tstamps.ctl.idx = 0;
 
 		if (i < TI_PRUSS_HOST_IRQS) {
 			ti_pruss_irq_args[i].irq = i;
 			ti_pruss_irq_args[i].sc = sc;
 			if (bus_setup_intr(dev, sc->sc_irq_res[i],
 			    INTR_MPSAFE | INTR_TYPE_MISC,
 			    NULL, ti_pruss_intr, &ti_pruss_irq_args[i],
 			    &sc->sc_intr[i]) != 0) {
 				device_printf(dev,
 				    "unable to setup the interrupt handler\n");
 				ti_pruss_detach(dev);
 
 				return (ENXIO);
 			}
 			mtx_init(&sc->sc_irq_devs[i].sc_mtx, "TI PRUSS IRQ", NULL, MTX_DEF);
 			knlist_init_mtx(&sc->sc_irq_devs[i].sc_selinfo.si_note, &sc->sc_irq_devs[i].sc_mtx);
 		}
 	}
 
 	if (ti_pruss_reg_read(sc, PRUSS_AM33XX_INTC) == PRUSS_AM33XX_REV)
 		device_printf(dev, "AM33xx PRU-ICSS\n");
 
 	sc->sc_pdev = make_dev(&ti_pruss_cdevsw, 0, UID_ROOT, GID_WHEEL,
 	    0600, "pruss%d", device_get_unit(dev));
 	sc->sc_pdev->si_drv1 = dev;
 
 	/*  Acc. to datasheet always write 1 to polarity registers */
 	ti_pruss_reg_write(sc, PRUSS_INTC_SIPR0, 0xFFFFFFFF);
 	ti_pruss_reg_write(sc, PRUSS_INTC_SIPR1, 0xFFFFFFFF);
 
 	/* Acc. to datasheet always write 0 to event type registers */
 	ti_pruss_reg_write(sc, PRUSS_INTC_SITR0, 0);
 	ti_pruss_reg_write(sc, PRUSS_INTC_SITR1, 0);
 
 	return (0);
 }
 
 static int
 ti_pruss_detach(device_t dev)
 {
 	struct ti_pruss_softc *sc = device_get_softc(dev);
 
 	ti_pruss_interrupts_clear(sc);
 
 	for (int i = 0; i < TI_PRUSS_HOST_IRQS; i++) {
 		ti_pruss_interrupts_enable( sc, i, false );
 
 		if (sc->sc_intr[i])
 			bus_teardown_intr(dev, sc->sc_irq_res[i], sc->sc_intr[i]);
 		if (sc->sc_irq_res[i])
 			bus_release_resource(dev, SYS_RES_IRQ,
 			    rman_get_rid(sc->sc_irq_res[i]),
 			    sc->sc_irq_res[i]);
 		knlist_clear(&sc->sc_irq_devs[i].sc_selinfo.si_note, 0);
 		mtx_lock(&sc->sc_irq_devs[i].sc_mtx);
 		if (!knlist_empty(&sc->sc_irq_devs[i].sc_selinfo.si_note))
 			printf("IRQ %d KQueue not empty!\n", i );
 		mtx_unlock(&sc->sc_irq_devs[i].sc_mtx);
 		knlist_destroy(&sc->sc_irq_devs[i].sc_selinfo.si_note);
 		mtx_destroy(&sc->sc_irq_devs[i].sc_mtx);
 	}
 
 	mtx_destroy(&sc->sc_mtx);
 	if (sc->sc_mem_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, rman_get_rid(sc->sc_mem_res),
 		    sc->sc_mem_res);
 	if (sc->sc_pdev)
 		destroy_dev(sc->sc_pdev);
 
 	return (0);
 }
 
 static void
 ti_pruss_intr(void *arg)
 {
 	int val;
 	struct ti_pruss_irq_arg *iap = arg;
 	struct ti_pruss_softc *sc = iap->sc;
 	/*
 	 * Interrupts pr1_host_intr[0:7] are mapped to
 	 * Host-2 to Host-9 of PRU-ICSS IRQ-controller.
 	 */
 	const int pru_int = iap->irq + TI_PRUSS_PRU_IRQS;
 	const int pru_int_mask = (1 << pru_int);
 	const int pru_channel = sc->sc_irq_devs[pru_int].channel;
 	const int pru_event = sc->sc_irq_devs[pru_channel].event;
 
 	val = ti_pruss_reg_read(sc, PRUSS_INTC_HIER);
 	if (!(val & pru_int_mask))
 		return;
 
 	ti_pruss_reg_write(sc, PRUSS_INTC_HIDISR, pru_int);
 	ti_pruss_reg_write(sc, PRUSS_INTC_SICR, pru_event);
 	ti_pruss_reg_write(sc, PRUSS_INTC_HIEISR, pru_int);
 
 	struct ti_pruss_irqsc* irq = &sc->sc_irq_devs[pru_channel];
 	size_t wr = irq->tstamps.ctl.idx;
 
 	struct timespec ts;
 	nanouptime(&ts);
 	irq->tstamps.ts[wr] = ts.tv_sec * 1000000000 + ts.tv_nsec;
 
 	if (++wr == TI_TS_ARRAY)
 		wr = 0;
 	atomic_add_32(&irq->tstamps.ctl.cnt, 1);
 
 	irq->tstamps.ctl.idx = wr;
 
 	KNOTE_UNLOCKED(&irq->sc_selinfo.si_note, pru_int);
 	wakeup(irq);
 	selwakeup(&irq->sc_selinfo);
 }
 
 static int
 ti_pruss_open(struct cdev *cdev __unused, int oflags __unused,
     int devtype __unused, struct thread *td __unused)
 {
 	return (0);
 }
 
 static int
 ti_pruss_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	device_t dev = cdev->si_drv1;
 	struct ti_pruss_softc *sc = device_get_softc(dev);
 
 	if (offset >= rman_get_size(sc->sc_mem_res))
 		return (ENOSPC);
 	*paddr = rman_get_start(sc->sc_mem_res) + offset;
 	*memattr = VM_MEMATTR_UNCACHEABLE;
 
 	return (0);
 }
 
-static struct filterops ti_pruss_kq_read = {
+static const struct filterops ti_pruss_kq_read = {
 	.f_isfd = 1,
 	.f_detach = ti_pruss_irq_kqread_detach,
 	.f_event = ti_pruss_irq_kqevent,
 };
 
 static void
 ti_pruss_irq_kqread_detach(struct knote *kn)
 {
 	struct ti_pruss_irqsc *sc = kn->kn_hook;
 
 	knlist_remove(&sc->sc_selinfo.si_note, kn, 0);
 }
 
 static int
 ti_pruss_irq_kqevent(struct knote *kn, long hint)
 {
     struct ti_pruss_irqsc* irq_sc;
     int notify;
 
     irq_sc = kn->kn_hook;
 
     if (hint > 0)
         kn->kn_data = hint - 2;
 
     if (hint > 0 || irq_sc->last > 0)
         notify = 1;
     else
         notify = 0;
 
     irq_sc->last = hint;
 
     return (notify);
 }
 
 static int
 ti_pruss_irq_kqfilter(struct cdev *cdev, struct knote *kn)
 {
 	struct ti_pruss_irqsc *sc = cdev->si_drv1;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_hook = sc;
 		kn->kn_fop = &ti_pruss_kq_read;
 		knlist_add(&sc->sc_selinfo.si_note, kn, 0);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
diff --git a/sys/cam/scsi/scsi_pass.c b/sys/cam/scsi/scsi_pass.c
index ff48bed30e68..72035e1e0d0e 100644
--- a/sys/cam/scsi/scsi_pass.c
+++ b/sys/cam/scsi/scsi_pass.c
@@ -1,2274 +1,2274 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1997, 1998, 2000 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/types.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/devicestat.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/sdt.h>
 #include <sys/sysent.h>
 #include <sys/taskqueue.h>
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <machine/bus.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_compat.h>
 #include <cam/cam_xpt_periph.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_pass.h>
 
 typedef enum {
 	PASS_FLAG_OPEN			= 0x01,
 	PASS_FLAG_LOCKED		= 0x02,
 	PASS_FLAG_INVALID		= 0x04,
 	PASS_FLAG_INITIAL_PHYSPATH	= 0x08,
 	PASS_FLAG_ZONE_INPROG		= 0x10,
 	PASS_FLAG_ZONE_VALID		= 0x20,
 	PASS_FLAG_UNMAPPED_CAPABLE	= 0x40,
 	PASS_FLAG_ABANDONED_REF_SET	= 0x80
 } pass_flags;
 
 typedef enum {
 	PASS_STATE_NORMAL
 } pass_state;
 
 typedef enum {
 	PASS_CCB_BUFFER_IO,
 	PASS_CCB_QUEUED_IO
 } pass_ccb_types;
 
 #define ccb_type	ppriv_field0
 #define ccb_ioreq	ppriv_ptr1
 
 /*
  * The maximum number of memory segments we preallocate.
  */
 #define	PASS_MAX_SEGS	16
 
 typedef enum {
 	PASS_IO_NONE		= 0x00,
 	PASS_IO_USER_SEG_MALLOC	= 0x01,
 	PASS_IO_KERN_SEG_MALLOC	= 0x02,
 	PASS_IO_ABANDONED	= 0x04
 } pass_io_flags; 
 
 struct pass_io_req {
 	union ccb			 ccb;
 	union ccb			*alloced_ccb;
 	union ccb			*user_ccb_ptr;
 	camq_entry			 user_periph_links;
 	ccb_ppriv_area			 user_periph_priv;
 	struct cam_periph_map_info	 mapinfo;
 	pass_io_flags			 flags;
 	ccb_flags			 data_flags;
 	int				 num_user_segs;
 	bus_dma_segment_t		 user_segs[PASS_MAX_SEGS];
 	int				 num_kern_segs;
 	bus_dma_segment_t		 kern_segs[PASS_MAX_SEGS];
 	bus_dma_segment_t		*user_segptr;
 	bus_dma_segment_t		*kern_segptr;
 	int				 num_bufs;
 	uint32_t			 dirs[CAM_PERIPH_MAXMAPS];
 	uint32_t			 lengths[CAM_PERIPH_MAXMAPS];
 	uint8_t				*user_bufs[CAM_PERIPH_MAXMAPS];
 	uint8_t				*kern_bufs[CAM_PERIPH_MAXMAPS];
 	struct bintime			 start_time;
 	TAILQ_ENTRY(pass_io_req)	 links;
 };
 
 struct pass_softc {
 	pass_state		  state;
 	pass_flags		  flags;
 	uint8_t		  pd_type;
 	int			  open_count;
 	u_int		 	  maxio;
 	struct devstat		 *device_stats;
 	struct cdev		 *dev;
 	struct cdev		 *alias_dev;
 	struct task		  add_physpath_task;
 	struct task		  shutdown_kqueue_task;
 	struct selinfo		  read_select;
 	TAILQ_HEAD(, pass_io_req) incoming_queue;
 	TAILQ_HEAD(, pass_io_req) active_queue;
 	TAILQ_HEAD(, pass_io_req) abandoned_queue;
 	TAILQ_HEAD(, pass_io_req) done_queue;
 	struct cam_periph	 *periph;
 	char			  zone_name[12];
 	char			  io_zone_name[12];
 	uma_zone_t		  pass_zone;
 	uma_zone_t		  pass_io_zone;
 	size_t			  io_zone_size;
 };
 
 static	d_open_t	passopen;
 static	d_close_t	passclose;
 static	d_ioctl_t	passioctl;
 static	d_ioctl_t	passdoioctl;
 static	d_poll_t	passpoll;
 static	d_kqfilter_t	passkqfilter;
 static	void		passreadfiltdetach(struct knote *kn);
 static	int		passreadfilt(struct knote *kn, long hint);
 
 static	periph_init_t	passinit;
 static	periph_ctor_t	passregister;
 static	periph_oninv_t	passoninvalidate;
 static	periph_dtor_t	passcleanup;
 static	periph_start_t	passstart;
 static	void		pass_shutdown_kqueue(void *context, int pending);
 static	void		pass_add_physpath(void *context, int pending);
 static	void		passasync(void *callback_arg, uint32_t code,
 				  struct cam_path *path, void *arg);
 static	void		passdone(struct cam_periph *periph, 
 				 union ccb *done_ccb);
 static	int		passcreatezone(struct cam_periph *periph);
 static	void		passiocleanup(struct pass_softc *softc, 
 				      struct pass_io_req *io_req);
 static	int		passcopysglist(struct cam_periph *periph,
 				       struct pass_io_req *io_req,
 				       ccb_flags direction);
 static	int		passmemsetup(struct cam_periph *periph,
 				     struct pass_io_req *io_req);
 static	int		passmemdone(struct cam_periph *periph,
 				    struct pass_io_req *io_req);
 static	int		passerror(union ccb *ccb, uint32_t cam_flags, 
 				  uint32_t sense_flags);
 static 	int		passsendccb(struct cam_periph *periph, union ccb *ccb,
 				    union ccb *inccb);
 static	void		passflags(union ccb *ccb, uint32_t *cam_flags,
 				  uint32_t *sense_flags);
 
 static struct periph_driver passdriver =
 {
 	passinit, "pass",
 	TAILQ_HEAD_INITIALIZER(passdriver.units), /* generation */ 0
 };
 
 PERIPHDRIVER_DECLARE(pass, passdriver);
 
 static struct cdevsw pass_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_TRACKCLOSE,
 	.d_open =	passopen,
 	.d_close =	passclose,
 	.d_ioctl =	passioctl,
 	.d_poll = 	passpoll,
 	.d_kqfilter = 	passkqfilter,
 	.d_name =	"pass",
 };
 
-static struct filterops passread_filtops = {
+static const struct filterops passread_filtops = {
 	.f_isfd	=	1,
 	.f_detach =	passreadfiltdetach,
 	.f_event =	passreadfilt
 };
 
 static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers");
 
 static void
 passinit(void)
 {
 	cam_status status;
 
 	/*
 	 * Install a global async callback.  This callback will
 	 * receive async callbacks like "new device found".
 	 */
 	status = xpt_register_async(AC_FOUND_DEVICE, passasync, NULL, NULL);
 
 	if (status != CAM_REQ_CMP) {
 		printf("pass: Failed to attach master async callback "
 		       "due to status 0x%x!\n", status);
 	}
 
 }
 
 static void
 passrejectios(struct cam_periph *periph)
 {
 	struct pass_io_req *io_req, *io_req2;
 	struct pass_softc *softc;
 
 	softc = (struct pass_softc *)periph->softc;
 
 	/*
 	 * The user can no longer get status for I/O on the done queue, so
 	 * clean up all outstanding I/O on the done queue.
 	 */
 	TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
 		TAILQ_REMOVE(&softc->done_queue, io_req, links);
 		passiocleanup(softc, io_req);
 		uma_zfree(softc->pass_zone, io_req);
 	}
 
 	/*
 	 * The underlying device is gone, so we can't issue these I/Os.
 	 * The devfs node has been shut down, so we can't return status to
 	 * the user.  Free any I/O left on the incoming queue.
 	 */
 	TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) {
 		TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
 		passiocleanup(softc, io_req);
 		uma_zfree(softc->pass_zone, io_req);
 	}
 
 	/*
 	 * Normally we would put I/Os on the abandoned queue and acquire a
 	 * reference when we saw the final close.  But, the device went
 	 * away and devfs may have moved everything off to deadfs by the
 	 * time the I/O done callback is called; as a result, we won't see
 	 * any more closes.  So, if we have any active I/Os, we need to put
 	 * them on the abandoned queue.  When the abandoned queue is empty,
 	 * we'll release the remaining reference (see below) to the peripheral.
 	 */
 	TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) {
 		TAILQ_REMOVE(&softc->active_queue, io_req, links);
 		io_req->flags |= PASS_IO_ABANDONED;
 		TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links);
 	}
 
 	/*
 	 * If we put any I/O on the abandoned queue, acquire a reference.
 	 */
 	if ((!TAILQ_EMPTY(&softc->abandoned_queue))
 	 && ((softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0)) {
 		cam_periph_doacquire(periph);
 		softc->flags |= PASS_FLAG_ABANDONED_REF_SET;
 	}
 }
 
 static void
 passdevgonecb(void *arg)
 {
 	struct cam_periph *periph;
 	struct mtx *mtx;
 	struct pass_softc *softc;
 	int i;
 
 	periph = (struct cam_periph *)arg;
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 
 	softc = (struct pass_softc *)periph->softc;
 	KASSERT(softc->open_count >= 0, ("Negative open count %d",
 		softc->open_count));
 
 	/*
 	 * When we get this callback, we will get no more close calls from
 	 * devfs.  So if we have any dangling opens, we need to release the
 	 * reference held for that particular context.
 	 */
 	for (i = 0; i < softc->open_count; i++)
 		cam_periph_release_locked(periph);
 
 	softc->open_count = 0;
 
 	/*
 	 * Release the reference held for the device node, it is gone now.
 	 * Accordingly, inform all queued I/Os of their fate.
 	 */
 	cam_periph_release_locked(periph);
 	passrejectios(periph);
 
 	/*
 	 * We reference the SIM lock directly here, instead of using
 	 * cam_periph_unlock().  The reason is that the final call to
 	 * cam_periph_release_locked() above could result in the periph
 	 * getting freed.  If that is the case, dereferencing the periph
 	 * with a cam_periph_unlock() call would cause a page fault.
 	 */
 	mtx_unlock(mtx);
 
 	/*
 	 * We have to remove our kqueue context from a thread because it
 	 * may sleep.  It would be nice if we could get a callback from
 	 * kqueue when it is done cleaning up resources.
 	 */
 	taskqueue_enqueue(taskqueue_thread, &softc->shutdown_kqueue_task);
 }
 
 static void
 passoninvalidate(struct cam_periph *periph)
 {
 	struct pass_softc *softc;
 
 	softc = (struct pass_softc *)periph->softc;
 
 	/*
 	 * De-register any async callbacks.
 	 */
 	xpt_register_async(0, passasync, periph, periph->path);
 
 	softc->flags |= PASS_FLAG_INVALID;
 
 	/*
 	 * Tell devfs this device has gone away, and ask for a callback
 	 * when it has cleaned up its state.
 	 */
 	destroy_dev_sched_cb(softc->dev, passdevgonecb, periph);
 }
 
 static void
 passcleanup(struct cam_periph *periph)
 {
 	struct pass_softc *softc;
 
 	softc = (struct pass_softc *)periph->softc;
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(TAILQ_EMPTY(&softc->active_queue),
 		("%s called when there are commands on the active queue!\n",
 		__func__));
 	KASSERT(TAILQ_EMPTY(&softc->abandoned_queue),
 		("%s called when there are commands on the abandoned queue!\n",
 		__func__));
 	KASSERT(TAILQ_EMPTY(&softc->incoming_queue),
 		("%s called when there are commands on the incoming queue!\n",
 		__func__));
 	KASSERT(TAILQ_EMPTY(&softc->done_queue),
 		("%s called when there are commands on the done queue!\n",
 		__func__));
 
 	devstat_remove_entry(softc->device_stats);
 
 	cam_periph_unlock(periph);
 
 	/*
 	 * We call taskqueue_drain() for the physpath task to make sure it
 	 * is complete.  We drop the lock because this can potentially
 	 * sleep.  XXX KDM that is bad.  Need a way to get a callback when
 	 * a taskqueue is drained.
 	 *
  	 * Note that we don't drain the kqueue shutdown task queue.  This
 	 * is because we hold a reference on the periph for kqueue, and
 	 * release that reference from the kqueue shutdown task queue.  So
 	 * we cannot come into this routine unless we've released that
 	 * reference.  Also, because that could be the last reference, we
 	 * could be called from the cam_periph_release() call in
 	 * pass_shutdown_kqueue().  In that case, the taskqueue_drain()
 	 * would deadlock.  It would be preferable if we had a way to
 	 * get a callback when a taskqueue is done.
 	 */
 	taskqueue_drain(taskqueue_thread, &softc->add_physpath_task);
 
 	/*
 	 * It should be safe to destroy the zones from here, because all
 	 * of the references to this peripheral have been freed, and all
 	 * I/O has been terminated and freed.  We check the zones for NULL
 	 * because they may not have been allocated yet if the device went
 	 * away before any asynchronous I/O has been issued.
 	 */
 	if (softc->pass_zone != NULL)
 		uma_zdestroy(softc->pass_zone);
 	if (softc->pass_io_zone != NULL)
 		uma_zdestroy(softc->pass_io_zone);
 
 	cam_periph_lock(periph);
 
 	free(softc, M_DEVBUF);
 }
 
 static void
 pass_shutdown_kqueue(void *context, int pending)
 {
 	struct cam_periph *periph;
 	struct pass_softc *softc;
 
 	periph = context;
 	softc = periph->softc;
 
 	knlist_clear(&softc->read_select.si_note, /*is_locked*/ 0);
 	knlist_destroy(&softc->read_select.si_note);
 
 	/*
 	 * Release the reference we held for kqueue.
 	 */
 	cam_periph_release(periph);
 }
 
 static void
 pass_add_physpath(void *context, int pending)
 {
 	struct cam_periph *periph;
 	struct pass_softc *softc;
 	struct mtx *mtx;
 	char *physpath;
 
 	/*
 	 * If we have one, create a devfs alias for our
 	 * physical path.
 	 */
 	periph = context;
 	softc = periph->softc;
 	physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 
 	if (periph->flags & CAM_PERIPH_INVALID)
 		goto out;
 
 	if (xpt_getattr(physpath, MAXPATHLEN,
 			"GEOM::physpath", periph->path) == 0
 	 && strlen(physpath) != 0) {
 		mtx_unlock(mtx);
 		make_dev_physpath_alias(MAKEDEV_WAITOK | MAKEDEV_CHECKNAME,
 				&softc->alias_dev, softc->dev,
 				softc->alias_dev, physpath);
 		mtx_lock(mtx);
 	}
 
 out:
 	/*
 	 * Now that we've made our alias, we no longer have to have a
 	 * reference to the device.
 	 */
 	if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0)
 		softc->flags |= PASS_FLAG_INITIAL_PHYSPATH;
 
 	/*
 	 * We always acquire a reference to the periph before queueing this
 	 * task queue function, so it won't go away before we run.
 	 */
 	while (pending-- > 0)
 		cam_periph_release_locked(periph);
 	mtx_unlock(mtx);
 
 	free(physpath, M_DEVBUF);
 }
 
 static void
 passasync(void *callback_arg, uint32_t code,
 	  struct cam_path *path, void *arg)
 {
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)callback_arg;
 
 	switch (code) {
 	case AC_FOUND_DEVICE:
 	{
 		struct ccb_getdev *cgd;
 		cam_status status;
 
 		cgd = (struct ccb_getdev *)arg;
 		if (cgd == NULL)
 			break;
 
 		/*
 		 * Allocate a peripheral instance for
 		 * this device and start the probe
 		 * process.
 		 */
 		status = cam_periph_alloc(passregister, passoninvalidate,
 					  passcleanup, passstart, "pass",
 					  CAM_PERIPH_BIO, path,
 					  passasync, AC_FOUND_DEVICE, cgd);
 
 		if (status != CAM_REQ_CMP
 		 && status != CAM_REQ_INPROG) {
 			const struct cam_status_entry *entry;
 
 			entry = cam_fetch_status_entry(status);
 
 			printf("passasync: Unable to attach new device "
 			       "due to status %#x: %s\n", status, entry ?
 			       entry->status_text : "Unknown");
 		}
 
 		break;
 	}
 	case AC_ADVINFO_CHANGED:
 	{
 		uintptr_t buftype;
 
 		buftype = (uintptr_t)arg;
 		if (buftype == CDAI_TYPE_PHYS_PATH) {
 			struct pass_softc *softc;
 
 			softc = (struct pass_softc *)periph->softc;
 			/*
 			 * Acquire a reference to the periph before we
 			 * start the taskqueue, so that we don't run into
 			 * a situation where the periph goes away before
 			 * the task queue has a chance to run.
 			 */
 			if (cam_periph_acquire(periph) != 0)
 				break;
 
 			taskqueue_enqueue(taskqueue_thread,
 					  &softc->add_physpath_task);
 		}
 		break;
 	}
 	default:
 		cam_periph_async(periph, code, path, arg);
 		break;
 	}
 }
 
 static cam_status
 passregister(struct cam_periph *periph, void *arg)
 {
 	struct pass_softc *softc;
 	struct ccb_getdev *cgd;
 	struct ccb_pathinq cpi;
 	struct make_dev_args args;
 	int error, no_tags;
 
 	cgd = (struct ccb_getdev *)arg;
 	if (cgd == NULL) {
 		printf("%s: no getdev CCB, can't register device\n", __func__);
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	softc = (struct pass_softc *)malloc(sizeof(*softc),
 					    M_DEVBUF, M_NOWAIT);
 
 	if (softc == NULL) {
 		printf("%s: Unable to probe new device. "
 		       "Unable to allocate softc\n", __func__);
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	bzero(softc, sizeof(*softc));
 	softc->state = PASS_STATE_NORMAL;
 	if (cgd->protocol == PROTO_SCSI || cgd->protocol == PROTO_ATAPI)
 		softc->pd_type = SID_TYPE(&cgd->inq_data);
 	else if (cgd->protocol == PROTO_SATAPM)
 		softc->pd_type = T_ENCLOSURE;
 	else
 		softc->pd_type = T_DIRECT;
 
 	periph->softc = softc;
 	softc->periph = periph;
 	TAILQ_INIT(&softc->incoming_queue);
 	TAILQ_INIT(&softc->active_queue);
 	TAILQ_INIT(&softc->abandoned_queue);
 	TAILQ_INIT(&softc->done_queue);
 	snprintf(softc->zone_name, sizeof(softc->zone_name), "%s%d",
 		 periph->periph_name, periph->unit_number);
 	snprintf(softc->io_zone_name, sizeof(softc->io_zone_name), "%s%dIO",
 		 periph->periph_name, periph->unit_number);
 	softc->io_zone_size = maxphys;
 	knlist_init_mtx(&softc->read_select.si_note, cam_periph_mtx(periph));
 
 	xpt_path_inq(&cpi, periph->path);
 
 	if (cpi.maxio == 0)
 		softc->maxio = DFLTPHYS;	/* traditional default */
 	else if (cpi.maxio > maxphys)
 		softc->maxio = maxphys;		/* for safety */
 	else
 		softc->maxio = cpi.maxio;	/* real value */
 
 	if (cpi.hba_misc & PIM_UNMAPPED)
 		softc->flags |= PASS_FLAG_UNMAPPED_CAPABLE;
 
 	/*
 	 * We pass in 0 for a blocksize, since we don't 
 	 * know what the blocksize of this device is, if 
 	 * it even has a blocksize.
 	 */
 	cam_periph_unlock(periph);
 	no_tags = (cgd->inq_data.flags & SID_CmdQue) == 0;
 	softc->device_stats = devstat_new_entry("pass",
 			  periph->unit_number, 0,
 			  DEVSTAT_NO_BLOCKSIZE
 			  | (no_tags ? DEVSTAT_NO_ORDERED_TAGS : 0),
 			  softc->pd_type |
 			  XPORT_DEVSTAT_TYPE(cpi.transport) |
 			  DEVSTAT_TYPE_PASS,
 			  DEVSTAT_PRIORITY_PASS);
 
 	/*
 	 * Initialize the taskqueue handler for shutting down kqueue.
 	 */
 	TASK_INIT(&softc->shutdown_kqueue_task, /*priority*/ 0,
 		  pass_shutdown_kqueue, periph);
 
 	/*
 	 * Acquire a reference to the periph that we can release once we've
 	 * cleaned up the kqueue.
 	 */
 	if (cam_periph_acquire(periph) != 0) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
 		cam_periph_lock(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 	/*
 	 * Acquire a reference to the periph before we create the devfs
 	 * instance for it.  We'll release this reference once the devfs
 	 * instance has been freed.
 	 */
 	if (cam_periph_acquire(periph) != 0) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
 		cam_periph_lock(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 	/* Register the device */
 	make_dev_args_init(&args);
 	args.mda_devsw = &pass_cdevsw;
 	args.mda_unit = periph->unit_number;
 	args.mda_uid = UID_ROOT;
 	args.mda_gid = GID_OPERATOR;
 	args.mda_mode = 0600;
 	args.mda_si_drv1 = periph;
 	args.mda_flags = MAKEDEV_NOWAIT;
 	error = make_dev_s(&args, &softc->dev, "%s%d", periph->periph_name,
 	    periph->unit_number);
 	if (error != 0) {
 		cam_periph_lock(periph);
 		cam_periph_release_locked(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 	/*
 	 * Hold a reference to the periph before we create the physical
 	 * path alias so it can't go away.
 	 */
 	if (cam_periph_acquire(periph) != 0) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
 		cam_periph_lock(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 	cam_periph_lock(periph);
 
 	TASK_INIT(&softc->add_physpath_task, /*priority*/0,
 		  pass_add_physpath, periph);
 
 	/*
 	 * See if physical path information is already available.
 	 */
 	taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task);
 
 	/*
 	 * Add an async callback so that we get notified if
 	 * this device goes away or its physical path
 	 * (stored in the advanced info data of the EDT) has
 	 * changed.
 	 */
 	xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED,
 			   passasync, periph, periph->path);
 
 	if (bootverbose)
 		xpt_announce_periph(periph, NULL);
 
 	return(CAM_REQ_CMP);
 }
 
 static int
 passopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct cam_periph *periph;
 	struct pass_softc *softc;
 	int error;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	if (cam_periph_acquire(periph) != 0)
 		return (ENXIO);
 
 	cam_periph_lock(periph);
 
 	softc = (struct pass_softc *)periph->softc;
 
 	if (softc->flags & PASS_FLAG_INVALID) {
 		cam_periph_release_locked(periph);
 		cam_periph_unlock(periph);
 		return(ENXIO);
 	}
 
 	/*
 	 * Don't allow access when we're running at a high securelevel.
 	 */
 	error = securelevel_gt(td->td_ucred, 1);
 	if (error) {
 		cam_periph_release_locked(periph);
 		cam_periph_unlock(periph);
 		return(error);
 	}
 
 	/*
 	 * Only allow read-write access.
 	 */
 	if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0)) {
 		cam_periph_release_locked(periph);
 		cam_periph_unlock(periph);
 		return(EPERM);
 	}
 
 	/*
 	 * We don't allow nonblocking access.
 	 */
 	if ((flags & O_NONBLOCK) != 0) {
 		xpt_print(periph->path, "can't do nonblocking access\n");
 		cam_periph_release_locked(periph);
 		cam_periph_unlock(periph);
 		return(EINVAL);
 	}
 
 	softc->open_count++;
 
 	cam_periph_unlock(periph);
 
 	return (error);
 }
 
 static int
 passclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
 	struct 	cam_periph *periph;
 	struct  pass_softc *softc;
 	struct mtx *mtx;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 
 	softc = periph->softc;
 	softc->open_count--;
 
 	if (softc->open_count == 0) {
 		struct pass_io_req *io_req, *io_req2;
 
 		TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
 			TAILQ_REMOVE(&softc->done_queue, io_req, links);
 			passiocleanup(softc, io_req);
 			uma_zfree(softc->pass_zone, io_req);
 		}
 
 		TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links,
 				   io_req2) {
 			TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
 			passiocleanup(softc, io_req);
 			uma_zfree(softc->pass_zone, io_req);
 		}
 
 		/*
 		 * If there are any active I/Os, we need to forcibly acquire a
 		 * reference to the peripheral so that we don't go away
 		 * before they complete.  We'll release the reference when
 		 * the abandoned queue is empty.
 		 */
 		io_req = TAILQ_FIRST(&softc->active_queue);
 		if ((io_req != NULL)
 		 && (softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0) {
 			cam_periph_doacquire(periph);
 			softc->flags |= PASS_FLAG_ABANDONED_REF_SET;
 		}
 
 		/*
 		 * Since the I/O in the active queue is not under our
 		 * control, just set a flag so that we can clean it up when
 		 * it completes and put it on the abandoned queue.  This
 		 * will prevent our sending spurious completions in the
 		 * event that the device is opened again before these I/Os
 		 * complete.
 		 */
 		TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links,
 				   io_req2) {
 			TAILQ_REMOVE(&softc->active_queue, io_req, links);
 			io_req->flags |= PASS_IO_ABANDONED;
 			TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req,
 					  links);
 		}
 	}
 
 	cam_periph_release_locked(periph);
 
 	/*
 	 * We reference the lock directly here, instead of using
 	 * cam_periph_unlock().  The reason is that the call to
 	 * cam_periph_release_locked() above could result in the periph
 	 * getting freed.  If that is the case, dereferencing the periph
 	 * with a cam_periph_unlock() call would cause a page fault.
 	 *
 	 * cam_periph_release() avoids this problem using the same method,
 	 * but we're manually acquiring and dropping the lock here to
 	 * protect the open count and avoid another lock acquisition and
 	 * release.
 	 */
 	mtx_unlock(mtx);
 
 	return (0);
 }
 
 static void
 passstart(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct pass_softc *softc;
 
 	softc = (struct pass_softc *)periph->softc;
 
 	switch (softc->state) {
 	case PASS_STATE_NORMAL: {
 		struct pass_io_req *io_req;
 
 		/*
 		 * Check for any queued I/O requests that require an
 		 * allocated slot.
 		 */
 		io_req = TAILQ_FIRST(&softc->incoming_queue);
 		if (io_req == NULL) {
 			xpt_release_ccb(start_ccb);
 			break;
 		}
 		TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
 		TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);
 		/*
 		 * Merge the user's CCB into the allocated CCB.
 		 */
 		xpt_merge_ccb(start_ccb, &io_req->ccb);
 		start_ccb->ccb_h.ccb_type = PASS_CCB_QUEUED_IO;
 		start_ccb->ccb_h.ccb_ioreq = io_req;
 		start_ccb->ccb_h.cbfcnp = passdone;
 		io_req->alloced_ccb = start_ccb;
 		binuptime(&io_req->start_time);
 		devstat_start_transaction(softc->device_stats,
 					  &io_req->start_time);
 
 		xpt_action(start_ccb);
 
 		/*
 		 * If we have any more I/O waiting, schedule ourselves again.
 		 */
 		if (!TAILQ_EMPTY(&softc->incoming_queue))
 			xpt_schedule(periph, CAM_PRIORITY_NORMAL);
 		break;
 	}
 	default:
 		break;
 	}
 }
 
 static void
 passdone(struct cam_periph *periph, union ccb *done_ccb)
 { 
 	struct pass_softc *softc;
 	struct ccb_scsiio *csio;
 
 	softc = (struct pass_softc *)periph->softc;
 
 	cam_periph_assert(periph, MA_OWNED);
 
 	csio = &done_ccb->csio;
 	switch (csio->ccb_h.ccb_type) {
 	case PASS_CCB_QUEUED_IO: {
 		struct pass_io_req *io_req;
 
 		io_req = done_ccb->ccb_h.ccb_ioreq;
 #if 0
 		xpt_print(periph->path, "%s: called for user CCB %p\n",
 			  __func__, io_req->user_ccb_ptr);
 #endif
 		if (((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) &&
 		    ((io_req->flags & PASS_IO_ABANDONED) == 0)) {
 			int error;
 			uint32_t cam_flags, sense_flags;
 
 			passflags(done_ccb, &cam_flags, &sense_flags);
 			error = passerror(done_ccb, cam_flags, sense_flags);
 
 			if (error == ERESTART) {
 				KASSERT(((sense_flags & SF_NO_RETRY) == 0),
 				    ("passerror returned ERESTART with no retry requested\n"));
 				return;
 			}
 		}
 
 		/*
 		 * Copy the allocated CCB contents back to the malloced CCB
 		 * so we can give status back to the user when he requests it.
 		 */
 		bcopy(done_ccb, &io_req->ccb, sizeof(*done_ccb));
 
 		/*
 		 * Log data/transaction completion with devstat(9).
 		 */
 		switch (done_ccb->ccb_h.func_code) {
 		case XPT_SCSI_IO:
 			devstat_end_transaction(softc->device_stats,
 			    done_ccb->csio.dxfer_len - done_ccb->csio.resid,
 			    done_ccb->csio.tag_action & 0x3,
 			    ((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
 			    CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
 			    (done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
 			    DEVSTAT_WRITE : DEVSTAT_READ, NULL,
 			    &io_req->start_time);
 			break;
 		case XPT_ATA_IO:
 			devstat_end_transaction(softc->device_stats,
 			    done_ccb->ataio.dxfer_len - done_ccb->ataio.resid,
 			    0, /* Not used in ATA */
 			    ((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
 			    CAM_DIR_NONE) ? DEVSTAT_NO_DATA : 
 			    (done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
 			    DEVSTAT_WRITE : DEVSTAT_READ, NULL,
 			    &io_req->start_time);
 			break;
 		case XPT_SMP_IO:
 			/*
 			 * XXX KDM this isn't quite right, but there isn't
 			 * currently an easy way to represent a bidirectional 
 			 * transfer in devstat.  The only way to do it
 			 * and have the byte counts come out right would
 			 * mean that we would have to record two
 			 * transactions, one for the request and one for the
 			 * response.  For now, so that we report something,
 			 * just treat the entire thing as a read.
 			 */
 			devstat_end_transaction(softc->device_stats,
 			    done_ccb->smpio.smp_request_len +
 			    done_ccb->smpio.smp_response_len,
 			    DEVSTAT_TAG_SIMPLE, DEVSTAT_READ, NULL,
 			    &io_req->start_time);
 			break;
 		default:
 			devstat_end_transaction(softc->device_stats, 0,
 			    DEVSTAT_TAG_NONE, DEVSTAT_NO_DATA, NULL,
 			    &io_req->start_time);
 			break;
 		}
 
 		/*
 		 * In the normal case, take the completed I/O off of the
 		 * active queue and put it on the done queue.  Notitfy the
 		 * user that we have a completed I/O.
 		 */
 		if ((io_req->flags & PASS_IO_ABANDONED) == 0) {
 			TAILQ_REMOVE(&softc->active_queue, io_req, links);
 			TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
 			selwakeuppri(&softc->read_select, PRIBIO);
 			KNOTE_LOCKED(&softc->read_select.si_note, 0);
 		} else {
 			/*
 			 * In the case of an abandoned I/O (final close
 			 * without fetching the I/O), take it off of the
 			 * abandoned queue and free it.
 			 */
 			TAILQ_REMOVE(&softc->abandoned_queue, io_req, links);
 			passiocleanup(softc, io_req);
 			uma_zfree(softc->pass_zone, io_req);
 
 			/*
 			 * Release the done_ccb here, since we may wind up
 			 * freeing the peripheral when we decrement the
 			 * reference count below.
 			 */
 			xpt_release_ccb(done_ccb);
 
 			/*
 			 * If the abandoned queue is empty, we can release
 			 * our reference to the periph since we won't have
 			 * any more completions coming.
 			 */
 			if ((TAILQ_EMPTY(&softc->abandoned_queue))
 			 && (softc->flags & PASS_FLAG_ABANDONED_REF_SET)) {
 				softc->flags &= ~PASS_FLAG_ABANDONED_REF_SET;
 				cam_periph_release_locked(periph);
 			}
 
 			/*
 			 * We have already released the CCB, so we can
 			 * return.
 			 */
 			return;
 		}
 		break;
 	}
 	}
 	xpt_release_ccb(done_ccb);
 }
 
 static int
 passcreatezone(struct cam_periph *periph)
 {
 	struct pass_softc *softc;
 	int error;
 
 	error = 0;
 	softc = (struct pass_softc *)periph->softc;
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(((softc->flags & PASS_FLAG_ZONE_VALID) == 0), 
 		("%s called when the pass(4) zone is valid!\n", __func__));
 	KASSERT((softc->pass_zone == NULL), 
 		("%s called when the pass(4) zone is allocated!\n", __func__));
 
 	if ((softc->flags & PASS_FLAG_ZONE_INPROG) == 0) {
 		/*
 		 * We're the first context through, so we need to create
 		 * the pass(4) UMA zone for I/O requests.
 		 */
 		softc->flags |= PASS_FLAG_ZONE_INPROG;
 
 		/*
 		 * uma_zcreate() does a blocking (M_WAITOK) allocation,
 		 * so we cannot hold a mutex while we call it.
 		 */
 		cam_periph_unlock(periph);
 
 		softc->pass_zone = uma_zcreate(softc->zone_name,
 		    sizeof(struct pass_io_req), NULL, NULL, NULL, NULL,
 		    /*align*/ 0, /*flags*/ 0);
 
 		softc->pass_io_zone = uma_zcreate(softc->io_zone_name,
 		    softc->io_zone_size, NULL, NULL, NULL, NULL,
 		    /*align*/ 0, /*flags*/ 0);
 
 		cam_periph_lock(periph);
 
 		if ((softc->pass_zone == NULL)
 		 || (softc->pass_io_zone == NULL)) {
 			if (softc->pass_zone == NULL)
 				xpt_print(periph->path, "unable to allocate "
 				    "IO Req UMA zone\n");
 			else
 				xpt_print(periph->path, "unable to allocate "
 				    "IO UMA zone\n");
 			softc->flags &= ~PASS_FLAG_ZONE_INPROG;
 			goto bailout;
 		}
 
 		/*
 		 * Set the flags appropriately and notify any other waiters.
 		 */
 		softc->flags &= ~PASS_FLAG_ZONE_INPROG;
 		softc->flags |= PASS_FLAG_ZONE_VALID;
 		wakeup(&softc->pass_zone);
 	} else {
 		/*
 		 * In this case, the UMA zone has not yet been created, but
 		 * another context is in the process of creating it.  We
 		 * need to sleep until the creation is either done or has
 		 * failed.
 		 */
 		while ((softc->flags & PASS_FLAG_ZONE_INPROG)
 		    && ((softc->flags & PASS_FLAG_ZONE_VALID) == 0)) {
 			error = msleep(&softc->pass_zone,
 				       cam_periph_mtx(periph), PRIBIO,
 				       "paszon", 0);
 			if (error != 0)
 				goto bailout;
 		}
 		/*
 		 * If the zone creation failed, no luck for the user.
 		 */
 		if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0){
 			error = ENOMEM;
 			goto bailout;
 		}
 	}
 bailout:
 	return (error);
 }
 
 static void
 passiocleanup(struct pass_softc *softc, struct pass_io_req *io_req)
 {
 	union ccb *ccb;
 	uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
 	int i, numbufs;
 
 	ccb = &io_req->ccb;
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
 		numbufs = min(io_req->num_bufs, 2);
 
 		if (numbufs == 1) {
 			data_ptrs[0] = (uint8_t **)&ccb->cdm.matches;
 		} else {
 			data_ptrs[0] = (uint8_t **)&ccb->cdm.patterns;
 			data_ptrs[1] = (uint8_t **)&ccb->cdm.matches;
 		}
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		data_ptrs[0] = &ccb->csio.data_ptr;
 		numbufs = min(io_req->num_bufs, 1);
 		break;
 	case XPT_ATA_IO:
 		data_ptrs[0] = &ccb->ataio.data_ptr;
 		numbufs = min(io_req->num_bufs, 1);
 		break;
 	case XPT_SMP_IO:
 		numbufs = min(io_req->num_bufs, 2);
 		data_ptrs[0] = &ccb->smpio.smp_request;
 		data_ptrs[1] = &ccb->smpio.smp_response;
 		break;
 	case XPT_DEV_ADVINFO:
 		numbufs = min(io_req->num_bufs, 1);
 		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
 		break;
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN:
 		data_ptrs[0] = &ccb->nvmeio.data_ptr;
 		numbufs = min(io_req->num_bufs, 1);
 		break;
 	default:
 		/* allow ourselves to be swapped once again */
 		return;
 		break; /* NOTREACHED */ 
 	}
 
 	if (io_req->flags & PASS_IO_USER_SEG_MALLOC) {
 		free(io_req->user_segptr, M_SCSIPASS);
 		io_req->user_segptr = NULL;
 	}
 
 	/*
 	 * We only want to free memory we malloced.
 	 */
 	if (io_req->data_flags == CAM_DATA_VADDR) {
 		for (i = 0; i < io_req->num_bufs; i++) {
 			if (io_req->kern_bufs[i] == NULL)
 				continue;
 
 			free(io_req->kern_bufs[i], M_SCSIPASS);
 			io_req->kern_bufs[i] = NULL;
 		}
 	} else if (io_req->data_flags == CAM_DATA_SG) {
 		for (i = 0; i < io_req->num_kern_segs; i++) {
 			if ((uint8_t *)(uintptr_t)
 			    io_req->kern_segptr[i].ds_addr == NULL)
 				continue;
 
 			uma_zfree(softc->pass_io_zone, (uint8_t *)(uintptr_t)
 			    io_req->kern_segptr[i].ds_addr);
 			io_req->kern_segptr[i].ds_addr = 0;
 		}
 	}
 
 	if (io_req->flags & PASS_IO_KERN_SEG_MALLOC) {
 		free(io_req->kern_segptr, M_SCSIPASS);
 		io_req->kern_segptr = NULL;
 	}
 
 	if (io_req->data_flags != CAM_DATA_PADDR) {
 		for (i = 0; i < numbufs; i++) {
 			/*
 			 * Restore the user's buffer pointers to their
 			 * previous values.
 			 */
 			if (io_req->user_bufs[i] != NULL)
 				*data_ptrs[i] = io_req->user_bufs[i];
 		}
 	}
 
 }
 
 static int
 passcopysglist(struct cam_periph *periph, struct pass_io_req *io_req,
 	       ccb_flags direction)
 {
 	bus_size_t kern_watermark, user_watermark, len_to_copy;
 	bus_dma_segment_t *user_sglist, *kern_sglist;
 	int i, j, error;
 
 	error = 0;
 	kern_watermark = 0;
 	user_watermark = 0;
 	len_to_copy = 0;
 	user_sglist = io_req->user_segptr;
 	kern_sglist = io_req->kern_segptr;
 
 	for (i = 0, j = 0; i < io_req->num_user_segs &&
 	     j < io_req->num_kern_segs;) {
 		uint8_t *user_ptr, *kern_ptr;
 
 		len_to_copy = min(user_sglist[i].ds_len -user_watermark,
 		    kern_sglist[j].ds_len - kern_watermark);
 
 		user_ptr = (uint8_t *)(uintptr_t)user_sglist[i].ds_addr;
 		user_ptr = user_ptr + user_watermark;
 		kern_ptr = (uint8_t *)(uintptr_t)kern_sglist[j].ds_addr;
 		kern_ptr = kern_ptr + kern_watermark;
 
 		user_watermark += len_to_copy;
 		kern_watermark += len_to_copy;
 
 		if (direction == CAM_DIR_IN) {
 			error = copyout(kern_ptr, user_ptr, len_to_copy);
 			if (error != 0) {
 				xpt_print(periph->path, "%s: copyout of %u "
 					  "bytes from %p to %p failed with "
 					  "error %d\n", __func__, len_to_copy,
 					  kern_ptr, user_ptr, error);
 				goto bailout;
 			}
 		} else {
 			error = copyin(user_ptr, kern_ptr, len_to_copy);
 			if (error != 0) {
 				xpt_print(periph->path, "%s: copyin of %u "
 					  "bytes from %p to %p failed with "
 					  "error %d\n", __func__, len_to_copy,
 					  user_ptr, kern_ptr, error);
 				goto bailout;
 			}
 		}
 
 		if (user_sglist[i].ds_len == user_watermark) {
 			i++;
 			user_watermark = 0;
 		}
 
 		if (kern_sglist[j].ds_len == kern_watermark) {
 			j++;
 			kern_watermark = 0;
 		}
 	}
 
 bailout:
 
 	return (error);
 }
 
 static int
 passmemsetup(struct cam_periph *periph, struct pass_io_req *io_req)
 {
 	union ccb *ccb;
 	struct pass_softc *softc;
 	int numbufs, i;
 	uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
 	uint32_t lengths[CAM_PERIPH_MAXMAPS];
 	uint32_t dirs[CAM_PERIPH_MAXMAPS];
 	uint32_t num_segs;
 	uint16_t *seg_cnt_ptr;
 	size_t maxmap;
 	int error;
 
 	cam_periph_assert(periph, MA_NOTOWNED);
 
 	softc = periph->softc;
 
 	error = 0;
 	ccb = &io_req->ccb;
 	maxmap = 0;
 	num_segs = 0;
 	seg_cnt_ptr = NULL;
 
 	switch(ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
 		if (ccb->cdm.match_buf_len == 0) {
 			printf("%s: invalid match buffer length 0\n", __func__);
 			return(EINVAL);
 		}
 		if (ccb->cdm.pattern_buf_len > 0) {
 			data_ptrs[0] = (uint8_t **)&ccb->cdm.patterns;
 			lengths[0] = ccb->cdm.pattern_buf_len;
 			dirs[0] = CAM_DIR_OUT;
 			data_ptrs[1] = (uint8_t **)&ccb->cdm.matches;
 			lengths[1] = ccb->cdm.match_buf_len;
 			dirs[1] = CAM_DIR_IN;
 			numbufs = 2;
 		} else {
 			data_ptrs[0] = (uint8_t **)&ccb->cdm.matches;
 			lengths[0] = ccb->cdm.match_buf_len;
 			dirs[0] = CAM_DIR_IN;
 			numbufs = 1;
 		}
 		io_req->data_flags = CAM_DATA_VADDR;
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 
 		/*
 		 * The user shouldn't be able to supply a bio.
 		 */
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
 			return (EINVAL);
 
 		io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;
 
 		data_ptrs[0] = &ccb->csio.data_ptr;
 		lengths[0] = ccb->csio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		num_segs = ccb->csio.sglist_cnt;
 		seg_cnt_ptr = &ccb->csio.sglist_cnt;
 		numbufs = 1;
 		maxmap = softc->maxio;
 		break;
 	case XPT_ATA_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 
 		/*
 		 * We only support a single virtual address for ATA I/O.
 		 */
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 
 		io_req->data_flags = CAM_DATA_VADDR;
 
 		data_ptrs[0] = &ccb->ataio.data_ptr;
 		lengths[0] = ccb->ataio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		maxmap = softc->maxio;
 		break;
 	case XPT_SMP_IO:
 		io_req->data_flags = CAM_DATA_VADDR;
 
 		data_ptrs[0] = &ccb->smpio.smp_request;
 		lengths[0] = ccb->smpio.smp_request_len;
 		dirs[0] = CAM_DIR_OUT;
 		data_ptrs[1] = &ccb->smpio.smp_response;
 		lengths[1] = ccb->smpio.smp_response_len;
 		dirs[1] = CAM_DIR_IN;
 		numbufs = 2;
 		maxmap = softc->maxio;
 		break;
 	case XPT_DEV_ADVINFO:
 		if (ccb->cdai.bufsiz == 0)
 			return (0);
 
 		io_req->data_flags = CAM_DATA_VADDR;
 
 		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
 		lengths[0] = ccb->cdai.bufsiz;
 		dirs[0] = CAM_DIR_IN;
 		numbufs = 1;
 		break;
 	case XPT_NVME_ADMIN:
 	case XPT_NVME_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return (0);
 
 		io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;
 
 		data_ptrs[0] = &ccb->nvmeio.data_ptr;
 		lengths[0] = ccb->nvmeio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		num_segs = ccb->nvmeio.sglist_cnt;
 		seg_cnt_ptr = &ccb->nvmeio.sglist_cnt;
 		numbufs = 1;
 		maxmap = softc->maxio;
 		break;
 	default:
 		return(EINVAL);
 		break; /* NOTREACHED */
 	}
 
 	io_req->num_bufs = numbufs;
 
 	/*
 	 * If there is a maximum, check to make sure that the user's
 	 * request fits within the limit.  In general, we should only have
 	 * a maximum length for requests that go to hardware.  Otherwise it
 	 * is whatever we're able to malloc.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		io_req->user_bufs[i] = *data_ptrs[i];
 		io_req->dirs[i] = dirs[i];
 		io_req->lengths[i] = lengths[i];
 
 		if (maxmap == 0)
 			continue;
 
 		if (lengths[i] <= maxmap)
 			continue;
 
 		xpt_print(periph->path, "%s: data length %u > max allowed %u "
 			  "bytes\n", __func__, lengths[i], maxmap);
 		error = EINVAL;
 		goto bailout;
 	}
 
 	switch (io_req->data_flags) {
 	case CAM_DATA_VADDR:
 		/* Map or copy the buffer into kernel address space */
 		for (i = 0; i < numbufs; i++) {
 			uint8_t *tmp_buf;
 
 			/*
 			 * If for some reason no length is specified, we
 			 * don't need to allocate anything.
 			 */
 			if (io_req->lengths[i] == 0)
 				continue;
 
 			tmp_buf = malloc(lengths[i], M_SCSIPASS,
 					 M_WAITOK | M_ZERO);
 			io_req->kern_bufs[i] = tmp_buf;
 			*data_ptrs[i] = tmp_buf;
 
 #if 0
 			xpt_print(periph->path, "%s: malloced %p len %u, user "
 				  "buffer %p, operation: %s\n", __func__,
 				  tmp_buf, lengths[i], io_req->user_bufs[i],
 				  (dirs[i] == CAM_DIR_IN) ? "read" : "write");
 #endif
 			/*
 			 * We only need to copy in if the user is writing.
 			 */
 			if (dirs[i] != CAM_DIR_OUT)
 				continue;
 
 			error = copyin(io_req->user_bufs[i],
 				       io_req->kern_bufs[i], lengths[i]);
 			if (error != 0) {
 				xpt_print(periph->path, "%s: copy of user "
 					  "buffer from %p to %p failed with "
 					  "error %d\n", __func__,
 					  io_req->user_bufs[i],
 					  io_req->kern_bufs[i], error);
 				goto bailout;
 			}
 		}
 		break;
 	case CAM_DATA_PADDR:
 		/* Pass down the pointer as-is */
 		break;
 	case CAM_DATA_SG: {
 		size_t sg_length, size_to_go, alloc_size;
 		uint32_t num_segs_needed;
 
 		/*
 		 * Copy the user S/G list in, and then copy in the
 		 * individual segments.
 		 */
 		/*
 		 * We shouldn't see this, but check just in case.
 		 */
 		if (numbufs != 1) {
 			xpt_print(periph->path, "%s: cannot currently handle "
 				  "more than one S/G list per CCB\n", __func__);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		/*
 		 * We have to have at least one segment.
 		 */
 		if (num_segs == 0) {
 			xpt_print(periph->path, "%s: CAM_DATA_SG flag set, "
 				  "but sglist_cnt=0!\n", __func__);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		/*
 		 * Make sure the user specified the total length and didn't
 		 * just leave it to us to decode the S/G list.
 		 */
 		if (lengths[0] == 0) {
 			xpt_print(periph->path, "%s: no dxfer_len specified, "
 				  "but CAM_DATA_SG flag is set!\n", __func__);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		/*
 		 * We allocate buffers in io_zone_size increments for an
 		 * S/G list.  This will generally be maxphys.
 		 */
 		if (lengths[0] <= softc->io_zone_size)
 			num_segs_needed = 1;
 		else {
 			num_segs_needed = lengths[0] / softc->io_zone_size;
 			if ((lengths[0] % softc->io_zone_size) != 0)
 				num_segs_needed++;
 		}
 
 		/* Figure out the size of the S/G list */
 		sg_length = num_segs * sizeof(bus_dma_segment_t);
 		io_req->num_user_segs = num_segs;
 		io_req->num_kern_segs = num_segs_needed;
 
 		/* Save the user's S/G list pointer for later restoration */
 		io_req->user_bufs[0] = *data_ptrs[0];
 
 		/*
 		 * If we have enough segments allocated by default to handle
 		 * the length of the user's S/G list,
 		 */
 		if (num_segs > PASS_MAX_SEGS) {
 			io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
 			    num_segs, M_SCSIPASS, M_WAITOK | M_ZERO);
 			io_req->flags |= PASS_IO_USER_SEG_MALLOC;
 		} else
 			io_req->user_segptr = io_req->user_segs;
 
 		error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
 		if (error != 0) {
 			xpt_print(periph->path, "%s: copy of user S/G list "
 				  "from %p to %p failed with error %d\n",
 				  __func__, *data_ptrs[0], io_req->user_segptr,
 				  error);
 			goto bailout;
 		}
 
 		if (num_segs_needed > PASS_MAX_SEGS) {
 			io_req->kern_segptr = malloc(sizeof(bus_dma_segment_t) *
 			    num_segs_needed, M_SCSIPASS, M_WAITOK | M_ZERO);
 			io_req->flags |= PASS_IO_KERN_SEG_MALLOC;
 		} else {
 			io_req->kern_segptr = io_req->kern_segs;
 		}
 
 		/*
 		 * Allocate the kernel S/G list.
 		 */
 		for (size_to_go = lengths[0], i = 0;
 		     size_to_go > 0 && i < num_segs_needed;
 		     i++, size_to_go -= alloc_size) {
 			uint8_t *kern_ptr;
 
 			alloc_size = min(size_to_go, softc->io_zone_size);
 			kern_ptr = uma_zalloc(softc->pass_io_zone, M_WAITOK);
 			io_req->kern_segptr[i].ds_addr =
 			    (bus_addr_t)(uintptr_t)kern_ptr;
 			io_req->kern_segptr[i].ds_len = alloc_size;
 		}
 		if (size_to_go > 0) {
 			printf("%s: size_to_go = %zu, software error!\n",
 			       __func__, size_to_go);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		*data_ptrs[0] = (uint8_t *)io_req->kern_segptr;
 		*seg_cnt_ptr = io_req->num_kern_segs;
 
 		/*
 		 * We only need to copy data here if the user is writing.
 		 */
 		if (dirs[0] == CAM_DIR_OUT)
 			error = passcopysglist(periph, io_req, dirs[0]);
 		break;
 	}
 	case CAM_DATA_SG_PADDR: {
 		size_t sg_length;
 
 		/*
 		 * We shouldn't see this, but check just in case.
 		 */
 		if (numbufs != 1) {
 			printf("%s: cannot currently handle more than one "
 			       "S/G list per CCB\n", __func__);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		/*
 		 * We have to have at least one segment.
 		 */
 		if (num_segs == 0) {
 			xpt_print(periph->path, "%s: CAM_DATA_SG_PADDR flag "
 				  "set, but sglist_cnt=0!\n", __func__);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		/*
 		 * Make sure the user specified the total length and didn't
 		 * just leave it to us to decode the S/G list.
 		 */
 		if (lengths[0] == 0) {
 			xpt_print(periph->path, "%s: no dxfer_len specified, "
 				  "but CAM_DATA_SG flag is set!\n", __func__);
 			error = EINVAL;
 			goto bailout;
 		}
 
 		/* Figure out the size of the S/G list */
 		sg_length = num_segs * sizeof(bus_dma_segment_t);
 		io_req->num_user_segs = num_segs;
 		io_req->num_kern_segs = io_req->num_user_segs;
 
 		/* Save the user's S/G list pointer for later restoration */
 		io_req->user_bufs[0] = *data_ptrs[0];
 
 		if (num_segs > PASS_MAX_SEGS) {
 			io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
 			    num_segs, M_SCSIPASS, M_WAITOK | M_ZERO);
 			io_req->flags |= PASS_IO_USER_SEG_MALLOC;
 		} else
 			io_req->user_segptr = io_req->user_segs;
 
 		io_req->kern_segptr = io_req->user_segptr;
 
 		error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
 		if (error != 0) {
 			xpt_print(periph->path, "%s: copy of user S/G list "
 				  "from %p to %p failed with error %d\n",
 				  __func__, *data_ptrs[0], io_req->user_segptr,
 				  error);
 			goto bailout;
 		}
 		break;
 	}
 	default:
 	case CAM_DATA_BIO:
 		/*
 		 * A user shouldn't be attaching a bio to the CCB.  It
 		 * isn't a user-accessible structure.
 		 */
 		error = EINVAL;
 		break;
 	}
 
 bailout:
 	if (error != 0)
 		passiocleanup(softc, io_req);
 
 	return (error);
 }
 
 static int
 passmemdone(struct cam_periph *periph, struct pass_io_req *io_req)
 {
 	struct pass_softc *softc;
 	int error;
 	int i;
 
 	error = 0;
 	softc = (struct pass_softc *)periph->softc;
 
 	switch (io_req->data_flags) {
 	case CAM_DATA_VADDR:
 		/*
 		 * Copy back to the user buffer if this was a read.
 		 */
 		for (i = 0; i < io_req->num_bufs; i++) {
 			if (io_req->dirs[i] != CAM_DIR_IN)
 				continue;
 
 			error = copyout(io_req->kern_bufs[i],
 			    io_req->user_bufs[i], io_req->lengths[i]);
 			if (error != 0) {
 				xpt_print(periph->path, "Unable to copy %u "
 					  "bytes from %p to user address %p\n",
 					  io_req->lengths[i],
 					  io_req->kern_bufs[i],
 					  io_req->user_bufs[i]);
 				goto bailout;
 			}
 		}
 		break;
 	case CAM_DATA_PADDR:
 		/* Do nothing.  The pointer is a physical address already */
 		break;
 	case CAM_DATA_SG:
 		/*
 		 * Copy back to the user buffer if this was a read.
 		 * Restore the user's S/G list buffer pointer.
 		 */
 		if (io_req->dirs[0] == CAM_DIR_IN)
 			error = passcopysglist(periph, io_req, io_req->dirs[0]);
 		break;
 	case CAM_DATA_SG_PADDR:
 		/*
 		 * Restore the user's S/G list buffer pointer.  No need to
 		 * copy.
 		 */
 		break;
 	default:
 	case CAM_DATA_BIO:
 		error = EINVAL;
 		break;
 	}
 
 bailout:
 	/*
 	 * Reset the user's pointers to their original values and free
 	 * allocated memory.
 	 */
 	passiocleanup(softc, io_req);
 
 	return (error);
 }
 
 static int
 passioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	int error;
 
 	if ((error = passdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) {
 		error = cam_compat_ioctl(dev, cmd, addr, flag, td, passdoioctl);
 	}
 	return (error);
 }
 
 static int
 passdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	struct	cam_periph *periph;
 	struct	pass_softc *softc;
 	int	error;
 	uint32_t priority;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	cam_periph_lock(periph);
 	softc = (struct pass_softc *)periph->softc;
 
 	error = 0;
 
 	switch (cmd) {
 	case CAMIOCOMMAND:
 	{
 		union ccb *inccb;
 		union ccb *ccb;
 		int ccb_malloced;
 
 		inccb = (union ccb *)addr;
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 		if (inccb->ccb_h.func_code == XPT_SCSI_IO)
 			inccb->csio.bio = NULL;
 #endif
 
 		if (inccb->ccb_h.flags & CAM_UNLOCKED) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * Some CCB types, like scan bus and scan lun can only go
 		 * through the transport layer device.
 		 */
 		if (inccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
 			xpt_print(periph->path, "CCB function code %#x is "
 			    "restricted to the XPT device\n",
 			    inccb->ccb_h.func_code);
 			error = ENODEV;
 			break;
 		}
 
 		/* Compatibility for RL/priority-unaware code. */
 		priority = inccb->ccb_h.pinfo.priority;
 		if (priority <= CAM_PRIORITY_OOB)
 		    priority += CAM_PRIORITY_OOB + 1;
 
 		/*
 		 * Non-immediate CCBs need a CCB from the per-device pool
 		 * of CCBs, which is scheduled by the transport layer.
 		 * Immediate CCBs and user-supplied CCBs should just be
 		 * malloced.
 		 */
 		if ((inccb->ccb_h.func_code & XPT_FC_QUEUED)
 		 && ((inccb->ccb_h.func_code & XPT_FC_USER_CCB) == 0)) {
 			ccb = cam_periph_getccb(periph, priority);
 			ccb_malloced = 0;
 		} else {
 			ccb = xpt_alloc_ccb_nowait();
 
 			if (ccb != NULL)
 				xpt_setup_ccb(&ccb->ccb_h, periph->path,
 					      priority);
 			ccb_malloced = 1;
 		}
 
 		if (ccb == NULL) {
 			xpt_print(periph->path, "unable to allocate CCB\n");
 			error = ENOMEM;
 			break;
 		}
 
 		error = passsendccb(periph, ccb, inccb);
 
 		if (ccb_malloced)
 			xpt_free_ccb(ccb);
 		else
 			xpt_release_ccb(ccb);
 
 		break;
 	}
 	case CAMIOQUEUE:
 	{
 		struct pass_io_req *io_req;
 		union ccb **user_ccb, *ccb;
 		xpt_opcode fc;
 
 #ifdef COMPAT_FREEBSD32
 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 			error = ENOTTY;
 			goto bailout;
 		}
 #endif
 		if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0) {
 			error = passcreatezone(periph);
 			if (error != 0)
 				goto bailout;
 		}
 
 		/*
 		 * We're going to do a blocking allocation for this I/O
 		 * request, so we have to drop the lock.
 		 */
 		cam_periph_unlock(periph);
 
 		io_req = uma_zalloc(softc->pass_zone, M_WAITOK | M_ZERO);
 		ccb = &io_req->ccb;
 		user_ccb = (union ccb **)addr;
 
 		/*
 		 * Unlike the CAMIOCOMMAND ioctl above, we only have a
 		 * pointer to the user's CCB, so we have to copy the whole
 		 * thing in to a buffer we have allocated (above) instead
 		 * of allowing the ioctl code to malloc a buffer and copy
 		 * it in.
 		 *
 		 * This is an advantage for this asynchronous interface,
 		 * since we don't want the memory to get freed while the
 		 * CCB is outstanding.
 		 */
 #if 0
 		xpt_print(periph->path, "Copying user CCB %p to "
 			  "kernel address %p\n", *user_ccb, ccb);
 #endif
 		error = copyin(*user_ccb, ccb, sizeof(*ccb));
 		if (error != 0) {
 			xpt_print(periph->path, "Copy of user CCB %p to "
 				  "kernel address %p failed with error %d\n",
 				  *user_ccb, ccb, error);
 			goto camioqueue_error;
 		}
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 		if (ccb->ccb_h.func_code == XPT_SCSI_IO)
 			ccb->csio.bio = NULL;
 #endif
 
 		if (ccb->ccb_h.flags & CAM_UNLOCKED) {
 			error = EINVAL;
 			goto camioqueue_error;
 		}
 
 		if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
 			if (ccb->csio.cdb_len > IOCDBLEN) {
 				error = EINVAL;
 				goto camioqueue_error;
 			}
 			error = copyin(ccb->csio.cdb_io.cdb_ptr,
 			    ccb->csio.cdb_io.cdb_bytes, ccb->csio.cdb_len);
 			if (error != 0)
 				goto camioqueue_error;
 			ccb->ccb_h.flags &= ~CAM_CDB_POINTER;
 		}
 
 		/*
 		 * Some CCB types, like scan bus and scan lun can only go
 		 * through the transport layer device.
 		 */
 		if (ccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
 			xpt_print(periph->path, "CCB function code %#x is "
 			    "restricted to the XPT device\n",
 			    ccb->ccb_h.func_code);
 			error = ENODEV;
 			goto camioqueue_error;
 		}
 
 		/*
 		 * Save the user's CCB pointer as well as his linked list
 		 * pointers and peripheral private area so that we can
 		 * restore these later.
 		 */
 		io_req->user_ccb_ptr = *user_ccb;
 		io_req->user_periph_links = ccb->ccb_h.periph_links;
 		io_req->user_periph_priv = ccb->ccb_h.periph_priv;
 
 		/*
 		 * Now that we've saved the user's values, we can set our
 		 * own peripheral private entry.
 		 */
 		ccb->ccb_h.ccb_ioreq = io_req;
 
 		/* Compatibility for RL/priority-unaware code. */
 		priority = ccb->ccb_h.pinfo.priority;
 		if (priority <= CAM_PRIORITY_OOB)
 		    priority += CAM_PRIORITY_OOB + 1;
 
 		/*
 		 * Setup fields in the CCB like the path and the priority.
 		 * The path in particular cannot be done in userland, since
 		 * it is a pointer to a kernel data structure.
 		 */
 		xpt_setup_ccb_flags(&ccb->ccb_h, periph->path, priority,
 				    ccb->ccb_h.flags);
 
 		/*
 		 * Setup our done routine.  There is no way for the user to
 		 * have a valid pointer here.
 		 */
 		ccb->ccb_h.cbfcnp = passdone;
 
 		fc = ccb->ccb_h.func_code;
 		/*
 		 * If this function code has memory that can be mapped in
 		 * or out, we need to call passmemsetup().
 		 */
 		if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO)
 		 || (fc == XPT_SMP_IO) || (fc == XPT_DEV_MATCH)
 		 || (fc == XPT_DEV_ADVINFO)
 		 || (fc == XPT_NVME_ADMIN) || (fc == XPT_NVME_IO)) {
 			error = passmemsetup(periph, io_req);
 			if (error != 0)
 				goto camioqueue_error;
 		} else
 			io_req->mapinfo.num_bufs_used = 0;
 
 		cam_periph_lock(periph);
 
 		/*
 		 * Everything goes on the incoming queue initially.
 		 */
 		TAILQ_INSERT_TAIL(&softc->incoming_queue, io_req, links);
 
 		/*
 		 * If the CCB is queued, and is not a user CCB, then
 		 * we need to allocate a slot for it.  Call xpt_schedule()
 		 * so that our start routine will get called when a CCB is
 		 * available.
 		 */
 		if ((fc & XPT_FC_QUEUED)
 		 && ((fc & XPT_FC_USER_CCB) == 0)) {
 			xpt_schedule(periph, priority);
 			break;
 		} 
 
 		/*
 		 * At this point, the CCB in question is either an
 		 * immediate CCB (like XPT_DEV_ADVINFO) or it is a user CCB
 		 * and therefore should be malloced, not allocated via a slot.
 		 * Remove the CCB from the incoming queue and add it to the
 		 * active queue.
 		 */
 		TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
 		TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);
 
 		xpt_action(ccb);
 
 		/*
 		 * If this is not a queued CCB (i.e. it is an immediate CCB),
 		 * then it is already done.  We need to put it on the done
 		 * queue for the user to fetch.
 		 */
 		if ((fc & XPT_FC_QUEUED) == 0) {
 			TAILQ_REMOVE(&softc->active_queue, io_req, links);
 			TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
 		}
 		break;
 
 camioqueue_error:
 		uma_zfree(softc->pass_zone, io_req);
 		cam_periph_lock(periph);
 		break;
 	}
 	case CAMIOGET:
 	{
 		union ccb **user_ccb;
 		struct pass_io_req *io_req;
 		int old_error;
 
 #ifdef COMPAT_FREEBSD32
 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 			error = ENOTTY;
 			goto bailout;
 		}
 #endif
 		user_ccb = (union ccb **)addr;
 		old_error = 0;
 
 		io_req = TAILQ_FIRST(&softc->done_queue);
 		if (io_req == NULL) {
 			error = ENOENT;
 			break;
 		}
 
 		/*
 		 * Remove the I/O from the done queue.
 		 */
 		TAILQ_REMOVE(&softc->done_queue, io_req, links);
 
 		/*
 		 * We have to drop the lock during the copyout because the
 		 * copyout can result in VM faults that require sleeping.
 		 */
 		cam_periph_unlock(periph);
 
 		/*
 		 * Do any needed copies (e.g. for reads) and revert the
 		 * pointers in the CCB back to the user's pointers.
 		 */
 		error = passmemdone(periph, io_req);
 
 		old_error = error;
 
 		io_req->ccb.ccb_h.periph_links = io_req->user_periph_links;
 		io_req->ccb.ccb_h.periph_priv = io_req->user_periph_priv;
 
 #if 0
 		xpt_print(periph->path, "Copying to user CCB %p from "
 			  "kernel address %p\n", *user_ccb, &io_req->ccb);
 #endif
 
 		error = copyout(&io_req->ccb, *user_ccb, sizeof(union ccb));
 		if (error != 0) {
 			xpt_print(periph->path, "Copy to user CCB %p from "
 				  "kernel address %p failed with error %d\n",
 				  *user_ccb, &io_req->ccb, error);
 		}
 
 		/*
 		 * Prefer the first error we got back, and make sure we
 		 * don't overwrite bad status with good.
 		 */
 		if (old_error != 0)
 			error = old_error;
 
 		cam_periph_lock(periph);
 
 		/*
 		 * At this point, if there was an error, we could potentially
 		 * re-queue the I/O and try again.  But why?  The error
 		 * would almost certainly happen again.  We might as well
 		 * not leak memory.
 		 */
 		uma_zfree(softc->pass_zone, io_req);
 		break;
 	}
 	default:
 		error = cam_periph_ioctl(periph, cmd, addr, passerror);
 		break;
 	}
 
 bailout:
 	cam_periph_unlock(periph);
 
 	return(error);
 }
 
 static int
 passpoll(struct cdev *dev, int poll_events, struct thread *td)
 {
 	struct cam_periph *periph;
 	struct pass_softc *softc;
 	int revents;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	softc = (struct pass_softc *)periph->softc;
 
 	revents = poll_events & (POLLOUT | POLLWRNORM);
 	if ((poll_events & (POLLIN | POLLRDNORM)) != 0) {
 		cam_periph_lock(periph);
 
 		if (!TAILQ_EMPTY(&softc->done_queue)) {
 			revents |= poll_events & (POLLIN | POLLRDNORM);
 		}
 		cam_periph_unlock(periph);
 		if (revents == 0)
 			selrecord(td, &softc->read_select);
 	}
 
 	return (revents);
 }
 
 static int
 passkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct cam_periph *periph;
 	struct pass_softc *softc;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	softc = (struct pass_softc *)periph->softc;
 
 	kn->kn_hook = (caddr_t)periph;
 	kn->kn_fop = &passread_filtops;
 	knlist_add(&softc->read_select.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 passreadfiltdetach(struct knote *kn)
 {
 	struct cam_periph *periph;
 	struct pass_softc *softc;
 
 	periph = (struct cam_periph *)kn->kn_hook;
 	softc = (struct pass_softc *)periph->softc;
 
 	knlist_remove(&softc->read_select.si_note, kn, 0);
 }
 
 static int
 passreadfilt(struct knote *kn, long hint)
 {
 	struct cam_periph *periph;
 	struct pass_softc *softc;
 	int retval;
 
 	periph = (struct cam_periph *)kn->kn_hook;
 	softc = (struct pass_softc *)periph->softc;
 
 	cam_periph_assert(periph, MA_OWNED);
 
 	if (TAILQ_EMPTY(&softc->done_queue))
 		retval = 0;
 	else
 		retval = 1;
 
 	return (retval);
 }
 
 /*
  * Generally, "ccb" should be the CCB supplied by the kernel.  "inccb"
  * should be the CCB that is copied in from the user.
  */
 static int
 passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb)
 {
 	struct pass_softc *softc;
 	struct cam_periph_map_info mapinfo;
 	uint8_t *cmd;
 	xpt_opcode fc;
 	int error;
 
 	softc = (struct pass_softc *)periph->softc;
 
 	/*
 	 * There are some fields in the CCB header that need to be
 	 * preserved, the rest we get from the user.
 	 */
 	xpt_merge_ccb(ccb, inccb);
 
 	if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
 		cmd = __builtin_alloca(ccb->csio.cdb_len);
 		error = copyin(ccb->csio.cdb_io.cdb_ptr, cmd, ccb->csio.cdb_len);
 		if (error)
 			return (error);
 		ccb->csio.cdb_io.cdb_ptr = cmd;
 	}
 
 	/*
 	 * Let cam_periph_mapmem do a sanity check on the data pointer format.
 	 * Even if no data transfer is needed, it's a cheap check and it
 	 * simplifies the code.
 	 */
 	fc = ccb->ccb_h.func_code;
 	if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO) || (fc == XPT_SMP_IO)
             || (fc == XPT_DEV_MATCH) || (fc == XPT_DEV_ADVINFO) || (fc == XPT_MMC_IO)
             || (fc == XPT_NVME_ADMIN) || (fc == XPT_NVME_IO)) {
 		bzero(&mapinfo, sizeof(mapinfo));
 
 		/*
 		 * cam_periph_mapmem calls into proc and vm functions that can
 		 * sleep as well as trigger I/O, so we can't hold the lock.
 		 * Dropping it here is reasonably safe.
 		 */
 		cam_periph_unlock(periph);
 		error = cam_periph_mapmem(ccb, &mapinfo, softc->maxio);
 		cam_periph_lock(periph);
 
 		/*
 		 * cam_periph_mapmem returned an error, we can't continue.
 		 * Return the error to the user.
 		 */
 		if (error)
 			return(error);
 	} else
 		/* Ensure that the unmap call later on is a no-op. */
 		mapinfo.num_bufs_used = 0;
 
 	/*
 	 * If the user wants us to perform any error recovery, then honor
 	 * that request.  Otherwise, it's up to the user to perform any
 	 * error recovery.
 	 */
 	{
 		uint32_t cam_flags, sense_flags;
 
 		passflags(ccb, &cam_flags, &sense_flags);
 		cam_periph_runccb(ccb,  passerror, cam_flags,
 		    sense_flags, softc->device_stats);
 	}
 
 	cam_periph_unlock(periph);
 	error = cam_periph_unmapmem(ccb, &mapinfo);
 	cam_periph_lock(periph);
 
 	ccb->ccb_h.cbfcnp = NULL;
 	ccb->ccb_h.periph_priv = inccb->ccb_h.periph_priv;
 	bcopy(ccb, inccb, sizeof(union ccb));
 
 	return (error);
 }
 
 /*
  * Set the cam_flags and sense_flags based on whether or not the request wants
  * error recovery. In order to log errors via devctl, we need to do at least
  * minimal recovery. We do this by not retrying unit attention (we let the
  * requester do it, or not, if appropriate) and specifically asking for no
  * recovery, like we do during device probing.
  */
 static void
 passflags(union ccb *ccb, uint32_t *cam_flags, uint32_t *sense_flags)
 {
 	if ((ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) != 0) {
 		*cam_flags = CAM_RETRY_SELTO;
 		*sense_flags = SF_RETRY_UA | SF_NO_PRINT;
 	} else {
 		*cam_flags = 0;
 		*sense_flags = SF_NO_RETRY | SF_NO_RECOVERY | SF_NO_PRINT;
 	}
 }
 
 static int
 passerror(union ccb *ccb, uint32_t cam_flags, uint32_t sense_flags)
 {
 
 	return(cam_periph_error(ccb, cam_flags, sense_flags));
 }
diff --git a/sys/cam/scsi/scsi_target.c b/sys/cam/scsi/scsi_target.c
index 6872e3a2a93b..278fcd908d7b 100644
--- a/sys/cam/scsi/scsi_target.c
+++ b/sys/cam/scsi/scsi_target.c
@@ -1,1169 +1,1169 @@
 /*-
  * Generic SCSI Target Kernel Mode Driver
  *
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2002 Nate Lawson.
  * Copyright (c) 1998, 1999, 2001, 2002 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/devicestat.h>
 #include <sys/proc.h>
 /* Includes to support callout */
 #include <sys/types.h>
 #include <sys/systm.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_sim.h>
 #include <cam/scsi/scsi_targetio.h>
 
 /* Transaction information attached to each CCB sent by the user */
 struct targ_cmd_descr {
 	struct cam_periph_map_info  mapinfo;
 	TAILQ_ENTRY(targ_cmd_descr) tqe;
 	union ccb *user_ccb;
 	int	   priority;
 	int	   func_code;
 };
 
 /* Offset into the private CCB area for storing our descriptor */
 #define targ_descr	periph_priv.entries[1].ptr
 
 TAILQ_HEAD(descr_queue, targ_cmd_descr);
 
 typedef enum {
 	TARG_STATE_RESV		= 0x00, /* Invalid state */
 	TARG_STATE_OPENED	= 0x01, /* Device opened, softc initialized */
 	TARG_STATE_LUN_ENABLED	= 0x02  /* Device enabled for a path */
 } targ_state;
 
 /* Per-instance device software context */
 struct targ_softc {
 	/* CCBs (CTIOs, ATIOs, INOTs) pending on the controller */
 	struct ccb_queue	 pending_ccb_queue;
 
 	/* Command descriptors awaiting CTIO resources from the XPT */
 	struct descr_queue	 work_queue;
 
 	/* Command descriptors that have been aborted back to the user. */
 	struct descr_queue	 abort_queue;
 
 	/*
 	 * Queue of CCBs that have been copied out to userland, but our
 	 * userland daemon has not yet seen.
 	 */
 	struct ccb_queue	 user_ccb_queue;
 
 	struct cam_periph	*periph;
 	struct cam_path		*path;
 	targ_state		 state;
 	u_int			 maxio;
 	struct selinfo		 read_select;
 	struct devstat		 device_stats;
 };
 
 static d_open_t		targopen;
 static d_read_t		targread;
 static d_write_t	targwrite;
 static d_ioctl_t	targioctl;
 static d_poll_t		targpoll;
 static d_kqfilter_t	targkqfilter;
 static void		targreadfiltdetach(struct knote *kn);
 static int		targreadfilt(struct knote *kn, long hint);
-static struct filterops targread_filtops = {
+static const struct filterops targread_filtops = {
 	.f_isfd = 1,
 	.f_detach = targreadfiltdetach,
 	.f_event = targreadfilt,
 };
 
 static struct cdevsw targ_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	targopen,
 	.d_read =	targread,
 	.d_write =	targwrite,
 	.d_ioctl =	targioctl,
 	.d_poll =	targpoll,
 	.d_name =	"targ",
 	.d_kqfilter =	targkqfilter
 };
 
 static cam_status	targendislun(struct cam_path *path, int enable,
 				     int grp6_len, int grp7_len);
 static cam_status	targenable(struct targ_softc *softc,
 				   struct cam_path *path,
 				   int grp6_len, int grp7_len);
 static cam_status	targdisable(struct targ_softc *softc);
 static periph_ctor_t    targctor;
 static periph_dtor_t    targdtor;
 static periph_start_t   targstart;
 static int		targusermerge(struct targ_softc *softc,
 				      struct targ_cmd_descr *descr,
 				      union ccb *ccb);
 static int		targsendccb(struct targ_softc *softc, union ccb *ccb,
 				    struct targ_cmd_descr *descr);
 static void		targdone(struct cam_periph *periph,
 				 union  ccb *done_ccb);
 static int		targreturnccb(struct targ_softc *softc,
 				      union  ccb *ccb);
 static union ccb *	targgetccb(struct targ_softc *softc, xpt_opcode type,
 				   int priority);
 static void		targfreeccb(struct targ_softc *softc, union ccb *ccb);
 static struct targ_cmd_descr *
 			targgetdescr(struct targ_softc *softc);
 static periph_init_t	targinit;
 static void		targasync(void *callback_arg, uint32_t code,
 				  struct cam_path *path, void *arg);
 static void		abort_all_pending(struct targ_softc *softc);
 static void		notify_user(struct targ_softc *softc);
 static int		targcamstatus(cam_status status);
 static size_t		targccblen(xpt_opcode func_code);
 
 static struct periph_driver targdriver =
 {
 	targinit, "targ",
 	TAILQ_HEAD_INITIALIZER(targdriver.units), /* generation */ 0
 };
 PERIPHDRIVER_DECLARE(targ, targdriver);
 
 static MALLOC_DEFINE(M_TARG, "TARG", "TARG data");
 
 /* Disable LUN if enabled and teardown softc */
 static void
 targcdevdtor(void *data)
 {
 	struct targ_softc *softc;
 	struct cam_periph *periph;
 
 	softc = data;
 	if (softc->periph == NULL) {
 		printf("%s: destroying non-enabled target\n", __func__);
 		free(softc, M_TARG);
 		return;
 	}
 
 	/*
 	 * Acquire a hold on the periph so that it doesn't go away before
 	 * we are ready at the end of the function.
 	 */
 	periph = softc->periph;
 	cam_periph_acquire(periph);
 	cam_periph_lock(periph);
 	(void)targdisable(softc);
 	if (softc->periph != NULL) {
 		cam_periph_invalidate(softc->periph);
 		softc->periph = NULL;
 	}
 	cam_periph_unlock(periph);
 	cam_periph_release(periph);
 	free(softc, M_TARG);
 }
 
 /*
  * Create softc and initialize it.  There is no locking here because a
  * periph doesn't get created until an ioctl is issued to do so, and
  * that can't happen until this method returns.
  */
 static int
 targopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct targ_softc *softc;
 
 	/* Allocate its softc, initialize it */
 	softc = malloc(sizeof(*softc), M_TARG,
 	       M_WAITOK | M_ZERO);
 	softc->state = TARG_STATE_OPENED;
 	softc->periph = NULL;
 	softc->path = NULL;
 
 	TAILQ_INIT(&softc->pending_ccb_queue);
 	TAILQ_INIT(&softc->work_queue);
 	TAILQ_INIT(&softc->abort_queue);
 	TAILQ_INIT(&softc->user_ccb_queue);
 	knlist_init_mtx(&softc->read_select.si_note, NULL);
 
 	devfs_set_cdevpriv(softc, targcdevdtor);
 	return (0);
 }
 
 /* Enable/disable LUNs, set debugging level */
 static int
 targioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	struct targ_softc *softc;
 	cam_status	   status;
 
 	devfs_get_cdevpriv((void **)&softc);
 
 	switch (cmd) {
 	case TARGIOCENABLE:
 	{
 		struct ioc_enable_lun	*new_lun;
 		struct cam_path		*path;
 
 		new_lun = (struct ioc_enable_lun *)addr;
 		status = xpt_create_path(&path, /*periph*/NULL,
 					  new_lun->path_id,
 					  new_lun->target_id,
 					  new_lun->lun_id);
 		if (status != CAM_REQ_CMP) {
 			printf("Couldn't create path, status %#x\n", status);
 			break;
 		}
 		xpt_path_lock(path);
 		status = targenable(softc, path, new_lun->grp6_len,
 				    new_lun->grp7_len);
 		xpt_path_unlock(path);
 		xpt_free_path(path);
 		break;
 	}
 	case TARGIOCDISABLE:
 		if (softc->periph == NULL) {
 			status = CAM_DEV_NOT_THERE;
 			break;
 		}
 		cam_periph_lock(softc->periph);
 		status = targdisable(softc);
 		cam_periph_unlock(softc->periph);
 		break;
 	case TARGIOCDEBUG:
 	{
 		struct ccb_debug cdbg;
 
 		/* If no periph available, disallow debugging changes */
 		if ((softc->state & TARG_STATE_LUN_ENABLED) == 0) {
 			status = CAM_DEV_NOT_THERE;
 			break;
 		}
 		bzero(&cdbg, sizeof cdbg);
 		if (*((int *)addr) != 0)
 			cdbg.flags = CAM_DEBUG_PERIPH;
 		else
 			cdbg.flags = CAM_DEBUG_NONE;
 		xpt_setup_ccb(&cdbg.ccb_h, softc->path, CAM_PRIORITY_NORMAL);
 		cdbg.ccb_h.func_code = XPT_DEBUG;
 		cdbg.ccb_h.cbfcnp = targdone;
 		xpt_action((union ccb *)&cdbg);
 		status = cdbg.ccb_h.status & CAM_STATUS_MASK;
 		break;
 	}
 	default:
 		status = CAM_PROVIDE_FAIL;
 		break;
 	}
 
 	return (targcamstatus(status));
 }
 
 /* Writes are always ready, reads wait for user_ccb_queue or abort_queue */
 static int
 targpoll(struct cdev *dev, int poll_events, struct thread *td)
 {
 	struct targ_softc *softc;
 	int	revents;
 
 	devfs_get_cdevpriv((void **)&softc);
 
 	/* Poll for write() is always ok. */
 	revents = poll_events & (POLLOUT | POLLWRNORM);
 	if ((poll_events & (POLLIN | POLLRDNORM)) != 0) {
 		/* Poll for read() depends on user and abort queues. */
 		cam_periph_lock(softc->periph);
 		if (!TAILQ_EMPTY(&softc->user_ccb_queue) ||
 		    !TAILQ_EMPTY(&softc->abort_queue)) {
 			revents |= poll_events & (POLLIN | POLLRDNORM);
 		}
 		cam_periph_unlock(softc->periph);
 		/* Only sleep if the user didn't poll for write. */
 		if (revents == 0)
 			selrecord(td, &softc->read_select);
 	}
 
 	return (revents);
 }
 
 static int
 targkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct  targ_softc *softc;
 
 	devfs_get_cdevpriv((void **)&softc);
 	kn->kn_hook = (caddr_t)softc;
 	kn->kn_fop = &targread_filtops;
 	knlist_add(&softc->read_select.si_note, kn, 0);
 	return (0);
 }
 
 static void
 targreadfiltdetach(struct knote *kn)
 {
 	struct  targ_softc *softc;
 
 	softc = (struct targ_softc *)kn->kn_hook;
 	knlist_remove(&softc->read_select.si_note, kn, 0);
 }
 
 /* Notify the user's kqueue when the user queue or abort queue gets a CCB */
 static int
 targreadfilt(struct knote *kn, long hint)
 {
 	struct targ_softc *softc;
 	int	retval;
 
 	softc = (struct targ_softc *)kn->kn_hook;
 	cam_periph_lock(softc->periph);
 	retval = !TAILQ_EMPTY(&softc->user_ccb_queue) ||
 		 !TAILQ_EMPTY(&softc->abort_queue);
 	cam_periph_unlock(softc->periph);
 	return (retval);
 }
 
 /* Send the HBA the enable/disable message */
 static cam_status
 targendislun(struct cam_path *path, int enable, int grp6_len, int grp7_len)
 {
 	struct ccb_en_lun en_ccb;
 	cam_status	  status;
 
 	/* Tell the lun to begin answering selects */
 	memset(&en_ccb, 0, sizeof(en_ccb));
 	xpt_setup_ccb(&en_ccb.ccb_h, path, CAM_PRIORITY_NORMAL);
 	en_ccb.ccb_h.func_code = XPT_EN_LUN;
 	/* Don't need support for any vendor specific commands */
 	en_ccb.grp6_len = grp6_len;
 	en_ccb.grp7_len = grp7_len;
 	en_ccb.enable = enable ? 1 : 0;
 	xpt_action((union ccb *)&en_ccb);
 	status = en_ccb.ccb_h.status & CAM_STATUS_MASK;
 	if (status != CAM_REQ_CMP) {
 		xpt_print(path, "%sable lun CCB rejected, status %#x\n",
 		    enable ? "en" : "dis", status);
 	}
 	return (status);
 }
 
 /* Enable target mode on a LUN, given its path */
 static cam_status
 targenable(struct targ_softc *softc, struct cam_path *path, int grp6_len,
 	   int grp7_len)
 {
 	struct cam_periph *periph;
 	struct ccb_pathinq cpi;
 	cam_status	   status;
 
 	if ((softc->state & TARG_STATE_LUN_ENABLED) != 0)
 		return (CAM_LUN_ALRDY_ENA);
 
 	/* Make sure SIM supports target mode */
 	xpt_path_inq(&cpi, path);
 	status = cpi.ccb_h.status & CAM_STATUS_MASK;
 	if (status != CAM_REQ_CMP) {
 		printf("pathinq failed, status %#x\n", status);
 		goto enable_fail;
 	}
 	if ((cpi.target_sprt & PIT_PROCESSOR) == 0) {
 		printf("controller does not support target mode\n");
 		status = CAM_FUNC_NOTAVAIL;
 		goto enable_fail;
 	}
 	if (cpi.maxio == 0)
 		softc->maxio = DFLTPHYS;	/* traditional default */
 	else if (cpi.maxio > maxphys)
 		softc->maxio = maxphys;		/* for safety */
 	else
 		softc->maxio = cpi.maxio;	/* real value */
 
 	/* Destroy any periph on our path if it is disabled */
 	periph = cam_periph_find(path, "targ");
 	if (periph != NULL) {
 		struct targ_softc *del_softc;
 
 		del_softc = (struct targ_softc *)periph->softc;
 		if ((del_softc->state & TARG_STATE_LUN_ENABLED) == 0) {
 			cam_periph_invalidate(del_softc->periph);
 			del_softc->periph = NULL;
 		} else {
 			printf("Requested path still in use by targ%d\n",
 			       periph->unit_number);
 			status = CAM_LUN_ALRDY_ENA;
 			goto enable_fail;
 		}
 	}
 
 	/* Create a periph instance attached to this path */
 	status = cam_periph_alloc(targctor, NULL, targdtor, targstart,
 			"targ", CAM_PERIPH_BIO, path, targasync, 0, softc);
 	if (status != CAM_REQ_CMP) {
 		printf("cam_periph_alloc failed, status %#x\n", status);
 		goto enable_fail;
 	}
 
 	/* Ensure that the periph now exists. */
 	if (cam_periph_find(path, "targ") == NULL) {
 		panic("targenable: succeeded but no periph?");
 		/* NOTREACHED */
 	}
 
 	/* Send the enable lun message */
 	status = targendislun(path, /*enable*/1, grp6_len, grp7_len);
 	if (status != CAM_REQ_CMP) {
 		printf("enable lun failed, status %#x\n", status);
 		goto enable_fail;
 	}
 	softc->state |= TARG_STATE_LUN_ENABLED;
 
 enable_fail:
 	return (status);
 }
 
 /* Disable this softc's target instance if enabled */
 static cam_status
 targdisable(struct targ_softc *softc)
 {
 	cam_status status;
 
 	if ((softc->state & TARG_STATE_LUN_ENABLED) == 0)
 		return (CAM_REQ_CMP);
 
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targdisable\n"));
 
 	/* Abort any ccbs pending on the controller */
 	abort_all_pending(softc);
 
 	/* Disable this lun */
 	status = targendislun(softc->path, /*enable*/0,
 			      /*grp6_len*/0, /*grp7_len*/0);
 	if (status == CAM_REQ_CMP)
 		softc->state &= ~TARG_STATE_LUN_ENABLED;
 	else
 		printf("Disable lun failed, status %#x\n", status);
 
 	return (status);
 }
 
 /* Initialize a periph (called from cam_periph_alloc) */
 static cam_status
 targctor(struct cam_periph *periph, void *arg)
 {
 	struct targ_softc *softc;
 
 	/* Store pointer to softc for periph-driven routines */
 	softc = (struct targ_softc *)arg;
 	periph->softc = softc;
 	softc->periph = periph;
 	softc->path = periph->path;
 	return (CAM_REQ_CMP);
 }
 
 static void
 targdtor(struct cam_periph *periph)
 {
 	struct targ_softc     *softc;
 	struct ccb_hdr	      *ccb_h;
 	struct targ_cmd_descr *descr;
 
 	softc = (struct targ_softc *)periph->softc;
 
 	/* 
 	 * targdisable() aborts CCBs back to the user and leaves them
 	 * on user_ccb_queue and abort_queue in case the user is still
 	 * interested in them.  We free them now.
 	 */
 	while ((ccb_h = TAILQ_FIRST(&softc->user_ccb_queue)) != NULL) {
 		TAILQ_REMOVE(&softc->user_ccb_queue, ccb_h, periph_links.tqe);
 		targfreeccb(softc, (union ccb *)ccb_h);
 	}
 	while ((descr = TAILQ_FIRST(&softc->abort_queue)) != NULL) {
 		TAILQ_REMOVE(&softc->abort_queue, descr, tqe);
 		free(descr, M_TARG);
 	}
 
 	softc->periph = NULL;
 	softc->path = NULL;
 	periph->softc = NULL;
 }
 
 /* Receive CCBs from user mode proc and send them to the HBA */
 static int
 targwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	union ccb *user_ccb;
 	struct targ_softc *softc;
 	struct targ_cmd_descr *descr;
 	int write_len, error;
 	int func_code, priority;
 
 	devfs_get_cdevpriv((void **)&softc);
 	write_len = error = 0;
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 		  ("write - uio_resid %zd\n", uio->uio_resid));
 	while (uio->uio_resid >= sizeof(user_ccb) && error == 0) {
 		union ccb *ccb;
 
 		error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio);
 		if (error != 0) {
 			CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 				  ("write - uiomove failed (%d)\n", error));
 			break;
 		}
 		priority = fuword32(&user_ccb->ccb_h.pinfo.priority);
 		if (priority == CAM_PRIORITY_NONE) {
 			error = EINVAL;
 			break;
 		}
 		func_code = fuword32(&user_ccb->ccb_h.func_code);
 		switch (func_code) {
 		case XPT_ACCEPT_TARGET_IO:
 		case XPT_IMMED_NOTIFY:
 		case XPT_IMMEDIATE_NOTIFY:
 			cam_periph_lock(softc->periph);
 			ccb = targgetccb(softc, func_code, priority);
 			descr = (struct targ_cmd_descr *)ccb->ccb_h.targ_descr;
 			descr->user_ccb = user_ccb;
 			descr->func_code = func_code;
 			CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 				  ("Sent ATIO/INOT (%p)\n", user_ccb));
 			xpt_action(ccb);
 			TAILQ_INSERT_TAIL(&softc->pending_ccb_queue,
 					  &ccb->ccb_h,
 					  periph_links.tqe);
 			cam_periph_unlock(softc->periph);
 			break;
 		default:
 			cam_periph_lock(softc->periph);
 			if ((func_code & XPT_FC_QUEUED) != 0) {
 				CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 					  ("Sending queued ccb %#x (%p)\n",
 					  func_code, user_ccb));
 				descr = targgetdescr(softc);
 				descr->user_ccb = user_ccb;
 				descr->priority = priority;
 				descr->func_code = func_code;
 				TAILQ_INSERT_TAIL(&softc->work_queue,
 						  descr, tqe);
 				xpt_schedule(softc->periph, priority);
 			} else {
 				CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 					  ("Sending inline ccb %#x (%p)\n",
 					  func_code, user_ccb));
 				ccb = targgetccb(softc, func_code, priority);
 				descr = (struct targ_cmd_descr *)
 					 ccb->ccb_h.targ_descr;
 				descr->user_ccb = user_ccb;
 				descr->priority = priority;
 				descr->func_code = func_code;
 				if (targusermerge(softc, descr, ccb) != EFAULT)
 					targsendccb(softc, ccb, descr);
 				targreturnccb(softc, ccb);
 			}
 			cam_periph_unlock(softc->periph);
 			break;
 		}
 		write_len += sizeof(user_ccb);
 	}
 
 	/*
 	 * If we've successfully taken in some amount of
 	 * data, return success for that data first.  If
 	 * an error is persistent, it will be reported
 	 * on the next write.
 	 */
 	if (error != 0 && write_len == 0)
 		return (error);
 	if (write_len == 0 && uio->uio_resid != 0)
 		return (ENOSPC);
 	return (0);
 }
 
 /* Process requests (descrs) via the periph-supplied CCBs */
 static void
 targstart(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct targ_softc *softc;
 	struct targ_cmd_descr *descr, *next_descr;
 	int error;
 
 	softc = (struct targ_softc *)periph->softc;
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targstart %p\n", start_ccb));
 
 	descr = TAILQ_FIRST(&softc->work_queue);
 	if (descr == NULL) {
 		xpt_release_ccb(start_ccb);
 	} else {
 		TAILQ_REMOVE(&softc->work_queue, descr, tqe);
 		next_descr = TAILQ_FIRST(&softc->work_queue);
 
 		/* Initiate a transaction using the descr and supplied CCB */
 		error = targusermerge(softc, descr, start_ccb);
 		if (error == 0)
 			error = targsendccb(softc, start_ccb, descr);
 		if (error != 0) {
 			xpt_print(periph->path,
 			    "targsendccb failed, err %d\n", error);
 			xpt_release_ccb(start_ccb);
 			(void)suword(&descr->user_ccb->ccb_h.status,
 			    CAM_REQ_CMP_ERR);
 			TAILQ_INSERT_TAIL(&softc->abort_queue, descr, tqe);
 			notify_user(softc);
 		}
 
 		/* If we have more work to do, stay scheduled */
 		if (next_descr != NULL)
 			xpt_schedule(periph, next_descr->priority);
 	}
 }
 
 static int
 targusermerge(struct targ_softc *softc, struct targ_cmd_descr *descr,
 	      union ccb *ccb)
 {
 	struct ccb_hdr *u_ccbh, *k_ccbh;
 	size_t ccb_len;
 	int error;
 
 	u_ccbh = &descr->user_ccb->ccb_h;
 	k_ccbh = &ccb->ccb_h;
 
 	/*
 	 * There are some fields in the CCB header that need to be
 	 * preserved, the rest we get from the user ccb. (See xpt_merge_ccb)
 	 */
 	xpt_setup_ccb(k_ccbh, softc->path, descr->priority);
 	k_ccbh->retry_count = fuword32(&u_ccbh->retry_count);
 	k_ccbh->func_code = descr->func_code;
 	k_ccbh->flags = fuword32(&u_ccbh->flags);
 	k_ccbh->timeout = fuword32(&u_ccbh->timeout);
 	ccb_len = targccblen(k_ccbh->func_code) - sizeof(struct ccb_hdr);
 	error = copyin(u_ccbh + 1, k_ccbh + 1, ccb_len);
 	if (error != 0) {
 		k_ccbh->status = CAM_REQ_CMP_ERR;
 		return (error);
 	}
 
 	/* Translate usermode abort_ccb pointer to its kernel counterpart */
 	if (k_ccbh->func_code == XPT_ABORT) {
 		struct ccb_abort *cab;
 		struct ccb_hdr *ccb_h;
 
 		cab = (struct ccb_abort *)ccb;
 		TAILQ_FOREACH(ccb_h, &softc->pending_ccb_queue,
 		    periph_links.tqe) {
 			struct targ_cmd_descr *ab_descr;
 
 			ab_descr = (struct targ_cmd_descr *)ccb_h->targ_descr;
 			if (ab_descr->user_ccb == cab->abort_ccb) {
 				CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 					  ("Changing abort for %p to %p\n",
 					  cab->abort_ccb, ccb_h));
 				cab->abort_ccb = (union ccb *)ccb_h;
 				break;
 			}
 		}
 		/* CCB not found, set appropriate status */
 		if (ccb_h == NULL) {
 			k_ccbh->status = CAM_PATH_INVALID;
 			error = ESRCH;
 		}
 	}
 
 	return (error);
 }
 
 /* Build and send a kernel CCB formed from descr->user_ccb */
 static int
 targsendccb(struct targ_softc *softc, union ccb *ccb,
 	    struct targ_cmd_descr *descr)
 {
 	struct cam_periph_map_info *mapinfo;
 	struct ccb_hdr *ccb_h;
 	int error;
 
 	ccb_h = &ccb->ccb_h;
 	mapinfo = &descr->mapinfo;
 	mapinfo->num_bufs_used = 0;
 
 	/*
 	 * There's no way for the user to have a completion
 	 * function, so we put our own completion function in here.
 	 * We also stash in a reference to our descriptor so targreturnccb()
 	 * can find our mapping info.
 	 */
 	ccb_h->cbfcnp = targdone;
 	ccb_h->targ_descr = descr;
 
 	if ((ccb_h->func_code == XPT_CONT_TARGET_IO) ||
 	    (ccb_h->func_code == XPT_DEV_MATCH)) {
 		error = cam_periph_mapmem(ccb, mapinfo, softc->maxio);
 
 		/*
 		 * cam_periph_mapmem returned an error, we can't continue.
 		 * Return the error to the user.
 		 */
 		if (error) {
 			ccb_h->status = CAM_REQ_CMP_ERR;
 			mapinfo->num_bufs_used = 0;
 			return (error);
 		}
 	}
 
 	/*
 	 * Once queued on the pending CCB list, this CCB will be protected
 	 * by our error recovery handler.
 	 */
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("sendccb %p\n", ccb));
 	if (XPT_FC_IS_QUEUED(ccb)) {
 		TAILQ_INSERT_TAIL(&softc->pending_ccb_queue, ccb_h,
 				  periph_links.tqe);
 	}
 	xpt_action(ccb);
 
 	return (0);
 }
 
 /* Completion routine for CCBs (called at splsoftcam) */
 static void
 targdone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	struct targ_softc *softc;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("targdone %p\n", done_ccb));
 	softc = (struct targ_softc *)periph->softc;
 	TAILQ_REMOVE(&softc->pending_ccb_queue, &done_ccb->ccb_h,
 		     periph_links.tqe);
 
 	/* If we're no longer enabled, throw away CCB */
 	if ((softc->state & TARG_STATE_LUN_ENABLED) == 0) {
 		targfreeccb(softc, done_ccb);
 		return;
 	}
 	/* abort_all_pending() waits for pending queue to be empty */
 	if (TAILQ_EMPTY(&softc->pending_ccb_queue))
 		wakeup(&softc->pending_ccb_queue);
 
 	switch (done_ccb->ccb_h.func_code) {
 	/* All FC_*_QUEUED CCBs go back to userland */
 	case XPT_IMMED_NOTIFY:
 	case XPT_IMMEDIATE_NOTIFY:
 	case XPT_ACCEPT_TARGET_IO:
 	case XPT_CONT_TARGET_IO:
 		TAILQ_INSERT_TAIL(&softc->user_ccb_queue, &done_ccb->ccb_h,
 				  periph_links.tqe);
  		cam_periph_unlock(softc->periph);
 		notify_user(softc);
  		cam_periph_lock(softc->periph);
 		break;
 	default:
 		panic("targdone: impossible xpt opcode %#x",
 		      done_ccb->ccb_h.func_code);
 		/* NOTREACHED */
 	}
 }
 
 /* Return CCBs to the user from the user queue and abort queue */
 static int
 targread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct descr_queue	*abort_queue;
 	struct targ_cmd_descr	*user_descr;
 	struct targ_softc	*softc;
 	struct ccb_queue  *user_queue;
 	struct ccb_hdr	  *ccb_h;
 	union  ccb	  *user_ccb;
 	int		   read_len, error;
 
 	error = 0;
 	read_len = 0;
 	devfs_get_cdevpriv((void **)&softc);
 	user_queue = &softc->user_ccb_queue;
 	abort_queue = &softc->abort_queue;
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targread\n"));
 
 	/* If no data is available, wait or return immediately */
 	cam_periph_lock(softc->periph);
 	ccb_h = TAILQ_FIRST(user_queue);
 	user_descr = TAILQ_FIRST(abort_queue);
 	while (ccb_h == NULL && user_descr == NULL) {
 		if ((ioflag & IO_NDELAY) == 0) {
 			error = cam_periph_sleep(softc->periph, user_queue,
 			    PRIBIO | PCATCH, "targrd", 0);
 			ccb_h = TAILQ_FIRST(user_queue);
 			user_descr = TAILQ_FIRST(abort_queue);
 			if (error != 0) {
 				if (error == ERESTART) {
 					continue;
 				} else {
 					goto read_fail;
 				}
 			}
 		} else {
 			cam_periph_unlock(softc->periph);
 			return (EAGAIN);
 		}
 	}
 
 	/* Data is available so fill the user's buffer */
 	while (ccb_h != NULL) {
 		struct targ_cmd_descr *descr;
 
 		if (uio->uio_resid < sizeof(user_ccb))
 			break;
 		TAILQ_REMOVE(user_queue, ccb_h, periph_links.tqe);
 		descr = (struct targ_cmd_descr *)ccb_h->targ_descr;
 		user_ccb = descr->user_ccb;
 		CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 			  ("targread ccb %p (%p)\n", ccb_h, user_ccb));
 		error = targreturnccb(softc, (union ccb *)ccb_h);
 		if (error != 0)
 			goto read_fail;
 		cam_periph_unlock(softc->periph);
 		error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio);
 		cam_periph_lock(softc->periph);
 		if (error != 0)
 			goto read_fail;
 		read_len += sizeof(user_ccb);
 
 		ccb_h = TAILQ_FIRST(user_queue);
 	}
 
 	/* Flush out any aborted descriptors */
 	while (user_descr != NULL) {
 		if (uio->uio_resid < sizeof(user_ccb))
 			break;
 		TAILQ_REMOVE(abort_queue, user_descr, tqe);
 		user_ccb = user_descr->user_ccb;
 		CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 			  ("targread aborted descr %p (%p)\n",
 			  user_descr, user_ccb));
 		if (suword(&user_ccb->ccb_h.status, CAM_REQ_ABORTED) != 0) {
 			error = EFAULT;
 			goto read_fail;
 		}
 		cam_periph_unlock(softc->periph);
 		error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio);
 		cam_periph_lock(softc->periph);
 		if (error != 0)
 			goto read_fail;
 		read_len += sizeof(user_ccb);
 
 		user_descr = TAILQ_FIRST(abort_queue);
 	}
 
 	/*
 	 * If we've successfully read some amount of data, don't report an
 	 * error.  If the error is persistent, it will be reported on the
 	 * next read().
 	 */
 	if (read_len == 0 && uio->uio_resid != 0)
 		error = ENOSPC;
 
 read_fail:
 	cam_periph_unlock(softc->periph);
 	return (error);
 }
 
 /* Copy completed ccb back to the user */
 static int
 targreturnccb(struct targ_softc *softc, union ccb *ccb)
 {
 	struct targ_cmd_descr *descr;
 	struct ccb_hdr *u_ccbh;
 	size_t ccb_len;
 	int error;
 
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targreturnccb %p\n", ccb));
 	descr = (struct targ_cmd_descr *)ccb->ccb_h.targ_descr;
 	u_ccbh = &descr->user_ccb->ccb_h;
 
 	/* Copy out the central portion of the ccb_hdr */
 	error = copyout(&ccb->ccb_h.retry_count, &u_ccbh->retry_count,
 	    offsetof(struct ccb_hdr, periph_priv) -
 	    offsetof(struct ccb_hdr, retry_count));
 	if (error != 0) {
 		xpt_print(softc->path,
 		    "targreturnccb - CCB header copyout failed (%d)\n", error);
 	}
 
 	/* Copy out the rest of the ccb (after the ccb_hdr) */
 	ccb_len = targccblen(ccb->ccb_h.func_code) - sizeof(struct ccb_hdr);
 	if (descr->mapinfo.num_bufs_used != 0) {
 		int error1;
 
 		error1 = cam_periph_unmapmem(ccb, &descr->mapinfo);
 		if (error == 0)
 			error = error1;
 	}
 	if (error == 0) {
 		error = copyout(&ccb->ccb_h + 1, u_ccbh + 1, ccb_len);
 		if (error != 0) {
 			xpt_print(softc->path,
 			    "targreturnccb - CCB copyout failed (%d)\n", error);
 		}
 	}
 	/* Free CCB or send back to devq. */
 	targfreeccb(softc, ccb);
 
 	return (error);
 }
 
 static union ccb *
 targgetccb(struct targ_softc *softc, xpt_opcode type, int priority)
 {
 	union ccb *ccb;
 	int ccb_len;
 
 	ccb_len = targccblen(type);
 	ccb = malloc(ccb_len, M_TARG, M_NOWAIT | M_ZERO);
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("getccb %p\n", ccb));
 	if (ccb == NULL) {
 		return (ccb);
 	}
 	xpt_setup_ccb(&ccb->ccb_h, softc->path, priority);
 	ccb->ccb_h.func_code = type;
 	ccb->ccb_h.cbfcnp = targdone;
 	ccb->ccb_h.targ_descr = targgetdescr(softc);
 	if (ccb->ccb_h.targ_descr == NULL) {
 		free (ccb, M_TARG);
 		ccb = NULL;
 	}
 	return (ccb);
 }
 
 static void
 targfreeccb(struct targ_softc *softc, union ccb *ccb)
 {
 	CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("targfreeccb descr %p and\n",
 			ccb->ccb_h.targ_descr));
 	free(ccb->ccb_h.targ_descr, M_TARG);
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_ACCEPT_TARGET_IO:
 	case XPT_IMMED_NOTIFY:
 	case XPT_IMMEDIATE_NOTIFY:
 		CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("freeing ccb %p\n", ccb));
 		free(ccb, M_TARG);
 		break;
 	default:
 		/* Send back CCB if we got it from the periph */
 		if (XPT_FC_IS_QUEUED(ccb)) {
 			CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH,
 					("returning queued ccb %p\n", ccb));
 			xpt_release_ccb(ccb);
 		} else {
 			CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH,
 					("freeing ccb %p\n", ccb));
 			free(ccb, M_TARG);
 		}
 		break;
 	}
 }
 
 static struct targ_cmd_descr *
 targgetdescr(struct targ_softc *softc)
 {
 	struct targ_cmd_descr *descr;
 
 	descr = malloc(sizeof(*descr), M_TARG,
 	       M_NOWAIT);
 	if (descr) {
 		descr->mapinfo.num_bufs_used = 0;
 	}
 	return (descr);
 }
 
 static void
 targinit(void)
 {
 	struct cdev *dev;
 
 	/* Add symbolic link to targ0 for compatibility. */
 	dev = make_dev(&targ_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "targ");
 	make_dev_alias(dev, "targ0");
 }
 
 static void
 targasync(void *callback_arg, uint32_t code, struct cam_path *path, void *arg)
 {
 	/* All events are handled in usermode by INOTs */
 	panic("targasync() called, should be an INOT instead");
 }
 
 /* Cancel all pending requests and CCBs awaiting work. */
 static void
 abort_all_pending(struct targ_softc *softc)
 {
 	struct targ_cmd_descr   *descr;
 	struct ccb_abort	 cab;
 	struct ccb_hdr		*ccb_h;
 
 	CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("abort_all_pending\n"));
 
 	/* First abort the descriptors awaiting resources */
 	while ((descr = TAILQ_FIRST(&softc->work_queue)) != NULL) {
 		CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 			  ("Aborting descr from workq %p\n", descr));
 		TAILQ_REMOVE(&softc->work_queue, descr, tqe);
 		TAILQ_INSERT_TAIL(&softc->abort_queue, descr, tqe);
 	}
 
 	/* 
 	 * Then abort all pending CCBs.
 	 * targdone() will return the aborted CCB via user_ccb_queue
 	 */
 	memset(&cab, 0, sizeof(cab));
 	xpt_setup_ccb(&cab.ccb_h, softc->path, CAM_PRIORITY_NORMAL);
 	cab.ccb_h.func_code = XPT_ABORT;
 	cab.ccb_h.status = CAM_REQ_CMP_ERR;
 	TAILQ_FOREACH(ccb_h, &softc->pending_ccb_queue, periph_links.tqe) {
 		CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH,
 			  ("Aborting pending CCB %p\n", ccb_h));
 		cab.abort_ccb = (union ccb *)ccb_h;
 		xpt_action((union ccb *)&cab);
 		if (cab.ccb_h.status != CAM_REQ_CMP) {
 			xpt_print(cab.ccb_h.path,
 			    "Unable to abort CCB, status %#x\n",
 			    cab.ccb_h.status);
 		}
 	}
 
 	/* If we aborted at least one pending CCB ok, wait for it. */
 	if (cab.ccb_h.status == CAM_REQ_CMP) {
 		cam_periph_sleep(softc->periph, &softc->pending_ccb_queue,
 		       PRIBIO | PCATCH, "tgabrt", 0);
 	}
 
 	/* If we aborted anything from the work queue, wakeup user. */
 	if (!TAILQ_EMPTY(&softc->user_ccb_queue)
 	 || !TAILQ_EMPTY(&softc->abort_queue)) {
 		cam_periph_unlock(softc->periph);
 		notify_user(softc);
 		cam_periph_lock(softc->periph);
 	}
 }
 
 /* Notify the user that data is ready */
 static void
 notify_user(struct targ_softc *softc)
 {
 	/*
 	 * Notify users sleeping via poll(), kqueue(), and
 	 * blocking read().
 	 */
 	selwakeuppri(&softc->read_select, PRIBIO);
 	KNOTE_UNLOCKED(&softc->read_select.si_note, 0);
 	wakeup(&softc->user_ccb_queue);
 }
 
 /* Convert CAM status to errno values */
 static int
 targcamstatus(cam_status status)
 {
 	switch (status & CAM_STATUS_MASK) {
 	case CAM_REQ_CMP:	/* CCB request completed without error */
 		return (0);
 	case CAM_REQ_INPROG:	/* CCB request is in progress */
 		return (EINPROGRESS);
 	case CAM_REQ_CMP_ERR:	/* CCB request completed with an error */
 		return (EIO);
 	case CAM_PROVIDE_FAIL:	/* Unable to provide requested capability */
 		return (ENOTTY);
 	case CAM_FUNC_NOTAVAIL:	/* The requested function is not available */
 		return (ENOTSUP);
 	case CAM_LUN_ALRDY_ENA:	/* LUN is already enabled for target mode */
 		return (EADDRINUSE);
 	case CAM_PATH_INVALID:	/* Supplied Path ID is invalid */
 	case CAM_DEV_NOT_THERE:	/* SCSI Device Not Installed/there */
 		return (ENOENT);
 	case CAM_REQ_ABORTED:	/* CCB request aborted by the host */
 		return (ECANCELED);
 	case CAM_CMD_TIMEOUT:	/* Command timeout */
 		return (ETIMEDOUT);
 	case CAM_REQUEUE_REQ:	/* Requeue to preserve transaction ordering */
 		return (EAGAIN);
 	case CAM_REQ_INVALID:	/* CCB request was invalid */
 		return (EINVAL);
 	case CAM_RESRC_UNAVAIL:	/* Resource Unavailable */
 		return (ENOMEM);
 	case CAM_BUSY:		/* CAM subsystem is busy */
 	case CAM_UA_ABORT:	/* Unable to abort CCB request */
 		return (EBUSY);
 	default:
 		return (ENXIO);
 	}
 }
 
 static size_t
 targccblen(xpt_opcode func_code)
 {
 	int len;
 
 	/* Codes we expect to see as a target */
 	switch (func_code) {
 	case XPT_CONT_TARGET_IO:
 	case XPT_SCSI_IO:
 		len = sizeof(struct ccb_scsiio);
 		break;
 	case XPT_ACCEPT_TARGET_IO:
 		len = sizeof(struct ccb_accept_tio);
 		break;
 	case XPT_IMMED_NOTIFY:
 		len = sizeof(struct ccb_immed_notify);
 		break;
 	case XPT_IMMEDIATE_NOTIFY:
 		len = sizeof(struct ccb_immediate_notify);
 		break;
 	case XPT_REL_SIMQ:
 		len = sizeof(struct ccb_relsim);
 		break;
 	case XPT_PATH_INQ:
 		len = sizeof(struct ccb_pathinq);
 		break;
 	case XPT_DEBUG:
 		len = sizeof(struct ccb_debug);
 		break;
 	case XPT_ABORT:
 		len = sizeof(struct ccb_abort);
 		break;
 	case XPT_EN_LUN:
 		len = sizeof(struct ccb_en_lun);
 		break;
 	default:
 		len = sizeof(union ccb);
 		break;
 	}
 
 	return (len);
 }
diff --git a/sys/compat/linuxkpi/common/include/linux/file.h b/sys/compat/linuxkpi/common/include/linux/file.h
index f94e3d89ced1..f6e988c2d88e 100644
--- a/sys/compat/linuxkpi/common/include/linux/file.h
+++ b/sys/compat/linuxkpi/common/include/linux/file.h
@@ -1,185 +1,185 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2017 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #ifndef	_LINUXKPI_LINUX_FILE_H_
 #define	_LINUXKPI_LINUX_FILE_H_
 
 #include <sys/param.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/refcount.h>
 #include <sys/capsicum.h>
 #include <sys/proc.h>
 
 #include <linux/fs.h>
 #include <linux/slab.h>
 
 struct linux_file;
 
 #undef file
 
-extern struct fileops linuxfileops;
+extern const struct fileops linuxfileops;
 
 static inline struct linux_file *
 linux_fget(unsigned int fd)
 {
 	struct file *file;
 
 	/* lookup file pointer by file descriptor index */
 	if (fget_unlocked(curthread, fd, &cap_no_rights, &file) != 0)
 		return (NULL);
 
 	/* check if file handle really belongs to us */
 	if (file->f_data == NULL ||
 	    file->f_ops != &linuxfileops) {
 		fdrop(file, curthread);
 		return (NULL);
 	}
 	return ((struct linux_file *)file->f_data);
 }
 
 extern void linux_file_free(struct linux_file *filp);
 
 static inline void
 fput(struct linux_file *filp)
 {
 	if (refcount_release(filp->_file == NULL ?
 	    &filp->f_count : &filp->_file->f_count)) {
 		linux_file_free(filp);
 	}
 }
 
 static inline unsigned int
 file_count(struct linux_file *filp)
 {
 	return (filp->_file == NULL ?
 	    filp->f_count : filp->_file->f_count);
 }
 
 static inline void
 put_unused_fd(unsigned int fd)
 {
 	struct file *file;
 
 	if (fget_unlocked(curthread, fd, &cap_no_rights, &file) != 0) {
 		return;
 	}
 	/*
 	 * NOTE: We should only get here when the "fd" has not been
 	 * installed, so no need to free the associated Linux file
 	 * structure.
 	 */
 	fdclose(curthread, file, fd);
 
 	/* drop extra reference */
 	fdrop(file, curthread);
 }
 
 static inline void
 fd_install(unsigned int fd, struct linux_file *filp)
 {
 	struct file *file;
 
 	if (fget_unlocked(curthread, fd, &cap_no_rights, &file) != 0) {
 		filp->_file = NULL;
 	} else {
 		filp->_file = file;
 		finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
 
 		/* transfer reference count from "filp" to "file" */
 		while (refcount_release(&filp->f_count) == 0)
 			refcount_acquire(&file->f_count);
 	}
 
 	/* drop the extra reference */
 	fput(filp);
 }
 
 static inline int
 get_unused_fd(void)
 {
 	struct file *file;
 	int error;
 	int fd;
 
 	error = falloc(curthread, &file, &fd, 0);
 	if (error)
 		return -error;
 	/* drop the extra reference */
 	fdrop(file, curthread);
 	return fd;
 }
 
 static inline int
 get_unused_fd_flags(int flags)
 {
 	struct file *file;
 	int error;
 	int fd;
 
 	error = falloc(curthread, &file, &fd, flags);
 	if (error)
 		return -error;
 	/* drop the extra reference */
 	fdrop(file, curthread);
 	return fd;
 }
 
 extern struct linux_file *linux_file_alloc(void);
 
 static inline struct linux_file *
 alloc_file(int mode, const struct file_operations *fops)
 {
 	struct linux_file *filp;
 
 	filp = linux_file_alloc();
 	filp->f_op = fops;
 	filp->f_mode = mode;
 
 	return (filp);
 }
 
 struct fd {
 	struct linux_file *linux_file;
 };
 
 static inline void fdput(struct fd fd)
 {
 	fput(fd.linux_file);
 }
 
 static inline struct fd fdget(unsigned int fd)
 {
 	struct linux_file *f = linux_fget(fd);
 	return (struct fd){f};
 }
 
 #define	file		linux_file
 #define	fget(...)	linux_fget(__VA_ARGS__)
 
 #endif	/* _LINUXKPI_LINUX_FILE_H_ */
diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c
index fe1a545c6a3a..1fc71c55469a 100644
--- a/sys/compat/linuxkpi/common/src/linux_compat.c
+++ b/sys/compat/linuxkpi/common/src/linux_compat.c
@@ -1,2876 +1,2876 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2021 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_global.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/sglist.h>
 #include <sys/sleepqueue.h>
 #include <sys/refcount.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/rwlock.h>
 #include <sys/mman.h>
 #include <sys/stack.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #include <machine/stdarg.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #endif
 
 #include <linux/kobject.h>
 #include <linux/cpu.h>
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/cdev.h>
 #include <linux/file.h>
 #include <linux/sysfs.h>
 #include <linux/mm.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
 #include <linux/netdevice.h>
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/uaccess.h>
 #include <linux/utsname.h>
 #include <linux/list.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/compat.h>
 #include <linux/io-mapping.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
 #include <linux/wait_bit.h>
 #include <linux/rcupdate.h>
 #include <linux/interval_tree.h>
 #include <linux/interval_tree_generic.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <asm/smp.h>
 #include <asm/processor.h>
 #endif
 
 #include <xen/xen.h>
 #ifdef XENHVM
 #undef xen_pv_domain
 #undef xen_initial_domain
 /* xen/xen-os.h redefines __must_check */
 #undef __must_check
 #include <xen/xen-os.h>
 #endif
 
 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "LinuxKPI parameters");
 
 int linuxkpi_debug;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
     &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
 
 int linuxkpi_rcu_debug;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, rcu_debug, CTLFLAG_RWTUN,
     &linuxkpi_rcu_debug, 0, "Set to enable RCU warning. Clear to disable.");
 
 int linuxkpi_warn_dump_stack = 0;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN,
     &linuxkpi_warn_dump_stack, 0,
     "Set to enable stack traces from WARN_ON(). Clear to disable.");
 
 static struct timeval lkpi_net_lastlog;
 static int lkpi_net_curpps;
 static int lkpi_net_maxpps = 99;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN,
     &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second.");
 
 MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat");
 
 #include <linux/rbtree.h>
 /* Undo Linux compat changes. */
 #undef RB_ROOT
 #undef file
 #undef cdev
 #define	RB_ROOT(head)	(head)->rbh_root
 
 static void linux_destroy_dev(struct linux_cdev *);
 static void linux_cdev_deref(struct linux_cdev *ldev);
 static struct vm_area_struct *linux_cdev_handle_find(void *handle);
 
 cpumask_t cpu_online_mask;
 static cpumask_t **static_single_cpu_mask;
 static cpumask_t *static_single_cpu_mask_lcs;
 struct kobject linux_class_root;
 struct device linux_root_device;
 struct class linux_class_misc;
 struct list_head pci_drivers;
 struct list_head pci_devices;
 spinlock_t pci_lock;
 struct uts_namespace init_uts_ns;
 
 unsigned long linux_timer_hz_mask;
 
 wait_queue_head_t linux_bit_waitq;
 wait_queue_head_t linux_var_waitq;
 
 int
 panic_cmp(struct rb_node *one, struct rb_node *two)
 {
 	panic("no cmp");
 }
 
 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
 
 #define	START(node)	((node)->start)
 #define	LAST(node)	((node)->last)
 
 INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START,
     LAST,, lkpi_interval_tree)
 
 static void
 linux_device_release(struct device *dev)
 {
 	pr_debug("linux_device_release: %s\n", dev_name(dev));
 	kfree(dev);
 }
 
 static ssize_t
 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct class, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct class, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static void
 linux_class_release(struct kobject *kobj)
 {
 	struct class *class;
 
 	class = container_of(kobj, struct class, kobj);
 	if (class->class_release)
 		class->class_release(class);
 }
 
 static const struct sysfs_ops linux_class_sysfs = {
 	.show  = linux_class_show,
 	.store = linux_class_store,
 };
 
 const struct kobj_type linux_class_ktype = {
 	.release = linux_class_release,
 	.sysfs_ops = &linux_class_sysfs
 };
 
 static void
 linux_dev_release(struct kobject *kobj)
 {
 	struct device *dev;
 
 	dev = container_of(kobj, struct device, kobj);
 	/* This is the precedence defined by linux. */
 	if (dev->release)
 		dev->release(dev);
 	else if (dev->class && dev->class->dev_release)
 		dev->class->dev_release(dev);
 }
 
 static ssize_t
 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct device, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct device, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static const struct sysfs_ops linux_dev_sysfs = {
 	.show  = linux_dev_show,
 	.store = linux_dev_store,
 };
 
 const struct kobj_type linux_dev_ktype = {
 	.release = linux_dev_release,
 	.sysfs_ops = &linux_dev_sysfs
 };
 
 struct device *
 device_create(struct class *class, struct device *parent, dev_t devt,
     void *drvdata, const char *fmt, ...)
 {
 	struct device *dev;
 	va_list args;
 
 	dev = kzalloc(sizeof(*dev), M_WAITOK);
 	dev->parent = parent;
 	dev->class = class;
 	dev->devt = devt;
 	dev->driver_data = drvdata;
 	dev->release = linux_device_release;
 	va_start(args, fmt);
 	kobject_set_name_vargs(&dev->kobj, fmt, args);
 	va_end(args);
 	device_register(dev);
 
 	return (dev);
 }
 
 struct device *
 device_create_groups_vargs(struct class *class, struct device *parent,
     dev_t devt, void *drvdata, const struct attribute_group **groups,
     const char *fmt, va_list args)
 {
 	struct device *dev = NULL;
 	int retval = -ENODEV;
 
 	if (class == NULL || IS_ERR(class))
 		goto error;
 
 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev) {
 		retval = -ENOMEM;
 		goto error;
 	}
 
 	dev->devt = devt;
 	dev->class = class;
 	dev->parent = parent;
 	dev->groups = groups;
 	dev->release = device_create_release;
 	/* device_initialize() needs the class and parent to be set */
 	device_initialize(dev);
 	dev_set_drvdata(dev, drvdata);
 
 	retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
 	if (retval)
 		goto error;
 
 	retval = device_add(dev);
 	if (retval)
 		goto error;
 
 	return dev;
 
 error:
 	put_device(dev);
 	return ERR_PTR(retval);
 }
 
 struct class *
 class_create(struct module *owner, const char *name)
 {
 	struct class *class;
 	int error;
 
 	class = kzalloc(sizeof(*class), M_WAITOK);
 	class->owner = owner;
 	class->name = name;
 	class->class_release = linux_class_kfree;
 	error = class_register(class);
 	if (error) {
 		kfree(class);
 		return (NULL);
 	}
 
 	return (class);
 }
 
 static void
 linux_kq_lock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_lock(s);
 }
 static void
 linux_kq_unlock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_unlock(s);
 }
 
 static void
 linux_kq_assert_lock(void *arg, int what)
 {
 #ifdef INVARIANTS
 	spinlock_t *s = arg;
 
 	if (what == LA_LOCKED)
 		mtx_assert(s, MA_OWNED);
 	else
 		mtx_assert(s, MA_NOTOWNED);
 #endif
 }
 
 static void
 linux_file_kqfilter_poll(struct linux_file *, int);
 
 struct linux_file *
 linux_file_alloc(void)
 {
 	struct linux_file *filp;
 
 	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
 
 	/* set initial refcount */
 	filp->f_count = 1;
 
 	/* setup fields needed by kqueue support */
 	spin_lock_init(&filp->f_kqlock);
 	knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
 	    linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock);
 
 	return (filp);
 }
 
 void
 linux_file_free(struct linux_file *filp)
 {
 	if (filp->_file == NULL) {
 		if (filp->f_op != NULL && filp->f_op->release != NULL)
 			filp->f_op->release(filp->f_vnode, filp);
 		if (filp->f_shmem != NULL)
 			vm_object_deallocate(filp->f_shmem);
 		kfree_rcu(filp, rcu);
 	} else {
 		/*
 		 * The close method of the character device or file
 		 * will free the linux_file structure:
 		 */
 		_fdrop(filp->_file, curthread);
 	}
 }
 
 struct linux_cdev *
 cdev_alloc(void)
 {
 	struct linux_cdev *cdev;
 
 	cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);
 	kobject_init(&cdev->kobj, &linux_cdev_ktype);
 	cdev->refs = 1;
 	return (cdev);
 }
 
 static int
 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
 		vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
 		vm_page_t page;
 
 		if (((*mres)->flags & PG_FICTITIOUS) != 0) {
 			/*
 			 * If the passed in result page is a fake
 			 * page, update it with the new physical
 			 * address.
 			 */
 			page = *mres;
 			vm_page_updatefake(page, paddr, vm_obj->memattr);
 		} else {
 			/*
 			 * Replace the passed in "mres" page with our
 			 * own fake page and free up the all of the
 			 * original pages.
 			 */
 			VM_OBJECT_WUNLOCK(vm_obj);
 			page = vm_page_getfake(paddr, vm_obj->memattr);
 			VM_OBJECT_WLOCK(vm_obj);
 
 			vm_page_replace(page, vm_obj, (*mres)->pindex, *mres);
 			*mres = page;
 		}
 		vm_page_valid(page);
 		return (VM_PAGER_OK);
 	}
 	return (VM_PAGER_FAIL);
 }
 
 static int
 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
     vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 	struct vm_area_struct *vmap;
 	int err;
 
 	/* get VM area structure */
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	VM_OBJECT_WUNLOCK(vm_obj);
 
 	linux_set_current(curthread);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	if (unlikely(vmap->vm_ops == NULL)) {
 		err = VM_FAULT_SIGBUS;
 	} else {
 		struct vm_fault vmf;
 
 		/* fill out VM fault structure */
 		vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
 		vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
 		vmf.pgoff = 0;
 		vmf.page = NULL;
 		vmf.vma = vmap;
 
 		vmap->vm_pfn_count = 0;
 		vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
 		vmap->vm_obj = vm_obj;
 
 		err = vmap->vm_ops->fault(&vmf);
 
 		while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
 			kern_yield(PRI_USER);
 			err = vmap->vm_ops->fault(&vmf);
 		}
 	}
 
 	/* translate return code */
 	switch (err) {
 	case VM_FAULT_OOM:
 		err = VM_PAGER_AGAIN;
 		break;
 	case VM_FAULT_SIGBUS:
 		err = VM_PAGER_BAD;
 		break;
 	case VM_FAULT_NOPAGE:
 		/*
 		 * By contract the fault handler will return having
 		 * busied all the pages itself. If pidx is already
 		 * found in the object, it will simply xbusy the first
 		 * page and return with vm_pfn_count set to 1.
 		 */
 		*first = vmap->vm_pfn_first;
 		*last = *first + vmap->vm_pfn_count - 1;
 		err = VM_PAGER_OK;
 		break;
 	default:
 		err = VM_PAGER_ERROR;
 		break;
 	}
 	up_write(&vmap->vm_mm->mmap_sem);
 	VM_OBJECT_WLOCK(vm_obj);
 	return (err);
 }
 
 static struct rwlock linux_vma_lock;
 static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
     TAILQ_HEAD_INITIALIZER(linux_vma_head);
 
 static void
 linux_cdev_handle_free(struct vm_area_struct *vmap)
 {
 	/* Drop reference on vm_file */
 	if (vmap->vm_file != NULL)
 		fput(vmap->vm_file);
 
 	/* Drop reference on mm_struct */
 	mmput(vmap->vm_mm);
 
 	kfree(vmap);
 }
 
 static void
 linux_cdev_handle_remove(struct vm_area_struct *vmap)
 {
 	rw_wlock(&linux_vma_lock);
 	TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
 	rw_wunlock(&linux_vma_lock);
 }
 
 static struct vm_area_struct *
 linux_cdev_handle_find(void *handle)
 {
 	struct vm_area_struct *vmap;
 
 	rw_rlock(&linux_vma_lock);
 	TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
 		if (vmap->vm_private_data == handle)
 			break;
 	}
 	rw_runlock(&linux_vma_lock);
 	return (vmap);
 }
 
 static int
 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		      vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 
 	MPASS(linux_cdev_handle_find(handle) != NULL);
 	*color = 0;
 	return (0);
 }
 
 static void
 linux_cdev_pager_dtor(void *handle)
 {
 	const struct vm_operations_struct *vm_ops;
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(handle);
 	MPASS(vmap != NULL);
 
 	/*
 	 * Remove handle before calling close operation to prevent
 	 * other threads from reusing the handle pointer.
 	 */
 	linux_cdev_handle_remove(vmap);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	vm_ops = vmap->vm_ops;
 	if (likely(vm_ops != NULL))
 		vm_ops->close(vmap);
 	up_write(&vmap->vm_mm->mmap_sem);
 
 	linux_cdev_handle_free(vmap);
 }
 
 static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
   {
 	/* OBJT_MGTDEVICE */
 	.cdev_pg_populate	= linux_cdev_pager_populate,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
   {
 	/* OBJT_DEVICE */
 	.cdev_pg_fault	= linux_cdev_pager_fault,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
 };
 
 int
 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
     unsigned long size)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
 	obj = vma->vm_obj;
 	if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
 		return (-ENOTSUP);
 	VM_OBJECT_RLOCK(obj);
 	for (m = vm_page_find_least(obj, OFF_TO_IDX(address));
 	    m != NULL && m->pindex < OFF_TO_IDX(address + size);
 	    m = TAILQ_NEXT(m, listq))
 		pmap_remove_all(m);
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 void
 vma_set_file(struct vm_area_struct *vma, struct linux_file *file)
 {
 	struct linux_file *tmp;
 
 	/* Changing an anonymous vma with this is illegal */
 	get_file(file);
 	tmp = vma->vm_file;
 	vma->vm_file = file;
 	fput(tmp);
 }
 
 static struct file_operations dummy_ldev_ops = {
 	/* XXXKIB */
 };
 
 static struct linux_cdev dummy_ldev = {
 	.ops = &dummy_ldev_ops,
 };
 
 #define	LDEV_SI_DTR	0x0001
 #define	LDEV_SI_REF	0x0002
 
 static void
 linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
     struct linux_cdev **dev)
 {
 	struct linux_cdev *ldev;
 	u_int siref;
 
 	ldev = filp->f_cdev;
 	*fop = filp->f_op;
 	if (ldev != NULL) {
 		if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
 			refcount_acquire(&ldev->refs);
 		} else {
 			for (siref = ldev->siref;;) {
 				if ((siref & LDEV_SI_DTR) != 0) {
 					ldev = &dummy_ldev;
 					*fop = ldev->ops;
 					siref = ldev->siref;
 					MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 				} else if (atomic_fcmpset_int(&ldev->siref,
 				    &siref, siref + LDEV_SI_REF)) {
 					break;
 				}
 			}
 		}
 	}
 	*dev = ldev;
 }
 
 static void
 linux_drop_fop(struct linux_cdev *ldev)
 {
 
 	if (ldev == NULL)
 		return;
 	if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
 		linux_cdev_deref(ldev);
 	} else {
 		MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
 		MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
 		atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
 	}
 }
 
 #define	OPW(fp,td,code) ({			\
 	struct file *__fpop;			\
 	__typeof(code) __retval;		\
 						\
 	__fpop = (td)->td_fpop;			\
 	(td)->td_fpop = (fp);			\
 	__retval = (code);			\
 	(td)->td_fpop = __fpop;			\
 	__retval;				\
 })
 
 static int
 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
     struct file *file)
 {
 	struct linux_cdev *ldev;
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	int error;
 
 	ldev = dev->si_drv1;
 
 	filp = linux_file_alloc();
 	filp->f_dentry = &filp->f_dentry_store;
 	filp->f_op = ldev->ops;
 	filp->f_mode = file->f_flag;
 	filp->f_flags = file->f_flag;
 	filp->f_vnode = file->f_vnode;
 	filp->_file = file;
 	refcount_acquire(&ldev->refs);
 	filp->f_cdev = ldev;
 
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 
 	if (fop->open != NULL) {
 		error = -fop->open(file->f_vnode, filp);
 		if (error != 0) {
 			linux_drop_fop(ldev);
 			linux_cdev_deref(filp->f_cdev);
 			kfree(filp);
 			return (error);
 		}
 	}
 
 	/* hold on to the vnode - used for fstat() */
 	vhold(filp->f_vnode);
 
 	/* release the file from devfs */
 	finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
 	linux_drop_fop(ldev);
 	return (ENXIO);
 }
 
 #define	LINUX_IOCTL_MIN_PTR 0x10000UL
 #define	LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
 
 static inline int
 linux_remap_address(void **uaddr, size_t len)
 {
 	uintptr_t uaddr_val = (uintptr_t)(*uaddr);
 
 	if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
 	    uaddr_val < LINUX_IOCTL_MAX_PTR)) {
 		struct task_struct *pts = current;
 		if (pts == NULL) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* compute data offset */
 		uaddr_val -= LINUX_IOCTL_MIN_PTR;
 
 		/* check that length is within bounds */
 		if ((len > IOCPARM_MAX) ||
 		    (uaddr_val + len) > pts->bsd_ioctl_len) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* re-add kernel buffer address */
 		uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
 
 		/* update address location */
 		*uaddr = (void *)uaddr_val;
 		return (1);
 	}
 	return (0);
 }
 
 int
 linux_copyin(const void *uaddr, void *kaddr, size_t len)
 {
 	if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(kaddr, uaddr, len);
 		return (0);
 	}
 	return (-copyin(uaddr, kaddr, len));
 }
 
 int
 linux_copyout(const void *kaddr, void *uaddr, size_t len)
 {
 	if (linux_remap_address(&uaddr, len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(uaddr, kaddr, len);
 		return (0);
 	}
 	return (-copyout(kaddr, uaddr, len));
 }
 
 size_t
 linux_clear_user(void *_uaddr, size_t _len)
 {
 	uint8_t *uaddr = _uaddr;
 	size_t len = _len;
 
 	/* make sure uaddr is aligned before going into the fast loop */
 	while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 
 	/* zero 8 bytes at a time */
 	while (len > 7) {
 #ifdef __LP64__
 		if (suword64(uaddr, 0))
 			return (_len);
 #else
 		if (suword32(uaddr, 0))
 			return (_len);
 		if (suword32(uaddr + 4, 0))
 			return (_len);
 #endif
 		uaddr += 8;
 		len -= 8;
 	}
 
 	/* zero fill end, if any */
 	while (len > 0) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 	return (0);
 }
 
 int
 linux_access_ok(const void *uaddr, size_t len)
 {
 	uintptr_t saddr;
 	uintptr_t eaddr;
 
 	/* get start and end address */
 	saddr = (uintptr_t)uaddr;
 	eaddr = (uintptr_t)uaddr + len;
 
 	/* verify addresses are valid for userspace */
 	return ((saddr == eaddr) ||
 	    (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
 }
 
 /*
  * This function should return either EINTR or ERESTART depending on
  * the signal type sent to this thread:
  */
 static int
 linux_get_error(struct task_struct *task, int error)
 {
 	/* check for signal type interrupt code */
 	if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
 		error = -linux_schedule_get_interrupt_value(task);
 		if (error == 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 static int
 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
     const struct file_operations *fop, u_long cmd, caddr_t data,
     struct thread *td)
 {
 	struct task_struct *task = current;
 	unsigned size;
 	int error;
 
 	size = IOCPARM_LEN(cmd);
 	/* refer to logic in sys_ioctl() */
 	if (size > 0) {
 		/*
 		 * Setup hint for linux_copyin() and linux_copyout().
 		 *
 		 * Background: Linux code expects a user-space address
 		 * while FreeBSD supplies a kernel-space address.
 		 */
 		task->bsd_ioctl_data = data;
 		task->bsd_ioctl_len = size;
 		data = (void *)LINUX_IOCTL_MIN_PTR;
 	} else {
 		/* fetch user-space pointer */
 		data = *(void **)data;
 	}
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		/* try the compat IOCTL handler first */
 		if (fop->compat_ioctl != NULL) {
 			error = -OPW(fp, td, fop->compat_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 
 		/* fallback to the regular IOCTL handler, if any */
 		if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		}
 	} else
 #endif
 	{
 		if (fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 	}
 	if (size > 0) {
 		task->bsd_ioctl_data = NULL;
 		task->bsd_ioctl_len = 0;
 	}
 
 	if (error == EWOULDBLOCK) {
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	} else {
 		error = linux_get_error(task, error);
 	}
 	return (error);
 }
 
 #define	LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
 
 /*
  * This function atomically updates the poll wakeup state and returns
  * the previous state at the time of update.
  */
 static uint8_t
 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
 {
 	int c, old;
 
 	c = v->counter;
 
 	while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
 		c = old;
 
 	return (c);
 }
 
 static int
 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
 	};
 	struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_QUEUED:
 		linux_poll_wakeup(filp);
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 void
 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
 	};
 
 	/* check if we are called inside the select system call */
 	if (p == LINUX_POLL_TABLE_NORMAL)
 		selrecord(curthread, &filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_INIT:
 		/* NOTE: file handles can only belong to one wait-queue */
 		filp->f_wait_queue.wqh = wqh;
 		filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
 		add_wait_queue(wqh, &filp->f_wait_queue.wq);
 		atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 linux_poll_wait_dequeue(struct linux_file *filp)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT,	/* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
 	};
 
 	seldrain(&filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_NOT_READY:
 	case LINUX_FWQ_STATE_QUEUED:
 	case LINUX_FWQ_STATE_READY:
 		remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
 		break;
 	default:
 		break;
 	}
 }
 
 void
 linux_poll_wakeup(struct linux_file *filp)
 {
 	/* this function should be NULL-safe */
 	if (filp == NULL)
 		return;
 
 	selwakeup(&filp->f_selinfo);
 
 	spin_lock(&filp->f_kqlock);
 	filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE;
 
 	/* make sure the "knote" gets woken up */
 	KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static void
 linux_file_kqfilter_detach(struct knote *kn)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	spin_lock(&filp->f_kqlock);
 	knlist_remove(&filp->f_selinfo.si_note, kn, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter_read_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
 }
 
 static int
 linux_file_kqfilter_write_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
 }
 
-static struct filterops linux_dev_kqfiltops_read = {
+static const struct filterops linux_dev_kqfiltops_read = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_read_event,
 };
 
-static struct filterops linux_dev_kqfiltops_write = {
+static const struct filterops linux_dev_kqfiltops_write = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_write_event,
 };
 
 static void
 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
 {
 	struct thread *td;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int temp;
 
 	if ((filp->f_kqflags & kqflags) == 0)
 		return;
 
 	td = curthread;
 
 	linux_get_fop(filp, &fop, &ldev);
 	/* get the latest polling state */
 	temp = OPW(filp->_file, td, fop->poll(filp, NULL));
 	linux_drop_fop(ldev);
 
 	spin_lock(&filp->f_kqlock);
 	/* clear kqflags */
 	filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE);
 	/* update kqflags */
 	if ((temp & (POLLIN | POLLOUT)) != 0) {
 		if ((temp & POLLIN) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
 		if ((temp & POLLOUT) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
 
 		/* make sure the "knote" gets woken up */
 		KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
 	}
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter(struct file *file, struct knote *kn)
 {
 	struct linux_file *filp;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	if (filp->f_op->poll == NULL)
 		return (EINVAL);
 
 	spin_lock(&filp->f_kqlock);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
 		kn->kn_fop = &linux_dev_kqfiltops_read;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	case EVFILT_WRITE:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
 		kn->kn_fop = &linux_dev_kqfiltops_write;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	spin_unlock(&filp->f_kqlock);
 
 	if (error == 0) {
 		linux_set_current(td);
 
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	}
 	return (error);
 }
 
 static int
 linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
     vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
     int nprot, bool is_shared, struct thread *td)
 {
 	struct task_struct *task;
 	struct vm_area_struct *vmap;
 	struct mm_struct *mm;
 	struct linux_file *filp;
 	vm_memattr_t attr;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 
 	if (fop->mmap == NULL)
 		return (EOPNOTSUPP);
 
 	linux_set_current(td);
 
 	/*
 	 * The same VM object might be shared by multiple processes
 	 * and the mm_struct is usually freed when a process exits.
 	 *
 	 * The atomic reference below makes sure the mm_struct is
 	 * available as long as the vmap is in the linux_vma_head.
 	 */
 	task = current;
 	mm = task->mm;
 	if (atomic_inc_not_zero(&mm->mm_users) == 0)
 		return (EINVAL);
 
 	vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
 	vmap->vm_start = 0;
 	vmap->vm_end = size;
 	vmap->vm_pgoff = *offset / PAGE_SIZE;
 	vmap->vm_pfn = 0;
 	vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
 	if (is_shared)
 		vmap->vm_flags |= VM_SHARED;
 	vmap->vm_ops = NULL;
 	vmap->vm_file = get_file(filp);
 	vmap->vm_mm = mm;
 
 	if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
 		error = linux_get_error(task, EINTR);
 	} else {
 		error = -OPW(fp, td, fop->mmap(filp, vmap));
 		error = linux_get_error(task, error);
 		up_write(&vmap->vm_mm->mmap_sem);
 	}
 
 	if (error != 0) {
 		linux_cdev_handle_free(vmap);
 		return (error);
 	}
 
 	attr = pgprot2cachemode(vmap->vm_page_prot);
 
 	if (vmap->vm_ops != NULL) {
 		struct vm_area_struct *ptr;
 		void *vm_private_data;
 		bool vm_no_fault;
 
 		if (vmap->vm_ops->open == NULL ||
 		    vmap->vm_ops->close == NULL ||
 		    vmap->vm_private_data == NULL) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			return (EINVAL);
 		}
 
 		vm_private_data = vmap->vm_private_data;
 
 		rw_wlock(&linux_vma_lock);
 		TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
 			if (ptr->vm_private_data == vm_private_data)
 				break;
 		}
 		/* check if there is an existing VM area struct */
 		if (ptr != NULL) {
 			/* check if the VM area structure is invalid */
 			if (ptr->vm_ops == NULL ||
 			    ptr->vm_ops->open == NULL ||
 			    ptr->vm_ops->close == NULL) {
 				error = ESTALE;
 				vm_no_fault = 1;
 			} else {
 				error = EEXIST;
 				vm_no_fault = (ptr->vm_ops->fault == NULL);
 			}
 		} else {
 			/* insert VM area structure into list */
 			TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
 			error = 0;
 			vm_no_fault = (vmap->vm_ops->fault == NULL);
 		}
 		rw_wunlock(&linux_vma_lock);
 
 		if (error != 0) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			/* check for stale VM area struct */
 			if (error != EEXIST)
 				return (error);
 		}
 
 		/* check if there is no fault handler */
 		if (vm_no_fault) {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
 			    &linux_cdev_pager_ops[1], size, nprot, *offset,
 			    td->td_ucred);
 		} else {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
 			    &linux_cdev_pager_ops[0], size, nprot, *offset,
 			    td->td_ucred);
 		}
 
 		/* check if allocating the VM object failed */
 		if (*object == NULL) {
 			if (error == 0) {
 				/* remove VM area struct from list */
 				linux_cdev_handle_remove(vmap);
 				/* free allocated VM area struct */
 				linux_cdev_handle_free(vmap);
 			}
 			return (EINVAL);
 		}
 	} else {
 		struct sglist *sg;
 
 		sg = sglist_alloc(1, M_WAITOK);
 		sglist_append_phys(sg,
 		    (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
 
 		*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
 		    nprot, 0, td->td_ucred);
 
 		linux_cdev_handle_free(vmap);
 
 		if (*object == NULL) {
 			sglist_free(sg);
 			return (EINVAL);
 		}
 	}
 
 	if (attr != VM_MEMATTR_DEFAULT) {
 		VM_OBJECT_WLOCK(*object);
 		vm_object_set_memattr(*object, attr);
 		VM_OBJECT_WUNLOCK(*object);
 	}
 	*offset = 0;
 	return (0);
 }
 
 struct cdevsw linuxcdevsw = {
 	.d_version = D_VERSION,
 	.d_fdopen = linux_dev_fdopen,
 	.d_name = "lkpidev",
 };
 
 static int
 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	error = 0;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->read != NULL) {
 		bytes = OPW(file, td, fop->read(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->write != NULL) {
 		bytes = OPW(file, td, fop->write(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 			error = 0;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
 
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_poll(struct file *file, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int revents;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->poll != NULL) {
 		revents = OPW(file, td, fop->poll(filp,
 		    LINUX_POLL_TABLE_NORMAL)) & events;
 	} else {
 		revents = 0;
 	}
 	linux_drop_fop(ldev);
 	return (revents);
 }
 
 static int
 linux_file_close(struct file *file, struct thread *td)
 {
 	struct linux_file *filp;
 	int (*release)(struct inode *, struct linux_file *);
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 
 	KASSERT(file_count(filp) == 0,
 	    ("File refcount(%d) is not zero", file_count(filp)));
 
 	if (td == NULL)
 		td = curthread;
 
 	error = 0;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_poll_wait_dequeue(filp);
 	linux_get_fop(filp, &fop, &ldev);
 	/*
 	 * Always use the real release function, if any, to avoid
 	 * leaking device resources:
 	 */
 	release = filp->f_op->release;
 	if (release != NULL)
 		error = -OPW(file, td, release(filp->f_vnode, filp));
 	funsetown(&filp->f_sigio);
 	if (filp->f_vnode != NULL)
 		vdrop(filp->f_vnode);
 	linux_drop_fop(ldev);
 	ldev = filp->f_cdev;
 	if (ldev != NULL)
 		linux_cdev_deref(ldev);
 	linux_synchronize_rcu(RCU_TYPE_REGULAR);
 	kfree(filp);
 
 	return (error);
 }
 
 static int
 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	struct fiodgname_arg *fgn;
 	const char *p;
 	int error, i;
 
 	error = 0;
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 	linux_get_fop(filp, &fop, &ldev);
 
 	linux_set_current(td);
 	switch (cmd) {
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		if (fop->fasync == NULL)
 			break;
 		error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
 		break;
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &filp->f_sigio);
 		if (error == 0) {
 			if (fop->fasync == NULL)
 				break;
 			error = -OPW(fp, td, fop->fasync(0, filp,
 			    fp->f_flag & FASYNC));
 		}
 		break;
 	case FIOGETOWN:
 		*(int *)data = fgetown(&filp->f_sigio);
 		break;
 	case FIODGNAME:
 #ifdef	COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 		if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) {
 			error = ENXIO;
 			break;
 		}
 		fgn = data;
 		p = devtoname(filp->f_cdev->cdev);
 		i = strlen(p) + 1;
 		if (i > fgn->len) {
 			error = EINVAL;
 			break;
 		}
 		error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i);
 		break;
 	default:
 		error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
 		break;
 	}
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
     vm_prot_t maxprot, int flags, struct file *fp,
     vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
 {
 	/*
 	 * Character devices do not provide private mappings
 	 * of any kind:
 	 */
 	if ((maxprot & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0)
 		return (EINVAL);
 
 	return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
 	    (int)prot, (flags & MAP_SHARED) ? true : false, td));
 }
 
 static int
 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 
 	vp = filp->f_vnode;
 	if (vp == NULL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 *
 	 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
 	 * requests rather than doing it here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	linux_get_fop(filp, &fop, &ldev);
 	error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp,
 	    &foff, fop, &object);
 	if (error != 0)
 		goto out;
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 out:
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	if (filp->f_vnode == NULL)
 		return (EOPNOTSUPP);
 
 	vp = filp->f_vnode;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED);
 	VOP_UNLOCK(vp);
 
 	return (error);
 }
 
 static int
 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = fp->f_data;
 	vp = filp->f_vnode;
 	if (vp == NULL) {
 		error = 0;
 		kif->kf_type = KF_TYPE_DEV;
 	} else {
 		vref(vp);
 		FILEDESC_SUNLOCK(fdp);
 		error = vn_fill_kinfo_vnode(vp, kif);
 		vrele(vp);
 		kif->kf_type = KF_TYPE_VNODE;
 		FILEDESC_SLOCK(fdp);
 	}
 	return (error);
 }
 
 unsigned int
 linux_iminor(struct inode *inode)
 {
 	struct linux_cdev *ldev;
 
 	if (inode == NULL || inode->v_rdev == NULL ||
 	    inode->v_rdev->si_devsw != &linuxcdevsw)
 		return (-1U);
 	ldev = inode->v_rdev->si_drv1;
 	if (ldev == NULL)
 		return (-1U);
 
 	return (minor(ldev->dev));
 }
 
 static int
 linux_file_kcmp(struct file *fp1, struct file *fp2, struct thread *td)
 {
 	struct linux_file *filp1, *filp2;
 
 	if (fp2->f_type != DTYPE_DEV)
 		return (3);
 
 	filp1 = fp1->f_data;
 	filp2 = fp2->f_data;
 	return (kcmp_cmp((uintptr_t)filp1->f_cdev, (uintptr_t)filp2->f_cdev));
 }
 
-struct fileops linuxfileops = {
+const struct fileops linuxfileops = {
 	.fo_read = linux_file_read,
 	.fo_write = linux_file_write,
 	.fo_truncate = invfo_truncate,
 	.fo_kqfilter = linux_file_kqfilter,
 	.fo_stat = linux_file_stat,
 	.fo_fill_kinfo = linux_file_fill_kinfo,
 	.fo_poll = linux_file_poll,
 	.fo_close = linux_file_close,
 	.fo_ioctl = linux_file_ioctl,
 	.fo_mmap = linux_file_mmap,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_cmp = linux_file_kcmp,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 /*
  * Hash of vmmap addresses.  This is infrequently accessed and does not
  * need to be particularly large.  This is done because we must store the
  * caller's idea of the map size to properly unmap.
  */
 struct vmmap {
 	LIST_ENTRY(vmmap)	vm_next;
 	void 			*vm_addr;
 	unsigned long		vm_size;
 };
 
 struct vmmaphd {
 	struct vmmap *lh_first;
 };
 #define	VMMAP_HASH_SIZE	64
 #define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
 #define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
 static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
 static struct mtx vmmaplock;
 
 static void
 vmmap_add(void *addr, unsigned long size)
 {
 	struct vmmap *vmmap;
 
 	vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
 	mtx_lock(&vmmaplock);
 	vmmap->vm_size = size;
 	vmmap->vm_addr = addr;
 	LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 }
 
 static struct vmmap *
 vmmap_remove(void *addr)
 {
 	struct vmmap *vmmap;
 
 	mtx_lock(&vmmaplock);
 	LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
 		if (vmmap->vm_addr == addr)
 			break;
 	if (vmmap)
 		LIST_REMOVE(vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 
 	return (vmmap);
 }
 
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)
 void *
 _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
 {
 	void *addr;
 
 	addr = pmap_mapdev_attr(phys_addr, size, attr);
 	if (addr == NULL)
 		return (NULL);
 	vmmap_add(addr, size);
 
 	return (addr);
 }
 #endif
 
 void
 iounmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)
 	pmap_unmapdev(addr, vmmap->vm_size);
 #endif
 	kfree(vmmap);
 }
 
 void *
 vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
 {
 	vm_offset_t off;
 	size_t size;
 
 	size = count * PAGE_SIZE;
 	off = kva_alloc(size);
 	if (off == 0)
 		return (NULL);
 	vmmap_add((void *)off, size);
 	pmap_qenter(off, pages, count);
 
 	return ((void *)off);
 }
 
 void
 vunmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 	pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
 	kva_free((vm_offset_t)addr, vmmap->vm_size);
 	kfree(vmmap);
 }
 
 static char *
 devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap)
 {
 	unsigned int len;
 	char *p;
 	va_list aq;
 
 	va_copy(aq, ap);
 	len = vsnprintf(NULL, 0, fmt, aq);
 	va_end(aq);
 
 	if (dev != NULL)
 		p = devm_kmalloc(dev, len + 1, gfp);
 	else
 		p = kmalloc(len + 1, gfp);
 	if (p != NULL)
 		vsnprintf(p, len + 1, fmt, ap);
 
 	return (p);
 }
 
 char *
 kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 {
 
 	return (devm_kvasprintf(NULL, gfp, fmt, ap));
 }
 
 char *
 lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)
 {
 	va_list ap;
 	char *p;
 
 	va_start(ap, fmt);
 	p = devm_kvasprintf(dev, gfp, fmt, ap);
 	va_end(ap);
 
 	return (p);
 }
 
 char *
 kasprintf(gfp_t gfp, const char *fmt, ...)
 {
 	va_list ap;
 	char *p;
 
 	va_start(ap, fmt);
 	p = kvasprintf(gfp, fmt, ap);
 	va_end(ap);
 
 	return (p);
 }
 
 static void
 linux_timer_callback_wrapper(void *context)
 {
 	struct timer_list *timer;
 
 	timer = context;
 
 	/* the timer is about to be shutdown permanently */
 	if (timer->function == NULL)
 		return;
 
 	if (linux_set_current_flags(curthread, M_NOWAIT)) {
 		/* try again later */
 		callout_reset(&timer->callout, 1,
 		    &linux_timer_callback_wrapper, timer);
 		return;
 	}
 
 	timer->function(timer->data);
 }
 
 int
 mod_timer(struct timer_list *timer, int expires)
 {
 	int ret;
 
 	timer->expires = expires;
 	ret = callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(expires),
 	    &linux_timer_callback_wrapper, timer);
 
 	MPASS(ret == 0 || ret == 1);
 
 	return (ret == 1);
 }
 
 void
 add_timer(struct timer_list *timer)
 {
 
 	callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer);
 }
 
 void
 add_timer_on(struct timer_list *timer, int cpu)
 {
 
 	callout_reset_on(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer, cpu);
 }
 
 int
 del_timer(struct timer_list *timer)
 {
 
 	if (callout_stop(&(timer)->callout) == -1)
 		return (0);
 	return (1);
 }
 
 int
 del_timer_sync(struct timer_list *timer)
 {
 
 	if (callout_drain(&(timer)->callout) == -1)
 		return (0);
 	return (1);
 }
 
 int
 timer_delete_sync(struct timer_list *timer)
 {
 
 	return (del_timer_sync(timer));
 }
 
 int
 timer_shutdown_sync(struct timer_list *timer)
 {
 
 	timer->function = NULL;
 	return (del_timer_sync(timer));
 }
 
 /* greatest common divisor, Euclid equation */
 static uint64_t
 lkpi_gcd_64(uint64_t a, uint64_t b)
 {
 	uint64_t an;
 	uint64_t bn;
 
 	while (b != 0) {
 		an = b;
 		bn = a % b;
 		a = an;
 		b = bn;
 	}
 	return (a);
 }
 
 uint64_t lkpi_nsec2hz_rem;
 uint64_t lkpi_nsec2hz_div = 1000000000ULL;
 uint64_t lkpi_nsec2hz_max;
 
 uint64_t lkpi_usec2hz_rem;
 uint64_t lkpi_usec2hz_div = 1000000ULL;
 uint64_t lkpi_usec2hz_max;
 
 uint64_t lkpi_msec2hz_rem;
 uint64_t lkpi_msec2hz_div = 1000ULL;
 uint64_t lkpi_msec2hz_max;
 
 static void
 linux_timer_init(void *arg)
 {
 	uint64_t gcd;
 
 	/*
 	 * Compute an internal HZ value which can divide 2**32 to
 	 * avoid timer rounding problems when the tick value wraps
 	 * around 2**32:
 	 */
 	linux_timer_hz_mask = 1;
 	while (linux_timer_hz_mask < (unsigned long)hz)
 		linux_timer_hz_mask *= 2;
 	linux_timer_hz_mask--;
 
 	/* compute some internal constants */
 
 	lkpi_nsec2hz_rem = hz;
 	lkpi_usec2hz_rem = hz;
 	lkpi_msec2hz_rem = hz;
 
 	gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div);
 	lkpi_nsec2hz_rem /= gcd;
 	lkpi_nsec2hz_div /= gcd;
 	lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem;
 
 	gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div);
 	lkpi_usec2hz_rem /= gcd;
 	lkpi_usec2hz_div /= gcd;
 	lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem;
 
 	gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div);
 	lkpi_msec2hz_rem /= gcd;
 	lkpi_msec2hz_div /= gcd;
 	lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem;
 }
 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
 
 void
 linux_complete_common(struct completion *c, int all)
 {
 	int wakeup_swapper;
 
 	sleepq_lock(c);
 	if (all) {
 		c->done = UINT_MAX;
 		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
 	} else {
 		if (c->done != UINT_MAX)
 			c->done++;
 		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
 	}
 	sleepq_release(c);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Indefinite wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_common(struct completion *c, int flags)
 {
 	struct task_struct *task;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 	error = 0;
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		if (flags & SLEEPQ_INTERRUPTIBLE) {
 			DROP_GIANT();
 			error = -sleepq_wait_sig(c, 0);
 			PICKUP_GIANT();
 			if (error != 0) {
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 				goto intr;
 			}
 		} else {
 			DROP_GIANT();
 			sleepq_wait(c, 0);
 			PICKUP_GIANT();
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 intr:
 	return (error);
 }
 
 /*
  * Time limited wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_timeout_common(struct completion *c, int timeout, int flags)
 {
 	struct task_struct *task;
 	int end = jiffies + timeout;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		sleepq_set_timeout(c, linux_timer_jiffies_until(end));
 
 		DROP_GIANT();
 		if (flags & SLEEPQ_INTERRUPTIBLE)
 			error = -sleepq_timedwait_sig(c, 0);
 		else
 			error = -sleepq_timedwait(c, 0);
 		PICKUP_GIANT();
 
 		if (error != 0) {
 			/* check for timeout */
 			if (error == -EWOULDBLOCK) {
 				error = 0;	/* timeout */
 			} else {
 				/* signal happened */
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 			}
 			goto done;
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 	/* return how many jiffies are left */
 	error = linux_timer_jiffies_until(end);
 done:
 	return (error);
 }
 
 int
 linux_try_wait_for_completion(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	if (c->done != 0 && c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 	return (isdone);
 }
 
 int
 linux_completion_done(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	sleepq_release(c);
 	return (isdone);
 }
 
 static void
 linux_cdev_deref(struct linux_cdev *ldev)
 {
 	if (refcount_release(&ldev->refs) &&
 	    ldev->kobj.ktype == &linux_cdev_ktype)
 		kfree(ldev);
 }
 
 static void
 linux_cdev_release(struct kobject *kobj)
 {
 	struct linux_cdev *cdev;
 	struct kobject *parent;
 
 	cdev = container_of(kobj, struct linux_cdev, kobj);
 	parent = kobj->parent;
 	linux_destroy_dev(cdev);
 	linux_cdev_deref(cdev);
 	kobject_put(parent);
 }
 
 static void
 linux_cdev_static_release(struct kobject *kobj)
 {
 	struct cdev *cdev;
 	struct linux_cdev *ldev;
 
 	ldev = container_of(kobj, struct linux_cdev, kobj);
 	cdev = ldev->cdev;
 	if (cdev != NULL) {
 		destroy_dev(cdev);
 		ldev->cdev = NULL;
 	}
 	kobject_put(kobj->parent);
 }
 
 int
 linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev)
 {
 	int ret;
 
 	if (dev->devt != 0) {
 		/* Set parent kernel object. */
 		ldev->kobj.parent = &dev->kobj;
 
 		/*
 		 * Unlike Linux we require the kobject of the
 		 * character device structure to have a valid name
 		 * before calling this function:
 		 */
 		if (ldev->kobj.name == NULL)
 			return (-EINVAL);
 
 		ret = cdev_add(ldev, dev->devt, 1);
 		if (ret)
 			return (ret);
 	}
 	ret = device_add(dev);
 	if (ret != 0 && dev->devt != 0)
 		cdev_del(ldev);
 	return (ret);
 }
 
 void
 linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev)
 {
 	device_del(dev);
 
 	if (dev->devt != 0)
 		cdev_del(ldev);
 }
 
 static void
 linux_destroy_dev(struct linux_cdev *ldev)
 {
 
 	if (ldev->cdev == NULL)
 		return;
 
 	MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 	MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
 
 	atomic_set_int(&ldev->siref, LDEV_SI_DTR);
 	while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
 		pause("ldevdtr", hz / 4);
 
 	destroy_dev(ldev->cdev);
 	ldev->cdev = NULL;
 }
 
 const struct kobj_type linux_cdev_ktype = {
 	.release = linux_cdev_release,
 };
 
 const struct kobj_type linux_cdev_static_ktype = {
 	.release = linux_cdev_static_release,
 };
 
 static void
 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	if (linkstate == LINK_STATE_UP)
 		nb->notifier_call(nb, NETDEV_UP, &ni);
 	else
 		nb->notifier_call(nb, NETDEV_DOWN, &ni);
 }
 
 static void
 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_REGISTER, &ni);
 }
 
 static void
 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_UNREGISTER, &ni);
 }
 
 static void
 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni);
 }
 
 static void
 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni);
 }
 
 int
 register_netdevice_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
 	    ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
 	nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
 	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
 	nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
 	    iflladdr_event, linux_handle_iflladdr_event, nb, 0);
 
 	return (0);
 }
 
 int
 register_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
 	    ifaddr_event, linux_handle_ifaddr_event, nb, 0);
 	return (0);
 }
 
 int
 unregister_netdevice_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifnet_link_event,
 	    nb->tags[NETDEV_UP]);
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 	    nb->tags[NETDEV_REGISTER]);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    nb->tags[NETDEV_UNREGISTER]);
 	EVENTHANDLER_DEREGISTER(iflladdr_event,
 	    nb->tags[NETDEV_CHANGEADDR]);
 
 	return (0);
 }
 
 int
 unregister_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifaddr_event,
 	    nb->tags[NETDEV_CHANGEIFADDR]);
 
 	return (0);
 }
 
 struct list_sort_thunk {
 	int (*cmp)(void *, struct list_head *, struct list_head *);
 	void *priv;
 };
 
 static inline int
 linux_le_cmp(const void *d1, const void *d2, void *priv)
 {
 	struct list_head *le1, *le2;
 	struct list_sort_thunk *thunk;
 
 	thunk = priv;
 	le1 = *(__DECONST(struct list_head **, d1));
 	le2 = *(__DECONST(struct list_head **, d2));
 	return ((thunk->cmp)(thunk->priv, le1, le2));
 }
 
 void
 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
     struct list_head *a, struct list_head *b))
 {
 	struct list_sort_thunk thunk;
 	struct list_head **ar, *le;
 	size_t count, i;
 
 	count = 0;
 	list_for_each(le, head)
 		count++;
 	ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
 	i = 0;
 	list_for_each(le, head)
 		ar[i++] = le;
 	thunk.cmp = cmp;
 	thunk.priv = priv;
 	qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk);
 	INIT_LIST_HEAD(head);
 	for (i = 0; i < count; i++)
 		list_add_tail(ar[i], head);
 	free(ar, M_KMALLOC);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 int
 linux_wbinvd_on_all_cpus(void)
 {
 
 	pmap_invalidate_cache();
 	return (0);
 }
 #endif
 
 int
 linux_on_each_cpu(void callback(void *), void *data)
 {
 
 	smp_rendezvous(smp_no_rendezvous_barrier, callback,
 	    smp_no_rendezvous_barrier, data);
 	return (0);
 }
 
 int
 linux_in_atomic(void)
 {
 
 	return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
 }
 
 struct linux_cdev *
 linux_find_cdev(const char *name, unsigned major, unsigned minor)
 {
 	dev_t dev = MKDEV(major, minor);
 	struct cdev *cdev;
 
 	dev_lock();
 	LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
 		struct linux_cdev *ldev = cdev->si_drv1;
 		if (ldev->dev == dev &&
 		    strcmp(kobject_name(&ldev->kobj), name) == 0) {
 			break;
 		}
 	}
 	dev_unlock();
 
 	return (cdev != NULL ? cdev->si_drv1 : NULL);
 }
 
 int
 __register_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add(cdev, makedev(major, i), 1);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 int
 __register_chrdev_p(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops, uid_t uid,
     gid_t gid, int mode)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 void
 __unregister_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name)
 {
 	struct linux_cdev *cdevp;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdevp = linux_find_cdev(name, major, i);
 		if (cdevp != NULL)
 			cdev_del(cdevp);
 	}
 }
 
 void
 linux_dump_stack(void)
 {
 #ifdef STACK
 	struct stack st;
 
 	stack_save(&st);
 	stack_print(&st);
 #endif
 }
 
 int
 linuxkpi_net_ratelimit(void)
 {
 
 	return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps,
 	   lkpi_net_maxpps));
 }
 
 struct io_mapping *
 io_mapping_create_wc(resource_size_t base, unsigned long size)
 {
 	struct io_mapping *mapping;
 
 	mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
 	if (mapping == NULL)
 		return (NULL);
 	return (io_mapping_init_wc(mapping, base, size));
 }
 
 /* We likely want a linuxkpi_device.c at some point. */
 bool
 device_can_wakeup(struct device *dev)
 {
 
 	if (dev == NULL)
 		return (false);
 	/*
 	 * XXX-BZ iwlwifi queries it as part of enabling WoWLAN.
 	 * Normally this would be based on a bool in dev->power.XXX.
 	 * Check such as PCI PCIM_PCAP_*PME.  We have no way to enable this yet.
 	 * We may get away by directly calling into bsddev for as long as
 	 * we can assume PCI only avoiding changing struct device breaking KBI.
 	 */
 	pr_debug("%s:%d: not enabled; see comment.\n", __func__, __LINE__);
 	return (false);
 }
 
 static void
 devm_device_group_remove(struct device *dev, void *p)
 {
 	const struct attribute_group **dr = p;
 	const struct attribute_group *group = *dr;
 
 	sysfs_remove_group(&dev->kobj, group);
 }
 
 int
 lkpi_devm_device_add_group(struct device *dev,
     const struct attribute_group *group)
 {
 	const struct attribute_group **dr;
 	int ret;
 
 	dr = devres_alloc(devm_device_group_remove, sizeof(*dr), GFP_KERNEL);
 	if (dr == NULL)
 		return (-ENOMEM);
 
 	ret = sysfs_create_group(&dev->kobj, group);
 	if (ret == 0) {
 		*dr = group;
 		devres_add(dev, dr);
 	} else
 		devres_free(dr);
 
 	return (ret);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 bool linux_cpu_has_clflush;
 struct cpuinfo_x86 boot_cpu_data;
 struct cpuinfo_x86 *__cpu_data;
 #endif
 
 cpumask_t *
 lkpi_get_static_single_cpu_mask(int cpuid)
 {
 
 	KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n",
 	    __func__, cpuid));
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n",
 	    __func__, cpuid));
 
 	return (static_single_cpu_mask[cpuid]);
 }
 
 bool
 lkpi_xen_initial_domain(void)
 {
 #ifdef XENHVM
 	return (xen_initial_domain());
 #else
 	return (false);
 #endif
 }
 
 bool
 lkpi_xen_pv_domain(void)
 {
 #ifdef XENHVM
 	return (xen_pv_domain());
 #else
 	return (false);
 #endif
 }
 
 static void
 linux_compat_init(void *arg)
 {
 	struct sysctl_oid *rootoid;
 	int i;
 
 #if defined(__i386__) || defined(__amd64__)
 	static const uint32_t x86_vendors[X86_VENDOR_NUM] = {
 		[X86_VENDOR_INTEL] = CPU_VENDOR_INTEL,
 		[X86_VENDOR_CYRIX] = CPU_VENDOR_CYRIX,
 		[X86_VENDOR_AMD] = CPU_VENDOR_AMD,
 		[X86_VENDOR_UMC] = CPU_VENDOR_UMC,
 		[X86_VENDOR_CENTAUR] = CPU_VENDOR_CENTAUR,
 		[X86_VENDOR_TRANSMETA] = CPU_VENDOR_TRANSMETA,
 		[X86_VENDOR_NSC] = CPU_VENDOR_NSC,
 		[X86_VENDOR_HYGON] = CPU_VENDOR_HYGON,
 	};
 	uint8_t x86_vendor = X86_VENDOR_UNKNOWN;
 
 	for (i = 0; i < X86_VENDOR_NUM; i++) {
 		if (cpu_vendor_id != 0 && cpu_vendor_id == x86_vendors[i]) {
 			x86_vendor = i;
 			break;
 		}
 	}
 	linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
 	boot_cpu_data.x86_clflush_size = cpu_clflush_line_size;
 	boot_cpu_data.x86_max_cores = mp_ncpus;
 	boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id);
 	boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id);
 	boot_cpu_data.x86_vendor = x86_vendor;
 
 	__cpu_data = mallocarray(mp_maxid + 1,
 	    sizeof(*__cpu_data), M_KMALLOC, M_WAITOK | M_ZERO);
 	CPU_FOREACH(i) {
 		__cpu_data[i].x86_clflush_size = cpu_clflush_line_size;
 		__cpu_data[i].x86_max_cores = mp_ncpus;
 		__cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id);
 		__cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id);
 		__cpu_data[i].x86_vendor = x86_vendor;
 	}
 #endif
 	rw_init(&linux_vma_lock, "lkpi-vma-lock");
 
 	rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
 	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
 	kobject_init(&linux_class_root, &linux_class_ktype);
 	kobject_set_name(&linux_class_root, "class");
 	linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
 	    OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
 	kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
 	kobject_set_name(&linux_root_device.kobj, "device");
 	linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
 	    SYSCTL_CHILDREN(rootoid), OID_AUTO, "device",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device");
 	linux_root_device.bsddev = root_bus;
 	linux_class_misc.name = "misc";
 	class_register(&linux_class_misc);
 	INIT_LIST_HEAD(&pci_drivers);
 	INIT_LIST_HEAD(&pci_devices);
 	spin_lock_init(&pci_lock);
 	mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
 	for (i = 0; i < VMMAP_HASH_SIZE; i++)
 		LIST_INIT(&vmmaphead[i]);
 	init_waitqueue_head(&linux_bit_waitq);
 	init_waitqueue_head(&linux_var_waitq);
 
 	CPU_COPY(&all_cpus, &cpu_online_mask);
 	/*
 	 * Generate a single-CPU cpumask_t for each CPU (possibly) in the system.
 	 * CPUs are indexed from 0..(mp_maxid).  The entry for cpuid 0 will only
 	 * have itself in the cpumask, cupid 1 only itself on entry 1, and so on.
 	 * This is used by cpumask_of() (and possibly others in the future) for,
 	 * e.g., drivers to pass hints to irq_set_affinity_hint().
 	 */
 	static_single_cpu_mask = mallocarray(mp_maxid + 1,
 	    sizeof(static_single_cpu_mask), M_KMALLOC, M_WAITOK | M_ZERO);
 
 	/*
 	 * When the number of CPUs reach a threshold, we start to save memory
 	 * given the sets are static by overlapping those having their single
 	 * bit set at same position in a bitset word.  Asymptotically, this
 	 * regular scheme is in O(n²) whereas the overlapping one is in O(n)
 	 * only with n being the maximum number of CPUs, so the gain will become
 	 * huge quite quickly.  The threshold for 64-bit architectures is 128
 	 * CPUs.
 	 */
 	if (mp_ncpus < (2 * _BITSET_BITS)) {
 		cpumask_t *sscm_ptr;
 
 		/*
 		 * This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) *
 		 * (_BITSET_BITS / 8)' bytes (for comparison with the
 		 * overlapping scheme).
 		 */
 		static_single_cpu_mask_lcs = mallocarray(mp_ncpus,
 		    sizeof(*static_single_cpu_mask_lcs),
 		    M_KMALLOC, M_WAITOK | M_ZERO);
 
 		sscm_ptr = static_single_cpu_mask_lcs;
 		CPU_FOREACH(i) {
 			static_single_cpu_mask[i] = sscm_ptr++;
 			CPU_SET(i, static_single_cpu_mask[i]);
 		}
 	} else {
 		/* Pointer to a bitset word. */
 		__typeof(((cpuset_t *)NULL)->__bits[0]) *bwp;
 
 		/*
 		 * Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t'
 		 * really) with a single bit set that can be reused for all
 		 * single CPU masks by making them start at different offsets.
 		 * We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before
 		 * the word having its single bit set, and the same amount
 		 * after.
 		 */
 		static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS,
 		    (2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8),
 		    M_KMALLOC, M_WAITOK | M_ZERO);
 
 		/*
 		 * We rely below on cpuset_t and the bitset generic
 		 * implementation assigning words in the '__bits' array in the
 		 * same order of bits (i.e., little-endian ordering, not to be
 		 * confused with machine endianness, which concerns bits in
 		 * words and other integers).  This is an imperfect test, but it
 		 * will detect a change to big-endian ordering.
 		 */
 		_Static_assert(
 		    __bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1,
 		    "Assumes a bitset implementation that is little-endian "
 		    "on its words");
 
 		/* Initialize the single bit of each static span. */
 		bwp = (__typeof(bwp))static_single_cpu_mask_lcs +
 		    (__bitset_words(CPU_SETSIZE) - 1);
 		for (i = 0; i < _BITSET_BITS; i++) {
 			CPU_SET(i, (cpuset_t *)bwp);
 			bwp += (2 * __bitset_words(CPU_SETSIZE) - 1);
 		}
 
 		/*
 		 * Finally set all CPU masks to the proper word in their
 		 * relevant span.
 		 */
 		CPU_FOREACH(i) {
 			bwp = (__typeof(bwp))static_single_cpu_mask_lcs;
 			/* Find the non-zero word of the relevant span. */
 			bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) *
 			    (i % _BITSET_BITS) +
 			    __bitset_words(CPU_SETSIZE) - 1;
 			/* Shift to find the CPU mask start. */
 			bwp -= (i / _BITSET_BITS);
 			static_single_cpu_mask[i] = (cpuset_t *)bwp;
 		}
 	}
 
 	strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release));
 }
 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
 
 static void
 linux_compat_uninit(void *arg)
 {
 	linux_kobject_kfree_name(&linux_class_root);
 	linux_kobject_kfree_name(&linux_root_device.kobj);
 	linux_kobject_kfree_name(&linux_class_misc.kobj);
 
 	free(static_single_cpu_mask_lcs, M_KMALLOC);
 	free(static_single_cpu_mask, M_KMALLOC);
 #if defined(__i386__) || defined(__amd64__)
 	free(__cpu_data, M_KMALLOC);
 #endif
 
 	mtx_destroy(&vmmaplock);
 	spin_lock_destroy(&pci_lock);
 	rw_destroy(&linux_vma_lock);
 }
 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
 
 /*
  * NOTE: Linux frequently uses "unsigned long" for pointer to integer
  * conversion and vice versa, where in FreeBSD "uintptr_t" would be
  * used. Assert these types have the same size, else some parts of the
  * LinuxKPI may not work like expected:
  */
 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
diff --git a/sys/dev/beri/beri_ring.c b/sys/dev/beri/beri_ring.c
index 6d48a411da38..eed002a3dfd6 100644
--- a/sys/dev/beri/beri_ring.c
+++ b/sys/dev/beri/beri_ring.c
@@ -1,525 +1,525 @@
 /*-
  * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * SRI-Cambridge BERI soft processor <-> ARM core ring buffer.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/rman.h>
 #include <sys/timeet.h>
 #include <sys/timetc.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/event.h>
 #include <sys/selinfo.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 #include <machine/cpu.h>
 #include <machine/intr.h>
 
 #define READ4(_sc, _reg) \
 	bus_read_4((_sc)->res[0], _reg)
 #define WRITE4(_sc, _reg, _val) \
 	bus_write_4((_sc)->res[0], _reg, _val)
 
 #define CDES_INT_EN		(1 << 15)
 #define CDES_CAUSE_MASK		0x3
 #define CDES_CAUSE_SHIFT	13
 #define DEVNAME_MAXLEN		256
 
 typedef struct
 {
 	uint16_t cdes;
 	uint16_t interrupt_level;
 	uint16_t in;
 	uint16_t out;
 } control_reg_t;
 
 struct beri_softc {
 	struct resource		*res[3];
 	bus_space_tag_t		bst;
 	bus_space_handle_t	bsh;
 	struct cdev		*cdev;
 	device_t		dev;
 	void			*read_ih;
 	void			*write_ih;
 	struct selinfo		beri_rsel;
 	struct mtx		beri_mtx;
 	int			opened;
 
 	char			devname[DEVNAME_MAXLEN];
 	int			control_read;
 	int			control_write;
 	int			data_read;
 	int			data_write;
 	int			data_size;
 };
 
 static struct resource_spec beri_spec[] = {
 	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },
 	{ SYS_RES_IRQ,		0,	RF_ACTIVE },
 	{ SYS_RES_IRQ,		1,	RF_ACTIVE },
 	{ -1, 0 }
 };
 
 static control_reg_t
 get_control_reg(struct beri_softc *sc, int dir)
 {
 	uint32_t offset;
 	uint16_t dst[4];
 	control_reg_t c;
 	uint16_t *cp;
 	int i;
 
 	cp = (uint16_t *)&c;
 
 	offset = dir ? sc->control_write : sc->control_read;
 	((uint32_t *)dst)[0] = READ4(sc, offset);
 	((uint32_t *)dst)[1] = READ4(sc, offset + 4);
 
 	for (i = 0; i < 4; i++)
 		cp[i] = dst[3 - i];
 
 	return (c);
 }
 
 static void
 set_control_reg(struct beri_softc *sc, int dir, control_reg_t *c)
 {
 	uint32_t offset;
 	uint16_t src[4];
 	uint16_t *cp;
 	int i;
 
 	cp = (uint16_t *)c;
 
 	for (i = 0; i < 4; i++)
 		src[3 - i] = cp[i];
 
 	offset = dir ? sc->control_write : sc->control_read;
 	WRITE4(sc, offset + 0, ((uint32_t *)src)[0]);
 	WRITE4(sc, offset + 4, ((uint32_t *)src)[1]);
 }
 
 static int
 get_stock(struct beri_softc *sc, int dir, control_reg_t *c)
 {
 	uint32_t fill;
 
 	fill = (c->in - c->out + sc->data_size) % sc->data_size;
 
 	if (dir)
 		return (sc->data_size - fill - 1);
 	else
 		return (fill);
 }
 
 static void
 beri_intr_write(void *arg)
 {
 	struct beri_softc *sc;
 	control_reg_t c;
 
 	sc = arg;
 
 	c = get_control_reg(sc, 1);
 	if (c.cdes & CDES_INT_EN) {
 		c.cdes &= ~(CDES_INT_EN);
 		set_control_reg(sc, 1, &c);
 	}
 
 	mtx_lock(&sc->beri_mtx);
 	selwakeuppri(&sc->beri_rsel, PZERO + 1);
 	KNOTE_LOCKED(&sc->beri_rsel.si_note, 0);
 	mtx_unlock(&sc->beri_mtx);
 }
 
 static void
 beri_intr_read(void *arg)
 {
 	struct beri_softc *sc;
 	control_reg_t c;
 
 	sc = arg;
 
 	c = get_control_reg(sc, 0);
 	if (c.cdes & CDES_INT_EN) {
 		c.cdes &= ~(CDES_INT_EN);
 		set_control_reg(sc, 0, &c);
 	}
 
 	mtx_lock(&sc->beri_mtx);
 	selwakeuppri(&sc->beri_rsel, PZERO + 1);
 	KNOTE_LOCKED(&sc->beri_rsel.si_note, 0);
 	mtx_unlock(&sc->beri_mtx);
 }
 
 static int
 beri_open(struct cdev *dev, int flags __unused,
     int fmt __unused, struct thread *td __unused)
 {
 	struct beri_softc *sc;
 	control_reg_t c;
 
 	sc = dev->si_drv1;
 
 	if (sc->opened)
 		return (1);
 
 	/* Setup interrupt handlers */
 	if (bus_setup_intr(sc->dev, sc->res[1], INTR_TYPE_BIO | INTR_MPSAFE,
 		NULL, beri_intr_read, sc, &sc->read_ih)) {
 		device_printf(sc->dev, "Unable to setup read intr\n");
 		return (1);
 	}
 	if (bus_setup_intr(sc->dev, sc->res[2], INTR_TYPE_BIO | INTR_MPSAFE,
 		NULL, beri_intr_write, sc, &sc->write_ih)) {
 		device_printf(sc->dev, "Unable to setup write intr\n");
 		return (1);
 	}
 
 	sc->opened = 1;
 
 	/* Clear write buffer */
 	c = get_control_reg(sc, 1);
 	c.in = c.out;
 	c.cdes = 0;
 	set_control_reg(sc, 1, &c);
 
 	/* Clear read buffer */
 	c = get_control_reg(sc, 0);
 	c.out = c.in;
 	c.cdes = 0;
 	set_control_reg(sc, 0, &c);
 
 	return (0);
 }
 
 static int
 beri_close(struct cdev *dev, int flags __unused,
     int fmt __unused, struct thread *td __unused)
 {
 	struct beri_softc *sc;
 
 	sc = dev->si_drv1;
 
 	if (sc->opened) {
 		sc->opened = 0;
 
 		/* Unsetup interrupt handlers */
 		bus_teardown_intr(sc->dev, sc->res[1], sc->read_ih);
 		bus_teardown_intr(sc->dev, sc->res[2], sc->write_ih);
 	}
 
 	return (0);
 }
 
 static int
 beri_rdwr(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct beri_softc *sc;
 	uint32_t offset;
 	control_reg_t c;
 	uint16_t *ptr;
 	uint8_t *dst;
 	int stock;
 	int dir;
 	int amount;
 	int count;
 
 	sc = dev->si_drv1;
 
 	dir = uio->uio_rw ? 1 : 0;
 
 	c = get_control_reg(sc, dir);
 	stock = get_stock(sc, dir, &c);
 	if (stock < uio->uio_resid) {
 		device_printf(sc->dev, "Err: no data/space available\n");
 		return (1);
 	}
 
 	amount = uio->uio_resid;
 	ptr = dir ? &c.in : &c.out;
 	count = (sc->data_size - *ptr);
 
 	offset = dir ? sc->data_write : sc->data_read;
 	dst = (uint8_t *)(sc->bsh + offset);
 
 	if (amount <= count) {
 		uiomove(dst + *ptr, amount, uio);
 	} else {
 		uiomove(dst + *ptr, count, uio);
 		uiomove(dst, (amount - count), uio);
 	}
 
 	*ptr = (*ptr + amount) % sc->data_size;
 	set_control_reg(sc, dir, &c);
 
 	return (0);
 }
 
 static int
 beri_kqread(struct knote *kn, long hint)
 {
 	struct beri_softc *sc;
 	control_reg_t c;
 	int stock;
 
 	sc = kn->kn_hook;
 
 	c = get_control_reg(sc, 0);
 	stock = get_stock(sc, 0, &c);
 	if (stock) {
 		kn->kn_data = stock;
 		return (1);
 	}
 
 	kn->kn_data = 0;
 
 	/* Wait at least one new byte in buffer */
 	c.interrupt_level = 1;
 
 	/* Enable interrupts */
 	c.cdes |= (CDES_INT_EN);
 	set_control_reg(sc, 0, &c);
 
 	return (0);
 }
 
 static int
 beri_kqwrite(struct knote *kn, long hint)
 {
 	struct beri_softc *sc;
 	control_reg_t c;
 	int stock;
 
 	sc = kn->kn_hook;
 
 	c = get_control_reg(sc, 1);
 	stock = get_stock(sc, 1, &c);
 	if (stock) {
 		kn->kn_data = stock;
 		return (1);
 	}
 
 	kn->kn_data = 0;
 
 	/* Wait at least one free position in buffer */
 	c.interrupt_level = sc->data_size - 2;
 
 	/* Enable interrupts */
 	c.cdes |= (CDES_INT_EN);
 	set_control_reg(sc, 1, &c);
 
 	return (0);
 }
 
 static void
 beri_kqdetach(struct knote *kn)
 {
 	struct beri_softc *sc;
 
 	sc = kn->kn_hook;
 
 	knlist_remove(&sc->beri_rsel.si_note, kn, 0);
 }
 
-static struct filterops beri_read_filterops = {
+static const struct filterops beri_read_filterops = {
 	.f_isfd =       1,
 	.f_attach =     NULL,
 	.f_detach =     beri_kqdetach,
 	.f_event =      beri_kqread,
 };
 
-static struct filterops beri_write_filterops = {
+static const struct filterops beri_write_filterops = {
 	.f_isfd =       1,
 	.f_attach =     NULL,
 	.f_detach =     beri_kqdetach,
 	.f_event =      beri_kqwrite,
 };
 
 static int
 beri_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct beri_softc *sc;
 
 	sc = dev->si_drv1;
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &beri_read_filterops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &beri_write_filterops;
 		break;
 	default:
 		return(EINVAL);
 	}
 
 	kn->kn_hook = sc;
 	knlist_add(&sc->beri_rsel.si_note, kn, 0);
 
 	return (0);
 }
 
 static struct cdevsw beri_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	beri_open,
 	.d_close =	beri_close,
 	.d_write =	beri_rdwr,
 	.d_read =	beri_rdwr,
 	.d_kqfilter =	beri_kqfilter,
 	.d_name =	"beri ring buffer",
 };
 
 static int
 parse_fdt(struct beri_softc *sc)
 {
 	pcell_t dts_value[0];
 	phandle_t node;
 	int len;
 
 	if ((node = ofw_bus_get_node(sc->dev)) == -1)
 		return (ENXIO);
 
 	/* get device name */
 	if (OF_getprop(ofw_bus_get_node(sc->dev), "device_name",
 		&sc->devname, sizeof(sc->devname)) <= 0) {
 		device_printf(sc->dev, "Can't get device_name\n");
 		return (ENXIO);
 	}
 
 	if ((len = OF_getproplen(node, "data_size")) <= 0)
 		return (ENXIO);
 	OF_getencprop(node, "data_size", dts_value, len);
 	sc->data_size = dts_value[0];
 
 	if ((len = OF_getproplen(node, "data_read")) <= 0)
 		return (ENXIO);
 	OF_getencprop(node, "data_read", dts_value, len);
 	sc->data_read = dts_value[0];
 
 	if ((len = OF_getproplen(node, "data_write")) <= 0)
 		return (ENXIO);
 	OF_getencprop(node, "data_write", dts_value, len);
 	sc->data_write = dts_value[0];
 
 	if ((len = OF_getproplen(node, "control_read")) <= 0)
 		return (ENXIO);
 	OF_getencprop(node, "control_read", dts_value, len);
 	sc->control_read = dts_value[0];
 
 	if ((len = OF_getproplen(node, "control_write")) <= 0)
 		return (ENXIO);
 	OF_getencprop(node, "control_write", dts_value, len);
 	sc->control_write = dts_value[0];
 
 	return (0);
 }
 
 static int
 beri_probe(device_t dev)
 {
 
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (!ofw_bus_is_compatible(dev, "sri-cambridge,beri-ring"))
 		return (ENXIO);
 
 	device_set_desc(dev, "SRI-Cambridge BERI ring buffer");
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 beri_attach(device_t dev)
 {
 	struct beri_softc *sc;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 
 	if (bus_alloc_resources(dev, beri_spec, sc->res)) {
 		device_printf(dev, "could not allocate resources\n");
 		return (ENXIO);
 	}
 
 	/* Memory interface */
 	sc->bst = rman_get_bustag(sc->res[0]);
 	sc->bsh = rman_get_bushandle(sc->res[0]);
 
 	if (parse_fdt(sc)) {
 		device_printf(sc->dev, "Can't get FDT values\n");
 		return (ENXIO);
 	}
 
 	sc->cdev = make_dev(&beri_cdevsw, 0, UID_ROOT, GID_WHEEL,
 	    S_IRWXU, "%s", sc->devname);
 	if (sc->cdev == NULL) {
 		device_printf(dev, "Failed to create character device.\n");
 		return (ENXIO);
 	}
 
 	sc->cdev->si_drv1 = sc;
 
 	mtx_init(&sc->beri_mtx, "beri_mtx", NULL, MTX_DEF);
 	knlist_init_mtx(&sc->beri_rsel.si_note, &sc->beri_mtx);
 
 	return (0);
 }
 
 static device_method_t beri_methods[] = {
 	DEVMETHOD(device_probe,		beri_probe),
 	DEVMETHOD(device_attach,	beri_attach),
 	{ 0, 0 }
 };
 
 static driver_t beri_driver = {
 	"beri_ring",
 	beri_methods,
 	sizeof(struct beri_softc),
 };
 
 DRIVER_MODULE(beri_ring, simplebus, beri_driver, 0, 0);
diff --git a/sys/dev/cyapa/cyapa.c b/sys/dev/cyapa/cyapa.c
index 307cd4d35b2e..50fa4faa560a 100644
--- a/sys/dev/cyapa/cyapa.c
+++ b/sys/dev/cyapa/cyapa.c
@@ -1,1818 +1,1818 @@
 /*
  * Copyright (c) 2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com> and was subsequently ported,
  * modified and enhanced for FreeBSD by Michael Gmelin <freebsd@grem.de>.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  * 3. Neither the name of The DragonFly Project nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific, prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 /*
  * CYAPA - Cypress APA trackpad with I2C Interface driver
  *
  * Based on DragonFlyBSD's cyapa driver, which referenced the linux
  * cyapa.c driver to figure out the bootstrapping and commands.
  *
  * Unable to locate any datasheet for the device.
  *
  *
  * Trackpad layout:
  *
  *                2/3               1/3
  *       +--------------------+------------+
  *       |                    |   Middle   |
  *       |                    |   Button   |
  *       |       Left         |            |
  *       |      Button        +------------+
  *       |                    |   Right    |
  *       |                    |   Button   |
  *       +--------------------+............|
  *       |     Thumb/Button Area           | 15%
  *       +---------------------------------+
  *
  *
  *                             FEATURES
  *
  * IMPS/2 emulation       - Emulates the IntelliMouse protocol.
  *
  * Jitter supression      - Implements 2-pixel hysteresis with memory.
  *
  * Jump detecion          - Detect jumps caused by touchpad.
  *
  * Two finger scrolling   - Use two fingers for Z axis scrolling.
  *
  * Button down/2nd finger - While one finger clicks and holds down the
  *                          touchpad, the second one can be used to move
  *                          the mouse cursor. Useful for drawing or
  *                          selecting text.
  *
  * Thumb/Button Area      - The lower 15%* of the trackpad will not affect
  *                          the mouse cursor position. This allows for high
  *                          precision clicking, by controlling the cursor
  *                          with the index finger and pushing/holding the
  *                          pad down with the thumb.
  *                          * can be changed using sysctl
  *
  * Track-pad button       - Push physical button. Left 2/3rds of the pad
  *                          will issue a LEFT button event, upper right
  *                          corner will issue a MIDDLE button event,
  *                          lower right corner will issue a RIGHT button
  *                          event. Optional tap support can be enabled
  *                          and configured using sysctl.
  *
  *                              WARNINGS
  *
  * These trackpads get confused when three or more fingers are down on the
  * same horizontal axis and will start to glitch the finger detection.
  * Removing your hand for a few seconds will allow the trackpad to
  * recalibrate.  Generally speaking, when using three or more fingers
  * please try to place at least one finger off-axis (a little above or
  * below) the other two.
  */
 
 #include "opt_evdev.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mouse.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/sysctl.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
 #include <dev/iicbus/iiconf.h>
 #include <dev/iicbus/iicbus.h>
 #include <dev/cyapa/cyapa.h>
 
 #ifdef EVDEV_SUPPORT
 #include <dev/evdev/input.h>
 #include <dev/evdev/evdev.h>
 #endif
 
 #include "iicbus_if.h"
 #include "bus_if.h"
 #include "device_if.h"
 
 #define CYAPA_BUFSIZE	128			/* power of 2 */
 #define CYAPA_BUFMASK	(CYAPA_BUFSIZE - 1)
 
 #define ZSCALE		15
 
 #define TIME_TO_IDLE	(hz * 10)
 #define TIME_TO_RESET	(hz * 3)
 
 static MALLOC_DEFINE(M_CYAPA, "cyapa", "CYAPA device data");
 
 struct cyapa_fifo {
 	int	rindex;
 	int	windex;
 	char	buf[CYAPA_BUFSIZE];
 };
 
 struct cyapa_softc {
 	device_t dev;
 	int	count;			/* >0 if device opened */
 	struct cdev *devnode;
 	struct selinfo selinfo;
 	struct mtx mutex;
 	struct intr_config_hook intr_hook;
 #ifdef EVDEV_SUPPORT
 	struct evdev_dev *evdev;
 #endif
 
 	int	cap_resx;
 	int	cap_resy;
 	int	cap_phyx;
 	int	cap_phyy;
 	uint8_t	cap_buttons;
 
 	int	detaching;		/* driver is detaching */
 	int	poll_thread_running;	/* poll thread is running */
 
 	/* PS/2 mouse emulation */
 	int	track_x;		/* current tracking */
 	int	track_y;
 	int	track_z;
 	int	track_z_ticks;
 	uint16_t track_but;
 	char	track_id;		/* first finger id */
 	int	track_nfingers;
 	int	delta_x;		/* accumulation -> report */
 	int	delta_y;
 	int	delta_z;
 	int	fuzz_x;
 	int	fuzz_y;
 	int	fuzz_z;
 	int	touch_x;		/* touch down coordinates */
 	int	touch_y;
 	int	touch_z;
 	int	finger1_ticks;
 	int	finger2_ticks;
 	int	finger3_ticks;
 	uint16_t reported_but;
 
 	struct cyapa_fifo rfifo;	/* device->host */
 	struct cyapa_fifo wfifo;	/* host->device */
 	uint8_t	ps2_cmd;		/* active p2_cmd waiting for data */
 	uint8_t ps2_acked;
 	int	active_tick;
 	int	data_signal;
 	int	blocked;
 	int	isselect;
 	int	reporting_mode;		/* 0=disabled 1=enabled */
 	int	scaling_mode;		/* 0=1:1 1=2:1 */
 	int	remote_mode;		/* 0 for streaming mode */
 	int	zenabled;		/* z-axis enabled (mode 1 or 2) */
 	mousehw_t hw;			/* hardware information */
 	mousemode_t mode;		/* mode */
 	int	poll_ticks;
 };
 
 struct cyapa_cdevpriv {
 	struct cyapa_softc *sc;
 };
 
 #define CYPOLL_SHUTDOWN	0x0001
 
 static void cyapa_poll_thread(void *arg);
 static int cyapa_raw_input(struct cyapa_softc *sc, struct cyapa_regs *regs,
     int freq);
 static void cyapa_set_power_mode(struct cyapa_softc *sc, int mode);
 
 static int fifo_empty(struct cyapa_softc *sc, struct cyapa_fifo *fifo);
 static size_t fifo_ready(struct cyapa_softc *sc, struct cyapa_fifo *fifo);
 static char *fifo_read(struct cyapa_softc *sc, struct cyapa_fifo *fifo,
     size_t n);
 static char *fifo_write(struct cyapa_softc *sc, struct cyapa_fifo *fifo,
     size_t n);
 static uint8_t fifo_read_char(struct cyapa_softc *sc,
     struct cyapa_fifo *fifo);
 static void fifo_write_char(struct cyapa_softc *sc, struct cyapa_fifo *fifo,
     uint8_t c);
 static size_t fifo_space(struct cyapa_softc *sc, struct cyapa_fifo *fifo);
 static void fifo_reset(struct cyapa_softc *sc, struct cyapa_fifo *fifo);
 
 static int cyapa_fuzz(int delta, int *fuzz);
 
 static int cyapa_idle_freq = 1;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_idle_freq, CTLFLAG_RW,
 	    &cyapa_idle_freq, 0, "Scan frequency in idle mode");
 static int cyapa_slow_freq = 20;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_slow_freq, CTLFLAG_RW,
 	    &cyapa_slow_freq, 0, "Scan frequency in slow mode ");
 static int cyapa_norm_freq = 100;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_norm_freq, CTLFLAG_RW,
 	    &cyapa_norm_freq, 0, "Normal scan frequency");
 static int cyapa_minpressure = 12;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_minpressure, CTLFLAG_RW,
 	    &cyapa_minpressure, 0, "Minimum pressure to detect finger");
 static int cyapa_enable_tapclick = 0;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_enable_tapclick, CTLFLAG_RW,
 	    &cyapa_enable_tapclick, 0, "Enable tap to click");
 static int cyapa_tapclick_min_ticks = 1;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_tapclick_min_ticks, CTLFLAG_RW,
 	    &cyapa_tapclick_min_ticks, 0, "Minimum tap duration for click");
 static int cyapa_tapclick_max_ticks = 8;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_tapclick_max_ticks, CTLFLAG_RW,
 	    &cyapa_tapclick_max_ticks, 0, "Maximum tap duration for click");
 static int cyapa_move_min_ticks = 4;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_move_min_ticks, CTLFLAG_RW,
 	    &cyapa_move_min_ticks, 0,
 	    "Minimum ticks before cursor position is changed");
 static int cyapa_scroll_wait_ticks = 0;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_scroll_wait_ticks, CTLFLAG_RW,
 	    &cyapa_scroll_wait_ticks, 0,
 	    "Wait N ticks before starting to scroll");
 static int cyapa_scroll_stick_ticks = 15;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_scroll_stick_ticks, CTLFLAG_RW,
 	    &cyapa_scroll_stick_ticks, 0,
 	    "Prevent cursor move on single finger for N ticks after scroll");
 static int cyapa_thumbarea_percent = 15;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_thumbarea_percent, CTLFLAG_RW,
 	    &cyapa_thumbarea_percent, 0,
 	    "Size of bottom thumb area in percent");
 
 static int cyapa_debug = 0;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_debug, CTLFLAG_RW,
 	    &cyapa_debug, 0, "Enable debugging");
 static int cyapa_reset = 0;
 SYSCTL_INT(_debug, OID_AUTO, cyapa_reset, CTLFLAG_RW,
 	    &cyapa_reset, 0, "Reset track pad");
 
 static int
 cyapa_read_bytes(device_t dev, uint8_t reg, uint8_t *val, int cnt)
 {
 	uint16_t addr = iicbus_get_addr(dev);
 	struct iic_msg msgs[] = {
 	     { addr, IIC_M_WR | IIC_M_NOSTOP, 1, &reg },
 	     { addr, IIC_M_RD, cnt, val },
 	};
 
 	return (iicbus_transfer(dev, msgs, nitems(msgs)));
 }
 
 static int
 cyapa_write_bytes(device_t dev, uint8_t reg, const uint8_t *val, int cnt)
 {
 	uint16_t addr = iicbus_get_addr(dev);
 	struct iic_msg msgs[] = {
 	     { addr, IIC_M_WR | IIC_M_NOSTOP, 1, &reg },
 	     { addr, IIC_M_WR | IIC_M_NOSTART, cnt, __DECONST(uint8_t *, val) },
 	};
 
 	return (iicbus_transfer(dev, msgs, nitems(msgs)));
 }
 
 static void
 cyapa_lock(struct cyapa_softc *sc)
 {
 
 	mtx_lock(&sc->mutex);
 }
 
 static void
 cyapa_unlock(struct cyapa_softc *sc)
 {
 
 	mtx_unlock(&sc->mutex);
 }
 
 #define	CYAPA_LOCK_ASSERT(sc)	mtx_assert(&(sc)->mutex, MA_OWNED);
 
 /*
  * Notify if possible receive data ready.  Must be called
  * with sc->mutex held (cyapa_lock(sc)).
  */
 static void
 cyapa_notify(struct cyapa_softc *sc)
 {
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	if (sc->data_signal || !fifo_empty(sc, &sc->rfifo)) {
 		KNOTE_LOCKED(&sc->selinfo.si_note, 0);
 		if (sc->blocked || sc->isselect) {
 			if (sc->blocked) {
 			    sc->blocked = 0;
 			    wakeup(&sc->blocked);
 			}
 			if (sc->isselect) {
 			    sc->isselect = 0;
 			    selwakeup(&sc->selinfo);
 			}
 		}
 	}
 }
 
 /*
  * Initialize the device
  */
 static int
 init_device(device_t dev, struct cyapa_cap *cap, int probe)
 {
 	static char bl_exit[] = {
 		0x00, 0xff, 0xa5, 0x00, 0x01,
 		0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
 	static char bl_deactivate[] = {
 		0x00, 0xff, 0x3b, 0x00, 0x01,
 		0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
 	struct cyapa_boot_regs boot;
 	int error;
 	int retries;
 
 	/* Get status */
 	error = cyapa_read_bytes(dev, CMD_BOOT_STATUS,
 	    (void *)&boot, sizeof(boot));
 	if (error)
 		goto done;
 
 	/*
 	 * Bootstrap the device if necessary.  It can take up to 2 seconds
 	 * for the device to fully initialize.
 	 */
 	retries = 20;
 	while ((boot.stat & CYAPA_STAT_RUNNING) == 0 && retries > 0) {
 		if (boot.boot & CYAPA_BOOT_BUSY) {
 			/* Busy, wait loop. */
 		} else if (boot.error & CYAPA_ERROR_BOOTLOADER) {
 			/* Magic */
 			error = cyapa_write_bytes(dev, CMD_BOOT_STATUS,
 			    bl_deactivate, sizeof(bl_deactivate));
 			if (error)
 				goto done;
 		} else {
 			/* Magic */
 			error = cyapa_write_bytes(dev, CMD_BOOT_STATUS,
 			    bl_exit, sizeof(bl_exit));
 			if (error)
 				goto done;
 		}
 		pause("cyapab1", (hz * 2) / 10);
 		--retries;
 		error = cyapa_read_bytes(dev, CMD_BOOT_STATUS,
 		    (void *)&boot, sizeof(boot));
 		if (error)
 			goto done;
 	}
 
 	if (retries == 0) {
 		device_printf(dev, "Unable to bring device out of bootstrap\n");
 		error = ENXIO;
 		goto done;
 	}
 
 	/* Check identity */
 	if (cap) {
 		error = cyapa_read_bytes(dev, CMD_QUERY_CAPABILITIES,
 		    (void *)cap, sizeof(*cap));
 
 		if (strncmp(cap->prod_ida, "CYTRA", 5) != 0) {
 			device_printf(dev, "Product ID \"%5.5s\" mismatch\n",
 			    cap->prod_ida);
 			error = ENXIO;
 		}
 	}
 	error = cyapa_read_bytes(dev, CMD_BOOT_STATUS,
 	    (void *)&boot, sizeof(boot));
 
 	if (probe == 0)		/* official init */
 		device_printf(dev, "cyapa init status %02x\n", boot.stat);
 	else if (probe == 2)
 		device_printf(dev, "cyapa reset status %02x\n", boot.stat);
 
 done:
 	if (error)
 		device_printf(dev, "Unable to initialize\n");
 	return (error);
 }
 
 /*
  * Start the polling thread
  */
 static void
 cyapa_start(void *xdev)
 {
 	struct cyapa_softc *sc;
 	device_t dev = xdev;
 
 	sc = device_get_softc(dev);
 
 	config_intrhook_disestablish(&sc->intr_hook);
 
 	/* Setup input event tracking */
 	cyapa_set_power_mode(sc, CMD_POWER_MODE_IDLE);
 
 	/* Start the polling thread */
 	kthread_add(cyapa_poll_thread, sc, NULL, NULL,
 	    0, 0, "cyapa-poll");
 }
 
 static int cyapa_probe(device_t);
 static int cyapa_attach(device_t);
 static int cyapa_detach(device_t);
 static void cyapa_cdevpriv_dtor(void*);
 
 static device_method_t cyapa_methods[] = {
 	/* device interface */
 	DEVMETHOD(device_probe,		cyapa_probe),
 	DEVMETHOD(device_attach,	cyapa_attach),
 	DEVMETHOD(device_detach,	cyapa_detach),
 
 	DEVMETHOD_END
 };
 
 static driver_t cyapa_driver = {
 	"cyapa",
 	cyapa_methods,
 	sizeof(struct cyapa_softc),
 };
 
 static	d_open_t	cyapaopen;
 static	d_ioctl_t	cyapaioctl;
 static	d_read_t	cyaparead;
 static	d_write_t	cyapawrite;
 static	d_kqfilter_t	cyapakqfilter;
 static	d_poll_t	cyapapoll;
 
 static struct cdevsw cyapa_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	cyapaopen,
 	.d_ioctl =	cyapaioctl,
 	.d_read =	cyaparead,
 	.d_write =	cyapawrite,
 	.d_kqfilter =	cyapakqfilter,
 	.d_poll =	cyapapoll,
 };
 
 static int
 cyapa_probe(device_t dev)
 {
 	struct cyapa_cap cap;
 	int addr;
 	int error;
 
 	addr = iicbus_get_addr(dev);
 
 	/*
 	 * 0x67 - cypress trackpad on the acer c720
 	 * (other devices might use other ids).
 	 */
 	if (addr != 0xce)
 		return (ENXIO);
 
 	error = init_device(dev, &cap, 1);
 	if (error != 0)
 		return (ENXIO);
 
 	device_set_desc(dev, "Cypress APA I2C Trackpad");
 
 	return (BUS_PROBE_VENDOR);
 }
 
 static int
 cyapa_attach(device_t dev)
 {
 	struct cyapa_softc *sc;
 	struct cyapa_cap cap;
 	int unit;
 	int addr;
 
 	sc = device_get_softc(dev);
 	sc->reporting_mode = 1;
 
 	unit = device_get_unit(dev);
 	addr = iicbus_get_addr(dev);
 
 	if (init_device(dev, &cap, 0))
 		return (ENXIO);
 
 	mtx_init(&sc->mutex, "cyapa", NULL, MTX_DEF);
 
 	sc->dev = dev;
 
 	knlist_init_mtx(&sc->selinfo.si_note, &sc->mutex);
 
 	sc->cap_resx = ((cap.max_abs_xy_high << 4) & 0x0F00) |
 	    cap.max_abs_x_low;
 	sc->cap_resy = ((cap.max_abs_xy_high << 8) & 0x0F00) |
 	    cap.max_abs_y_low;
 	sc->cap_phyx = ((cap.phy_siz_xy_high << 4) & 0x0F00) |
 	    cap.phy_siz_x_low;
 	sc->cap_phyy = ((cap.phy_siz_xy_high << 8) & 0x0F00) |
 	    cap.phy_siz_y_low;
 	sc->cap_buttons = cap.buttons >> 3 &
 	    (CYAPA_FNGR_LEFT | CYAPA_FNGR_RIGHT | CYAPA_FNGR_MIDDLE);
 
 	device_printf(dev, "%5.5s-%6.6s-%2.2s buttons=%c%c%c res=%dx%d\n",
 	    cap.prod_ida, cap.prod_idb, cap.prod_idc,
 	    ((sc->cap_buttons & CYAPA_FNGR_LEFT) ? 'L' : '-'),
 	    ((sc->cap_buttons & CYAPA_FNGR_MIDDLE) ? 'M' : '-'),
 	    ((sc->cap_buttons & CYAPA_FNGR_RIGHT) ? 'R' : '-'),
 	    sc->cap_resx, sc->cap_resy);
 
 	sc->hw.buttons = 5;
 	sc->hw.iftype = MOUSE_IF_PS2;
 	sc->hw.type = MOUSE_MOUSE;
 	sc->hw.model = MOUSE_MODEL_INTELLI;
 	sc->hw.hwid = addr;
 
 	sc->mode.protocol = MOUSE_PROTO_PS2;
 	sc->mode.rate = 100;
 	sc->mode.resolution = 4;
 	sc->mode.accelfactor = 1;
 	sc->mode.level = 0;
 	sc->mode.packetsize = MOUSE_PS2_PACKETSIZE;
 
 	sc->intr_hook.ich_func = cyapa_start;
 	sc->intr_hook.ich_arg = sc->dev;
 
 #ifdef EVDEV_SUPPORT
 	sc->evdev = evdev_alloc();
 	evdev_set_name(sc->evdev, device_get_desc(sc->dev));
 	evdev_set_phys(sc->evdev, device_get_nameunit(sc->dev));
 	evdev_set_id(sc->evdev, BUS_I2C, 0, 0, 1);
 	evdev_set_flag(sc->evdev, EVDEV_FLAG_MT_STCOMPAT);
 	evdev_set_flag(sc->evdev, EVDEV_FLAG_MT_AUTOREL);
 
 	evdev_support_event(sc->evdev, EV_SYN);
 	evdev_support_event(sc->evdev, EV_ABS);
 	evdev_support_event(sc->evdev, EV_KEY);
 	evdev_support_prop(sc->evdev, INPUT_PROP_POINTER);
 	if (sc->cap_buttons & CYAPA_FNGR_LEFT)
 		evdev_support_key(sc->evdev, BTN_LEFT);
 	if (sc->cap_buttons & CYAPA_FNGR_RIGHT)
 		evdev_support_key(sc->evdev, BTN_RIGHT);
 	if (sc->cap_buttons & CYAPA_FNGR_MIDDLE)
 		evdev_support_key(sc->evdev, BTN_MIDDLE);
 	if (sc->cap_buttons == CYAPA_FNGR_LEFT)
 		evdev_support_prop(sc->evdev, INPUT_PROP_BUTTONPAD);
 
 	evdev_support_abs(sc->evdev, ABS_MT_SLOT,
 	    0, CYAPA_MAX_MT - 1, 0, 0, 0);
 	evdev_support_abs(sc->evdev, ABS_MT_TRACKING_ID, -1, 15, 0, 0, 0);
 	evdev_support_abs(sc->evdev, ABS_MT_POSITION_X, 0, sc->cap_resx, 0, 0,
 	    sc->cap_phyx != 0 ? sc->cap_resx / sc->cap_phyx : 0);
 	evdev_support_abs(sc->evdev, ABS_MT_POSITION_Y, 0, sc->cap_resy, 0, 0,
 	    sc->cap_phyy != 0 ? sc->cap_resy / sc->cap_phyy : 0);
 	evdev_support_abs(sc->evdev, ABS_MT_PRESSURE, 0, 255, 0, 0, 0);
 
 	if (evdev_register(sc->evdev) != 0) {
 		mtx_destroy(&sc->mutex);
 		return (ENOMEM);
 	}
 #endif
 
 	/* Postpone start of the polling thread until sleep is available */
 	if (config_intrhook_establish(&sc->intr_hook) != 0) {
 #ifdef EVDEV_SUPPORT
 		evdev_free(sc->evdev);
 #endif
 		mtx_destroy(&sc->mutex);
 		return (ENOMEM);
 	}
 
 	sc->devnode = make_dev(&cyapa_cdevsw, unit,
 	    UID_ROOT, GID_WHEEL, 0600, "cyapa%d", unit);
 
 	sc->devnode->si_drv1 = sc;
 
 	return (0);
 }
 
 static int
 cyapa_detach(device_t dev)
 {
 	struct cyapa_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	/* Cleanup poller thread */
 	cyapa_lock(sc);
 	while (sc->poll_thread_running) {
 		sc->detaching = 1;
 		mtx_sleep(&sc->detaching, &sc->mutex, PCATCH, "cyapadet", hz);
 	}
 	cyapa_unlock(sc);
 
 	destroy_dev(sc->devnode);
 
 	knlist_clear(&sc->selinfo.si_note, 0);
 	seldrain(&sc->selinfo);
 	knlist_destroy(&sc->selinfo.si_note);
 #ifdef EVDEV_SUPPORT
 	evdev_free(sc->evdev);
 #endif
 
 	mtx_destroy(&sc->mutex);
 
 	return (0);
 }
 
 /*
  * USER DEVICE I/O FUNCTIONS
  */
 static int
 cyapaopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct cyapa_cdevpriv *priv;
 	int error;
 
 	priv = malloc(sizeof(*priv), M_CYAPA, M_WAITOK | M_ZERO);
 	priv->sc = dev->si_drv1;
 
 	error = devfs_set_cdevpriv(priv, cyapa_cdevpriv_dtor);
 	if (error == 0) {
 		cyapa_lock(priv->sc);
 		priv->sc->count++;
 		cyapa_unlock(priv->sc);
 	}
 	else
 		free(priv, M_CYAPA);
 
 	return (error);
 }
 
 static void
 cyapa_cdevpriv_dtor(void *data)
 {
 	struct cyapa_cdevpriv *priv;
 
 	priv = data;
 	KASSERT(priv != NULL, ("cyapa cdevpriv should not be NULL!"));
 
 	cyapa_lock(priv->sc);
 	priv->sc->count--;
 	cyapa_unlock(priv->sc);
 
 	free(priv, M_CYAPA);
 }
 
 static int
 cyaparead(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cyapa_softc *sc;
 	int error;
 	int didread;
 	size_t n;
 	char* ptr;
 
 	sc = dev->si_drv1;
 	/* If buffer is empty, load a new event if it is ready */
 	cyapa_lock(sc);
 again:
 	if (fifo_empty(sc, &sc->rfifo) &&
 	    (sc->data_signal || sc->delta_x || sc->delta_y ||
 	     sc->track_but != sc->reported_but)) {
 		uint8_t c0;
 		uint16_t but;
 		int delta_x;
 		int delta_y;
 		int delta_z;
 
 		/* Accumulate delta_x, delta_y */
 		sc->data_signal = 0;
 		delta_x = sc->delta_x;
 		delta_y = sc->delta_y;
 		delta_z = sc->delta_z;
 		if (delta_x > 255) {
 			delta_x = 255;
 			sc->data_signal = 1;
 		}
 		if (delta_x < -256) {
 			delta_x = -256;
 			sc->data_signal = 1;
 		}
 		if (delta_y > 255) {
 			delta_y = 255;
 			sc->data_signal = 1;
 		}
 		if (delta_y < -256) {
 			delta_y = -256;
 			sc->data_signal = 1;
 		}
 		if (delta_z > 255) {
 			delta_z = 255;
 			sc->data_signal = 1;
 		}
 		if (delta_z < -256) {
 			delta_z = -256;
 			sc->data_signal = 1;
 		}
 		but = sc->track_but;
 
 		/* Adjust baseline for next calculation */
 		sc->delta_x -= delta_x;
 		sc->delta_y -= delta_y;
 		sc->delta_z -= delta_z;
 		sc->reported_but = but;
 
 		/*
 		 * Fuzz reduces movement jitter by introducing some
 		 * hysteresis.  It operates without cumulative error so
 		 * if you swish around quickly and return your finger to
 		 * where it started, so to will the mouse.
 		 */
 		delta_x = cyapa_fuzz(delta_x, &sc->fuzz_x);
 		delta_y = cyapa_fuzz(delta_y, &sc->fuzz_y);
 		delta_z = cyapa_fuzz(delta_z, &sc->fuzz_z);
 
 		/*
 		 * Generate report
 		 */
 		c0 = 0;
 		if (delta_x < 0)
 			c0 |= 0x10;
 		if (delta_y < 0)
 			c0 |= 0x20;
 		c0 |= 0x08;
 		if (but & CYAPA_FNGR_LEFT)
 			c0 |= 0x01;
 		if (but & CYAPA_FNGR_MIDDLE)
 			c0 |= 0x04;
 		if (but & CYAPA_FNGR_RIGHT)
 			c0 |= 0x02;
 
 		fifo_write_char(sc, &sc->rfifo, c0);
 		fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_x);
 		fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_y);
 		switch(sc->zenabled) {
 		case 1:
 			/* Z axis all 8 bits */
 			fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_z);
 			break;
 		case 2:
 			/*
 			 * Z axis low 4 bits + 4th button and 5th button
 			 * (high 2 bits must be left 0).  Auto-scale
 			 * delta_z to fit to avoid a wrong-direction
 			 * overflow (don't try to retain the remainder).
 			 */
 			while (delta_z > 7 || delta_z < -8)
 				delta_z >>= 1;
 			c0 = (uint8_t)delta_z & 0x0F;
 			fifo_write_char(sc, &sc->rfifo, c0);
 			break;
 		default:
 			/* basic PS/2 */
 			break;
 		}
 		cyapa_notify(sc);
 	}
 
 	/* Blocking / Non-blocking */
 	error = 0;
 	didread = (uio->uio_resid == 0);
 
 	while ((ioflag & IO_NDELAY) == 0 && fifo_empty(sc, &sc->rfifo)) {
 		if (sc->data_signal)
 			goto again;
 		sc->blocked = 1;
 		error = mtx_sleep(&sc->blocked, &sc->mutex, PCATCH, "cyablk", 0);
 		if (error)
 			break;
 	}
 
 	/* Return any buffered data */
 	while (error == 0 && uio->uio_resid &&
 	    (n = fifo_ready(sc, &sc->rfifo)) > 0) {
 		if (n > uio->uio_resid)
 			n = uio->uio_resid;
 		ptr = fifo_read(sc, &sc->rfifo, 0);
 		cyapa_unlock(sc);
 		error = uiomove(ptr, n, uio);
 		cyapa_lock(sc);
 		if (error)
 			break;
 		fifo_read(sc, &sc->rfifo, n);
 		didread = 1;
 	}
 	cyapa_unlock(sc);
 
 	if (error == 0 && didread == 0) {
 		error = EWOULDBLOCK;
 	}
 	return (didread ? 0 : error);
 }
 
 static int
 cyapawrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cyapa_softc *sc;
 	int error;
 	int cmd_completed;
 	size_t n;
 	uint8_t c0;
 	char* ptr;
 
 	sc = dev->si_drv1;
 again:
 	/*
 	 * Copy data from userland.  This will also cross-over the end
 	 * of the fifo and keep filling.
 	 */
 	cyapa_lock(sc);
 	while ((n = fifo_space(sc, &sc->wfifo)) > 0 && uio->uio_resid) {
 		if (n > uio->uio_resid)
 			n = uio->uio_resid;
 		ptr = fifo_write(sc, &sc->wfifo, 0);
 		cyapa_unlock(sc);
 		error = uiomove(ptr, n, uio);
 		cyapa_lock(sc);
 		if (error)
 			break;
 		fifo_write(sc, &sc->wfifo, n);
 	}
 
 	/* Handle commands */
 	cmd_completed = (fifo_ready(sc, &sc->wfifo) != 0);
 	while (fifo_ready(sc, &sc->wfifo) && cmd_completed && error == 0) {
 		if (sc->ps2_cmd == 0)
 			sc->ps2_cmd = fifo_read_char(sc, &sc->wfifo);
 		switch(sc->ps2_cmd) {
 		case 0xE6:
 			/* SET SCALING 1:1 */
 			sc->scaling_mode = 0;
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			break;
 		case 0xE7:
 			/* SET SCALING 2:1 */
 			sc->scaling_mode = 1;
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			break;
 		case 0xE8:
 			/* SET RESOLUTION +1 byte */
 			if (sc->ps2_acked == 0) {
 				sc->ps2_acked = 1;
 				fifo_write_char(sc, &sc->rfifo, 0xFA);
 			}
 			if (fifo_ready(sc, &sc->wfifo) == 0) {
 				cmd_completed = 0;
 				break;
 			}
 			sc->mode.resolution = fifo_read_char(sc, &sc->wfifo);
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			break;
 		case 0xE9:
 			/*
 			 * STATUS REQUEST
 			 *
 			 * byte1:
 			 *	bit 7	0
 			 *	bit 6	Mode	(1=remote mode, 0=stream mode)
 			 *	bit 5	Enable	(data reporting enabled)
 			 *	bit 4	Scaling	(0=1:1 1=2:1)
 			 *	bit 3	0
 			 *	bit 2	LEFT BUTTON    (1 if pressed)
 			 *	bit 1	MIDDLE BUTTON  (1 if pressed)
 			 *	bit 0	RIGHT BUTTON   (1 if pressed)
 			 *
 			 * byte2: resolution counts/mm
 			 * byte3: sample rate
 			 */
 			c0 = 0;
 			if (sc->remote_mode)
 				c0 |= 0x40;
 			if (sc->reporting_mode)
 				c0 |= 0x20;
 			if (sc->scaling_mode)
 				c0 |= 0x10;
 			if (sc->track_but & CYAPA_FNGR_LEFT)
 				c0 |= 0x04;
 			if (sc->track_but & CYAPA_FNGR_MIDDLE)
 				c0 |= 0x02;
 			if (sc->track_but & CYAPA_FNGR_RIGHT)
 				c0 |= 0x01;
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			fifo_write_char(sc, &sc->rfifo, c0);
 			fifo_write_char(sc, &sc->rfifo, 0x00);
 			fifo_write_char(sc, &sc->rfifo, 100);
 			break;
 		case 0xEA:
 			/* Set stream mode and reset movement counters */
 			sc->remote_mode = 0;
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->delta_x = 0;
 			sc->delta_y = 0;
 			sc->delta_z = 0;
 			break;
 		case 0xEB:
 			/*
 			 * Read Data (if in remote mode).  If not in remote
 			 * mode force an event.
 			 */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->data_signal = 1;
 			break;
 		case 0xEC:
 			/* Reset Wrap Mode (ignored) */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			break;
 		case 0xEE:
 			/* Set Wrap Mode (ignored) */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			break;
 		case 0xF0:
 			/* Set Remote Mode */
 			sc->remote_mode = 1;
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->delta_x = 0;
 			sc->delta_y = 0;
 			sc->delta_z = 0;
 			break;
 		case 0xF2:
 			/*
 			 * Get Device ID
 			 *
 			 * If we send 0x00 - normal PS/2 mouse, no Z-axis
 			 *
 			 * If we send 0x03 - Intellimouse, data packet has
 			 * an additional Z movement byte (8 bits signed).
 			 * (also reset movement counters)
 			 *
 			 * If we send 0x04 - Now includes z-axis and the
 			 * 4th and 5th mouse buttons.
 			 */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			switch(sc->zenabled) {
 			case 1:
 				fifo_write_char(sc, &sc->rfifo, 0x03);
 				break;
 			case 2:
 				fifo_write_char(sc, &sc->rfifo, 0x04);
 				break;
 			default:
 				fifo_write_char(sc, &sc->rfifo, 0x00);
 				break;
 			}
 			sc->delta_x = 0;
 			sc->delta_y = 0;
 			sc->delta_z = 0;
 			break;
 		case 0xF3:
 			/*
 			 * Set Sample Rate
 			 *
 			 * byte1: the sample rate
 			 */
 			if (sc->ps2_acked == 0) {
 				sc->ps2_acked = 1;
 				fifo_write_char(sc, &sc->rfifo, 0xFA);
 			}
 			if (fifo_ready(sc, &sc->wfifo) == 0) {
 				cmd_completed = 0;
 				break;
 			}
 			sc->mode.rate = fifo_read_char(sc, &sc->wfifo);
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 
 			/*
 			 * zenabling sequence: 200,100,80 (device id 0x03)
 			 *		       200,200,80 (device id 0x04)
 			 *
 			 * We support id 0x03 (no 4th or 5th button).
 			 * We support id 0x04 (w/ 4th and 5th button).
 			 */
 			if (sc->zenabled == 0 && sc->mode.rate == 200)
 				sc->zenabled = -1;
 			else if (sc->zenabled == -1 && sc->mode.rate == 100)
 				sc->zenabled = -2;
 			else if (sc->zenabled == -1 && sc->mode.rate == 200)
 				sc->zenabled = -3;
 			else if (sc->zenabled == -2 && sc->mode.rate == 80)
 				sc->zenabled = 1;	/* z-axis mode */
 			else if (sc->zenabled == -3 && sc->mode.rate == 80)
 				sc->zenabled = 2;	/* z-axis+but4/5 */
 			if (sc->mode.level)
 				sc->zenabled = 1;
 			break;
 		case 0xF4:
 			/* Enable data reporting.  Only effects stream mode. */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->reporting_mode = 1;
 			break;
 		case 0xF5:
 			/*
 			 * Disable data reporting.  Only effects stream mode
 			 * and is ignored right now.
 			 */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->reporting_mode = 1;
 			break;
 		case 0xF6:
 			/*
 			 * SET DEFAULTS
 			 *
 			 * (reset sampling rate, resolution, scaling and
 			 *  enter stream mode)
 			 */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->mode.rate = 100;
 			sc->mode.resolution = 4;
 			sc->scaling_mode = 0;
 			sc->reporting_mode = 1;
 			sc->remote_mode = 0;
 			sc->delta_x = 0;
 			sc->delta_y = 0;
 			sc->delta_z = 0;
 			/* signal */
 			break;
 		case 0xFE:
 			/*
 			 * RESEND
 			 *
 			 * Force a resend by guaranteeing that reported_but
 			 * differs from track_but.
 			 */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->data_signal = 1;
 			break;
 		case 0xFF:
 			/*
 			 * RESET
 			 */
 			fifo_reset(sc, &sc->rfifo);	/* should we do this? */
 			fifo_reset(sc, &sc->wfifo);	/* should we do this? */
 			fifo_write_char(sc, &sc->rfifo, 0xFA);
 			sc->delta_x = 0;
 			sc->delta_y = 0;
 			sc->delta_z = 0;
 			sc->zenabled = 0;
 			sc->mode.level = 0;
 			break;
 		default:
 			printf("unknown command %02x\n", sc->ps2_cmd);
 			break;
 		}
 		if (cmd_completed) {
 			sc->ps2_cmd = 0;
 			sc->ps2_acked = 0;
 		}
 		cyapa_notify(sc);
 	}
 	cyapa_unlock(sc);
 	if (error == 0 && (cmd_completed || uio->uio_resid))
 		goto again;
 	return (error);
 }
 
 static void cyapafiltdetach(struct knote *);
 static int cyapafilt(struct knote *, long);
 
-static struct filterops cyapa_filtops = {
+static const struct filterops cyapa_filtops = {
 	    .f_isfd = 1,
 	    .f_detach = cyapafiltdetach,
 	    .f_event = cyapafilt
 };
 
 static int
 cyapakqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct cyapa_softc *sc;
 	struct knlist *knlist;
 
 	sc = dev->si_drv1;
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &cyapa_filtops;
 		kn->kn_hook = (void *)sc;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	knlist = &sc->selinfo.si_note;
 	knlist_add(knlist, kn, 0);
 
 	return (0);
 }
 
 static int
 cyapapoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct cyapa_softc *sc;
 	int revents;
 
 	sc = dev->si_drv1;
 	revents = 0;
 
 	cyapa_lock(sc);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (sc->data_signal || !fifo_empty(sc, &sc->rfifo))
 			revents = events & (POLLIN | POLLRDNORM);
 		else {
 			sc->isselect = 1;
 			selrecord(td, &sc->selinfo);
 		}
 	}
 	cyapa_unlock(sc);
 
 	return (revents);
 }
 
 static void
 cyapafiltdetach(struct knote *kn)
 {
 	struct cyapa_softc *sc;
 	struct knlist *knlist;
 
 	sc = (struct cyapa_softc *)kn->kn_hook;
 
 	knlist = &sc->selinfo.si_note;
 	knlist_remove(knlist, kn, 0);
 }
 
 static int
 cyapafilt(struct knote *kn, long hint)
 {
 	struct cyapa_softc *sc;
 	int ready;
 
 	sc = (struct cyapa_softc *)kn->kn_hook;
 
 	cyapa_lock(sc);
 	ready = fifo_ready(sc, &sc->rfifo) || sc->data_signal;
 	cyapa_unlock(sc);
 
 	return (ready);
 }
 
 static int
 cyapaioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	struct cyapa_softc *sc;
 	int error;
 
 	sc = dev->si_drv1;
 	error = 0;
 
 	cyapa_lock(sc);
 	switch (cmd) {
 	case MOUSE_GETHWINFO:
 		*(mousehw_t *)data = sc->hw;
 		if (sc->mode.level == 0)
 			((mousehw_t *)data)->model = MOUSE_MODEL_GENERIC;
 		break;
 
 	case MOUSE_GETMODE:
 		*(mousemode_t *)data = sc->mode;
 		((mousemode_t *)data)->resolution =
 		    MOUSE_RES_LOW - sc->mode.resolution;
 		switch (sc->mode.level) {
 		case 0:
 			((mousemode_t *)data)->protocol = MOUSE_PROTO_PS2;
 			((mousemode_t *)data)->packetsize =
 			    MOUSE_PS2_PACKETSIZE;
 			break;
 		case 2:
 			((mousemode_t *)data)->protocol = MOUSE_PROTO_PS2;
 			((mousemode_t *)data)->packetsize =
 			    MOUSE_PS2_PACKETSIZE + 1;
 			break;
 		}
 		break;
 
 	case MOUSE_GETLEVEL:
 		*(int *)data = sc->mode.level;
 		break;
 
 	case MOUSE_SETLEVEL:
 		if ((*(int *)data < 0) &&
 		    (*(int *)data > 2)) {
 			error = EINVAL;
 			break;
 		}
 		sc->mode.level = *(int *)data ? 2 : 0;
 		sc->zenabled = sc->mode.level ? 1 : 0;
 		break;
 
 	default:
 		error = ENOTTY;
 		break;
 	}
 	cyapa_unlock(sc);
 
 	return (error);
 }
 
 /*
  * MAJOR SUPPORT FUNCTIONS
  */
 static void
 cyapa_poll_thread(void *arg)
 {
 	struct cyapa_softc *sc;
 	struct cyapa_regs regs;
 	device_t bus;		/* iicbus */
 	int error;
 	int freq;
 	int isidle;
 	int pstate;
 	int npstate;
 	int last_reset;
 
 	sc = arg;
 	freq = cyapa_norm_freq;
 	isidle = 0;
 	pstate = CMD_POWER_MODE_IDLE;
 	last_reset = ticks;
 
 	bus = device_get_parent(sc->dev);
 
 	cyapa_lock(sc);
 	sc->poll_thread_running = 1;
 
 	while (!sc->detaching) {
 		cyapa_unlock(sc);
 		error = iicbus_request_bus(bus, sc->dev, IIC_WAIT);
 		if (error == 0) {
 			error = cyapa_read_bytes(sc->dev, CMD_DEV_STATUS,
 			    (void *)&regs, sizeof(regs));
 			if (error == 0) {
 				isidle = cyapa_raw_input(sc, &regs, freq);
 			}
 
 			/*
 			 * For some reason the device can crap-out.  If it
 			 * drops back into bootstrap mode try to reinitialize
 			 * it.
 			 */
 			if (cyapa_reset ||
 			    ((regs.stat & CYAPA_STAT_RUNNING) == 0 &&
 			     (unsigned)(ticks - last_reset) > TIME_TO_RESET)) {
 				cyapa_reset = 0;
 				last_reset = ticks;
 				init_device(sc->dev, NULL, 2);
 			}
 			iicbus_release_bus(bus, sc->dev);
 		}
 		pause("cyapw", hz / freq);
 		++sc->poll_ticks;
 
 		if (sc->count == 0) {
 			freq = cyapa_idle_freq;
 			npstate = CMD_POWER_MODE_IDLE;
 		} else if (isidle) {
 			freq = cyapa_slow_freq;
 			npstate = CMD_POWER_MODE_IDLE;
 		} else {
 			freq = cyapa_norm_freq;
 			npstate = CMD_POWER_MODE_FULL;
 		}
 		if (pstate != npstate) {
 			pstate = npstate;
 			cyapa_set_power_mode(sc, pstate);
 			if (cyapa_debug) {
 				switch(pstate) {
 				case CMD_POWER_MODE_OFF:
 					printf("cyapa: power off\n");
 					break;
 				case CMD_POWER_MODE_IDLE:
 					printf("cyapa: power idle\n");
 					break;
 				case CMD_POWER_MODE_FULL:
 					printf("cyapa: power full\n");
 					break;
 				}
 			}
 		}
 
 		cyapa_lock(sc);
 	}
 	sc->poll_thread_running = 0;
 	cyapa_unlock(sc);
 	kthread_exit();
 }
 
 static int
 cyapa_raw_input(struct cyapa_softc *sc, struct cyapa_regs *regs, int freq)
 {
 	int nfingers;
 	int afingers;	/* actual fingers after culling */
 	int i;
 	int j;
 	int isidle;
 	int thumbarea_begin;
 	int seen_thumb;
 	int x;
 	int y;
 	int z;
 	int newfinger;
 	int lessfingers;
 	int click_x;
 	int click_y;
 	uint16_t but;	/* high bits used for simulated but4/but5 */
 
 	thumbarea_begin = sc->cap_resy -
 	    ((sc->cap_resy *  cyapa_thumbarea_percent) / 100);
 	click_x = click_y = 0;
 
 	/*
 	 * If the device is not running the rest of the status
 	 * means something else, set fingers to 0.
 	 */
 	if ((regs->stat & CYAPA_STAT_RUNNING) == 0) {
 		regs->fngr = 0;
 	}
 
 	/* Process fingers/movement */
 	nfingers = CYAPA_FNGR_NUMFINGERS(regs->fngr);
 	afingers = nfingers;
 
 	if (cyapa_debug) {
 		printf("stat %02x buttons %c%c%c nfngrs=%d ",
 		    regs->stat,
 		    ((regs->fngr & CYAPA_FNGR_LEFT) ? 'L' : '-'),
 		    ((regs->fngr & CYAPA_FNGR_MIDDLE) ? 'M' : '-'),
 		    ((regs->fngr & CYAPA_FNGR_RIGHT) ? 'R' : '-'),
 		    nfingers);
 	}
 
 #ifdef EVDEV_SUPPORT
 	if (evdev_rcpt_mask & EVDEV_RCPT_HW_MOUSE) {
 		for (i = 0; i < nfingers; ++i) {
 			int slot = evdev_mt_id_to_slot(
 			    sc->evdev, regs->touch[i].id);
 			if (slot == -1) {
 				if (cyapa_debug)
 					printf("Slot overflow for i=%d\n",
 					    regs->touch[i].id);
 				continue;
 			}
 			evdev_push_abs(sc->evdev, ABS_MT_SLOT, slot);
 			evdev_push_abs(sc->evdev, ABS_MT_TRACKING_ID,
 			    regs->touch[i].id);
 			evdev_push_abs(sc->evdev, ABS_MT_POSITION_X,
 			    CYAPA_TOUCH_X(regs, i));
 			evdev_push_abs(sc->evdev, ABS_MT_POSITION_Y,
 			    CYAPA_TOUCH_Y(regs, i));
 			evdev_push_abs(sc->evdev, ABS_MT_PRESSURE,
 			    CYAPA_TOUCH_P(regs, i));
 		}
 		if (sc->cap_buttons & CYAPA_FNGR_LEFT)
 			evdev_push_key(sc->evdev, BTN_LEFT,
 			    regs->fngr & CYAPA_FNGR_LEFT);
 		if (sc->cap_buttons & CYAPA_FNGR_RIGHT)
 			evdev_push_key(sc->evdev, BTN_RIGHT,
 			    regs->fngr & CYAPA_FNGR_RIGHT);
 		if (sc->cap_buttons & CYAPA_FNGR_MIDDLE)
 			evdev_push_key(sc->evdev, BTN_MIDDLE,
 			    regs->fngr & CYAPA_FNGR_MIDDLE);
 		evdev_sync(sc->evdev);
 	}
 #endif
 
 	seen_thumb = 0;
 	for (i = 0; i < afingers; ) {
 		if (cyapa_debug) {
 			printf(" [x=%04d y=%04d p=%d i=%d]",
 			    CYAPA_TOUCH_X(regs, i),
 			    CYAPA_TOUCH_Y(regs, i),
 			    CYAPA_TOUCH_P(regs, i),
 			    regs->touch[i].id);
 		}
 		if ((CYAPA_TOUCH_Y(regs, i) > thumbarea_begin && seen_thumb) ||
 		     CYAPA_TOUCH_P(regs, i) < cyapa_minpressure) {
 			--afingers;
 			if (i < afingers) {
 			    regs->touch[i] = regs->touch[i+1];
 			    continue;
 			}
 		} else {
 			if (CYAPA_TOUCH_Y(regs, i) > thumbarea_begin)
 			    seen_thumb = 1;
 		}
 		++i;
 	}
 	nfingers = afingers;
 
 	/* Tracking for local solutions */
 	cyapa_lock(sc);
 
 	/*
 	 * Track timing for finger-downs.  Used to detect false-3-finger
 	 * button-down.
 	 */
 	switch(afingers) {
 	case 0:
 		break;
 	case 1:
 		if (sc->track_nfingers == 0)
 			sc->finger1_ticks = sc->poll_ticks;
 		break;
 	case 2:
 		if (sc->track_nfingers <= 0)
 			sc->finger1_ticks = sc->poll_ticks;
 		if (sc->track_nfingers <= 1)
 			sc->finger2_ticks = sc->poll_ticks;
 		break;
 	case 3:
 	default:
 		if (sc->track_nfingers <= 0)
 			sc->finger1_ticks = sc->poll_ticks;
 		if (sc->track_nfingers <= 1)
 			sc->finger2_ticks = sc->poll_ticks;
 		if (sc->track_nfingers <= 2)
 			sc->finger3_ticks = sc->poll_ticks;
 		break;
 	}
 	newfinger = sc->track_nfingers < afingers;
 	lessfingers = sc->track_nfingers > afingers;
 	sc->track_nfingers = afingers;
 
 	/*
 	 * Lookup and track finger indexes in the touch[] array.
 	 */
 	if (afingers == 0) {
 		click_x = sc->track_x;
 		click_y = sc->track_y;
 		sc->track_x = -1;
 		sc->track_y = -1;
 		sc->track_z = -1;
 		sc->fuzz_x = 0;
 		sc->fuzz_y = 0;
 		sc->fuzz_z = 0;
 		sc->touch_x = -1;
 		sc->touch_y = -1;
 		sc->touch_z = -1;
 		sc->track_id = -1;
 		sc->track_but = 0;
 		i = 0;
 		j = 0;
 	} else {
 		/*
 		 * The id assigned on touch can move around in the array,
 		 * find it.  If that finger is lifted up, assign some other
 		 * finger for mouse tracking and reset track_x and track_y
 		 * to avoid a mouse jump.
 		 *
 		 * If >= 2 fingers are down be sure not to assign i and
 		 * j to the same index.
 		 */
 		for (i = 0; i < nfingers; ++i) {
 			if (sc->track_id == regs->touch[i].id)
 				break;
 		}
 		if (i == nfingers) {
 			i = 0;
 			sc->track_x = -1;
 			sc->track_y = -1;
 			sc->track_z = -1;
 			while (CYAPA_TOUCH_Y(regs, i) >= thumbarea_begin &&
 			    i < nfingers) ++i;
 			if (i == nfingers) {
 				i = 0;
 			}
 			sc->track_id = regs->touch[i].id;
 		}
 		else if ((sc->track_but ||
 		     CYAPA_TOUCH_Y(regs, i) >= thumbarea_begin) &&
 		    newfinger && afingers == 2) {
 			j = regs->touch[0].id == sc->track_id ? 1 : 0;
 			if (CYAPA_TOUCH_Y(regs, j) < thumbarea_begin) {
 			    i = j;
 			    sc->track_x = -1;
 			    sc->track_y = -1;
 			    sc->track_z = -1;
 			    sc->track_id = regs->touch[i].id;
 			}
 		}
 	}
 
 	/* Two finger scrolling - reset after timeout */
 	if (sc->track_z != -1 && afingers != 2 &&
 	    (sc->poll_ticks - sc->track_z_ticks) > cyapa_scroll_stick_ticks) {
 		sc->track_z = -1;
 		sc->track_z_ticks = 0;
 	}
 
 	/* Initiate two finger scrolling */
 	if (!(regs->fngr & CYAPA_FNGR_LEFT) &&
 	    ((afingers && sc->track_z != -1) ||
 	     (afingers == 2 && CYAPA_TOUCH_Y(regs, 0) < thumbarea_begin &&
 	     CYAPA_TOUCH_Y(regs, 1) < thumbarea_begin))) {
 		if (afingers == 2 && (sc->poll_ticks - sc->finger2_ticks)
 		    > cyapa_scroll_wait_ticks) {
 			z = (CYAPA_TOUCH_Y(regs, 0) +
 			    CYAPA_TOUCH_Y(regs, 1)) >> 1;
 			sc->delta_z += z / ZSCALE - sc->track_z;
 			if (sc->track_z == -1) {
 			    sc->delta_z = 0;
 			}
 			if (sc->touch_z == -1)
 			    sc->touch_z = z;	/* not used atm */
 			sc->track_z = z / ZSCALE;
 			sc->track_z_ticks = sc->poll_ticks;
 		}
 	} else if (afingers) {
 		/* Normal pad position reporting */
 		x = CYAPA_TOUCH_X(regs, i);
 		y = CYAPA_TOUCH_Y(regs, i);
 		click_x = x;
 		click_y = y;
 		if (sc->track_x != -1 && sc->track_y < thumbarea_begin &&
 		    (afingers > 1 || (sc->poll_ticks - sc->finger1_ticks)
 		    >= cyapa_move_min_ticks || freq < cyapa_norm_freq)) {
 			sc->delta_x += x - sc->track_x;
 			sc->delta_y -= y - sc->track_y;
 			if (sc->delta_x > sc->cap_resx)
 				sc->delta_x = sc->cap_resx;
 			if (sc->delta_x < -sc->cap_resx)
 				sc->delta_x = -sc->cap_resx;
 			if (sc->delta_y > sc->cap_resy)
 				sc->delta_y = sc->cap_resy;
 			if (sc->delta_y < -sc->cap_resy)
 				sc->delta_y = -sc->cap_resy;
 
 			if (abs(sc->delta_y) > sc->cap_resy / 2 ||
 			    abs(sc->delta_x) > sc->cap_resx / 2) {
 				if (cyapa_debug)
 					printf("Detected jump by %i %i\n",
 					    sc->delta_x, sc->delta_y);
 			    sc->delta_x = sc->delta_y = 0;
 			}
 		}
 		if (sc->touch_x == -1) {
 			sc->touch_x = x;
 			sc->touch_y = y;
 		}
 		sc->track_x = x;
 		sc->track_y = y;
 	}
 
 	/* Select finger (L = 2/3x, M = 1/3u, R = 1/3d) */
 	int is_tapclick = (cyapa_enable_tapclick && lessfingers &&
 	    afingers == 0 && sc->poll_ticks - sc->finger1_ticks
 	    >= cyapa_tapclick_min_ticks &&
 	    sc->poll_ticks - sc->finger1_ticks < cyapa_tapclick_max_ticks);
 
 	if (regs->fngr & CYAPA_FNGR_LEFT || is_tapclick) {
 		if (sc->track_but) {
 			but = sc->track_but;
 		} else if (afingers == 1) {
 			if (click_x < sc->cap_resx * 2 / 3)
 				but = CYAPA_FNGR_LEFT;
 			else if (click_y < sc->cap_resy / 2)
 				but = CYAPA_FNGR_MIDDLE;
 			else
 				but = CYAPA_FNGR_RIGHT;
 		} else if (is_tapclick) {
 			if (click_x < sc->cap_resx * 2 / 3 ||
 			    cyapa_enable_tapclick < 2)
 				but = CYAPA_FNGR_LEFT;
 			else if (click_y < sc->cap_resy / 2 &&
 			    cyapa_enable_tapclick > 2)
 				but = CYAPA_FNGR_MIDDLE;
 			else
 				but = CYAPA_FNGR_RIGHT;
 		} else {
 			but = CYAPA_FNGR_LEFT;
 		}
 	} else {
 		but = 0;
 	}
 
 	/*
 	 * Detect state change from last reported state and
 	 * determine if we have gone idle.
 	 */
 	sc->track_but = but;
 	if (sc->delta_x || sc->delta_y || sc->delta_z ||
 	    sc->track_but != sc->reported_but) {
 		sc->active_tick = ticks;
 		if (sc->remote_mode == 0 && sc->reporting_mode)
 			sc->data_signal = 1;
 		isidle = 0;
 	} else if ((unsigned)(ticks - sc->active_tick) >= TIME_TO_IDLE) {
 		sc->active_tick = ticks - TIME_TO_IDLE; /* prevent overflow */
 		isidle = 1;
 	} else {
 		isidle = 0;
 	}
 	cyapa_notify(sc);
 	cyapa_unlock(sc);
 
 	if (cyapa_debug)
 		printf("%i >> %i << %i\n", isidle, sc->track_id, sc->delta_y);
 	return (isidle);
 }
 
 static void
 cyapa_set_power_mode(struct cyapa_softc *sc, int mode)
 {
 	uint8_t data;
 	device_t bus;
 	int error;
 
 	bus = device_get_parent(sc->dev);
 	error = iicbus_request_bus(bus, sc->dev, IIC_WAIT);
 	if (error == 0) {
 		error = cyapa_read_bytes(sc->dev, CMD_POWER_MODE,
 		    &data, 1);
 		data = (data & ~0xFC) | mode;
 		if (error == 0) {
 			error = cyapa_write_bytes(sc->dev, CMD_POWER_MODE,
 			    &data, 1);
 		}
 		iicbus_release_bus(bus, sc->dev);
 	}
 }
 
 /*
  * FIFO FUNCTIONS
  */
 
 /*
  * Returns non-zero if the fifo is empty
  */
 static int
 fifo_empty(struct cyapa_softc *sc, struct cyapa_fifo *fifo)
 {
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	return (fifo->rindex == fifo->windex);
 }
 
 /*
  * Returns the number of characters available for reading from
  * the fifo without wrapping the fifo buffer.
  */
 static size_t
 fifo_ready(struct cyapa_softc *sc, struct cyapa_fifo *fifo)
 {
 	size_t n;
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	n = CYAPA_BUFSIZE - (fifo->rindex & CYAPA_BUFMASK);
 	if (n > (size_t)(fifo->windex - fifo->rindex))
 		n = (size_t)(fifo->windex - fifo->rindex);
 	return (n);
 }
 
 /*
  * Returns a read pointer into the fifo and then bumps
  * rindex.  The FIFO must have at least 'n' characters in
  * it.  The value (n) can cause the index to wrap but users
  * of the buffer should never supply a value for (n) that wraps
  * the buffer.
  */
 static char *
 fifo_read(struct cyapa_softc *sc, struct cyapa_fifo *fifo, size_t n)
 {
 	char *ptr;
 
 	CYAPA_LOCK_ASSERT(sc);
 	if (n > (CYAPA_BUFSIZE - (fifo->rindex & CYAPA_BUFMASK))) {
 		printf("fifo_read: overflow\n");
 		return (fifo->buf);
 	}
 	ptr = fifo->buf + (fifo->rindex & CYAPA_BUFMASK);
 	fifo->rindex += n;
 
 	return (ptr);
 }
 
 static uint8_t
 fifo_read_char(struct cyapa_softc *sc, struct cyapa_fifo *fifo)
 {
 	uint8_t c;
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	if (fifo->rindex == fifo->windex) {
 		printf("fifo_read_char: overflow\n");
 		c = 0;
 	} else {
 		c = fifo->buf[fifo->rindex & CYAPA_BUFMASK];
 		++fifo->rindex;
 	}
 	return (c);
 }
 
 
 /*
  * Write a character to the FIFO.  The character will be discarded
  * if the FIFO is full.
  */
 static void
 fifo_write_char(struct cyapa_softc *sc, struct cyapa_fifo *fifo, uint8_t c)
 {
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	if (fifo->windex - fifo->rindex < CYAPA_BUFSIZE) {
 		fifo->buf[fifo->windex & CYAPA_BUFMASK] = c;
 		++fifo->windex;
 	}
 }
 
 /*
  * Return the amount of space available for writing without wrapping
  * the fifo.
  */
 static size_t
 fifo_space(struct cyapa_softc *sc, struct cyapa_fifo *fifo)
 {
 	size_t n;
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	n = CYAPA_BUFSIZE - (fifo->windex & CYAPA_BUFMASK);
 	if (n > (size_t)(CYAPA_BUFSIZE - (fifo->windex - fifo->rindex)))
 		n = (size_t)(CYAPA_BUFSIZE - (fifo->windex - fifo->rindex));
 	return (n);
 }
 
 static char *
 fifo_write(struct cyapa_softc *sc, struct cyapa_fifo *fifo, size_t n)
 {
 	char *ptr;
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	ptr = fifo->buf + (fifo->windex & CYAPA_BUFMASK);
 	fifo->windex += n;
 
 	return (ptr);
 }
 
 static void
 fifo_reset(struct cyapa_softc *sc, struct cyapa_fifo *fifo)
 {
 
 	CYAPA_LOCK_ASSERT(sc);
 
 	fifo->rindex = 0;
 	fifo->windex = 0;
 }
 
 /*
  * Fuzz handling
  */
 static int
 cyapa_fuzz(int delta, int *fuzzp)
 {
 	int fuzz;
 
 	fuzz = *fuzzp;
 	if (fuzz >= 0 && delta < 0) {
 		++delta;
 		--fuzz;
 	} else if (fuzz <= 0 && delta > 0) {
 		--delta;
 		++fuzz;
 	}
 	*fuzzp = fuzz;
 
 	return (delta);
 }
 
 DRIVER_MODULE(cyapa, iicbus, cyapa_driver, NULL, NULL);
 MODULE_DEPEND(cyapa, iicbus, IICBUS_MINVER, IICBUS_PREFVER, IICBUS_MAXVER);
 #ifdef EVDEV_SUPPORT
 MODULE_DEPEND(cyapa, evdev, 1, 1, 1);
 #endif
 MODULE_VERSION(cyapa, 1);
diff --git a/sys/dev/evdev/cdev.c b/sys/dev/evdev/cdev.c
index c9a8258a03a9..9fe1299a0937 100644
--- a/sys/dev/evdev/cdev.c
+++ b/sys/dev/evdev/cdev.c
@@ -1,951 +1,951 @@
 /*-
  * Copyright (c) 2014 Jakub Wojciech Klama <jceel@FreeBSD.org>
  * Copyright (c) 2015-2016 Vladimir Kondratyev <wulf@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_evdev.h"
 
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/conf.h>
 #include <sys/epoch.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/selinfo.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/uio.h>
 
 #include <dev/evdev/evdev.h>
 #include <dev/evdev/evdev_private.h>
 #include <dev/evdev/input.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/sysent.h>
 #include <compat/freebsd32/freebsd32.h>
 struct input_event32 {
 	struct timeval32	time;
 	uint16_t		type;
 	uint16_t		code;
 	int32_t			value;
 };
 #endif
 
 #ifdef EVDEV_DEBUG
 #define	debugf(client, fmt, args...)	printf("evdev cdev: "fmt"\n", ##args)
 #else
 #define	debugf(client, fmt, args...)
 #endif
 
 #define	DEF_RING_REPORTS	8
 
 static d_open_t		evdev_open;
 static d_read_t		evdev_read;
 static d_write_t	evdev_write;
 static d_ioctl_t	evdev_ioctl;
 static d_poll_t		evdev_poll;
 static d_kqfilter_t	evdev_kqfilter;
 
 static int evdev_kqread(struct knote *kn, long hint);
 static void evdev_kqdetach(struct knote *kn);
 static void evdev_dtor(void *);
 static int evdev_ioctl_eviocgbit(struct evdev_dev *, int, int, caddr_t,
     struct thread *);
 static void evdev_client_filter_queue(struct evdev_client *, uint16_t);
 
 static struct cdevsw evdev_cdevsw = {
 	.d_version = D_VERSION,
 	.d_open = evdev_open,
 	.d_read = evdev_read,
 	.d_write = evdev_write,
 	.d_ioctl = evdev_ioctl,
 	.d_poll = evdev_poll,
 	.d_kqfilter = evdev_kqfilter,
 	.d_name = "evdev",
 };
 
-static struct filterops evdev_cdev_filterops = {
+static const struct filterops evdev_cdev_filterops = {
 	.f_isfd = 1,
 	.f_attach = NULL,
 	.f_detach = evdev_kqdetach,
 	.f_event = evdev_kqread,
 };
 
 static int
 evdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct evdev_dev *evdev = dev->si_drv1;
 	struct evdev_client *client;
 	size_t buffer_size;
 	int ret;
 
 	if (evdev == NULL)
 		return (ENODEV);
 
 	/* Initialize client structure */
 	buffer_size = evdev->ev_report_size * DEF_RING_REPORTS;
 	client = malloc(offsetof(struct evdev_client, ec_buffer) +
 	    sizeof(struct input_event) * buffer_size,
 	    M_EVDEV, M_WAITOK | M_ZERO);
 
 	/* Initialize ring buffer */
 	client->ec_buffer_size = buffer_size;
 	client->ec_buffer_head = 0;
 	client->ec_buffer_tail = 0;
 	client->ec_buffer_ready = 0;
 
 	client->ec_evdev = evdev;
 	mtx_init(&client->ec_buffer_mtx, "evclient", "evdev", MTX_DEF);
 	knlist_init_mtx(&client->ec_selp.si_note, &client->ec_buffer_mtx);
 
 	ret = EVDEV_LIST_LOCK_SIG(evdev);
 	if (ret != 0)
 		goto out;
 	/* Avoid race with evdev_unregister */
 	if (dev->si_drv1 == NULL)
 		ret = ENODEV;
 	else
 		ret = evdev_register_client(evdev, client);
 	EVDEV_LIST_UNLOCK(evdev);
 out:
 	if (ret == 0)
 		ret = devfs_set_cdevpriv(client, evdev_dtor);
 	else
 		client->ec_revoked = true;
 
 	if (ret != 0) {
 		debugf(client, "cannot register evdev client");
 		evdev_dtor(client);
 	}
 
 	return (ret);
 }
 
 static void
 evdev_dtor(void *data)
 {
 	struct evdev_client *client = (struct evdev_client *)data;
 
 	EVDEV_LIST_LOCK(client->ec_evdev);
 	if (!client->ec_revoked)
 		evdev_dispose_client(client->ec_evdev, client);
 	EVDEV_LIST_UNLOCK(client->ec_evdev);
 
 	if (client->ec_evdev->ev_lock_type != EV_LOCK_MTX)
 		epoch_wait_preempt(INPUT_EPOCH);
 	knlist_clear(&client->ec_selp.si_note, 0);
 	seldrain(&client->ec_selp);
 	knlist_destroy(&client->ec_selp.si_note);
 	funsetown(&client->ec_sigio);
 	mtx_destroy(&client->ec_buffer_mtx);
 	free(client, M_EVDEV);
 }
 
 static int
 evdev_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct evdev_client *client;
 	union {
 		struct input_event t;
 #ifdef COMPAT_FREEBSD32
 		struct input_event32 t32;
 #endif
 	} event;
 	struct input_event *head;
 	size_t evsize;
 	int ret = 0;
 	int remaining;
 
 	ret = devfs_get_cdevpriv((void **)&client);
 	if (ret != 0)
 		return (ret);
 
 	debugf(client, "read %zd bytes by thread %d", uio->uio_resid,
 	    uio->uio_td->td_tid);
 
 	if (client->ec_revoked)
 		return (ENODEV);
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		evsize = sizeof(struct input_event32);
 	else
 #endif
 		evsize = sizeof(struct input_event);
 
 	/* Zero-sized reads are allowed for error checking */
 	if (uio->uio_resid != 0 && uio->uio_resid < evsize)
 		return (EINVAL);
 
 	remaining = uio->uio_resid / evsize;
 
 	EVDEV_CLIENT_LOCKQ(client);
 
 	if (EVDEV_CLIENT_EMPTYQ(client)) {
 		if (ioflag & O_NONBLOCK)
 			ret = EWOULDBLOCK;
 		else {
 			if (remaining != 0) {
 				client->ec_blocked = true;
 				ret = mtx_sleep(client, &client->ec_buffer_mtx,
 				    PCATCH, "evread", 0);
 				if (ret == 0 && client->ec_revoked)
 					ret = ENODEV;
 			}
 		}
 	}
 
 	while (ret == 0 && !EVDEV_CLIENT_EMPTYQ(client) && remaining > 0) {
 		head = client->ec_buffer + client->ec_buffer_head;
 #ifdef COMPAT_FREEBSD32
 		if (SV_CURPROC_FLAG(SV_ILP32)) {
 			bzero(&event.t32, sizeof(struct input_event32));
 			TV_CP(*head, event.t32, time);
 			CP(*head, event.t32, type);
 			CP(*head, event.t32, code);
 			CP(*head, event.t32, value);
 		} else
 #endif
 			bcopy(head, &event.t, evsize);
 
 		client->ec_buffer_head =
 		    (client->ec_buffer_head + 1) % client->ec_buffer_size;
 		remaining--;
 
 		EVDEV_CLIENT_UNLOCKQ(client);
 		ret = uiomove(&event, evsize, uio);
 		EVDEV_CLIENT_LOCKQ(client);
 	}
 
 	EVDEV_CLIENT_UNLOCKQ(client);
 
 	return (ret);
 }
 
 static int
 evdev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct evdev_dev *evdev = dev->si_drv1;
 	struct evdev_client *client;
 	union {
 		struct input_event t;
 #ifdef COMPAT_FREEBSD32
 		struct input_event32 t32;
 #endif
 	} event;
 	size_t evsize;
 	int ret = 0;
 
 	ret = devfs_get_cdevpriv((void **)&client);
 	if (ret != 0)
 		return (ret);
 
 	debugf(client, "write %zd bytes by thread %d", uio->uio_resid,
 	    uio->uio_td->td_tid);
 
 	if (client->ec_revoked || evdev == NULL)
 		return (ENODEV);
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		evsize = sizeof(struct input_event32);
 	else
 #endif
 		evsize = sizeof(struct input_event);
 
 	if (uio->uio_resid % evsize != 0) {
 		debugf(client, "write size not multiple of input_event size");
 		return (EINVAL);
 	}
 
 	while (uio->uio_resid > 0 && ret == 0) {
 		ret = uiomove(&event, evsize, uio);
 		if (ret == 0) {
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32))
 				ret = evdev_inject_event(evdev, event.t32.type,
 				    event.t32.code, event.t32.value);
 			else
 #endif
 				ret = evdev_inject_event(evdev, event.t.type,
 				    event.t.code, event.t.value);
 		}
 	}
 
 	return (ret);
 }
 
 static int
 evdev_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct evdev_client *client;
 	int ret;
 	int revents = 0;
 
 	ret = devfs_get_cdevpriv((void **)&client);
 	if (ret != 0)
 		return (POLLNVAL);
 
 	debugf(client, "poll by thread %d", td->td_tid);
 
 	if (client->ec_revoked)
 		return (POLLHUP);
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		EVDEV_CLIENT_LOCKQ(client);
 		if (!EVDEV_CLIENT_EMPTYQ(client))
 			revents = events & (POLLIN | POLLRDNORM);
 		else {
 			client->ec_selected = true;
 			selrecord(td, &client->ec_selp);
 		}
 		EVDEV_CLIENT_UNLOCKQ(client);
 	}
 
 	return (revents);
 }
 
 static int
 evdev_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct evdev_client *client;
 	int ret;
 
 	ret = devfs_get_cdevpriv((void **)&client);
 	if (ret != 0)
 		return (ret);
 
 	if (client->ec_revoked)
 		return (ENODEV);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &evdev_cdev_filterops;
 		break;
 	default:
 		return(EINVAL);
 	}
 	kn->kn_hook = (caddr_t)client;
 
 	knlist_add(&client->ec_selp.si_note, kn, 0);
 	return (0);
 }
 
 static int
 evdev_kqread(struct knote *kn, long hint)
 {
 	struct evdev_client *client;
 	int ret;
 
 	client = (struct evdev_client *)kn->kn_hook;
 
 	EVDEV_CLIENT_LOCKQ_ASSERT(client);
 
 	if (client->ec_revoked) {
 		kn->kn_flags |= EV_EOF;
 		ret = 1;
 	} else {
 		kn->kn_data = EVDEV_CLIENT_SIZEQ(client) *
 		    sizeof(struct input_event);
 		ret = !EVDEV_CLIENT_EMPTYQ(client);
 	}
 	return (ret);
 }
 
 static void
 evdev_kqdetach(struct knote *kn)
 {
 	struct evdev_client *client;
 
 	client = (struct evdev_client *)kn->kn_hook;
 	knlist_remove(&client->ec_selp.si_note, kn, 0);
 }
 
 static int
 evdev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct evdev_dev *evdev = dev->si_drv1;
 	struct evdev_client *client;
 	struct input_keymap_entry *ke;
 	struct epoch_tracker et;
 	int ret, len, limit, type_num;
 	uint32_t code;
 	size_t nvalues;
 
 	ret = devfs_get_cdevpriv((void **)&client);
 	if (ret != 0)
 		return (ret);
 
 	if (client->ec_revoked || evdev == NULL)
 		return (ENODEV);
 
 	/*
 	 * Fix evdev state corrupted with discarding of kdb events.
 	 * EVIOCGKEY and EVIOCGLED ioctls can suffer from this.
 	 */
 	if (evdev->ev_kdb_active) {
 		EVDEV_LOCK(evdev);
 		if (evdev->ev_kdb_active) {
 			evdev->ev_kdb_active = false;
 			if (evdev->ev_lock_type == EV_LOCK_EXT_EPOCH)
 				epoch_enter_preempt(INPUT_EPOCH, &et);
 			evdev_restore_after_kdb(evdev);
 			if (evdev->ev_lock_type == EV_LOCK_EXT_EPOCH)
 				epoch_exit_preempt(INPUT_EPOCH, &et);
 		}
 		EVDEV_UNLOCK(evdev);
 	}
 
 	/* file I/O ioctl handling */
 	switch (cmd) {
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &client->ec_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&client->ec_sigio);
 		return (0);
 
 	case FIONBIO:
 		return (0);
 
 	case FIOASYNC:
 		if (*(int *)data)
 			client->ec_async = true;
 		else
 			client->ec_async = false;
 
 		return (0);
 
 	case FIONREAD:
 		EVDEV_CLIENT_LOCKQ(client);
 		*(int *)data =
 		    EVDEV_CLIENT_SIZEQ(client) * sizeof(struct input_event);
 		EVDEV_CLIENT_UNLOCKQ(client);
 		return (0);
 	}
 
 	len = IOCPARM_LEN(cmd);
 	debugf(client, "ioctl called: cmd=0x%08lx, data=%p", cmd, data);
 
 	/* evdev fixed-length ioctls handling */
 	switch (cmd) {
 	case EVIOCGVERSION:
 		*(int *)data = EV_VERSION;
 		return (0);
 
 	case EVIOCGID:
 		debugf(client, "EVIOCGID: bus=%d vendor=0x%04x product=0x%04x",
 		    evdev->ev_id.bustype, evdev->ev_id.vendor,
 		    evdev->ev_id.product);
 		memcpy(data, &evdev->ev_id, sizeof(struct input_id));
 		return (0);
 
 	case EVIOCGREP:
 		if (!evdev_event_supported(evdev, EV_REP))
 			return (ENOTSUP);
 
 		memcpy(data, evdev->ev_rep, sizeof(evdev->ev_rep));
 		return (0);
 
 	case EVIOCSREP:
 		if (!evdev_event_supported(evdev, EV_REP))
 			return (ENOTSUP);
 
 		evdev_inject_event(evdev, EV_REP, REP_DELAY, ((int *)data)[0]);
 		evdev_inject_event(evdev, EV_REP, REP_PERIOD,
 		    ((int *)data)[1]);
 		return (0);
 
 	case EVIOCGKEYCODE:
 		/* Fake unsupported ioctl */
 		return (0);
 
 	case EVIOCGKEYCODE_V2:
 		if (evdev->ev_methods == NULL ||
 		    evdev->ev_methods->ev_get_keycode == NULL)
 			return (ENOTSUP);
 
 		ke = (struct input_keymap_entry *)data;
 		evdev->ev_methods->ev_get_keycode(evdev, ke);
 		return (0);
 
 	case EVIOCSKEYCODE:
 		/* Fake unsupported ioctl */
 		return (0);
 
 	case EVIOCSKEYCODE_V2:
 		if (evdev->ev_methods == NULL ||
 		    evdev->ev_methods->ev_set_keycode == NULL)
 			return (ENOTSUP);
 
 		ke = (struct input_keymap_entry *)data;
 		evdev->ev_methods->ev_set_keycode(evdev, ke);
 		return (0);
 
 	case EVIOCGABS(0) ... EVIOCGABS(ABS_MAX):
 		if (evdev->ev_absinfo == NULL)
 			return (EINVAL);
 
 		memcpy(data, &evdev->ev_absinfo[cmd - EVIOCGABS(0)],
 		    sizeof(struct input_absinfo));
 		return (0);
 
 	case EVIOCSABS(0) ... EVIOCSABS(ABS_MAX):
 		if (evdev->ev_absinfo == NULL)
 			return (EINVAL);
 
 		code = cmd - EVIOCSABS(0);
 		/* mt-slot number can not be changed */
 		if (code == ABS_MT_SLOT)
 			return (EINVAL);
 
 		EVDEV_LOCK(evdev);
 		evdev_set_absinfo(evdev, code, (struct input_absinfo *)data);
 		EVDEV_UNLOCK(evdev);
 		return (0);
 
 	case EVIOCSFF:
 	case EVIOCRMFF:
 	case EVIOCGEFFECTS:
 		/* Fake unsupported ioctls */
 		return (0);
 
 	case EVIOCGRAB:
 		EVDEV_LOCK(evdev);
 		if (*(int *)data)
 			ret = evdev_grab_client(evdev, client);
 		else
 			ret = evdev_release_client(evdev, client);
 		EVDEV_UNLOCK(evdev);
 		return (ret);
 
 	case EVIOCREVOKE:
 		if (*(int *)data != 0)
 			return (EINVAL);
 
 		EVDEV_LIST_LOCK(evdev);
 		if (dev->si_drv1 != NULL && !client->ec_revoked) {
 			evdev_dispose_client(evdev, client);
 			evdev_revoke_client(client);
 		}
 		EVDEV_LIST_UNLOCK(evdev);
 		return (0);
 
 	case EVIOCSCLOCKID:
 		switch (*(int *)data) {
 		case CLOCK_REALTIME:
 			client->ec_clock_id = EV_CLOCK_REALTIME;
 			return (0);
 		case CLOCK_MONOTONIC:
 			client->ec_clock_id = EV_CLOCK_MONOTONIC;
 			return (0);
 		default:
 			return (EINVAL);
 		}
 	}
 
 	/* evdev variable-length ioctls handling */
 	switch (IOCBASECMD(cmd)) {
 	case EVIOCGNAME(0):
 		/* Linux evdev does not terminate truncated strings with 0 */
 		limit = MIN(strlen(evdev->ev_name) + 1, len);
 		memcpy(data, evdev->ev_name, limit);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGPHYS(0):
 		if (evdev->ev_shortname[0] == 0)
 			return (ENOENT);
 
 		limit = MIN(strlen(evdev->ev_shortname) + 1, len);
 		memcpy(data, evdev->ev_shortname, limit);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGUNIQ(0):
 		if (evdev->ev_serial[0] == 0)
 			return (ENOENT);
 
 		limit = MIN(strlen(evdev->ev_serial) + 1, len);
 		memcpy(data, evdev->ev_serial, limit);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGPROP(0):
 		limit = MIN(len, bitstr_size(INPUT_PROP_CNT));
 		memcpy(data, evdev->ev_prop_flags, limit);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGMTSLOTS(0):
 		/* EVIOCGMTSLOTS always returns 0 on success */
 		if (evdev->ev_mt == NULL)
 			return (EINVAL);
 		if (len < sizeof(uint32_t))
 			return (EINVAL);
 		code = *(uint32_t *)data;
 		if (!ABS_IS_MT(code))
 			return (EINVAL);
 
 		nvalues =
 		    MIN(len / sizeof(int32_t) - 1, MAXIMAL_MT_SLOT(evdev) + 1);
 		for (int i = 0; i < nvalues; i++)
 			((int32_t *)data)[i + 1] =
 			    evdev_mt_get_value(evdev, i, code);
 		return (0);
 
 	case EVIOCGKEY(0):
 		limit = MIN(len, bitstr_size(KEY_CNT));
 		EVDEV_LOCK(evdev);
 		evdev_client_filter_queue(client, EV_KEY);
 		memcpy(data, evdev->ev_key_states, limit);
 		EVDEV_UNLOCK(evdev);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGLED(0):
 		limit = MIN(len, bitstr_size(LED_CNT));
 		EVDEV_LOCK(evdev);
 		evdev_client_filter_queue(client, EV_LED);
 		memcpy(data, evdev->ev_led_states, limit);
 		EVDEV_UNLOCK(evdev);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGSND(0):
 		limit = MIN(len, bitstr_size(SND_CNT));
 		EVDEV_LOCK(evdev);
 		evdev_client_filter_queue(client, EV_SND);
 		memcpy(data, evdev->ev_snd_states, limit);
 		EVDEV_UNLOCK(evdev);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGSW(0):
 		limit = MIN(len, bitstr_size(SW_CNT));
 		EVDEV_LOCK(evdev);
 		evdev_client_filter_queue(client, EV_SW);
 		memcpy(data, evdev->ev_sw_states, limit);
 		EVDEV_UNLOCK(evdev);
 		td->td_retval[0] = limit;
 		return (0);
 
 	case EVIOCGBIT(0, 0) ... EVIOCGBIT(EV_MAX, 0):
 		type_num = IOCBASECMD(cmd) - EVIOCGBIT(0, 0);
 		debugf(client, "EVIOCGBIT(%d): data=%p, len=%d", type_num,
 		    data, len);
 		return (evdev_ioctl_eviocgbit(evdev, type_num, len, data, td));
 	}
 
 	return (EINVAL);
 }
 
 static int
 evdev_ioctl_eviocgbit(struct evdev_dev *evdev, int type, int len, caddr_t data,
     struct thread *td)
 {
 	unsigned long *bitmap;
 	int limit;
 
 	switch (type) {
 	case 0:
 		bitmap = evdev->ev_type_flags;
 		limit = EV_CNT;
 		break;
 	case EV_KEY:
 		bitmap = evdev->ev_key_flags;
 		limit = KEY_CNT;
 		break;
 	case EV_REL:
 		bitmap = evdev->ev_rel_flags;
 		limit = REL_CNT;
 		break;
 	case EV_ABS:
 		bitmap = evdev->ev_abs_flags;
 		limit = ABS_CNT;
 		break;
 	case EV_MSC:
 		bitmap = evdev->ev_msc_flags;
 		limit = MSC_CNT;
 		break;
 	case EV_LED:
 		bitmap = evdev->ev_led_flags;
 		limit = LED_CNT;
 		break;
 	case EV_SND:
 		bitmap = evdev->ev_snd_flags;
 		limit = SND_CNT;
 		break;
 	case EV_SW:
 		bitmap = evdev->ev_sw_flags;
 		limit = SW_CNT;
 		break;
 	case EV_FF:
 		/*
 		 * We don't support EV_FF now, so let's
 		 * just fake it returning only zeros.
 		 */
 		bzero(data, len);
 		td->td_retval[0] = len;
 		return (0);
 	default:
 		return (ENOTTY);
 	}
 
 	/*
 	 * Clear ioctl data buffer in case it's bigger than
 	 * bitmap size
 	 */
 	bzero(data, len);
 
 	limit = bitstr_size(limit);
 	len = MIN(limit, len);
 	memcpy(data, bitmap, len);
 	td->td_retval[0] = len;
 	return (0);
 }
 
 void
 evdev_revoke_client(struct evdev_client *client)
 {
 
 	EVDEV_LIST_LOCK_ASSERT(client->ec_evdev);
 
 	client->ec_revoked = true;
 }
 
 void
 evdev_notify_event(struct evdev_client *client)
 {
 
 	EVDEV_CLIENT_LOCKQ_ASSERT(client);
 
 	if (client->ec_blocked) {
 		client->ec_blocked = false;
 		wakeup(client);
 	}
 	if (client->ec_selected) {
 		client->ec_selected = false;
 		selwakeup(&client->ec_selp);
 	}
 	KNOTE_LOCKED(&client->ec_selp.si_note, 0);
 
 	if (client->ec_async && client->ec_sigio != NULL)
 		pgsigio(&client->ec_sigio, SIGIO, 0);
 }
 
 int
 evdev_cdev_create(struct evdev_dev *evdev)
 {
 	struct make_dev_args mda;
 	int ret, unit = 0;
 
 	make_dev_args_init(&mda);
 	mda.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME;
 	mda.mda_devsw = &evdev_cdevsw;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = evdev;
 
 	/* Try to coexist with cuse-backed input/event devices */
 	while ((ret = make_dev_s(&mda, &evdev->ev_cdev, "input/event%d", unit))
 	    == EEXIST)
 		unit++;
 
 	if (ret == 0)
 		evdev->ev_unit = unit;
 
 	return (ret);
 }
 
 int
 evdev_cdev_destroy(struct evdev_dev *evdev)
 {
 
 	destroy_dev(evdev->ev_cdev);
 	return (0);
 }
 
 static void
 evdev_client_gettime(struct evdev_client *client, struct timeval *tv)
 {
 
 	switch (client->ec_clock_id) {
 	case EV_CLOCK_BOOTTIME:
 		/*
 		 * XXX: FreeBSD does not support true POSIX monotonic clock.
 		 *      So aliase EV_CLOCK_BOOTTIME to EV_CLOCK_MONOTONIC.
 		 */
 	case EV_CLOCK_MONOTONIC:
 		microuptime(tv);
 		break;
 
 	case EV_CLOCK_REALTIME:
 	default:
 		microtime(tv);
 		break;
 	}
 }
 
 void
 evdev_client_push(struct evdev_client *client, uint16_t type, uint16_t code,
     int32_t value)
 {
 	struct timeval time;
 	size_t count, head, tail, ready;
 
 	EVDEV_CLIENT_LOCKQ_ASSERT(client);
 	head = client->ec_buffer_head;
 	tail = client->ec_buffer_tail;
 	ready = client->ec_buffer_ready;
 	count = client->ec_buffer_size;
 
 	/* If queue is full drop its content and place SYN_DROPPED event */
 	if ((tail + 1) % count == head) {
 		debugf(client, "client %p: buffer overflow", client);
 
 		head = (tail + count - 1) % count;
 		client->ec_buffer[head] = (struct input_event) {
 			.type = EV_SYN,
 			.code = SYN_DROPPED,
 			.value = 0
 		};
 		/*
 		 * XXX: Here is a small race window from now till the end of
 		 *      report. The queue is empty but client has been already
 		 *      notified of data readyness. Can be fixed in two ways:
 		 * 1. Implement bulk insert so queue lock would not be dropped
 		 *    till the SYN_REPORT event.
 		 * 2. Insert SYN_REPORT just now and skip remaining events
 		 */
 		client->ec_buffer_head = head;
 		client->ec_buffer_ready = head;
 	}
 
 	client->ec_buffer[tail].type = type;
 	client->ec_buffer[tail].code = code;
 	client->ec_buffer[tail].value = value;
 	client->ec_buffer_tail = (tail + 1) % count;
 
 	/* Allow users to read events only after report has been completed */
 	if (type == EV_SYN && code == SYN_REPORT) {
 		evdev_client_gettime(client, &time);
 		for (; ready != client->ec_buffer_tail;
 		    ready = (ready + 1) % count)
 			client->ec_buffer[ready].time = time;
 		client->ec_buffer_ready = client->ec_buffer_tail;
 	}
 }
 
 void
 evdev_client_dumpqueue(struct evdev_client *client)
 {
 	struct input_event *event;
 	size_t i, head, tail, ready, size;
 
 	head = client->ec_buffer_head;
 	tail = client->ec_buffer_tail;
 	ready = client->ec_buffer_ready;
 	size = client->ec_buffer_size;
 
 	printf("evdev client: %p\n", client);
 	printf("event queue: head=%zu ready=%zu tail=%zu size=%zu\n",
 	    head, ready, tail, size);
 
 	printf("queue contents:\n");
 
 	for (i = 0; i < size; i++) {
 		event = &client->ec_buffer[i];
 		printf("%zu: ", i);
 
 		if (i < head || i > tail)
 			printf("unused\n");
 		else
 			printf("type=%d code=%d value=%d ", event->type,
 			    event->code, event->value);
 
 		if (i == head)
 			printf("<- head\n");
 		else if (i == tail)
 			printf("<- tail\n");
 		else if (i == ready)
 			printf("<- ready\n");
 		else
 			printf("\n");
 	}
 }
 
 static void
 evdev_client_filter_queue(struct evdev_client *client, uint16_t type)
 {
 	struct input_event *event;
 	size_t head, tail, count, i;
 	bool last_was_syn = false;
 
 	EVDEV_CLIENT_LOCKQ(client);
 
 	i = head = client->ec_buffer_head;
 	tail = client->ec_buffer_tail;
 	count = client->ec_buffer_size;
 	client->ec_buffer_ready = client->ec_buffer_tail;
 
 	while (i != client->ec_buffer_tail) {
 		event = &client->ec_buffer[i];
 		i = (i + 1) % count;
 
 		/* Skip event of given type */
 		if (event->type == type)
 			continue;
 
 		/* Remove empty SYN_REPORT events */
 		if (event->type == EV_SYN && event->code == SYN_REPORT) {
 			if (last_was_syn)
 				continue;
 			else
 				client->ec_buffer_ready = (tail + 1) % count;
 		}
 
 		/* Rewrite entry */
 		memcpy(&client->ec_buffer[tail], event,
 		    sizeof(struct input_event));
 
 		last_was_syn = (event->type == EV_SYN &&
 		    event->code == SYN_REPORT);
 
 		tail = (tail + 1) % count;
 	}
 
 	client->ec_buffer_head = i;
 	client->ec_buffer_tail = tail;
 
 	EVDEV_CLIENT_UNLOCKQ(client);
 }
diff --git a/sys/dev/evdev/uinput.c b/sys/dev/evdev/uinput.c
index 3bf0e91b7360..9ac9fee8a157 100644
--- a/sys/dev/evdev/uinput.c
+++ b/sys/dev/evdev/uinput.c
@@ -1,714 +1,714 @@
 /*-
  * Copyright (c) 2014 Jakub Wojciech Klama <jceel@FreeBSD.org>
  * Copyright (c) 2015-2016 Vladimir Kondratyev <wulf@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_evdev.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/selinfo.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 
 #include <dev/evdev/evdev.h>
 #include <dev/evdev/evdev_private.h>
 #include <dev/evdev/input.h>
 #include <dev/evdev/uinput.h>
 
 #ifdef UINPUT_DEBUG
 #define	debugf(state, fmt, args...)	printf("uinput: " fmt "\n", ##args)
 #else
 #define	debugf(state, fmt, args...)
 #endif
 
 #define	UINPUT_BUFFER_SIZE	16
 
 #define	UINPUT_LOCK(state)		sx_xlock(&(state)->ucs_lock)
 #define	UINPUT_UNLOCK(state)		sx_unlock(&(state)->ucs_lock)
 #define	UINPUT_LOCK_ASSERT(state)	sx_assert(&(state)->ucs_lock, SA_LOCKED)
 #define UINPUT_EMPTYQ(state) \
     ((state)->ucs_buffer_head == (state)->ucs_buffer_tail)
 
 enum uinput_state
 {
 	UINPUT_NEW = 0,
 	UINPUT_CONFIGURED,
 	UINPUT_RUNNING
 };
 
 static evdev_event_t	uinput_ev_event;
 
 static d_open_t		uinput_open;
 static d_read_t		uinput_read;
 static d_write_t	uinput_write;
 static d_ioctl_t	uinput_ioctl;
 static d_poll_t		uinput_poll;
 static d_kqfilter_t	uinput_kqfilter;
 static void uinput_dtor(void *);
 
 static int uinput_kqread(struct knote *kn, long hint);
 static void uinput_kqdetach(struct knote *kn);
 
 static struct cdevsw uinput_cdevsw = {
 	.d_version = D_VERSION,
 	.d_open = uinput_open,
 	.d_read = uinput_read,
 	.d_write = uinput_write,
 	.d_ioctl = uinput_ioctl,
 	.d_poll = uinput_poll,
 	.d_kqfilter = uinput_kqfilter,
 	.d_name = "uinput",
 };
 
 static struct cdev *uinput_cdev;
 
-static struct evdev_methods uinput_ev_methods = {
+static const struct evdev_methods uinput_ev_methods = {
 	.ev_open = NULL,
 	.ev_close = NULL,
 	.ev_event = uinput_ev_event,
 };
 
-static struct filterops uinput_filterops = {
+static const struct filterops uinput_filterops = {
 	.f_isfd = 1,
 	.f_attach = NULL,
 	.f_detach = uinput_kqdetach,
 	.f_event = uinput_kqread,
 };
 
 struct uinput_cdev_state
 {
 	enum uinput_state	ucs_state;
 	struct evdev_dev *	ucs_evdev;
 	struct sx		ucs_lock;
 	size_t			ucs_buffer_head;
 	size_t			ucs_buffer_tail;
 	struct selinfo		ucs_selp;
 	bool			ucs_blocked;
 	bool			ucs_selected;
 	struct input_event      ucs_buffer[UINPUT_BUFFER_SIZE];
 };
 
 static void uinput_enqueue_event(struct uinput_cdev_state *, uint16_t,
     uint16_t, int32_t);
 static int uinput_setup_provider(struct uinput_cdev_state *,
     struct uinput_user_dev *);
 static int uinput_cdev_create(void);
 static void uinput_notify(struct uinput_cdev_state *);
 
 static void
 uinput_knllock(void *arg)
 {
 	struct sx *sx = arg;
 
 	sx_xlock(sx);
 }
 
 static void
 uinput_knlunlock(void *arg)
 {
 	struct sx *sx = arg;
 
 	sx_unlock(sx);
 }
 
 static void
 uinput_knl_assert_lock(void *arg, int what)
 {
 
 	if (what == LA_LOCKED)
 		sx_assert((struct sx*)arg, SA_XLOCKED);
 	else
 		sx_assert((struct sx*)arg, SA_UNLOCKED);
 }
 
 static void
 uinput_ev_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t value)
 {
 	struct uinput_cdev_state *state = evdev_get_softc(evdev);
 
 	if (type == EV_LED)
 		evdev_push_event(evdev, type, code, value);
 
 	UINPUT_LOCK(state);
 	if (state->ucs_state == UINPUT_RUNNING) {
 		uinput_enqueue_event(state, type, code, value);
 		uinput_notify(state);
 	}
 	UINPUT_UNLOCK(state);
 }
 
 static void
 uinput_enqueue_event(struct uinput_cdev_state *state, uint16_t type,
     uint16_t code, int32_t value)
 {
 	size_t head, tail;
 
 	UINPUT_LOCK_ASSERT(state);
 
 	head = state->ucs_buffer_head;
 	tail = (state->ucs_buffer_tail + 1) % UINPUT_BUFFER_SIZE;
 
 	microtime(&state->ucs_buffer[tail].time);
 	state->ucs_buffer[tail].type = type;
 	state->ucs_buffer[tail].code = code;
 	state->ucs_buffer[tail].value = value;
 	state->ucs_buffer_tail = tail;
 
 	/* If queue is full remove oldest event */
 	if (tail == head) {
 		debugf(state, "state %p: buffer overflow", state);
 
 		head = (head + 1) % UINPUT_BUFFER_SIZE;
 		state->ucs_buffer_head = head;
 	}
 }
 
 static int
 uinput_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct uinput_cdev_state *state;
 
 	state = malloc(sizeof(struct uinput_cdev_state), M_EVDEV,
 	    M_WAITOK | M_ZERO);
 	state->ucs_evdev = evdev_alloc();
 
 	sx_init(&state->ucs_lock, "uinput");
 	knlist_init(&state->ucs_selp.si_note, &state->ucs_lock, uinput_knllock,
 	    uinput_knlunlock, uinput_knl_assert_lock);
 
 	devfs_set_cdevpriv(state, uinput_dtor);
 	return (0);
 }
 
 static void
 uinput_dtor(void *data)
 {
 	struct uinput_cdev_state *state = (struct uinput_cdev_state *)data;
 
 	evdev_free(state->ucs_evdev);
 
 	knlist_clear(&state->ucs_selp.si_note, 0);
 	seldrain(&state->ucs_selp);
 	knlist_destroy(&state->ucs_selp.si_note);
 	sx_destroy(&state->ucs_lock);
 	free(data, M_EVDEV);
 }
 
 static int
 uinput_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct uinput_cdev_state *state;
 	struct input_event *event;
 	int remaining, ret;
 
 	ret = devfs_get_cdevpriv((void **)&state);
 	if (ret != 0)
 		return (ret);
 
 	debugf(state, "read %zd bytes by thread %d", uio->uio_resid,
 	    uio->uio_td->td_tid);
 
 	/* Zero-sized reads are allowed for error checking */
 	if (uio->uio_resid != 0 && uio->uio_resid < sizeof(struct input_event))
 		return (EINVAL);
 
 	remaining = uio->uio_resid / sizeof(struct input_event);
 
 	UINPUT_LOCK(state);
 
 	if (state->ucs_state != UINPUT_RUNNING)
 		ret = EINVAL;
 
 	if (ret == 0 && UINPUT_EMPTYQ(state)) {
 		if (ioflag & O_NONBLOCK)
 			ret = EWOULDBLOCK;
 		else {
 			if (remaining != 0) {
 				state->ucs_blocked = true;
 				ret = sx_sleep(state, &state->ucs_lock,
 				    PCATCH, "uiread", 0);
 			}
 		}
 	}
 
 	while (ret == 0 && !UINPUT_EMPTYQ(state) && remaining > 0) {
 		event = &state->ucs_buffer[state->ucs_buffer_head];
 		state->ucs_buffer_head = (state->ucs_buffer_head + 1) %
 		    UINPUT_BUFFER_SIZE;
 		remaining--;
 		ret = uiomove(event, sizeof(struct input_event), uio);
 	}
 
 	UINPUT_UNLOCK(state);
 
 	return (ret);
 }
 
 static int
 uinput_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct uinput_cdev_state *state;
 	struct uinput_user_dev userdev;
 	struct input_event event;
 	int ret = 0;
 
 	ret = devfs_get_cdevpriv((void **)&state);
 	if (ret != 0)
 		return (ret);
 
 	debugf(state, "write %zd bytes by thread %d", uio->uio_resid,
 	    uio->uio_td->td_tid);
 
 	UINPUT_LOCK(state);
 
 	if (state->ucs_state != UINPUT_RUNNING) {
 		/* Process written struct uinput_user_dev */
 		if (uio->uio_resid != sizeof(struct uinput_user_dev)) {
 			debugf(state, "write size not multiple of "
 			    "struct uinput_user_dev size");
 			ret = EINVAL;
 		} else {
 			ret = uiomove(&userdev, sizeof(struct uinput_user_dev),
 			    uio);
 			if (ret == 0)
 				uinput_setup_provider(state, &userdev);
 		}
 	} else {
 		/* Process written event */
 		if (uio->uio_resid % sizeof(struct input_event) != 0) {
 			debugf(state, "write size not multiple of "
 			    "struct input_event size");
 			ret = EINVAL;
 		}
 
 		while (ret == 0 && uio->uio_resid > 0) {
 			uiomove(&event, sizeof(struct input_event), uio);
 			ret = evdev_push_event(state->ucs_evdev, event.type,
 			    event.code, event.value);
 		}
 	}
 
 	UINPUT_UNLOCK(state);
 
 	return (ret);
 }
 
 static int
 uinput_setup_dev(struct uinput_cdev_state *state, struct input_id *id,
     char *name, uint32_t ff_effects_max)
 {
 
 	if (name[0] == 0)
 		return (EINVAL);
 
 	evdev_set_name(state->ucs_evdev, name);
 	evdev_set_id(state->ucs_evdev, id->bustype, id->vendor, id->product,
 	    id->version);
 	state->ucs_state = UINPUT_CONFIGURED;
 
 	return (0);
 }
 
 static int
 uinput_setup_provider(struct uinput_cdev_state *state,
     struct uinput_user_dev *udev)
 {
 	struct input_absinfo absinfo;
 	int i, ret;
 
 	debugf(state, "setup_provider called, udev=%p", udev);
 
 	ret = uinput_setup_dev(state, &udev->id, udev->name,
 	    udev->ff_effects_max);
 	if (ret)
 		return (ret);
 
 	bzero(&absinfo, sizeof(struct input_absinfo));
 	for (i = 0; i < ABS_CNT; i++) {
 		if (!bit_test(state->ucs_evdev->ev_abs_flags, i))
 			continue;
 
 		absinfo.minimum = udev->absmin[i];
 		absinfo.maximum = udev->absmax[i];
 		absinfo.fuzz = udev->absfuzz[i];
 		absinfo.flat = udev->absflat[i];
 		evdev_set_absinfo(state->ucs_evdev, i, &absinfo);
 	}
 
 	return (0);
 }
 
 static int
 uinput_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct uinput_cdev_state *state;
 	int revents = 0;
 
 	if (devfs_get_cdevpriv((void **)&state) != 0)
 		return (POLLNVAL);
 
 	debugf(state, "poll by thread %d", td->td_tid);
 
 	/* Always allow write */
 	if (events & (POLLOUT | POLLWRNORM))
 		revents |= (events & (POLLOUT | POLLWRNORM));
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		UINPUT_LOCK(state);
 		if (!UINPUT_EMPTYQ(state))
 			revents = events & (POLLIN | POLLRDNORM);
 		else {
 			state->ucs_selected = true;
 			selrecord(td, &state->ucs_selp);
 		}
 		UINPUT_UNLOCK(state);
 	}
 
 	return (revents);
 }
 
 static int
 uinput_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct uinput_cdev_state *state;
 	int ret;
 
 	ret = devfs_get_cdevpriv((void **)&state);
 	if (ret != 0)
 		return (ret);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &uinput_filterops;
 		break;
 	default:
 		return(EINVAL);
 	}
 	kn->kn_hook = (caddr_t)state;
 
 	knlist_add(&state->ucs_selp.si_note, kn, 0);
 	return (0);
 }
 
 static int
 uinput_kqread(struct knote *kn, long hint)
 {
 	struct uinput_cdev_state *state;
 	int ret;
 
 	state = (struct uinput_cdev_state *)kn->kn_hook;
 
 	UINPUT_LOCK_ASSERT(state);
 
 	ret = !UINPUT_EMPTYQ(state);
 	return (ret);
 }
 
 static void
 uinput_kqdetach(struct knote *kn)
 {
 	struct uinput_cdev_state *state;
 
 	state = (struct uinput_cdev_state *)kn->kn_hook;
 	knlist_remove(&state->ucs_selp.si_note, kn, 0);
 }
 
 static void
 uinput_notify(struct uinput_cdev_state *state)
 {
 
 	UINPUT_LOCK_ASSERT(state);
 
 	if (state->ucs_blocked) {
 		state->ucs_blocked = false;
 		wakeup(state);
 	}
 	if (state->ucs_selected) {
 		state->ucs_selected = false;
 		selwakeup(&state->ucs_selp);
 	}
 	KNOTE_LOCKED(&state->ucs_selp.si_note, 0);
 }
 
 static int
 uinput_ioctl_sub(struct uinput_cdev_state *state, u_long cmd, caddr_t data)
 {
 	struct uinput_setup *us;
 	struct uinput_abs_setup *uabs;
 	int ret, len, intdata;
 	char buf[NAMELEN];
 
 	UINPUT_LOCK_ASSERT(state);
 
 	len = IOCPARM_LEN(cmd);
 	if ((cmd & IOC_DIRMASK) == IOC_VOID && len == sizeof(int))
 		intdata = *(int *)data;
 
 	switch (IOCBASECMD(cmd)) {
 	case UI_GET_SYSNAME(0):
 		if (state->ucs_state != UINPUT_RUNNING)
 			return (ENOENT);
 		if (len == 0)
 			return (EINVAL);
 		snprintf(data, len, "event%d", state->ucs_evdev->ev_unit);
 		return (0);
 	}
 
 	switch (cmd) {
 	case UI_DEV_CREATE:
 		if (state->ucs_state != UINPUT_CONFIGURED)
 			return (EINVAL);
 
 		evdev_set_methods(state->ucs_evdev, state, &uinput_ev_methods);
 		evdev_set_flag(state->ucs_evdev, EVDEV_FLAG_SOFTREPEAT);
 		evdev_set_flag(state->ucs_evdev, EVDEV_FLAG_MT_KEEPID);
 		ret = evdev_register(state->ucs_evdev);
 		if (ret == 0)
 			state->ucs_state = UINPUT_RUNNING;
 		return (ret);
 
 	case UI_DEV_DESTROY:
 		if (state->ucs_state != UINPUT_RUNNING)
 			return (0);
 
 		evdev_unregister(state->ucs_evdev);
 		bzero(state->ucs_evdev, sizeof(struct evdev_dev));
 		state->ucs_state = UINPUT_NEW;
 		return (0);
 
 	case UI_DEV_SETUP:
 		if (state->ucs_state == UINPUT_RUNNING)
 			return (EINVAL);
 
 		us = (struct uinput_setup *)data;
 		return (uinput_setup_dev(state, &us->id, us->name,
 		    us->ff_effects_max));
 
 	case UI_ABS_SETUP:
 		if (state->ucs_state == UINPUT_RUNNING)
 			return (EINVAL);
 
 		uabs = (struct uinput_abs_setup *)data;
 		if (uabs->code > ABS_MAX)
 			return (EINVAL);
 
 		evdev_set_abs_bit(state->ucs_evdev, uabs->code);
 		evdev_set_absinfo(state->ucs_evdev, uabs->code,
 		    &uabs->absinfo);
 		return (0);
 
 	case UI_SET_EVBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > EV_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_event(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_KEYBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > KEY_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_key(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_RELBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > REL_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_rel(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_ABSBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > ABS_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_set_abs_bit(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_MSCBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > MSC_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_msc(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_LEDBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > LED_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_led(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_SNDBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > SND_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_snd(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_FFBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > FF_MAX || intdata < 0)
 			return (EINVAL);
 		/* Fake unsupported ioctl */
 		return (0);
 
 	case UI_SET_PHYS:
 		if (state->ucs_state == UINPUT_RUNNING)
 			return (EINVAL);
 		ret = copyinstr(*(void **)data, buf, sizeof(buf), NULL);
 		/* Linux returns EINVAL when string does not fit the buffer */
 		if (ret == ENAMETOOLONG)
 			ret = EINVAL;
 		if (ret != 0)
 			return (ret);
 		evdev_set_phys(state->ucs_evdev, buf);
 		return (0);
 
 	case UI_SET_BSDUNIQ:
 		if (state->ucs_state == UINPUT_RUNNING)
 			return (EINVAL);
 		ret = copyinstr(*(void **)data, buf, sizeof(buf), NULL);
 		if (ret != 0)
 			return (ret);
 		evdev_set_serial(state->ucs_evdev, buf);
 		return (0);
 
 	case UI_SET_SWBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > SW_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_sw(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_SET_PROPBIT:
 		if (state->ucs_state == UINPUT_RUNNING ||
 		    intdata > INPUT_PROP_MAX || intdata < 0)
 			return (EINVAL);
 		evdev_support_prop(state->ucs_evdev, intdata);
 		return (0);
 
 	case UI_BEGIN_FF_UPLOAD:
 	case UI_END_FF_UPLOAD:
 	case UI_BEGIN_FF_ERASE:
 	case UI_END_FF_ERASE:
 		if (state->ucs_state == UINPUT_RUNNING)
 			return (EINVAL);
 		/* Fake unsupported ioctl */
 		return (0);
 
 	case UI_GET_VERSION:
 		*(unsigned int *)data = UINPUT_VERSION;
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 static int
 uinput_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct uinput_cdev_state *state;
 	int ret;
 
 	ret = devfs_get_cdevpriv((void **)&state);
 	if (ret != 0)
 		return (ret);
 
 	debugf(state, "ioctl called: cmd=0x%08lx, data=%p", cmd, data);
 
 	UINPUT_LOCK(state);
 	ret = uinput_ioctl_sub(state, cmd, data);
 	UINPUT_UNLOCK(state);
 
 	return (ret);
 }
 
 static int
 uinput_cdev_create(void)
 {
 	struct make_dev_args mda;
 	int ret;
 
 	make_dev_args_init(&mda);
 	mda.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME;
 	mda.mda_devsw = &uinput_cdevsw;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 
 	ret = make_dev_s(&mda, &uinput_cdev, "uinput");
 
 	return (ret);
 }
 
 static int
 uinput_cdev_destroy(void)
 {
 
 	destroy_dev(uinput_cdev);
 
 	return (0);
 }
 
 static int
 uinput_modevent(module_t mod __unused, int cmd, void *data)
 {
 	int ret = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		ret = uinput_cdev_create();
 		break;
 
 	case MOD_UNLOAD:
 		ret = uinput_cdev_destroy();
 		break;
 
 	case MOD_SHUTDOWN:
 		break;
 
 	default:
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 DEV_MODULE(uinput, uinput_modevent, NULL);
 MODULE_VERSION(uinput, 1);
 MODULE_DEPEND(uinput, evdev, 1, 1, 1);
diff --git a/sys/dev/gpio/gpioc.c b/sys/dev/gpio/gpioc.c
index 6fb79fa8d751..067a43617f11 100644
--- a/sys/dev/gpio/gpioc.c
+++ b/sys/dev/gpio/gpioc.c
@@ -1,1063 +1,1063 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2009 Oleksandr Tymoshenko <gonzo@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/gpio.h>
 #include <sys/ioccom.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/uio.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/module.h>
 
 #include <dev/gpio/gpiobusvar.h>
 
 #include "gpio_if.h"
 #include "gpiobus_if.h"
 
 #undef GPIOC_DEBUG
 #ifdef GPIOC_DEBUG
 #define dprintf printf
 #define ddevice_printf device_printf
 #else
 #define dprintf(x, arg...)
 #define ddevice_printf(dev, x, arg...)
 #endif
 
 struct gpioc_softc {
 	device_t		sc_dev;		/* gpiocX dev */
 	device_t		sc_pdev;	/* gpioX dev */
 	struct cdev		*sc_ctl_dev;	/* controller device */
 	int			sc_unit;
 	int			sc_npins;
 	struct gpioc_pin_intr	*sc_pin_intr;
 };
 
 struct gpioc_pin_intr {
 	struct gpioc_softc				*sc;
 	gpio_pin_t					pin;
 	bool						config_locked;
 	int						intr_rid;
 	struct resource					*intr_res;
 	void						*intr_cookie;
 	struct mtx					mtx;
 	SLIST_HEAD(gpioc_privs_list, gpioc_privs)	privs;
 };
 
 
 struct gpioc_cdevpriv {
 	struct gpioc_softc			*sc;
 	struct selinfo				selinfo;
 	bool					async;
 	uint8_t					report_option;
 	struct sigio				*sigio;
 	struct mtx				mtx;
 	struct gpioc_pin_event			*events;
 	int					numevents;
 	int					evidx_head;
 	int					evidx_tail;
 	SLIST_HEAD(gpioc_pins_list, gpioc_pins)	pins;
 };
 
 struct gpioc_privs {
 	struct gpioc_cdevpriv		*priv;
 	SLIST_ENTRY(gpioc_privs)	next;
 };
 
 struct gpioc_pins {
 	struct gpioc_pin_intr	*pin;
 	int			eventcount;
 	int			firstevent;
 	SLIST_ENTRY(gpioc_pins)	next;
 };
 
 struct gpioc_pin_event {
 	struct gpioc_pins	*privpin;
 	sbintime_t		event_time;
 	bool			event_pin_state;
 };
 
 static MALLOC_DEFINE(M_GPIOC, "gpioc", "gpioc device data");
 
 static int	gpioc_allocate_pin_intr(struct gpioc_pin_intr*, uint32_t);
 static int	gpioc_release_pin_intr(struct gpioc_pin_intr*);
 static int	gpioc_attach_priv_pin(struct gpioc_cdevpriv*,
 		    struct gpioc_pin_intr*);
 static int	gpioc_detach_priv_pin(struct gpioc_cdevpriv*,
 		    struct gpioc_pin_intr*);
 static bool	gpioc_intr_reconfig_allowed(struct gpioc_cdevpriv*,
 		    struct gpioc_pin_intr *intr_conf);
 static uint32_t	gpioc_get_intr_config(struct gpioc_softc*,
 		    struct gpioc_cdevpriv*, uint32_t pin);
 static int	gpioc_set_intr_config(struct gpioc_softc*,
 		    struct gpioc_cdevpriv*, uint32_t, uint32_t);
 static void	gpioc_interrupt_handler(void*);
 
 static int	gpioc_kqread(struct knote*, long);
 static void	gpioc_kqdetach(struct knote*);
 
 static int	gpioc_probe(device_t dev);
 static int	gpioc_attach(device_t dev);
 static int	gpioc_detach(device_t dev);
 
 static void	gpioc_cdevpriv_dtor(void*);
 
 static d_open_t		gpioc_open;
 static d_read_t		gpioc_read;
 static d_ioctl_t	gpioc_ioctl;
 static d_poll_t		gpioc_poll;
 static d_kqfilter_t	gpioc_kqfilter;
 
 static struct cdevsw gpioc_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= gpioc_open,
 	.d_read		= gpioc_read,
 	.d_ioctl	= gpioc_ioctl,
 	.d_poll		= gpioc_poll,
 	.d_kqfilter	= gpioc_kqfilter,
 	.d_name		= "gpioc",
 };
 
-static struct filterops gpioc_read_filterops = {
+static const struct filterops gpioc_read_filterops = {
 	.f_isfd =	true,
 	.f_attach =	NULL,
 	.f_detach =	gpioc_kqdetach,
 	.f_event =	gpioc_kqread,
 	.f_touch =	NULL
 };
 
 static struct gpioc_pin_event *
 next_head_event(struct gpioc_cdevpriv *priv)
 {
 	struct gpioc_pin_event *rv;
 
 	rv = &priv->events[priv->evidx_head++];
 	if (priv->evidx_head == priv->numevents)
 		priv->evidx_head = 0;
 	return (rv);
 }
 
 static struct gpioc_pin_event *
 next_tail_event(struct gpioc_cdevpriv *priv)
 {
 	struct gpioc_pin_event *rv;
 
 	rv = &priv->events[priv->evidx_tail++];
 	if (priv->evidx_tail == priv->numevents)
 		priv->evidx_tail = 0;
 	return (rv);
 }
 
 static size_t
 number_of_events(struct gpioc_cdevpriv *priv)
 {
 	if (priv->evidx_head >= priv->evidx_tail)
 		return (priv->evidx_head - priv->evidx_tail);
 	else
 		return (priv->numevents + priv->evidx_head - priv->evidx_tail);
 }
 
 static int
 gpioc_allocate_pin_intr(struct gpioc_pin_intr *intr_conf, uint32_t flags)
 {
 	int err;
 
 	intr_conf->config_locked = true;
 	mtx_unlock(&intr_conf->mtx);
 
 	intr_conf->intr_res = gpio_alloc_intr_resource(intr_conf->pin->dev,
 	    &intr_conf->intr_rid, RF_ACTIVE, intr_conf->pin, flags);
 	if (intr_conf->intr_res == NULL) {
 		err = ENXIO;
 		goto error_exit;
 	}
 
 	err = bus_setup_intr(intr_conf->pin->dev, intr_conf->intr_res,
 	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, gpioc_interrupt_handler,
 	    intr_conf, &intr_conf->intr_cookie);
 	if (err != 0)
 		goto error_exit;
 
 	intr_conf->pin->flags = flags;
 
 error_exit:
 	mtx_lock(&intr_conf->mtx);
 	intr_conf->config_locked = false;
 	wakeup(&intr_conf->config_locked);
 
 	return (err);
 }
 
 static int
 gpioc_release_pin_intr(struct gpioc_pin_intr *intr_conf)
 {
 	int err;
 
 	intr_conf->config_locked = true;
 	mtx_unlock(&intr_conf->mtx);
 
 	if (intr_conf->intr_cookie != NULL) {
 		err = bus_teardown_intr(intr_conf->pin->dev,
 		    intr_conf->intr_res, intr_conf->intr_cookie);
 		if (err != 0)
 			goto error_exit;
 		else
 			intr_conf->intr_cookie = NULL;
 	}
 
 	if (intr_conf->intr_res != NULL) {
 		err = bus_release_resource(intr_conf->pin->dev, SYS_RES_IRQ,
 		    intr_conf->intr_rid, intr_conf->intr_res);
 		if (err != 0)
 			goto error_exit;
 		else {
 			intr_conf->intr_rid = 0;
 			intr_conf->intr_res = NULL;
 		}
 	}
 
 	intr_conf->pin->flags = 0;
 	err = 0;
 
 error_exit:
 	mtx_lock(&intr_conf->mtx);
 	intr_conf->config_locked = false;
 	wakeup(&intr_conf->config_locked);
 
 	return (err);
 }
 
 static int
 gpioc_attach_priv_pin(struct gpioc_cdevpriv *priv,
     struct gpioc_pin_intr *intr_conf)
 {
 	struct gpioc_privs	*priv_link;
 	struct gpioc_pins	*pin_link;
 	unsigned int		consistency_a __diagused;
 	unsigned int		consistency_b __diagused;
 
 	consistency_a = 0;
 	consistency_b = 0;
 	mtx_assert(&intr_conf->mtx, MA_OWNED);
 	mtx_lock(&priv->mtx);
 	SLIST_FOREACH(priv_link, &intr_conf->privs, next) {
 		if (priv_link->priv == priv)
 			consistency_a++;
 	}
 	KASSERT(consistency_a <= 1,
 	    ("inconsistent links between pin config and cdevpriv"));
 	SLIST_FOREACH(pin_link, &priv->pins, next) {
 		if (pin_link->pin == intr_conf)
 			consistency_b++;
 	}
 	KASSERT(consistency_a == consistency_b,
 	    ("inconsistent links between pin config and cdevpriv"));
 	if (consistency_a == 1 && consistency_b == 1) {
 		mtx_unlock(&priv->mtx);
 		return (EEXIST);
 	}
 	priv_link = malloc(sizeof(struct gpioc_privs), M_GPIOC,
 	    M_NOWAIT | M_ZERO);
 	if (priv_link == NULL)
 	{
 		mtx_unlock(&priv->mtx);
 		return (ENOMEM);
 	}
 	pin_link = malloc(sizeof(struct gpioc_pins), M_GPIOC,
 	    M_NOWAIT | M_ZERO);
 	if (pin_link == NULL) {
 		mtx_unlock(&priv->mtx);
 		return (ENOMEM);
 	}
 	priv_link->priv = priv;
 	pin_link->pin = intr_conf;
 	SLIST_INSERT_HEAD(&intr_conf->privs, priv_link, next);
 	SLIST_INSERT_HEAD(&priv->pins, pin_link, next);
 	mtx_unlock(&priv->mtx);
 
 	return (0);
 }
 
 static int
 gpioc_detach_priv_pin(struct gpioc_cdevpriv *priv,
     struct gpioc_pin_intr *intr_conf)
 {
 	struct gpioc_privs	*priv_link, *priv_link_temp;
 	struct gpioc_pins	*pin_link, *pin_link_temp;
 	unsigned int		consistency_a __diagused;
 	unsigned int		consistency_b __diagused;
 
 	consistency_a = 0;
 	consistency_b = 0;
 	mtx_assert(&intr_conf->mtx, MA_OWNED);
 	mtx_lock(&priv->mtx);
 	SLIST_FOREACH_SAFE(priv_link, &intr_conf->privs, next, priv_link_temp) {
 		if (priv_link->priv == priv) {
 			SLIST_REMOVE(&intr_conf->privs, priv_link, gpioc_privs,
 			    next);
 			free(priv_link, M_GPIOC);
 			consistency_a++;
 		}
 	}
 	KASSERT(consistency_a <= 1,
 	    ("inconsistent links between pin config and cdevpriv"));
 	SLIST_FOREACH_SAFE(pin_link, &priv->pins, next, pin_link_temp) {
 		if (pin_link->pin == intr_conf) {
 			/*
 			 * If the pin we're removing has events in the priv's
 			 * event fifo, we can't leave dangling pointers from
 			 * those events to the gpioc_pins struct we're about to
 			 * free.  We also can't remove random items and leave
 			 * holes in the events fifo, so just empty it out.
 			 */
 			if (pin_link->eventcount > 0) {
 				priv->evidx_head = priv->evidx_tail = 0;
 			}
 			SLIST_REMOVE(&priv->pins, pin_link, gpioc_pins, next);
 			free(pin_link, M_GPIOC);
 			consistency_b++;
 		}
 	}
 	KASSERT(consistency_a == consistency_b,
 	    ("inconsistent links between pin config and cdevpriv"));
 	mtx_unlock(&priv->mtx);
 
 	return (0);
 }
 
 static bool
 gpioc_intr_reconfig_allowed(struct gpioc_cdevpriv *priv,
     struct gpioc_pin_intr *intr_conf)
 {
 	struct gpioc_privs	*priv_link;
 
 	mtx_assert(&intr_conf->mtx, MA_OWNED);
 
 	if (SLIST_EMPTY(&intr_conf->privs))
 		return (true);
 
 	SLIST_FOREACH(priv_link, &intr_conf->privs, next) {
 		if (priv_link->priv != priv)
 			return (false);
 	}
 
 	return (true);
 }
 
 
 static uint32_t
 gpioc_get_intr_config(struct gpioc_softc *sc, struct gpioc_cdevpriv *priv,
     uint32_t pin)
 {
 	struct gpioc_pin_intr	*intr_conf = &sc->sc_pin_intr[pin];
 	struct gpioc_privs	*priv_link;
 	uint32_t		flags;
 
 	flags = intr_conf->pin->flags;
 
 	if (flags == 0)
 		return (0);
 
 	mtx_lock(&intr_conf->mtx);
 	SLIST_FOREACH(priv_link, &intr_conf->privs, next) {
 		if (priv_link->priv == priv) {
 			flags |= GPIO_INTR_ATTACHED;
 			break;
 		}
 	}
 	mtx_unlock(&intr_conf->mtx);
 
 	return (flags);
 }
 
 static int
 gpioc_set_intr_config(struct gpioc_softc *sc, struct gpioc_cdevpriv *priv,
     uint32_t pin, uint32_t flags)
 {
 	struct gpioc_pin_intr *intr_conf = &sc->sc_pin_intr[pin];
 	int res;
 
 	res = 0;
 	if (intr_conf->pin->flags == 0 && flags == 0) {
 		/* No interrupt configured and none requested: Do nothing. */
 		return (0);
 	}
 	mtx_lock(&intr_conf->mtx);
 	while (intr_conf->config_locked == true)
 		mtx_sleep(&intr_conf->config_locked, &intr_conf->mtx, 0,
 		    "gpicfg", 0);
 	if (intr_conf->pin->flags == 0 && flags != 0) {
 		/*
 		 * No interrupt is configured, but one is requested: Allocate
 		 * and setup interrupt on the according pin.
 		 */
 		res = gpioc_allocate_pin_intr(intr_conf, flags);
 		if (res == 0)
 			res = gpioc_attach_priv_pin(priv, intr_conf);
 		if (res == EEXIST)
 			res = 0;
 	} else if (intr_conf->pin->flags == flags) {
 		/*
 		 * Same interrupt requested as already configured: Attach the
 		 * cdevpriv to the corresponding pin.
 		 */
 		res = gpioc_attach_priv_pin(priv, intr_conf);
 		if (res == EEXIST)
 			res = 0;
 	} else if (intr_conf->pin->flags != 0 && flags == 0) {
 		/*
 		 * Interrupt configured, but none requested: Teardown and
 		 * release the pin when no other cdevpriv is attached. Otherwise
 		 * just detach pin and cdevpriv from each other.
 		 */
 		if (gpioc_intr_reconfig_allowed(priv, intr_conf)) {
 			res = gpioc_release_pin_intr(intr_conf);
 		}
 		if (res == 0)
 			res = gpioc_detach_priv_pin(priv, intr_conf);
 	} else {
 		/*
 		 * Other flag requested than configured: Reconfigure when no
 		 * other cdevpriv is are attached to the pin.
 		 */
 		if (!gpioc_intr_reconfig_allowed(priv, intr_conf))
 			res = EBUSY;
 		else {
 			res = gpioc_release_pin_intr(intr_conf);
 			if (res == 0)
 				res = gpioc_allocate_pin_intr(intr_conf, flags);
 			if (res == 0)
 				res = gpioc_attach_priv_pin(priv, intr_conf);
 			if (res == EEXIST)
 				res = 0;
 		}
 	}
 	mtx_unlock(&intr_conf->mtx);
 
 	return (res);
 }
 
 static void
 gpioc_interrupt_handler(void *arg)
 {
 	struct gpioc_pin_intr *intr_conf;
 	struct gpioc_privs *privs;
 	struct gpioc_softc *sc;
 	sbintime_t evtime;
 	uint32_t pin_state;
 
 	intr_conf = arg;
 	sc = intr_conf->sc;
 
 	/* Capture time and pin state first. */
 	evtime = sbinuptime();
 	if (intr_conf->pin->flags & GPIO_INTR_EDGE_BOTH)
 		GPIO_PIN_GET(sc->sc_pdev, intr_conf->pin->pin, &pin_state);
 	else if (intr_conf->pin->flags & GPIO_INTR_EDGE_RISING)
 		pin_state = true;
 	else
 		pin_state = false;
 
 	mtx_lock(&intr_conf->mtx);
 
 	if (intr_conf->config_locked == true) {
 		ddevice_printf(sc->sc_dev, "Interrupt configuration in "
 		    "progress. Discarding interrupt on pin %d.\n",
 		    intr_conf->pin->pin);
 		mtx_unlock(&intr_conf->mtx);
 		return;
 	}
 
 	if (SLIST_EMPTY(&intr_conf->privs)) {
 		ddevice_printf(sc->sc_dev, "No file descriptor associated with "
 		    "occurred interrupt on pin %d.\n", intr_conf->pin->pin);
 		mtx_unlock(&intr_conf->mtx);
 		return;
 	}
 
 	SLIST_FOREACH(privs, &intr_conf->privs, next) {
 		struct gpioc_cdevpriv *priv = privs->priv;
 		struct gpioc_pins *privpin;
 		struct gpioc_pin_event *event;
 		mtx_lock(&priv->mtx);
 		SLIST_FOREACH(privpin, &priv->pins, next) {
 			if (privpin->pin == intr_conf)
 				break;
 		}
 		if (privpin == NULL) {
 			/* Should be impossible. */
 			ddevice_printf(sc->sc_dev, "Cannot find privpin\n");
 			mtx_unlock(&priv->mtx);
 			continue;
 		}
 
 		if (priv->report_option == GPIO_EVENT_REPORT_DETAIL) {
 			event = next_head_event(priv);
 			/* If head is overtaking tail, advance tail. */
 			if (priv->evidx_head == priv->evidx_tail)
 				next_tail_event(priv);
 		} else {
 			if (privpin->eventcount > 0)
 				event = &priv->events[privpin->firstevent + 1];
 			else {
 				privpin->firstevent = priv->evidx_head;
 				event = next_head_event(priv);
 				event->privpin = privpin;
 				event->event_time = evtime;
 				event->event_pin_state = pin_state;
 				event = next_head_event(priv);
 			}
 			++privpin->eventcount;
 		}
 		event->privpin = privpin;
 		event->event_time = evtime;
 		event->event_pin_state = pin_state;
 		wakeup(priv);
 		selwakeup(&priv->selinfo);
 		KNOTE_LOCKED(&priv->selinfo.si_note, 0);
 		if (priv->async == true && priv->sigio != NULL)
 			pgsigio(&priv->sigio, SIGIO, 0);
 		mtx_unlock(&priv->mtx);
 	}
 
 	mtx_unlock(&intr_conf->mtx);
 }
 
 static int
 gpioc_probe(device_t dev)
 {
 	device_set_desc(dev, "GPIO controller");
 	return (0);
 }
 
 static int
 gpioc_attach(device_t dev)
 {
 	int err;
 	struct gpioc_softc *sc;
 	struct make_dev_args devargs;
 
 	sc = device_get_softc(dev);
 	sc->sc_dev = dev;
 	sc->sc_pdev = device_get_parent(dev);
 	sc->sc_unit = device_get_unit(dev);
 
 	err = GPIO_PIN_MAX(sc->sc_pdev, &sc->sc_npins);
 	sc->sc_npins++; /* Number of pins is one more than max pin number. */
 	if (err != 0)
 		return (err);
 	sc->sc_pin_intr = malloc(sizeof(struct gpioc_pin_intr) * sc->sc_npins,
 	    M_GPIOC, M_WAITOK | M_ZERO);
 	for (int i = 0; i < sc->sc_npins; i++) {
 		sc->sc_pin_intr[i].pin = malloc(sizeof(struct gpiobus_pin),
 		    M_GPIOC, M_WAITOK | M_ZERO);
 		sc->sc_pin_intr[i].sc = sc;
 		sc->sc_pin_intr[i].pin->pin = i;
 		sc->sc_pin_intr[i].pin->dev = sc->sc_pdev;
 		mtx_init(&sc->sc_pin_intr[i].mtx, "gpioc pin", NULL, MTX_DEF);
 		SLIST_INIT(&sc->sc_pin_intr[i].privs);
 	}
 
 	make_dev_args_init(&devargs);
 	devargs.mda_devsw = &gpioc_cdevsw;
 	devargs.mda_uid = UID_ROOT;
 	devargs.mda_gid = GID_WHEEL;
 	devargs.mda_mode = 0600;
 	devargs.mda_si_drv1 = sc;
 	err = make_dev_s(&devargs, &sc->sc_ctl_dev, "gpioc%d", sc->sc_unit);
 	if (err != 0) {
 		device_printf(dev, "Failed to create gpioc%d", sc->sc_unit);
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static int
 gpioc_detach(device_t dev)
 {
 	struct gpioc_softc *sc = device_get_softc(dev);
 	int err;
 
 	if (sc->sc_ctl_dev)
 		destroy_dev(sc->sc_ctl_dev);
 
 	for (int i = 0; i < sc->sc_npins; i++) {
 		mtx_destroy(&sc->sc_pin_intr[i].mtx);
 		free(sc->sc_pin_intr[i].pin, M_GPIOC);
 	}
 	free(sc->sc_pin_intr, M_GPIOC);
 
 	if ((err = bus_generic_detach(dev)) != 0)
 		return (err);
 
 	return (0);
 }
 
 static void
 gpioc_cdevpriv_dtor(void *data)
 {
 	struct gpioc_cdevpriv	*priv;
 	struct gpioc_privs	*priv_link, *priv_link_temp;
 	struct gpioc_pins	*pin_link, *pin_link_temp;
 	unsigned int		consistency __diagused;
 
 	priv = data;
 
 	SLIST_FOREACH_SAFE(pin_link, &priv->pins, next, pin_link_temp) {
 		consistency = 0;
 		mtx_lock(&pin_link->pin->mtx);
 		while (pin_link->pin->config_locked == true)
 			mtx_sleep(&pin_link->pin->config_locked,
 			    &pin_link->pin->mtx, 0, "gpicfg", 0);
 		SLIST_FOREACH_SAFE(priv_link, &pin_link->pin->privs, next,
 		    priv_link_temp) {
 			if (priv_link->priv == priv) {
 				SLIST_REMOVE(&pin_link->pin->privs, priv_link,
 				    gpioc_privs, next);
 				free(priv_link, M_GPIOC);
 				consistency++;
 			}
 		}
 		KASSERT(consistency == 1,
 		    ("inconsistent links between pin config and cdevpriv"));
 		if (gpioc_intr_reconfig_allowed(priv, pin_link->pin)) {
 			gpioc_release_pin_intr(pin_link->pin);
 		}
 		mtx_unlock(&pin_link->pin->mtx);
 		SLIST_REMOVE(&priv->pins, pin_link, gpioc_pins, next);
 		free(pin_link, M_GPIOC);
 	}
 
 	wakeup(&priv);
 	knlist_clear(&priv->selinfo.si_note, 0);
 	seldrain(&priv->selinfo);
 	knlist_destroy(&priv->selinfo.si_note);
 	funsetown(&priv->sigio);
 
 	mtx_destroy(&priv->mtx);
 	free(priv->events, M_GPIOC);
 	free(data, M_GPIOC);
 }
 
 static int
 gpioc_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct gpioc_cdevpriv *priv;
 	int err;
 
 	priv = malloc(sizeof(*priv), M_GPIOC, M_WAITOK | M_ZERO);
 	priv->sc = dev->si_drv1;
 	priv->report_option = GPIO_EVENT_REPORT_DETAIL;
 	err = devfs_set_cdevpriv(priv, gpioc_cdevpriv_dtor);
 	if (err != 0) {
 		gpioc_cdevpriv_dtor(priv);
 		return (err);
 	}
 	mtx_init(&priv->mtx, "gpioc priv", NULL, MTX_DEF);
 	knlist_init_mtx(&priv->selinfo.si_note, &priv->mtx);
 
 	/*
 	 * Allocate a circular buffer for events.  The scheme we use for summary
 	 * reporting assumes there will always be a pair of events available to
 	 * record the first/last events on any pin, so we allocate 2 * npins.
 	 * Even though we actually default to detailed event reporting, 2 *
 	 * npins isn't a horrible fifo size for that either.
 	 */
 	priv->numevents = priv->sc->sc_npins * 2;
 	priv->events = malloc(priv->numevents * sizeof(struct gpio_event_detail),
 	    M_GPIOC, M_WAITOK | M_ZERO);
 
 	return (0);
 }
 
 static int
 gpioc_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct gpioc_cdevpriv *priv;
 	struct gpioc_pin_event *event;
 	union {
 		struct gpio_event_summary sum;
 		struct gpio_event_detail  evt;
 		uint8_t 		  data[1];
 	} recbuf;
 	size_t recsize;
 	int err;
 
 	if ((err = devfs_get_cdevpriv((void **)&priv)) != 0)
 		return (err);
 
 	if (priv->report_option == GPIO_EVENT_REPORT_SUMMARY)
 		recsize = sizeof(struct gpio_event_summary);
 	else
 		recsize = sizeof(struct gpio_event_detail);
 
 	if (uio->uio_resid < recsize)
 		return (EINVAL);
 
 	mtx_lock(&priv->mtx);
 	while (priv->evidx_head == priv->evidx_tail) {
 		if (SLIST_EMPTY(&priv->pins)) {
 			err = ENXIO;
 			break;
 		} else if (ioflag & O_NONBLOCK) {
 			err = EWOULDBLOCK;
 			break;
 		} else {
 			err = mtx_sleep(priv, &priv->mtx, PCATCH, "gpintr", 0);
 			if (err != 0)
 				break;
 		}
 	}
 
 	while (err == 0 && uio->uio_resid >= recsize &&
            priv->evidx_tail != priv->evidx_head) {
 		event = next_tail_event(priv);
 		if (priv->report_option == GPIO_EVENT_REPORT_SUMMARY) {
 			recbuf.sum.gp_first_time = event->event_time;
 			recbuf.sum.gp_pin = event->privpin->pin->pin->pin;
 			recbuf.sum.gp_count = event->privpin->eventcount;
 			recbuf.sum.gp_first_state = event->event_pin_state;
 			event = next_tail_event(priv);
 			recbuf.sum.gp_last_time = event->event_time;
 			recbuf.sum.gp_last_state = event->event_pin_state;
 			event->privpin->eventcount = 0;
 			event->privpin->firstevent = 0;
 		} else {
 			recbuf.evt.gp_time = event->event_time;
 			recbuf.evt.gp_pin = event->privpin->pin->pin->pin;
 			recbuf.evt.gp_pinstate = event->event_pin_state;
 		}
 		mtx_unlock(&priv->mtx);
 		err = uiomove(recbuf.data, recsize, uio);
 		mtx_lock(&priv->mtx);
 	}
 	mtx_unlock(&priv->mtx);
 	return (err);
 }
 
 static int 
 gpioc_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag, 
     struct thread *td)
 {
 	device_t bus;
 	int max_pin, res;
 	struct gpioc_softc *sc = cdev->si_drv1;
 	struct gpioc_cdevpriv *priv;
 	struct gpio_pin pin;
 	struct gpio_req req;
 	struct gpio_access_32 *a32;
 	struct gpio_config_32 *c32;
 	struct gpio_event_config *evcfg;
 	uint32_t caps, intrflags;
 
 	bus = GPIO_GET_BUS(sc->sc_pdev);
 	if (bus == NULL)
 		return (EINVAL);
 	switch (cmd) {
 	case GPIOMAXPIN:
 		max_pin = -1;
 		res = GPIO_PIN_MAX(sc->sc_pdev, &max_pin);
 		bcopy(&max_pin, arg, sizeof(max_pin));
 		break;
 	case GPIOGETCONFIG:
 		bcopy(arg, &pin, sizeof(pin));
 		dprintf("get config pin %d\n", pin.gp_pin);
 		res = GPIO_PIN_GETFLAGS(sc->sc_pdev, pin.gp_pin,
 		    &pin.gp_flags);
 		/* Fail early */
 		if (res)
 			break;
 		res = devfs_get_cdevpriv((void **)&priv);
 		if (res)
 			break;
 		pin.gp_flags |= gpioc_get_intr_config(sc, priv,
 		    pin.gp_pin);
 		GPIO_PIN_GETCAPS(sc->sc_pdev, pin.gp_pin, &pin.gp_caps);
 		GPIOBUS_PIN_GETNAME(bus, pin.gp_pin, pin.gp_name);
 		bcopy(&pin, arg, sizeof(pin));
 		break;
 	case GPIOSETCONFIG:
 		bcopy(arg, &pin, sizeof(pin));
 		dprintf("set config pin %d\n", pin.gp_pin);
 		res = devfs_get_cdevpriv((void **)&priv);
 		if (res != 0)
 			break;
 		res = GPIO_PIN_GETCAPS(sc->sc_pdev, pin.gp_pin, &caps);
 		if (res != 0)
 			break;
 		res = gpio_check_flags(caps, pin.gp_flags);
 		if (res != 0)
 			break;
 		intrflags = pin.gp_flags & GPIO_INTR_MASK;
 		/*
 		 * We can do only edge interrupts, and only if the
 		 * hardware supports that interrupt type on that pin.
 		 */
 		switch (intrflags) {
 		case GPIO_INTR_NONE:
 			break;
 		case GPIO_INTR_EDGE_RISING:
 		case GPIO_INTR_EDGE_FALLING:
 		case GPIO_INTR_EDGE_BOTH:
 			if ((intrflags & caps) == 0)
 				res = EOPNOTSUPP;
 			break;
 		default:
 			res = EINVAL;
 			break;
 		}
 		if (res != 0)
 			break;
 		res = GPIO_PIN_SETFLAGS(sc->sc_pdev, pin.gp_pin,
 		    (pin.gp_flags & ~GPIO_INTR_MASK));
 		if (res != 0)
 			break;
 		res = gpioc_set_intr_config(sc, priv, pin.gp_pin,
 		    intrflags);
 		break;
 	case GPIOGET:
 		bcopy(arg, &req, sizeof(req));
 		res = GPIO_PIN_GET(sc->sc_pdev, req.gp_pin,
 		    &req.gp_value);
 		dprintf("read pin %d -> %d\n", 
 		    req.gp_pin, req.gp_value);
 		bcopy(&req, arg, sizeof(req));
 		break;
 	case GPIOSET:
 		bcopy(arg, &req, sizeof(req));
 		res = GPIO_PIN_SET(sc->sc_pdev, req.gp_pin, 
 		    req.gp_value);
 		dprintf("write pin %d -> %d\n", 
 		    req.gp_pin, req.gp_value);
 		break;
 	case GPIOTOGGLE:
 		bcopy(arg, &req, sizeof(req));
 		dprintf("toggle pin %d\n", 
 		    req.gp_pin);
 		res = GPIO_PIN_TOGGLE(sc->sc_pdev, req.gp_pin);
 		break;
 	case GPIOSETNAME:
 		bcopy(arg, &pin, sizeof(pin));
 		dprintf("set name on pin %d\n", pin.gp_pin);
 		res = GPIOBUS_PIN_SETNAME(bus, pin.gp_pin,
 		    pin.gp_name);
 		break;
 	case GPIOACCESS32:
 		a32 = (struct gpio_access_32 *)arg;
 		res = GPIO_PIN_ACCESS_32(sc->sc_pdev, a32->first_pin,
 		    a32->clear_pins, a32->change_pins, &a32->orig_pins);
 		break;
 	case GPIOCONFIG32:
 		c32 = (struct gpio_config_32 *)arg;
 		res = GPIO_PIN_CONFIG_32(sc->sc_pdev, c32->first_pin,
 		    c32->num_pins, c32->pin_flags);
 		break;
 	case GPIOCONFIGEVENTS:
 		evcfg = (struct gpio_event_config *)arg;
 		res = devfs_get_cdevpriv((void **)&priv);
 		if (res != 0)
 			break;
 		/* If any pins have been configured, changes aren't allowed. */
 		if (!SLIST_EMPTY(&priv->pins)) {
 			res = EINVAL;
 			break;
 		}
 		if (evcfg->gp_report_type != GPIO_EVENT_REPORT_DETAIL &&
 		    evcfg->gp_report_type != GPIO_EVENT_REPORT_SUMMARY) {
 			res = EINVAL;
 			break;
 		}
 		priv->report_option = evcfg->gp_report_type;
 		/* Reallocate the events buffer if the user wants it bigger. */
 		if (priv->report_option == GPIO_EVENT_REPORT_DETAIL &&
 		    priv->numevents < evcfg->gp_fifo_size) {
 			free(priv->events, M_GPIOC);
 			priv->numevents = evcfg->gp_fifo_size;
 			priv->events = malloc(priv->numevents *
 			    sizeof(struct gpio_event_detail), M_GPIOC,
 			    M_WAITOK | M_ZERO);
 			priv->evidx_head = priv->evidx_tail = 0;
 		}
 		break;
 	case FIONBIO:
 		/*
 		 * This dummy handler is necessary to prevent fcntl()
 		 * from failing. The actual handling of non-blocking IO
 		 * is done using the O_NONBLOCK ioflag passed to the
 		 * read() syscall.
 		 */
 		res = 0;
 		break;
 	case FIOASYNC:
 		res = devfs_get_cdevpriv((void **)&priv);
 		if (res == 0) {
 			if (*(int *)arg == FASYNC)
 				priv->async = true;
 			else
 				priv->async = false;
 		}
 		break;
 	case FIOGETOWN:
 		res = devfs_get_cdevpriv((void **)&priv);
 		if (res == 0)
 			*(int *)arg = fgetown(&priv->sigio);
 		break;
 	case FIOSETOWN:
 		res = devfs_get_cdevpriv((void **)&priv);
 		if (res == 0)
 			res = fsetown(*(int *)arg, &priv->sigio);
 		break;
 	default:
 		return (ENOTTY);
 		break;
 	}
 
 	return (res);
 }
 
 static int
 gpioc_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct gpioc_cdevpriv *priv;
 	int err;
 	int revents;
 
 	revents = 0;
 
 	err = devfs_get_cdevpriv((void **)&priv);
 	if (err != 0) {
 		revents = POLLERR;
 		return (revents);
 	}
 
 	if (SLIST_EMPTY(&priv->pins)) {
 		revents = POLLHUP;
 		return (revents);
 	}
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (priv->evidx_head != priv->evidx_tail)
 			revents |= events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &priv->selinfo);
 	}
 
 	return (revents);
 }
 
 static int
 gpioc_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct gpioc_cdevpriv *priv;
 	struct knlist *knlist;
 	int err;
 
 	err = devfs_get_cdevpriv((void **)&priv);
 	if (err != 0)
 		return err;
 
 	if (SLIST_EMPTY(&priv->pins))
 		return (ENXIO);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &gpioc_read_filterops;
 		kn->kn_hook = (void *)priv;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	knlist = &priv->selinfo.si_note;
 	knlist_add(knlist, kn, 0);
 
 	return (0);
 }
 
 static int
 gpioc_kqread(struct knote *kn, long hint)
 {
 	struct gpioc_cdevpriv *priv = kn->kn_hook;
 	size_t recsize;
 
 
 	if (SLIST_EMPTY(&priv->pins)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		if (priv->evidx_head != priv->evidx_tail) {
 			if (priv->report_option == GPIO_EVENT_REPORT_SUMMARY)
 				recsize = sizeof(struct gpio_event_summary);
 			else
 				recsize = sizeof(struct gpio_event_detail);
 			kn->kn_data = recsize * number_of_events(priv);
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static void
 gpioc_kqdetach(struct knote *kn)
 {
 	struct gpioc_cdevpriv *priv = kn->kn_hook;
 	struct knlist *knlist = &priv->selinfo.si_note;
 
 	knlist_remove(knlist, kn, 0);
 }
 
 static device_method_t gpioc_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		gpioc_probe),
 	DEVMETHOD(device_attach,	gpioc_attach),
 	DEVMETHOD(device_detach,	gpioc_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	bus_generic_resume),
 
 	DEVMETHOD_END
 };
 
 driver_t gpioc_driver = {
 	"gpioc",
 	gpioc_methods,
 	sizeof(struct gpioc_softc)
 };
 
 DRIVER_MODULE(gpioc, gpio, gpioc_driver, 0, 0);
 MODULE_VERSION(gpioc, 1);
diff --git a/sys/dev/hid/hidraw.c b/sys/dev/hid/hidraw.c
index 6a05b633cfc8..618a6d2d5c31 100644
--- a/sys/dev/hid/hidraw.c
+++ b/sys/dev/hid/hidraw.c
@@ -1,1018 +1,1018 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1998 The NetBSD Foundation, Inc.
  * All rights reserved.
  * Copyright (c) 2020 Vladimir Kondratyev <wulf@FreeBSD.org>
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Lennart Augustsson (lennart@augustsson.net) at
  * Carlstedt Research & Technology.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * HID spec: http://www.usb.org/developers/devclass_docs/HID1_11.pdf
  */
 
 #include <sys/cdefs.h>
 #include "opt_hid.h"
 
 #include <sys/param.h>
 #ifdef COMPAT_FREEBSD32
 #include <sys/abi_compat.h>
 #endif
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/ioccom.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/selinfo.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/uio.h>
 
 #define HID_DEBUG_VAR	hidraw_debug
 #include <dev/hid/hid.h>
 #include <dev/hid/hidbus.h>
 #include <dev/hid/hidraw.h>
 
 #ifdef HID_DEBUG
 static int hidraw_debug = 0;
 static SYSCTL_NODE(_hw_hid, OID_AUTO, hidraw, CTLFLAG_RW, 0,
     "HID raw interface");
 SYSCTL_INT(_hw_hid_hidraw, OID_AUTO, debug, CTLFLAG_RWTUN,
     &hidraw_debug, 0, "Debug level");
 #endif
 
 #define	HIDRAW_INDEX		0xFF	/* Arbitrary high value */
 
 #define	HIDRAW_LOCAL_BUFSIZE	64	/* Size of on-stack buffer. */
 #define	HIDRAW_LOCAL_ALLOC(local_buf, size)		\
 	(sizeof(local_buf) > (size) ? (local_buf) :	\
 	    malloc((size), M_DEVBUF, M_ZERO | M_WAITOK))
 #define	HIDRAW_LOCAL_FREE(local_buf, buf)		\
 	if ((local_buf) != (buf)) {			\
 		free((buf), M_DEVBUF);			\
 	}
 
 struct hidraw_softc {
 	device_t sc_dev;		/* base device */
 
 	struct mtx sc_mtx;		/* hidbus private mutex */
 
 	struct hid_rdesc_info *sc_rdesc;
 	const struct hid_device_info *sc_hw;
 
 	uint8_t *sc_q;
 	hid_size_t *sc_qlen;
 	int sc_head;
 	int sc_tail;
 	int sc_sleepcnt;
 
 	struct selinfo sc_rsel;
 	struct proc *sc_async;	/* process that wants SIGIO */
 	struct {			/* driver state */
 		bool	open:1;		/* device is open */
 		bool	aslp:1;		/* waiting for device data in read() */
 		bool	sel:1;		/* waiting for device data in poll() */
 		bool	quiet:1;	/* Ignore input data */
 		bool	immed:1;	/* return read data immediately */
 		bool	uhid:1;		/* driver switched in to uhid mode */
 		bool	lock:1;		/* input queue sleepable lock */
 		bool	flush:1;	/* do not wait for data in read() */
 	} sc_state;
 	int sc_fflags;			/* access mode for open lifetime */
 
 	struct cdev *dev;
 };
 
 #ifdef COMPAT_FREEBSD32
 struct hidraw_gen_descriptor32 {
 	uint32_t hgd_data;	/* void * */
 	uint16_t hgd_lang_id;
 	uint16_t hgd_maxlen;
 	uint16_t hgd_actlen;
 	uint16_t hgd_offset;
 	uint8_t hgd_config_index;
 	uint8_t hgd_string_index;
 	uint8_t hgd_iface_index;
 	uint8_t hgd_altif_index;
 	uint8_t hgd_endpt_index;
 	uint8_t hgd_report_type;
 	uint8_t reserved[8];
 };
 #define	HIDRAW_GET_REPORT_DESC32 \
     _IOC_NEWTYPE(HIDRAW_GET_REPORT_DESC, struct hidraw_gen_descriptor32)
 #define	HIDRAW_GET_REPORT32 \
     _IOC_NEWTYPE(HIDRAW_GET_REPORT, struct hidraw_gen_descriptor32)
 #define	HIDRAW_SET_REPORT_DESC32 \
     _IOC_NEWTYPE(HIDRAW_SET_REPORT_DESC, struct hidraw_gen_descriptor32)
 #define	HIDRAW_SET_REPORT32 \
     _IOC_NEWTYPE(HIDRAW_SET_REPORT, struct hidraw_gen_descriptor32)
 #endif
 
 static d_open_t		hidraw_open;
 static d_read_t		hidraw_read;
 static d_write_t	hidraw_write;
 static d_ioctl_t	hidraw_ioctl;
 static d_poll_t		hidraw_poll;
 static d_kqfilter_t	hidraw_kqfilter;
 
 static d_priv_dtor_t	hidraw_dtor;
 
 static struct cdevsw hidraw_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	hidraw_open,
 	.d_read =	hidraw_read,
 	.d_write =	hidraw_write,
 	.d_ioctl =	hidraw_ioctl,
 	.d_poll =	hidraw_poll,
 	.d_kqfilter =	hidraw_kqfilter,
 	.d_name =	"hidraw",
 };
 
 static hid_intr_t	hidraw_intr;
 
 static device_identify_t hidraw_identify;
 static device_probe_t	hidraw_probe;
 static device_attach_t	hidraw_attach;
 static device_detach_t	hidraw_detach;
 
 static int		hidraw_kqread(struct knote *, long);
 static void		hidraw_kqdetach(struct knote *);
 static void		hidraw_notify(struct hidraw_softc *);
 
-static struct filterops hidraw_filterops_read = {
+static const struct filterops hidraw_filterops_read = {
 	.f_isfd =	1,
 	.f_detach =	hidraw_kqdetach,
 	.f_event =	hidraw_kqread,
 };
 
 static void
 hidraw_identify(driver_t *driver, device_t parent)
 {
 	device_t child;
 
 	if (device_find_child(parent, "hidraw", -1) == NULL) {
 		child = BUS_ADD_CHILD(parent, 0, "hidraw",
 		    device_get_unit(parent));
 		if (child != NULL)
 			hidbus_set_index(child, HIDRAW_INDEX);
 	}
 }
 
 static int
 hidraw_probe(device_t self)
 {
 
 	if (hidbus_get_index(self) != HIDRAW_INDEX)
 		return (ENXIO);
 
 	hidbus_set_desc(self, "Raw HID Device");
 
 	return (BUS_PROBE_GENERIC);
 }
 
 static int
 hidraw_attach(device_t self)
 {
 	struct hidraw_softc *sc = device_get_softc(self);
 	struct make_dev_args mda;
 	int error;
 
 	sc->sc_dev = self;
 	sc->sc_rdesc = hidbus_get_rdesc_info(self);
 	sc->sc_hw = hid_get_device_info(self);
 
 	/* Hidraw mode does not require report descriptor to work */
 	if (sc->sc_rdesc->data == NULL || sc->sc_rdesc->len == 0)
 		device_printf(self, "no report descriptor\n");
 
 	mtx_init(&sc->sc_mtx, "hidraw lock", NULL, MTX_DEF);
 	knlist_init_mtx(&sc->sc_rsel.si_note, &sc->sc_mtx);
 
 	make_dev_args_init(&mda);
 	mda.mda_flags = MAKEDEV_WAITOK;
 	mda.mda_devsw = &hidraw_cdevsw;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_OPERATOR;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = sc;
 
 	error = make_dev_s(&mda, &sc->dev, "hidraw%d", device_get_unit(self));
 	if (error) {
 		device_printf(self, "Can not create character device\n");
 		hidraw_detach(self);
 		return (error);
 	}
 #ifdef HIDRAW_MAKE_UHID_ALIAS
 	(void)make_dev_alias(sc->dev, "uhid%d", device_get_unit(self));
 #endif
 
 	hidbus_set_lock(self, &sc->sc_mtx);
 	hidbus_set_intr(self, hidraw_intr, sc);
 
 	return (0);
 }
 
 static int
 hidraw_detach(device_t self)
 {
 	struct hidraw_softc *sc = device_get_softc(self);
 
 	DPRINTF("sc=%p\n", sc);
 
 	if (sc->dev != NULL) {
 		mtx_lock(&sc->sc_mtx);
 		sc->dev->si_drv1 = NULL;
 		/* Wake everyone */
 		hidraw_notify(sc);
 		mtx_unlock(&sc->sc_mtx);
 		destroy_dev(sc->dev);
 	}
 
 	knlist_clear(&sc->sc_rsel.si_note, 0);
 	knlist_destroy(&sc->sc_rsel.si_note);
 	seldrain(&sc->sc_rsel);
 	mtx_destroy(&sc->sc_mtx);
 
 	return (0);
 }
 
 void
 hidraw_intr(void *context, void *buf, hid_size_t len)
 {
 	struct hidraw_softc *sc = context;
 	int next;
 
 	DPRINTFN(5, "len=%d\n", len);
 	DPRINTFN(5, "data = %*D\n", len, buf, " ");
 
 	next = (sc->sc_tail + 1) % HIDRAW_BUFFER_SIZE;
 	if (sc->sc_state.quiet || next == sc->sc_head)
 		return;
 
 	bcopy(buf, sc->sc_q + sc->sc_tail * sc->sc_rdesc->rdsize, len);
 
 	/* Make sure we don't process old data */
 	if (len < sc->sc_rdesc->rdsize)
 		bzero(sc->sc_q + sc->sc_tail * sc->sc_rdesc->rdsize + len,
 		    sc->sc_rdesc->isize - len);
 
 	sc->sc_qlen[sc->sc_tail] = len;
 	sc->sc_tail = next;
 
 	hidraw_notify(sc);
 }
 
 static inline int
 hidraw_lock_queue(struct hidraw_softc *sc, bool flush)
 {
 	int error = 0;
 
 	mtx_assert(&sc->sc_mtx, MA_OWNED);
 
 	if (flush)
 		sc->sc_state.flush = true;
 	++sc->sc_sleepcnt;
 	while (sc->sc_state.lock && error == 0) {
 		/* Flush is requested. Wakeup all readers and forbid sleeps */
 		if (flush && sc->sc_state.aslp) {
 			sc->sc_state.aslp = false;
 			DPRINTFN(5, "waking %p\n", &sc->sc_q);
 			wakeup(&sc->sc_q);
 		}
 		error = mtx_sleep(&sc->sc_sleepcnt, &sc->sc_mtx,
 		    PZERO | PCATCH, "hidrawio", 0);
 	}
 	--sc->sc_sleepcnt;
 	if (flush)
 		sc->sc_state.flush = false;
 	if (error == 0)
 		sc->sc_state.lock = true;
 
 	return (error);
 }
 
 static inline void
 hidraw_unlock_queue(struct hidraw_softc *sc)
 {
 
 	mtx_assert(&sc->sc_mtx, MA_OWNED);
 	KASSERT(sc->sc_state.lock, ("input buffer is not locked"));
 
 	if (sc->sc_sleepcnt != 0)
 		wakeup_one(&sc->sc_sleepcnt);
 	sc->sc_state.lock = false;
 }
 
 static int
 hidraw_open(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 	struct hidraw_softc *sc;
 	int error;
 
 	sc = dev->si_drv1;
 	if (sc == NULL)
 		return (ENXIO);
 
 	DPRINTF("sc=%p\n", sc);
 
 	mtx_lock(&sc->sc_mtx);
 	if (sc->sc_state.open) {
 		mtx_unlock(&sc->sc_mtx);
 		return (EBUSY);
 	}
 	sc->sc_state.open = true;
 	mtx_unlock(&sc->sc_mtx);
 
 	error = devfs_set_cdevpriv(sc, hidraw_dtor);
 	if (error != 0) {
 		mtx_lock(&sc->sc_mtx);
 		sc->sc_state.open = false;
 		mtx_unlock(&sc->sc_mtx);
 		return (error);
 	}
 
 	sc->sc_q = malloc(sc->sc_rdesc->rdsize * HIDRAW_BUFFER_SIZE, M_DEVBUF,
 	    M_ZERO | M_WAITOK);
 	sc->sc_qlen = malloc(sizeof(hid_size_t) * HIDRAW_BUFFER_SIZE, M_DEVBUF,
 	    M_ZERO | M_WAITOK);
 
 	/* Set up interrupt pipe. */
 	sc->sc_state.immed = false;
 	sc->sc_async = 0;
 	sc->sc_state.uhid = false;	/* hidraw mode is default */
 	sc->sc_state.quiet = false;
 	sc->sc_head = sc->sc_tail = 0;
 	sc->sc_fflags = flag;
 
 	hid_intr_start(sc->sc_dev);
 
 	return (0);
 }
 
 static void
 hidraw_dtor(void *data)
 {
 	struct hidraw_softc *sc = data;
 
 	DPRINTF("sc=%p\n", sc);
 
 	/* Disable interrupts. */
 	hid_intr_stop(sc->sc_dev);
 
 	sc->sc_tail = sc->sc_head = 0;
 	sc->sc_async = 0;
 	free(sc->sc_q, M_DEVBUF);
 	free(sc->sc_qlen, M_DEVBUF);
 	sc->sc_q = NULL;
 
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_state.open = false;
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static int
 hidraw_read(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct hidraw_softc *sc;
 	size_t length;
 	int error;
 
 	DPRINTFN(1, "\n");
 
 	sc = dev->si_drv1;
 	if (sc == NULL)
 		return (EIO);
 
 	mtx_lock(&sc->sc_mtx);
 	error = dev->si_drv1 == NULL ? EIO : hidraw_lock_queue(sc, false);
 	if (error != 0) {
 		mtx_unlock(&sc->sc_mtx);
 		return (error);
 	}
 
 	if (sc->sc_state.immed) {
 		mtx_unlock(&sc->sc_mtx);
 		DPRINTFN(1, "immed\n");
 
 		error = hid_get_report(sc->sc_dev, sc->sc_q,
 		    sc->sc_rdesc->isize, NULL, HID_INPUT_REPORT,
 		    sc->sc_rdesc->iid);
 		if (error == 0)
 			error = uiomove(sc->sc_q, sc->sc_rdesc->isize, uio);
 		mtx_lock(&sc->sc_mtx);
 		goto exit;
 	}
 
 	while (sc->sc_tail == sc->sc_head && !sc->sc_state.flush) {
 		if (flag & O_NONBLOCK) {
 			error = EWOULDBLOCK;
 			goto exit;
 		}
 		sc->sc_state.aslp = true;
 		DPRINTFN(5, "sleep on %p\n", &sc->sc_q);
 		error = mtx_sleep(&sc->sc_q, &sc->sc_mtx, PZERO | PCATCH,
 		    "hidrawrd", 0);
 		DPRINTFN(5, "woke, error=%d\n", error);
 		if (dev->si_drv1 == NULL)
 			error = EIO;
 		if (error) {
 			sc->sc_state.aslp = false;
 			goto exit;
 		}
 	}
 
 	while (sc->sc_tail != sc->sc_head && uio->uio_resid > 0) {
 		length = min(uio->uio_resid, sc->sc_state.uhid ?
 		    sc->sc_rdesc->isize : sc->sc_qlen[sc->sc_head]);
 		mtx_unlock(&sc->sc_mtx);
 
 		/* Copy the data to the user process. */
 		DPRINTFN(5, "got %lu chars\n", (u_long)length);
 		error = uiomove(sc->sc_q + sc->sc_head * sc->sc_rdesc->rdsize,
 		    length, uio);
 
 		mtx_lock(&sc->sc_mtx);
 		if (error != 0)
 			goto exit;
 		/* Remove a small chunk from the input queue. */
 		sc->sc_head = (sc->sc_head + 1) % HIDRAW_BUFFER_SIZE;
 		/*
 		 * In uhid mode transfer as many chunks as possible. Hidraw
 		 * packets are transferred one by one due to different length.
 		 */
 		if (!sc->sc_state.uhid)
 			goto exit;
 	}
 exit:
 	hidraw_unlock_queue(sc);
 	mtx_unlock(&sc->sc_mtx);
 
 	return (error);
 }
 
 static int
 hidraw_write(struct cdev *dev, struct uio *uio, int flag)
 {
 	uint8_t local_buf[HIDRAW_LOCAL_BUFSIZE], *buf;
 	struct hidraw_softc *sc;
 	int error;
 	int size;
 	size_t buf_offset;
 	uint8_t id = 0;
 
 	DPRINTFN(1, "\n");
 
 	sc = dev->si_drv1;
 	if (sc == NULL)
 		return (EIO);
 
 	if (sc->sc_rdesc->osize == 0)
 		return (EOPNOTSUPP);
 
 	buf_offset = 0;
 	if (sc->sc_state.uhid) {
 		size = sc->sc_rdesc->osize;
 		if (uio->uio_resid != size)
 			return (EINVAL);
 	} else {
 		size = uio->uio_resid;
 		if (size < 2)
 			return (EINVAL);
 		/* Strip leading 0 if the device doesnt use numbered reports */
 		error = uiomove(&id, 1, uio);
 		if (error)
 			return (error);
 		if (id != 0)
 			buf_offset++;
 		else
 			size--;
 		/* Check if underlying driver could process this request */
 		if (size > sc->sc_rdesc->wrsize)
 			return (ENOBUFS);
 	}
 	buf = HIDRAW_LOCAL_ALLOC(local_buf, size);
 	buf[0] = id;
 	error = uiomove(buf + buf_offset, uio->uio_resid, uio);
 	if (error == 0)
 		error = hid_write(sc->sc_dev, buf, size);
 	HIDRAW_LOCAL_FREE(local_buf, buf);
 
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 static void
 update_hgd32(const struct hidraw_gen_descriptor *hgd,
     struct hidraw_gen_descriptor32 *hgd32)
 {
 	/* Don't update hgd_data pointer */
 	CP(*hgd, *hgd32, hgd_lang_id);
 	CP(*hgd, *hgd32, hgd_maxlen);
 	CP(*hgd, *hgd32, hgd_actlen);
 	CP(*hgd, *hgd32, hgd_offset);
 	CP(*hgd, *hgd32, hgd_config_index);
 	CP(*hgd, *hgd32, hgd_string_index);
 	CP(*hgd, *hgd32, hgd_iface_index);
 	CP(*hgd, *hgd32, hgd_altif_index);
 	CP(*hgd, *hgd32, hgd_endpt_index);
 	CP(*hgd, *hgd32, hgd_report_type);
 	/* Don't update reserved */
 }
 #endif
 
 static int
 hidraw_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
     struct thread *td)
 {
 	uint8_t local_buf[HIDRAW_LOCAL_BUFSIZE];
 #ifdef COMPAT_FREEBSD32
 	struct hidraw_gen_descriptor local_hgd;
 	struct hidraw_gen_descriptor32 *hgd32 = NULL;
 #endif
 	void *buf;
 	struct hidraw_softc *sc;
 	struct hidraw_device_info *hdi;
 	struct hidraw_gen_descriptor *hgd;
 	struct hidraw_report_descriptor *hrd;
 	struct hidraw_devinfo *hd;
 	const char *devname;
 	uint32_t size;
 	int id, len;
 	int error = 0;
 
 	DPRINTFN(2, "cmd=%lx\n", cmd);
 
 	sc = dev->si_drv1;
 	if (sc == NULL)
 		return (EIO);
 
 	hgd = (struct hidraw_gen_descriptor *)addr;
 
 #ifdef COMPAT_FREEBSD32
 	switch (cmd) {
 	case HIDRAW_GET_REPORT_DESC32:
 	case HIDRAW_GET_REPORT32:
 	case HIDRAW_SET_REPORT_DESC32:
 	case HIDRAW_SET_REPORT32:
 		cmd = _IOC_NEWTYPE(cmd, struct hidraw_gen_descriptor);
 		hgd32 = (struct hidraw_gen_descriptor32 *)addr;
 		hgd = &local_hgd;
 		PTRIN_CP(*hgd32, *hgd, hgd_data);
 		CP(*hgd32, *hgd, hgd_lang_id);
 		CP(*hgd32, *hgd, hgd_maxlen);
 		CP(*hgd32, *hgd, hgd_actlen);
 		CP(*hgd32, *hgd, hgd_offset);
 		CP(*hgd32, *hgd, hgd_config_index);
 		CP(*hgd32, *hgd, hgd_string_index);
 		CP(*hgd32, *hgd, hgd_iface_index);
 		CP(*hgd32, *hgd, hgd_altif_index);
 		CP(*hgd32, *hgd, hgd_endpt_index);
 		CP(*hgd32, *hgd, hgd_report_type);
 		/* Don't copy reserved */
 		break;
 	}
 #endif
 
 	/* fixed-length ioctls handling */
 	switch (cmd) {
 	case FIONBIO:
 		/* All handled in the upper FS layer. */
 		return (0);
 
 	case FIOASYNC:
 		mtx_lock(&sc->sc_mtx);
 		if (*(int *)addr) {
 			if (sc->sc_async == NULL) {
 				sc->sc_async = td->td_proc;
 				DPRINTF("FIOASYNC %p\n", sc->sc_async);
 			} else
 				error = EBUSY;
 		} else
 			sc->sc_async = NULL;
 		mtx_unlock(&sc->sc_mtx);
 		return (error);
 
 	/* XXX this is not the most general solution. */
 	case TIOCSPGRP:
 		mtx_lock(&sc->sc_mtx);
 		if (sc->sc_async == NULL)
 			error = EINVAL;
 		else if (*(int *)addr != sc->sc_async->p_pgid)
 			error = EPERM;
 		mtx_unlock(&sc->sc_mtx);
 		return (error);
 
 	case HIDRAW_GET_REPORT_DESC:
 		if (sc->sc_rdesc->data == NULL || sc->sc_rdesc->len == 0)
 			return (EOPNOTSUPP);
 		mtx_lock(&sc->sc_mtx);
 		sc->sc_state.uhid = true;
 		mtx_unlock(&sc->sc_mtx);
 		if (sc->sc_rdesc->len > hgd->hgd_maxlen) {
 			size = hgd->hgd_maxlen;
 		} else {
 			size = sc->sc_rdesc->len;
 		}
 		hgd->hgd_actlen = size;
 #ifdef COMPAT_FREEBSD32
 		if (hgd32 != NULL)
 			update_hgd32(hgd, hgd32);
 #endif
 		if (hgd->hgd_data == NULL)
 			return (0);		/* descriptor length only */
 
 		return (copyout(sc->sc_rdesc->data, hgd->hgd_data, size));
 
 
 	case HIDRAW_SET_REPORT_DESC:
 		if (!(sc->sc_fflags & FWRITE))
 			return (EPERM);
 
 		/* check privileges */
 		error = priv_check(curthread, PRIV_DRIVER);
 		if (error)
 			return (error);
 
 		/* Stop interrupts and clear input report buffer */
 		mtx_lock(&sc->sc_mtx);
 		sc->sc_tail = sc->sc_head = 0;
 		error = hidraw_lock_queue(sc, true);
 		if (error == 0)
 			sc->sc_state.quiet = true;
 		mtx_unlock(&sc->sc_mtx);
 		if (error != 0)
 			return (error);
 
 		buf = HIDRAW_LOCAL_ALLOC(local_buf, hgd->hgd_maxlen);
 		error = copyin(hgd->hgd_data, buf, hgd->hgd_maxlen);
 		if (error == 0) {
 			bus_topo_lock();
 			error = hid_set_report_descr(sc->sc_dev, buf,
 			    hgd->hgd_maxlen);
 			bus_topo_unlock();
 		}
 		HIDRAW_LOCAL_FREE(local_buf, buf);
 
 		/* Realloc hidraw input queue */
 		if (error == 0)
 			sc->sc_q = realloc(sc->sc_q,
 			    sc->sc_rdesc->rdsize * HIDRAW_BUFFER_SIZE,
 			    M_DEVBUF, M_ZERO | M_WAITOK);
 
 		/* Start interrupts again */
 		mtx_lock(&sc->sc_mtx);
 		sc->sc_state.quiet = false;
 		hidraw_unlock_queue(sc);
 		mtx_unlock(&sc->sc_mtx);
 		return (error);
 	case HIDRAW_SET_IMMED:
 		if (!(sc->sc_fflags & FREAD))
 			return (EPERM);
 		if (*(int *)addr) {
 			/* XXX should read into ibuf, but does it matter? */
 			size = sc->sc_rdesc->isize;
 			buf = HIDRAW_LOCAL_ALLOC(local_buf, size);
 			error = hid_get_report(sc->sc_dev, buf, size, NULL,
 			    HID_INPUT_REPORT, sc->sc_rdesc->iid);
 			HIDRAW_LOCAL_FREE(local_buf, buf);
 			if (error)
 				return (EOPNOTSUPP);
 
 			mtx_lock(&sc->sc_mtx);
 			sc->sc_state.immed = true;
 			mtx_unlock(&sc->sc_mtx);
 		} else {
 			mtx_lock(&sc->sc_mtx);
 			sc->sc_state.immed = false;
 			mtx_unlock(&sc->sc_mtx);
 		}
 		return (0);
 
 	case HIDRAW_GET_REPORT:
 		if (!(sc->sc_fflags & FREAD))
 			return (EPERM);
 		switch (hgd->hgd_report_type) {
 		case HID_INPUT_REPORT:
 			size = sc->sc_rdesc->isize;
 			id = sc->sc_rdesc->iid;
 			break;
 		case HID_OUTPUT_REPORT:
 			size = sc->sc_rdesc->osize;
 			id = sc->sc_rdesc->oid;
 			break;
 		case HID_FEATURE_REPORT:
 			size = sc->sc_rdesc->fsize;
 			id = sc->sc_rdesc->fid;
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (id != 0) {
 			error = copyin(hgd->hgd_data, &id, 1);
 			if (error != 0)
 				return (error);
 		}
 		size = MIN(hgd->hgd_maxlen, size);
 		buf = HIDRAW_LOCAL_ALLOC(local_buf, size);
 		error = hid_get_report(sc->sc_dev, buf, size, NULL,
 		    hgd->hgd_report_type, id);
 		if (!error)
 			error = copyout(buf, hgd->hgd_data, size);
 		HIDRAW_LOCAL_FREE(local_buf, buf);
 #ifdef COMPAT_FREEBSD32
 		/*
 		 * HIDRAW_GET_REPORT is declared _IOWR, but hgd is not written
 		 * so we don't call update_hgd32().
 		 */
 #endif
 		return (error);
 
 	case HIDRAW_SET_REPORT:
 		if (!(sc->sc_fflags & FWRITE))
 			return (EPERM);
 		switch (hgd->hgd_report_type) {
 		case HID_INPUT_REPORT:
 			size = sc->sc_rdesc->isize;
 			id = sc->sc_rdesc->iid;
 			break;
 		case HID_OUTPUT_REPORT:
 			size = sc->sc_rdesc->osize;
 			id = sc->sc_rdesc->oid;
 			break;
 		case HID_FEATURE_REPORT:
 			size = sc->sc_rdesc->fsize;
 			id = sc->sc_rdesc->fid;
 			break;
 		default:
 			return (EINVAL);
 		}
 		size = MIN(hgd->hgd_maxlen, size);
 		buf = HIDRAW_LOCAL_ALLOC(local_buf, size);
 		error = copyin(hgd->hgd_data, buf, size);
 		if (error == 0) {
 			if (id != 0)
 				id = *(uint8_t *)buf;
 			error = hid_set_report(sc->sc_dev, buf, size,
 			    hgd->hgd_report_type, id);
 		}
 		HIDRAW_LOCAL_FREE(local_buf, buf);
 		return (error);
 
 	case HIDRAW_GET_REPORT_ID:
 		*(int *)addr = 0;	/* XXX: we only support reportid 0? */
 		return (0);
 
 	case HIDRAW_GET_DEVICEINFO:
 		hdi = (struct hidraw_device_info *)addr;
 		bzero(hdi, sizeof(struct hidraw_device_info));
 		hdi->hdi_product = sc->sc_hw->idProduct;
 		hdi->hdi_vendor = sc->sc_hw->idVendor;
 		hdi->hdi_version = sc->sc_hw->idVersion;
 		hdi->hdi_bustype = sc->sc_hw->idBus;
 		strlcpy(hdi->hdi_name, sc->sc_hw->name,
 		    sizeof(hdi->hdi_name));
 		strlcpy(hdi->hdi_phys, device_get_nameunit(sc->sc_dev),
 		    sizeof(hdi->hdi_phys));
 		strlcpy(hdi->hdi_uniq, sc->sc_hw->serial,
 		    sizeof(hdi->hdi_uniq));
 		snprintf(hdi->hdi_release, sizeof(hdi->hdi_release), "%x.%02x",
 		    sc->sc_hw->idVersion >> 8, sc->sc_hw->idVersion & 0xff);
 		return(0);
 
 	case HIDIOCGRDESCSIZE:
 		*(int *)addr = sc->sc_hw->rdescsize;
 		return (0);
 
 	case HIDIOCGRDESC:
 		hrd = *(struct hidraw_report_descriptor **)addr;
 		error = copyin(&hrd->size, &size, sizeof(uint32_t));
 		if (error)
 			return (error);
 		/*
 		 * HID_MAX_DESCRIPTOR_SIZE-1 is a limit of report descriptor
 		 * size in current Linux implementation.
 		 */
 		if (size >= HID_MAX_DESCRIPTOR_SIZE)
 			return (EINVAL);
 		buf = HIDRAW_LOCAL_ALLOC(local_buf, size);
 		error = hid_get_rdesc(sc->sc_dev, buf, size);
 		if (error == 0) {
 			size = MIN(size, sc->sc_rdesc->len);
 			error = copyout(buf, hrd->value, size);
 		}
 		HIDRAW_LOCAL_FREE(local_buf, buf);
 		return (error);
 
 	case HIDIOCGRAWINFO:
 		hd = (struct hidraw_devinfo *)addr;
 		hd->bustype = sc->sc_hw->idBus;
 		hd->vendor = sc->sc_hw->idVendor;
 		hd->product = sc->sc_hw->idProduct;
 		return (0);
 	}
 
 	/* variable-length ioctls handling */
 	len = IOCPARM_LEN(cmd);
 	switch (IOCBASECMD(cmd)) {
 	case HIDIOCGRAWNAME(0):
 		strlcpy(addr, sc->sc_hw->name, len);
 		td->td_retval[0] = min(strlen(sc->sc_hw->name) + 1, len);
 		return (0);
 
 	case HIDIOCGRAWPHYS(0):
 		devname = device_get_nameunit(sc->sc_dev);
 		strlcpy(addr, devname, len);
 		td->td_retval[0] = min(strlen(devname) + 1, len);
 		return (0);
 
 	case HIDIOCSFEATURE(0):
 		if (!(sc->sc_fflags & FWRITE))
 			return (EPERM);
 		if (len < 2)
 			return (EINVAL);
 		id = *(uint8_t *)addr;
 		if (id == 0) {
 			addr = (uint8_t *)addr + 1;
 			len--;
 		}
 		return (hid_set_report(sc->sc_dev, addr, len,
 		    HID_FEATURE_REPORT, id));
 
 	case HIDIOCGFEATURE(0):
 		if (!(sc->sc_fflags & FREAD))
 			return (EPERM);
 		if (len < 2)
 			return (EINVAL);
 		id = *(uint8_t *)addr;
 		if (id == 0) {
 			addr = (uint8_t *)addr + 1;
 			len--;
 		}
 		return (hid_get_report(sc->sc_dev, addr, len, NULL,
 		    HID_FEATURE_REPORT, id));
 
 	case HIDIOCGRAWUNIQ(0):
 		strlcpy(addr, sc->sc_hw->serial, len);
 		td->td_retval[0] = min(strlen(sc->sc_hw->serial) + 1, len);
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 static int
 hidraw_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct hidraw_softc *sc;
 	int revents = 0;
 
 	sc = dev->si_drv1;
 	if (sc == NULL)
 		return (POLLHUP);
 
 	if (events & (POLLOUT | POLLWRNORM) && (sc->sc_fflags & FWRITE))
 		revents |= events & (POLLOUT | POLLWRNORM);
 	if (events & (POLLIN | POLLRDNORM) && (sc->sc_fflags & FREAD)) {
 		mtx_lock(&sc->sc_mtx);
 		if (sc->sc_head != sc->sc_tail)
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			sc->sc_state.sel = true;
 			selrecord(td, &sc->sc_rsel);
 		}
 		mtx_unlock(&sc->sc_mtx);
 	}
 
 	return (revents);
 }
 
 static int
 hidraw_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct hidraw_softc *sc;
 
 	sc = dev->si_drv1;
 	if (sc == NULL)
 		return (ENXIO);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		if (sc->sc_fflags & FREAD) {
 			kn->kn_fop = &hidraw_filterops_read;
 			break;
 		}
 		/* FALLTHROUGH */
 	default:
 		return(EINVAL);
 	}
 	kn->kn_hook = sc;
 
 	knlist_add(&sc->sc_rsel.si_note, kn, 0);
 	return (0);
 }
 
 static int
 hidraw_kqread(struct knote *kn, long hint)
 {
 	struct hidraw_softc *sc;
 	int ret;
 
 	sc = kn->kn_hook;
 
 	mtx_assert(&sc->sc_mtx, MA_OWNED);
 
 	if (sc->dev->si_drv1 == NULL) {
 		kn->kn_flags |= EV_EOF;
 		ret = 1;
 	} else
 		ret = (sc->sc_head != sc->sc_tail) ? 1 : 0;
 
 	return (ret);
 }
 
 static void
 hidraw_kqdetach(struct knote *kn)
 {
 	struct hidraw_softc *sc;
 
 	sc = kn->kn_hook;
 	knlist_remove(&sc->sc_rsel.si_note, kn, 0);
 }
 
 static void
 hidraw_notify(struct hidraw_softc *sc)
 {
 
 	mtx_assert(&sc->sc_mtx, MA_OWNED);
 
 	if (sc->sc_state.aslp) {
 		sc->sc_state.aslp = false;
 		DPRINTFN(5, "waking %p\n", &sc->sc_q);
 		wakeup(&sc->sc_q);
 	}
 	if (sc->sc_state.sel) {
 		sc->sc_state.sel = false;
 		selwakeuppri(&sc->sc_rsel, PZERO);
 	}
 	if (sc->sc_async != NULL) {
 		DPRINTFN(3, "sending SIGIO %p\n", sc->sc_async);
 		PROC_LOCK(sc->sc_async);
 		kern_psignal(sc->sc_async, SIGIO);
 		PROC_UNLOCK(sc->sc_async);
 	}
 	KNOTE_LOCKED(&sc->sc_rsel.si_note, 0);
 }
 
 static device_method_t hidraw_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	hidraw_identify),
 	DEVMETHOD(device_probe,		hidraw_probe),
 	DEVMETHOD(device_attach,	hidraw_attach),
 	DEVMETHOD(device_detach,	hidraw_detach),
 
 	DEVMETHOD_END
 };
 
 static driver_t hidraw_driver = {
 	"hidraw",
 	hidraw_methods,
 	sizeof(struct hidraw_softc)
 };
 
 DRIVER_MODULE(hidraw, hidbus, hidraw_driver, NULL, NULL);
 MODULE_DEPEND(hidraw, hidbus, 1, 1, 1);
 MODULE_DEPEND(hidraw, hid, 1, 1, 1);
 MODULE_VERSION(hidraw, 1);
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index 215b1f7bd09e..6448fdc74160 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -1,1585 +1,1585 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *   1. Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *   2. Redistributions in binary form must reproduce the above copyright
  *      notice, this list of conditions and the following disclaimer in the
  *      documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/errno.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/poll.h>  /* POLLIN, POLLOUT */
 #include <sys/kernel.h> /* types used in module initialization */
 #include <sys/conf.h>	/* DEV_MODULE_ORDERED */
 #include <sys/endian.h>
 #include <sys/syscallsubr.h> /* kern_ioctl() */
 
 #include <sys/rwlock.h>
 
 #include <vm/vm.h>      /* vtophys */
 #include <vm/pmap.h>    /* vtophys */
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 
 #include <sys/malloc.h>
 #include <sys/socket.h> /* sockaddrs */
 #include <sys/selinfo.h>
 #include <sys/kthread.h> /* kthread_add() */
 #include <sys/proc.h> /* PROC_LOCK() */
 #include <sys/unistd.h> /* RFNOWAIT */
 #include <sys/sched.h> /* sched_bind() */
 #include <sys/smp.h> /* mp_maxid */
 #include <sys/taskqueue.h> /* taskqueue_enqueue(), taskqueue_create(), ... */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h> /* IFT_ETHER */
 #include <net/ethernet.h> /* ether_ifdetach */
 #include <net/if_dl.h> /* LLADDR */
 #include <machine/bus.h>        /* bus_dmamap_* */
 #include <netinet/in.h>		/* in6_cksum_pseudo() */
 #include <machine/in_cksum.h>  /* in_pseudo(), in_cksum_hdr() */
 
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #include <net/netmap_virt.h>
 #include <dev/netmap/netmap_mem2.h>
 
 
 /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
 
 static void
 nm_kqueue_notify(void *opaque, int pending)
 {
 	struct nm_selinfo *si = opaque;
 
 	/* We use a non-zero hint to distinguish this notification call
 	 * from the call done in kqueue_scan(), which uses hint=0.
 	 */
 	KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100);
 }
 
 int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) {
 	int err;
 
 	TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si);
 	si->ntfytq = taskqueue_create(name, M_NOWAIT,
 	    taskqueue_thread_enqueue, &si->ntfytq);
 	if (si->ntfytq == NULL)
 		return -ENOMEM;
 	err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name);
 	if (err) {
 		taskqueue_free(si->ntfytq);
 		si->ntfytq = NULL;
 		return err;
 	}
 
 	snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name);
 	mtx_init(&si->m, si->mtxname, NULL, MTX_DEF);
 	knlist_init_mtx(&si->si.si_note, &si->m);
 	si->kqueue_users = 0;
 
 	return (0);
 }
 
 void
 nm_os_selinfo_uninit(NM_SELINFO_T *si)
 {
 	if (si->ntfytq == NULL) {
 		return;	/* si was not initialized */
 	}
 	taskqueue_drain(si->ntfytq, &si->ntfytask);
 	taskqueue_free(si->ntfytq);
 	si->ntfytq = NULL;
 	knlist_delete(&si->si.si_note, curthread, /*islocked=*/0);
 	knlist_destroy(&si->si.si_note);
 	/* now we don't need the mutex anymore */
 	mtx_destroy(&si->m);
 }
 
 void *
 nm_os_malloc(size_t size)
 {
 	return malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
 }
 
 void *
 nm_os_realloc(void *addr, size_t new_size, size_t old_size __unused)
 {
 	return realloc(addr, new_size, M_DEVBUF, M_NOWAIT | M_ZERO);
 }
 
 void
 nm_os_free(void *addr)
 {
 	free(addr, M_DEVBUF);
 }
 
 void
 nm_os_ifnet_lock(void)
 {
 	IFNET_RLOCK();
 }
 
 void
 nm_os_ifnet_unlock(void)
 {
 	IFNET_RUNLOCK();
 }
 
 static int netmap_use_count = 0;
 
 void
 nm_os_get_module(void)
 {
 	netmap_use_count++;
 }
 
 void
 nm_os_put_module(void)
 {
 	netmap_use_count--;
 }
 
 static void
 netmap_ifnet_arrival_handler(void *arg __unused, if_t ifp)
 {
 	netmap_undo_zombie(ifp);
 }
 
 static void
 netmap_ifnet_departure_handler(void *arg __unused, if_t ifp)
 {
 	netmap_make_zombie(ifp);
 }
 
 static eventhandler_tag nm_ifnet_ah_tag;
 static eventhandler_tag nm_ifnet_dh_tag;
 
 int
 nm_os_ifnet_init(void)
 {
 	nm_ifnet_ah_tag =
 		EVENTHANDLER_REGISTER(ifnet_arrival_event,
 				netmap_ifnet_arrival_handler,
 				NULL, EVENTHANDLER_PRI_ANY);
 	nm_ifnet_dh_tag =
 		EVENTHANDLER_REGISTER(ifnet_departure_event,
 				netmap_ifnet_departure_handler,
 				NULL, EVENTHANDLER_PRI_ANY);
 	return 0;
 }
 
 void
 nm_os_ifnet_fini(void)
 {
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 			nm_ifnet_ah_tag);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 			nm_ifnet_dh_tag);
 }
 
 unsigned
 nm_os_ifnet_mtu(if_t ifp)
 {
 	return if_getmtu(ifp);
 }
 
 rawsum_t
 nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
 {
 	/* TODO XXX please use the FreeBSD implementation for this. */
 	uint16_t *words = (uint16_t *)data;
 	int nw = len / 2;
 	int i;
 
 	for (i = 0; i < nw; i++)
 		cur_sum += be16toh(words[i]);
 
 	if (len & 1)
 		cur_sum += (data[len-1] << 8);
 
 	return cur_sum;
 }
 
 /* Fold a raw checksum: 'cur_sum' is in host byte order, while the
  * return value is in network byte order.
  */
 uint16_t
 nm_os_csum_fold(rawsum_t cur_sum)
 {
 	/* TODO XXX please use the FreeBSD implementation for this. */
 	while (cur_sum >> 16)
 		cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16);
 
 	return htobe16((~cur_sum) & 0xFFFF);
 }
 
 uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph)
 {
 #if 0
 	return in_cksum_hdr((void *)iph);
 #else
 	return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));
 #endif
 }
 
 void
 nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
 					size_t datalen, uint16_t *check)
 {
 #ifdef INET
 	uint16_t pseudolen = datalen + iph->protocol;
 
 	/* Compute and insert the pseudo-header checksum. */
 	*check = in_pseudo(iph->saddr, iph->daddr,
 				 htobe16(pseudolen));
 	/* Compute the checksum on TCP/UDP header + payload
 	 * (includes the pseudo-header).
 	 */
 	*check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
 #else
 	static int notsupported = 0;
 	if (!notsupported) {
 		notsupported = 1;
 		nm_prerr("inet4 segmentation not supported");
 	}
 #endif
 }
 
 void
 nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
 					size_t datalen, uint16_t *check)
 {
 #ifdef INET6
 	*check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0);
 	*check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
 #else
 	static int notsupported = 0;
 	if (!notsupported) {
 		notsupported = 1;
 		nm_prerr("inet6 segmentation not supported");
 	}
 #endif
 }
 
 /* on FreeBSD we send up one packet at a time */
 void *
 nm_os_send_up(if_t ifp, struct mbuf *m, struct mbuf *prev)
 {
 	NA(ifp)->if_input(ifp, m);
 	return NULL;
 }
 
 int
 nm_os_mbuf_has_csum_offld(struct mbuf *m)
 {
 	return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP |
 					 CSUM_TCP_IPV6 | CSUM_UDP_IPV6 |
 					 CSUM_SCTP_IPV6);
 }
 
 int
 nm_os_mbuf_has_seg_offld(struct mbuf *m)
 {
 	return m->m_pkthdr.csum_flags & CSUM_TSO;
 }
 
 static void
 freebsd_generic_rx_handler(if_t ifp, struct mbuf *m)
 {
 	int stolen;
 
 	if (unlikely(!NM_NA_VALID(ifp))) {
 		nm_prlim(1, "Warning: RX packet intercepted, but no"
 				" emulated adapter");
 		return;
 	}
 
 	do {
 		struct mbuf *n;
 
 		n = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		stolen = generic_rx_handler(ifp, m);
 		if (!stolen) {
 			NA(ifp)->if_input(ifp, m);
 		}
 		m = n;
 	} while (m != NULL);
 }
 
 /*
  * Intercept the rx routine in the standard device driver.
  * Second argument is non-zero to intercept, 0 to restore
  */
 int
 nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept)
 {
 	struct netmap_adapter *na = &gna->up.up;
 	if_t ifp = na->ifp;
 	int ret = 0;
 
 	nm_os_ifnet_lock();
 	if (intercept) {
 		if_setcapenablebit(ifp, IFCAP_NETMAP, 0);
 		if_setinputfn(ifp, freebsd_generic_rx_handler);
 	} else {
 		if_setcapenablebit(ifp, 0, IFCAP_NETMAP);
 		if_setinputfn(ifp, na->if_input);
 	}
 	nm_os_ifnet_unlock();
 
 	return ret;
 }
 
 
 /*
  * Intercept the packet steering routine in the tx path,
  * so that we can decide which queue is used for an mbuf.
  * Second argument is non-zero to intercept, 0 to restore.
  * On freebsd we just intercept if_transmit.
  */
 int
 nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept)
 {
 	struct netmap_adapter *na = &gna->up.up;
 	if_t ifp = netmap_generic_getifp(gna);
 
 	nm_os_ifnet_lock();
 	if (intercept) {
 		na->if_transmit = if_gettransmitfn(ifp);
 		if_settransmitfn(ifp, netmap_transmit);
 	} else {
 		if_settransmitfn(ifp, na->if_transmit);
 	}
 	nm_os_ifnet_unlock();
 
 	return 0;
 }
 
 
 /*
  * Transmit routine used by generic_netmap_txsync(). Returns 0 on success
  * and non-zero on error (which may be packet drops or other errors).
  * addr and len identify the netmap buffer, m is the (preallocated)
  * mbuf to use for transmissions.
  *
  * Zero-copy transmission is possible if netmap is attached directly to a
  * hardware interface: when cleaning we simply wait for the mbuf cluster
  * refcount to decrement to 1, indicating that the driver has completed
  * transmission and is done with the buffer.  However, this approach can
  * lead to queue deadlocks when attaching to software interfaces (e.g.,
  * if_bridge) since we cannot rely on member ports to promptly reclaim
  * transmitted mbufs.  Since there is no easy way to distinguish these
  * cases, we currently always copy the buffer.
  *
  * On multiqueue cards, we can force the queue using
  *      if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
  *              i = m->m_pkthdr.flowid % adapter->num_queues;
  *      else
  *              i = curcpu % adapter->num_queues;
  */
 int
 nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
 {
 	int ret;
 	u_int len = a->len;
 	if_t ifp = a->ifp;
 	struct mbuf *m = a->m;
 
 	M_ASSERTPKTHDR(m);
 	KASSERT((m->m_flags & M_EXT) != 0,
 	    ("%s: mbuf %p has no cluster", __func__, m));
 
 	if (MBUF_REFCNT(m) != 1) {
 		nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m);
 		panic("in generic_xmit_frame");
 	}
 	if (unlikely(m->m_ext.ext_size < len)) {
 		nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len);
 		len = m->m_ext.ext_size;
 	}
 
 	m_copyback(m, 0, len, a->addr);
 	m->m_len = m->m_pkthdr.len = len;
 	SET_MBUF_REFCNT(m, 2);
 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 	m->m_pkthdr.flowid = a->ring_nr;
 	m->m_pkthdr.rcvif = ifp; /* used for tx notification */
 	CURVNET_SET(if_getvnet(ifp));
 	ret = NA(ifp)->if_transmit(ifp, m);
 	CURVNET_RESTORE();
 	return ret ? -1 : 0;
 }
 
 struct netmap_adapter *
 netmap_getna(if_t ifp)
 {
 	return (NA(ifp));
 }
 
 /*
  * The following two functions are empty until we have a generic
  * way to extract the info from the ifp
  */
 int
 nm_os_generic_find_num_desc(if_t ifp, unsigned int *tx, unsigned int *rx)
 {
 	return 0;
 }
 
 
 void
 nm_os_generic_find_num_queues(if_t ifp, u_int *txq, u_int *rxq)
 {
 	unsigned num_rings = netmap_generic_rings ? netmap_generic_rings : 1;
 
 	*txq = num_rings;
 	*rxq = num_rings;
 }
 
 void
 nm_os_generic_set_features(struct netmap_generic_adapter *gna)
 {
 
 	gna->rxsg = 1; /* Supported through m_copydata. */
 	gna->txqdisc = 0; /* Not supported. */
 }
 
 void
 nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)
 {
 	mit->mit_pending = 0;
 	mit->mit_ring_idx = idx;
 	mit->mit_na = na;
 }
 
 
 void
 nm_os_mitigation_start(struct nm_generic_mit *mit)
 {
 }
 
 
 void
 nm_os_mitigation_restart(struct nm_generic_mit *mit)
 {
 }
 
 
 int
 nm_os_mitigation_active(struct nm_generic_mit *mit)
 {
 
 	return 0;
 }
 
 
 void
 nm_os_mitigation_cleanup(struct nm_generic_mit *mit)
 {
 }
 
 static int
 nm_vi_dummy(if_t ifp, u_long cmd, caddr_t addr)
 {
 
 	return EINVAL;
 }
 
 static void
 nm_vi_start(if_t ifp)
 {
 	panic("nm_vi_start() must not be called");
 }
 
 /*
  * Index manager of persistent virtual interfaces.
  * It is used to decide the lowest byte of the MAC address.
  * We use the same algorithm with management of bridge port index.
  */
 #define NM_VI_MAX	255
 static struct {
 	uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */
 	uint8_t active;
 	struct mtx lock;
 } nm_vi_indices;
 
 void
 nm_os_vi_init_index(void)
 {
 	int i;
 	for (i = 0; i < NM_VI_MAX; i++)
 		nm_vi_indices.index[i] = i;
 	nm_vi_indices.active = 0;
 	mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF);
 }
 
 /* return -1 if no index available */
 static int
 nm_vi_get_index(void)
 {
 	int ret;
 
 	mtx_lock(&nm_vi_indices.lock);
 	ret = nm_vi_indices.active == NM_VI_MAX ? -1 :
 		nm_vi_indices.index[nm_vi_indices.active++];
 	mtx_unlock(&nm_vi_indices.lock);
 	return ret;
 }
 
 static void
 nm_vi_free_index(uint8_t val)
 {
 	int i, lim;
 
 	mtx_lock(&nm_vi_indices.lock);
 	lim = nm_vi_indices.active;
 	for (i = 0; i < lim; i++) {
 		if (nm_vi_indices.index[i] == val) {
 			/* swap index[lim-1] and j */
 			int tmp = nm_vi_indices.index[lim-1];
 			nm_vi_indices.index[lim-1] = val;
 			nm_vi_indices.index[i] = tmp;
 			nm_vi_indices.active--;
 			break;
 		}
 	}
 	if (lim == nm_vi_indices.active)
 		nm_prerr("Index %u not found", val);
 	mtx_unlock(&nm_vi_indices.lock);
 }
 #undef NM_VI_MAX
 
 /*
  * Implementation of a netmap-capable virtual interface that
  * registered to the system.
  * It is based on if_tap.c and ip_fw_log.c in FreeBSD 9.
  *
  * Note: Linux sets refcount to 0 on allocation of net_device,
  * then increments it on registration to the system.
  * FreeBSD sets refcount to 1 on if_alloc(), and does not
  * increment this refcount on if_attach().
  */
 int
 nm_os_vi_persist(const char *name, if_t *ret)
 {
 	if_t ifp;
 	u_short macaddr_hi;
 	uint32_t macaddr_mid;
 	u_char eaddr[6];
 	int unit = nm_vi_get_index(); /* just to decide MAC address */
 
 	if (unit < 0)
 		return EBUSY;
 	/*
 	 * We use the same MAC address generation method with tap
 	 * except for the highest octet is 00:be instead of 00:bd
 	 */
 	macaddr_hi = htons(0x00be); /* XXX tap + 1 */
 	macaddr_mid = (uint32_t) ticks;
 	bcopy(&macaddr_hi, eaddr, sizeof(short));
 	bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t));
 	eaddr[5] = (uint8_t)unit;
 
 	ifp = if_alloc(IFT_ETHER);
 	if_initname(ifp, name, IF_DUNIT_NONE);
 	if_setflags(ifp, IFF_UP | IFF_SIMPLEX | IFF_MULTICAST);
 	if_setinitfn(ifp, (void *)nm_vi_dummy);
 	if_setioctlfn(ifp, nm_vi_dummy);
 	if_setstartfn(ifp, nm_vi_start);
 	if_setmtu(ifp, ETHERMTU);
 	if_setsendqlen(ifp, ifqmaxlen);
 	if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0);
 	if_setcapenablebit(ifp, IFCAP_LINKSTATE, 0);
 
 	ether_ifattach(ifp, eaddr);
 	*ret = ifp;
 	return 0;
 }
 
 /* unregister from the system and drop the final refcount */
 void
 nm_os_vi_detach(if_t ifp)
 {
 	nm_vi_free_index(((char *)if_getlladdr(ifp))[5]);
 	ether_ifdetach(ifp);
 	if_free(ifp);
 }
 
 #ifdef WITH_EXTMEM
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 struct nm_os_extmem {
 	vm_object_t obj;
 	vm_offset_t kva;
 	vm_offset_t size;
 	uintptr_t scan;
 };
 
 void
 nm_os_extmem_delete(struct nm_os_extmem *e)
 {
 	nm_prinf("freeing %zx bytes", (size_t)e->size);
 	vm_map_remove(kernel_map, e->kva, e->kva + e->size);
 	nm_os_free(e);
 }
 
 char *
 nm_os_extmem_nextpage(struct nm_os_extmem *e)
 {
 	char *rv = NULL;
 	if (e->scan < e->kva + e->size) {
 		rv = (char *)e->scan;
 		e->scan += PAGE_SIZE;
 	}
 	return rv;
 }
 
 int
 nm_os_extmem_isequal(struct nm_os_extmem *e1, struct nm_os_extmem *e2)
 {
 	return (e1->obj == e2->obj);
 }
 
 int
 nm_os_extmem_nr_pages(struct nm_os_extmem *e)
 {
 	return e->size >> PAGE_SHIFT;
 }
 
 struct nm_os_extmem *
 nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t obj;
 	vm_prot_t prot;
 	vm_pindex_t index;
 	boolean_t wired;
 	struct nm_os_extmem *e = NULL;
 	int rv, error = 0;
 
 	e = nm_os_malloc(sizeof(*e));
 	if (e == NULL) {
 		error = ENOMEM;
 		goto out;
 	}
 
 	map = &curthread->td_proc->p_vmspace->vm_map;
 	rv = vm_map_lookup(&map, p, VM_PROT_RW, &entry,
 			&obj, &index, &prot, &wired);
 	if (rv != KERN_SUCCESS) {
 		nm_prerr("address %lx not found", p);
 		error = vm_mmap_to_errno(rv);
 		goto out_free;
 	}
 	vm_object_reference(obj);
 
 	/* check that we are given the whole vm_object ? */
 	vm_map_lookup_done(map, entry);
 
 	e->obj = obj;
 	/* Wire the memory and add the vm_object to the kernel map,
 	 * to make sure that it is not freed even if all the processes
 	 * that are mmap()ing should munmap() it.
 	 */
 	e->kva = vm_map_min(kernel_map);
 	e->size = obj->size << PAGE_SHIFT;
 	rv = vm_map_find(kernel_map, obj, 0, &e->kva, e->size, 0,
 			VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 			VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv != KERN_SUCCESS) {
 		nm_prerr("vm_map_find(%zx) failed", (size_t)e->size);
 		error = vm_mmap_to_errno(rv);
 		goto out_rel;
 	}
 	rv = vm_map_wire(kernel_map, e->kva, e->kva + e->size,
 			VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	if (rv != KERN_SUCCESS) {
 		nm_prerr("vm_map_wire failed");
 		error = vm_mmap_to_errno(rv);
 		goto out_rem;
 	}
 
 	e->scan = e->kva;
 
 	return e;
 
 out_rem:
 	vm_map_remove(kernel_map, e->kva, e->kva + e->size);
 out_rel:
 	vm_object_deallocate(e->obj);
 	e->obj = NULL;
 out_free:
 	nm_os_free(e);
 out:
 	if (perror)
 		*perror = error;
 	return NULL;
 }
 #endif /* WITH_EXTMEM */
 
 /* ================== PTNETMAP GUEST SUPPORT ==================== */
 
 #ifdef WITH_PTNETMAP
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <machine/bus.h>        /* bus_dmamap_* */
 #include <machine/resource.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 /*
  * ptnetmap memory device (memdev) for freebsd guest,
  * ssed to expose host netmap memory to the guest through a PCI BAR.
  */
 
 /*
  * ptnetmap memdev private data structure
  */
 struct ptnetmap_memdev {
 	device_t dev;
 	struct resource *pci_io;
 	struct resource *pci_mem;
 	struct netmap_mem_d *nm_mem;
 };
 
 static int	ptn_memdev_probe(device_t);
 static int	ptn_memdev_attach(device_t);
 static int	ptn_memdev_detach(device_t);
 static int	ptn_memdev_shutdown(device_t);
 
 static device_method_t ptn_memdev_methods[] = {
 	DEVMETHOD(device_probe, ptn_memdev_probe),
 	DEVMETHOD(device_attach, ptn_memdev_attach),
 	DEVMETHOD(device_detach, ptn_memdev_detach),
 	DEVMETHOD(device_shutdown, ptn_memdev_shutdown),
 	DEVMETHOD_END
 };
 
 static driver_t ptn_memdev_driver = {
 	PTNETMAP_MEMDEV_NAME,
 	ptn_memdev_methods,
 	sizeof(struct ptnetmap_memdev),
 };
 
 /* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation
  * below. */
 DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, NULL, NULL,
 		      SI_ORDER_MIDDLE + 1);
 
 /*
  * Map host netmap memory through PCI-BAR in the guest OS,
  * returning physical (nm_paddr) and virtual (nm_addr) addresses
  * of the netmap memory mapped in the guest.
  */
 int
 nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr,
 		      void **nm_addr, uint64_t *mem_size)
 {
 	int rid;
 
 	nm_prinf("ptn_memdev_driver iomap");
 
 	rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR);
 	*mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_HI);
 	*mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_LO) |
 			(*mem_size << 32);
 
 	/* map memory allocator */
 	ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY,
 			&rid, 0, ~0, *mem_size, RF_ACTIVE);
 	if (ptn_dev->pci_mem == NULL) {
 		*nm_paddr = 0;
 		*nm_addr = NULL;
 		return ENOMEM;
 	}
 
 	*nm_paddr = rman_get_start(ptn_dev->pci_mem);
 	*nm_addr = rman_get_virtual(ptn_dev->pci_mem);
 
 	nm_prinf("=== BAR %d start %lx len %lx mem_size %lx ===",
 			PTNETMAP_MEM_PCI_BAR,
 			(unsigned long)(*nm_paddr),
 			(unsigned long)rman_get_size(ptn_dev->pci_mem),
 			(unsigned long)*mem_size);
 	return (0);
 }
 
 uint32_t
 nm_os_pt_memdev_ioread(struct ptnetmap_memdev *ptn_dev, unsigned int reg)
 {
 	return bus_read_4(ptn_dev->pci_io, reg);
 }
 
 /* Unmap host netmap memory. */
 void
 nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev)
 {
 	nm_prinf("ptn_memdev_driver iounmap");
 
 	if (ptn_dev->pci_mem) {
 		bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY,
 			PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);
 		ptn_dev->pci_mem = NULL;
 	}
 }
 
 /* Device identification routine, return BUS_PROBE_DEFAULT on success,
  * positive on failure */
 static int
 ptn_memdev_probe(device_t dev)
 {
 	if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID)
 		return (ENXIO);
 	if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID)
 		return (ENXIO);
 
 	device_set_descf(dev, "%s PCI adapter", PTNETMAP_MEMDEV_NAME);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 /* Device initialization routine. */
 static int
 ptn_memdev_attach(device_t dev)
 {
 	struct ptnetmap_memdev *ptn_dev;
 	int rid;
 	uint16_t mem_id;
 
 	ptn_dev = device_get_softc(dev);
 	ptn_dev->dev = dev;
 
 	pci_enable_busmaster(dev);
 
 	rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR);
 	ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
 						 RF_ACTIVE);
 	if (ptn_dev->pci_io == NULL) {
 	        device_printf(dev, "cannot map I/O space\n");
 	        return (ENXIO);
 	}
 
 	mem_id = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMID);
 
 	/* create guest allocator */
 	ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id);
 	if (ptn_dev->nm_mem == NULL) {
 		ptn_memdev_detach(dev);
 	        return (ENOMEM);
 	}
 	netmap_mem_get(ptn_dev->nm_mem);
 
 	nm_prinf("ptnetmap memdev attached, host memid: %u", mem_id);
 
 	return (0);
 }
 
 /* Device removal routine. */
 static int
 ptn_memdev_detach(device_t dev)
 {
 	struct ptnetmap_memdev *ptn_dev;
 
 	ptn_dev = device_get_softc(dev);
 
 	if (ptn_dev->nm_mem) {
 		nm_prinf("ptnetmap memdev detached, host memid %u",
 			netmap_mem_get_id(ptn_dev->nm_mem));
 		netmap_mem_put(ptn_dev->nm_mem);
 		ptn_dev->nm_mem = NULL;
 	}
 	if (ptn_dev->pci_mem) {
 		bus_release_resource(dev, SYS_RES_MEMORY,
 			PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);
 		ptn_dev->pci_mem = NULL;
 	}
 	if (ptn_dev->pci_io) {
 		bus_release_resource(dev, SYS_RES_IOPORT,
 			PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io);
 		ptn_dev->pci_io = NULL;
 	}
 
 	return (0);
 }
 
 static int
 ptn_memdev_shutdown(device_t dev)
 {
 	return bus_generic_shutdown(dev);
 }
 
 #endif /* WITH_PTNETMAP */
 
 /*
  * In order to track whether pages are still mapped, we hook into
  * the standard cdev_pager and intercept the constructor and
  * destructor.
  */
 
 struct netmap_vm_handle_t {
 	struct cdev 		*dev;
 	struct netmap_priv_d	*priv;
 };
 
 
 static int
 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 	struct netmap_vm_handle_t *vmh = handle;
 
 	if (netmap_verbose)
 		nm_prinf("handle %p size %jd prot %d foff %jd",
 			handle, (intmax_t)size, prot, (intmax_t)foff);
 	if (color)
 		*color = 0;
 	dev_ref(vmh->dev);
 	return 0;
 }
 
 
 static void
 netmap_dev_pager_dtor(void *handle)
 {
 	struct netmap_vm_handle_t *vmh = handle;
 	struct cdev *dev = vmh->dev;
 	struct netmap_priv_d *priv = vmh->priv;
 
 	if (netmap_verbose)
 		nm_prinf("handle %p", handle);
 	netmap_dtor(priv);
 	free(vmh, M_DEVBUF);
 	dev_rel(dev);
 }
 
 
 static int
 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
 	int prot, vm_page_t *mres)
 {
 	struct netmap_vm_handle_t *vmh = object->handle;
 	struct netmap_priv_d *priv = vmh->priv;
 	struct netmap_adapter *na = priv->np_na;
 	vm_paddr_t paddr;
 	vm_page_t page;
 	vm_memattr_t memattr;
 
 	nm_prdis("object %p offset %jd prot %d mres %p",
 			object, (intmax_t)offset, prot, mres);
 	memattr = object->memattr;
 	paddr = netmap_mem_ofstophys(na->nm_mem, offset);
 	if (paddr == 0)
 		return VM_PAGER_FAIL;
 
 	if (((*mres)->flags & PG_FICTITIOUS) != 0) {
 		/*
 		 * If the passed in result page is a fake page, update it with
 		 * the new physical address.
 		 */
 		page = *mres;
 		vm_page_updatefake(page, paddr, memattr);
 	} else {
 		/*
 		 * Replace the passed in reqpage page with our own fake page and
 		 * free up the all of the original pages.
 		 */
 		VM_OBJECT_WUNLOCK(object);
 		page = vm_page_getfake(paddr, memattr);
 		VM_OBJECT_WLOCK(object);
 		vm_page_replace(page, object, (*mres)->pindex, *mres);
 		*mres = page;
 	}
 	page->valid = VM_PAGE_BITS_ALL;
 	return (VM_PAGER_OK);
 }
 
 
 static struct cdev_pager_ops netmap_cdev_pager_ops = {
 	.cdev_pg_ctor = netmap_dev_pager_ctor,
 	.cdev_pg_dtor = netmap_dev_pager_dtor,
 	.cdev_pg_fault = netmap_dev_pager_fault,
 };
 
 
 static int
 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
 	vm_size_t objsize,  vm_object_t *objp, int prot)
 {
 	int error;
 	struct netmap_vm_handle_t *vmh;
 	struct netmap_priv_d *priv;
 	vm_object_t obj;
 
 	if (netmap_verbose)
 		nm_prinf("cdev %p foff %jd size %jd objp %p prot %d", cdev,
 		    (intmax_t )*foff, (intmax_t )objsize, objp, prot);
 
 	vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF,
 			      M_NOWAIT | M_ZERO);
 	if (vmh == NULL)
 		return ENOMEM;
 	vmh->dev = cdev;
 
 	NMG_LOCK();
 	error = devfs_get_cdevpriv((void**)&priv);
 	if (error)
 		goto err_unlock;
 	if (priv->np_nifp == NULL) {
 		error = EINVAL;
 		goto err_unlock;
 	}
 	vmh->priv = priv;
 	priv->np_refs++;
 	NMG_UNLOCK();
 
 	obj = cdev_pager_allocate(vmh, OBJT_DEVICE,
 		&netmap_cdev_pager_ops, objsize, prot,
 		*foff, NULL);
 	if (obj == NULL) {
 		nm_prerr("cdev_pager_allocate failed");
 		error = EINVAL;
 		goto err_deref;
 	}
 
 	*objp = obj;
 	return 0;
 
 err_deref:
 	NMG_LOCK();
 	priv->np_refs--;
 err_unlock:
 	NMG_UNLOCK();
 // err:
 	free(vmh, M_DEVBUF);
 	return error;
 }
 
 /*
  * On FreeBSD the close routine is only called on the last close on
  * the device (/dev/netmap) so we cannot do anything useful.
  * To track close() on individual file descriptors we pass netmap_dtor() to
  * devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor
  * when the last fd pointing to the device is closed.
  *
  * Note that FreeBSD does not even munmap() on close() so we also have
  * to track mmap() ourselves, and postpone the call to
  * netmap_dtor() is called when the process has no open fds and no active
  * memory maps on /dev/netmap, as in linux.
  */
 static int
 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	if (netmap_verbose)
 		nm_prinf("dev %p fflag 0x%x devtype %d td %p",
 			dev, fflag, devtype, td);
 	return 0;
 }
 
 
 static int
 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct netmap_priv_d *priv;
 	int error;
 
 	(void)dev;
 	(void)oflags;
 	(void)devtype;
 	(void)td;
 
 	NMG_LOCK();
 	priv = netmap_priv_new();
 	if (priv == NULL) {
 		error = ENOMEM;
 		goto out;
 	}
 	error = devfs_set_cdevpriv(priv, netmap_dtor);
 	if (error) {
 		netmap_priv_delete(priv);
 	}
 out:
 	NMG_UNLOCK();
 	return error;
 }
 
 /******************** kthread wrapper ****************/
 #include <sys/sysproto.h>
 u_int
 nm_os_ncpus(void)
 {
 	return mp_maxid + 1;
 }
 
 struct nm_kctx_ctx {
 	/* Userspace thread (kthread creator). */
 	struct thread *user_td;
 
 	/* worker function and parameter */
 	nm_kctx_worker_fn_t worker_fn;
 	void *worker_private;
 
 	struct nm_kctx *nmk;
 
 	/* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */
 	long type;
 };
 
 struct nm_kctx {
 	struct thread *worker;
 	struct mtx worker_lock;
 	struct nm_kctx_ctx worker_ctx;
 	int run;			/* used to stop kthread */
 	int attach_user;		/* kthread attached to user_process */
 	int affinity;
 };
 
 static void
 nm_kctx_worker(void *data)
 {
 	struct nm_kctx *nmk = data;
 	struct nm_kctx_ctx *ctx = &nmk->worker_ctx;
 
 	if (nmk->affinity >= 0) {
 		thread_lock(curthread);
 		sched_bind(curthread, nmk->affinity);
 		thread_unlock(curthread);
 	}
 
 	while (nmk->run) {
 		/*
 		 * check if the parent process dies
 		 * (when kthread is attached to user process)
 		 */
 		if (ctx->user_td) {
 			PROC_LOCK(curproc);
 			thread_suspend_check(0);
 			PROC_UNLOCK(curproc);
 		} else {
 			kthread_suspend_check();
 		}
 
 		/* Continuously execute worker process. */
 		ctx->worker_fn(ctx->worker_private); /* worker body */
 	}
 
 	kthread_exit();
 }
 
 void
 nm_os_kctx_worker_setaff(struct nm_kctx *nmk, int affinity)
 {
 	nmk->affinity = affinity;
 }
 
 struct nm_kctx *
 nm_os_kctx_create(struct nm_kctx_cfg *cfg, void *opaque)
 {
 	struct nm_kctx *nmk = NULL;
 
 	nmk = malloc(sizeof(*nmk),  M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (!nmk)
 		return NULL;
 
 	mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_DEF);
 	nmk->worker_ctx.worker_fn = cfg->worker_fn;
 	nmk->worker_ctx.worker_private = cfg->worker_private;
 	nmk->worker_ctx.type = cfg->type;
 	nmk->affinity = -1;
 
 	/* attach kthread to user process (ptnetmap) */
 	nmk->attach_user = cfg->attach_user;
 
 	return nmk;
 }
 
 int
 nm_os_kctx_worker_start(struct nm_kctx *nmk)
 {
 	struct proc *p = NULL;
 	int error = 0;
 
 	/* Temporarily disable this function as it is currently broken
 	 * and causes kernel crashes. The failure can be triggered by
 	 * the "vale_polling_enable_disable" test in ctrl-api-test.c. */
 	return EOPNOTSUPP;
 
 	if (nmk->worker)
 		return EBUSY;
 
 	/* check if we want to attach kthread to user process */
 	if (nmk->attach_user) {
 		nmk->worker_ctx.user_td = curthread;
 		p = curthread->td_proc;
 	}
 
 	/* enable kthread main loop */
 	nmk->run = 1;
 	/* create kthread */
 	if((error = kthread_add(nm_kctx_worker, nmk, p,
 			&nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld",
 			nmk->worker_ctx.type))) {
 		goto err;
 	}
 
 	nm_prinf("nm_kthread started td %p", nmk->worker);
 
 	return 0;
 err:
 	nm_prerr("nm_kthread start failed err %d", error);
 	nmk->worker = NULL;
 	return error;
 }
 
 void
 nm_os_kctx_worker_stop(struct nm_kctx *nmk)
 {
 	if (!nmk->worker)
 		return;
 
 	/* tell to kthread to exit from main loop */
 	nmk->run = 0;
 
 	/* wake up kthread if it sleeps */
 	kthread_resume(nmk->worker);
 
 	nmk->worker = NULL;
 }
 
 void
 nm_os_kctx_destroy(struct nm_kctx *nmk)
 {
 	if (!nmk)
 		return;
 
 	if (nmk->worker)
 		nm_os_kctx_worker_stop(nmk);
 
 	free(nmk, M_DEVBUF);
 }
 
 /******************** kqueue support ****************/
 
 /*
  * In addition to calling selwakeuppri(), nm_os_selwakeup() also
  * needs to call knote() to wake up kqueue listeners.
  * This operation is deferred to a taskqueue in order to avoid possible
  * lock order reversals; these may happen because knote() grabs a
  * private lock associated to the 'si' (see struct selinfo,
  * struct nm_selinfo, and nm_os_selinfo_init), and nm_os_selwakeup()
  * can be called while holding the lock associated to a different
  * 'si'.
  * When calling knote() we use a non-zero 'hint' argument to inform
  * the netmap_knrw() function that it is being called from
  * 'nm_os_selwakeup'; this is necessary because when netmap_knrw() is
  * called by the kevent subsystem (i.e. kevent_scan()) we also need to
  * call netmap_poll().
  *
  * The netmap_kqfilter() function registers one or another f_event
  * depending on read or write mode. A pointer to the struct
  * 'netmap_priv_d' is stored into kn->kn_hook, so that it can later
  * be passed to netmap_poll(). We pass NULL as a third argument to
  * netmap_poll(), so that the latter only runs the txsync/rxsync
  * (if necessary), and skips the nm_os_selrecord() calls.
  */
 
 
 void
 nm_os_selwakeup(struct nm_selinfo *si)
 {
 	selwakeuppri(&si->si, PI_NET);
 	if (si->kqueue_users > 0) {
 		taskqueue_enqueue(si->ntfytq, &si->ntfytask);
 	}
 }
 
 void
 nm_os_selrecord(struct thread *td, struct nm_selinfo *si)
 {
 	selrecord(td, &si->si);
 }
 
 static void
 netmap_knrdetach(struct knote *kn)
 {
 	struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
 	struct nm_selinfo *si = priv->np_si[NR_RX];
 
 	knlist_remove(&si->si.si_note, kn, /*islocked=*/0);
 	NMG_LOCK();
 	KASSERT(si->kqueue_users > 0, ("kqueue_user underflow on %s",
 	    si->mtxname));
 	si->kqueue_users--;
 	nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users);
 	NMG_UNLOCK();
 }
 
 static void
 netmap_knwdetach(struct knote *kn)
 {
 	struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
 	struct nm_selinfo *si = priv->np_si[NR_TX];
 
 	knlist_remove(&si->si.si_note, kn, /*islocked=*/0);
 	NMG_LOCK();
 	si->kqueue_users--;
 	nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users);
 	NMG_UNLOCK();
 }
 
 /*
  * Callback triggered by netmap notifications (see netmap_notify()),
  * and by the application calling kevent(). In the former case we
  * just return 1 (events ready), since we are not able to do better.
  * In the latter case we use netmap_poll() to see which events are
  * ready.
  */
 static int
 netmap_knrw(struct knote *kn, long hint, int events)
 {
 	struct netmap_priv_d *priv;
 	int revents;
 
 	if (hint != 0) {
 		/* Called from netmap_notify(), typically from a
 		 * thread different from the one issuing kevent().
 		 * Assume we are ready. */
 		return 1;
 	}
 
 	/* Called from kevent(). */
 	priv = kn->kn_hook;
 	revents = netmap_poll(priv, events, /*thread=*/NULL);
 
 	return (events & revents) ? 1 : 0;
 }
 
 static int
 netmap_knread(struct knote *kn, long hint)
 {
 	return netmap_knrw(kn, hint, POLLIN);
 }
 
 static int
 netmap_knwrite(struct knote *kn, long hint)
 {
 	return netmap_knrw(kn, hint, POLLOUT);
 }
 
-static struct filterops netmap_rfiltops = {
+static const struct filterops netmap_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = netmap_knrdetach,
 	.f_event = netmap_knread,
 };
 
-static struct filterops netmap_wfiltops = {
+static const struct filterops netmap_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = netmap_knwdetach,
 	.f_event = netmap_knwrite,
 };
 
 
 /*
  * This is called when a thread invokes kevent() to record
  * a change in the configuration of the kqueue().
  * The 'priv' is the one associated to the open netmap device.
  */
 static int
 netmap_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct netmap_priv_d *priv;
 	int error;
 	struct netmap_adapter *na;
 	struct nm_selinfo *si;
 	int ev = kn->kn_filter;
 
 	if (ev != EVFILT_READ && ev != EVFILT_WRITE) {
 		nm_prerr("bad filter request %d", ev);
 		return 1;
 	}
 	error = devfs_get_cdevpriv((void**)&priv);
 	if (error) {
 		nm_prerr("device not yet setup");
 		return 1;
 	}
 	na = priv->np_na;
 	if (na == NULL) {
 		nm_prerr("no netmap adapter for this file descriptor");
 		return 1;
 	}
 	/* the si is indicated in the priv */
 	si = priv->np_si[(ev == EVFILT_WRITE) ? NR_TX : NR_RX];
 	kn->kn_fop = (ev == EVFILT_WRITE) ?
 		&netmap_wfiltops : &netmap_rfiltops;
 	kn->kn_hook = priv;
 	NMG_LOCK();
 	si->kqueue_users++;
 	nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users);
 	NMG_UNLOCK();
 	knlist_add(&si->si.si_note, kn, /*islocked=*/0);
 
 	return 0;
 }
 
 static int
 freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td)
 {
 	struct netmap_priv_d *priv;
 	if (devfs_get_cdevpriv((void **)&priv)) {
 		return POLLERR;
 	}
 	return netmap_poll(priv, events, td);
 }
 
 static int
 freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
 		int ffla __unused, struct thread *td)
 {
 	int error;
 	struct netmap_priv_d *priv;
 
 	CURVNET_SET(TD_TO_VNET(td));
 	error = devfs_get_cdevpriv((void **)&priv);
 	if (error) {
 		/* XXX ENOENT should be impossible, since the priv
 		 * is now created in the open */
 		if (error == ENOENT)
 			error = ENXIO;
 		goto out;
 	}
 	error = netmap_ioctl(priv, cmd, data, td, /*nr_body_is_user=*/1);
 out:
 	CURVNET_RESTORE();
 
 	return error;
 }
 
 void
 nm_os_onattach(if_t ifp)
 {
 	if_setcapabilitiesbit(ifp, IFCAP_NETMAP, 0);
 }
 
 void
 nm_os_onenter(if_t ifp)
 {
 	struct netmap_adapter *na = NA(ifp);
 
 	na->if_transmit = if_gettransmitfn(ifp);
 	if_settransmitfn(ifp, netmap_transmit);
 	if_setcapenablebit(ifp, IFCAP_NETMAP, 0);
 }
 
 void
 nm_os_onexit(if_t ifp)
 {
 	struct netmap_adapter *na = NA(ifp);
 
 	if_settransmitfn(ifp, na->if_transmit);
 	if_setcapenablebit(ifp, 0, IFCAP_NETMAP);
 }
 
 extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */
 struct cdevsw netmap_cdevsw = {
 	.d_version = D_VERSION,
 	.d_name = "netmap",
 	.d_open = netmap_open,
 	.d_mmap_single = netmap_mmap_single,
 	.d_ioctl = freebsd_netmap_ioctl,
 	.d_poll = freebsd_netmap_poll,
 	.d_kqfilter = netmap_kqfilter,
 	.d_close = netmap_close,
 };
 /*--- end of kqueue support ----*/
 
 /*
  * Kernel entry point.
  *
  * Initialize/finalize the module and return.
  *
  * Return 0 on success, errno on failure.
  */
 static int
 netmap_loader(__unused struct module *module, int event, __unused void *arg)
 {
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 		error = netmap_init();
 		break;
 
 	case MOD_UNLOAD:
 		/*
 		 * if some one is still using netmap,
 		 * then the module can not be unloaded.
 		 */
 		if (netmap_use_count) {
 			nm_prerr("netmap module can not be unloaded - netmap_use_count: %d",
 					netmap_use_count);
 			error = EBUSY;
 			break;
 		}
 		netmap_fini();
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 #ifdef DEV_MODULE_ORDERED
 /*
  * The netmap module contains three drivers: (i) the netmap character device
  * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI
  * device driver. The attach() routines of both (ii) and (iii) need the
  * lock of the global allocator, and such lock is initialized in netmap_init(),
  * which is part of (i).
  * Therefore, we make sure that (i) is loaded before (ii) and (iii), using
  * the 'order' parameter of driver declaration macros. For (i), we specify
  * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED
  * macros for (ii) and (iii).
  */
 DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE);
 #else /* !DEV_MODULE_ORDERED */
 DEV_MODULE(netmap, netmap_loader, NULL);
 #endif /* DEV_MODULE_ORDERED  */
 MODULE_DEPEND(netmap, pci, 1, 1, 1);
 MODULE_VERSION(netmap, 1);
 /* reduce conditional code */
 // linux API, use for the knlist in FreeBSD
 /* use a private mutex for the knlist */
diff --git a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c
index a70f25d57dcb..661d5bd0f14e 100644
--- a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c
+++ b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c
@@ -1,672 +1,672 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
 /* Copyright(c) 2007-2022 Intel Corporation */
 
 #include "qat_freebsd.h"
 #include "adf_cfg.h"
 #include "adf_common_drv.h"
 #include "adf_accel_devices.h"
 #include "icp_qat_uclo.h"
 #include "icp_qat_fw.h"
 #include "icp_qat_fw_init_admin.h"
 #include "adf_cfg_strings.h"
 #include "adf_uio_control.h"
 #include "adf_uio_cleanup.h"
 #include "adf_uio.h"
 #include "adf_transport_access_macros.h"
 #include "adf_transport_internal.h"
 
 #define ADF_DEV_PROCESSES_NAME "qat_dev_processes"
 #define ADF_DEV_STATE_NAME "qat_dev_state"
 
 #define ADF_STATE_CALLOUT_TIME 10
 
 static const char *mtx_name = "state_mtx";
 static const char *mtx_callout_name = "callout_mtx";
 
 static d_open_t adf_processes_open;
 static void adf_processes_release(void *data);
 static d_read_t adf_processes_read;
 static d_write_t adf_processes_write;
 
 static d_open_t adf_state_open;
 static void adf_state_release(void *data);
 static d_read_t adf_state_read;
 static int adf_state_kqfilter(struct cdev *dev, struct knote *kn);
 static int adf_state_kqread_event(struct knote *kn, long hint);
 static void adf_state_kqread_detach(struct knote *kn);
 
 static struct callout callout;
 static struct mtx mtx;
 static struct mtx callout_mtx;
 static struct service_hndl adf_state_hndl;
 
 struct entry_proc_events {
 	struct adf_state_priv_data *proc_events;
 
 	SLIST_ENTRY(entry_proc_events) entries_proc_events;
 };
 
 struct entry_state {
 	struct adf_state state;
 
 	STAILQ_ENTRY(entry_state) entries_state;
 };
 
 SLIST_HEAD(proc_events_head, entry_proc_events);
 STAILQ_HEAD(state_head, entry_state);
 
 static struct proc_events_head proc_events_head;
 
 struct adf_processes_priv_data {
 	char name[ADF_CFG_MAX_SECTION_LEN_IN_BYTES];
 	int read_flag;
 	struct list_head list;
 };
 
 struct adf_state_priv_data {
 	struct cdev *cdev;
 	struct selinfo rsel;
 	struct state_head state_head;
 };
 
 static struct cdevsw adf_processes_cdevsw = {
 	.d_version = D_VERSION,
 	.d_open = adf_processes_open,
 	.d_read = adf_processes_read,
 	.d_write = adf_processes_write,
 	.d_name = ADF_DEV_PROCESSES_NAME,
 };
 
 static struct cdevsw adf_state_cdevsw = {
 	.d_version = D_VERSION,
 	.d_open = adf_state_open,
 	.d_read = adf_state_read,
 	.d_kqfilter = adf_state_kqfilter,
 	.d_name = ADF_DEV_STATE_NAME,
 };
 
-static struct filterops adf_state_read_filterops = {
+static const struct filterops adf_state_read_filterops = {
 	.f_isfd = 1,
 	.f_attach = NULL,
 	.f_detach = adf_state_kqread_detach,
 	.f_event = adf_state_kqread_event,
 };
 
 static struct cdev *adf_processes_dev;
 static struct cdev *adf_state_dev;
 
 static LINUX_LIST_HEAD(processes_list);
 
 struct sx processes_list_sema;
 SX_SYSINIT(processes_list_sema, &processes_list_sema, "adf proc list");
 
 static void
 adf_chr_drv_destroy(void)
 {
 	destroy_dev(adf_processes_dev);
 }
 
 static int
 adf_chr_drv_create(void)
 {
 
 	adf_processes_dev = make_dev(&adf_processes_cdevsw,
 				     0,
 				     UID_ROOT,
 				     GID_WHEEL,
 				     0600,
 				     ADF_DEV_PROCESSES_NAME);
 	if (adf_processes_dev == NULL) {
 		printf("QAT: failed to create device\n");
 		goto err_cdev_del;
 	}
 	return 0;
 err_cdev_del:
 	return EFAULT;
 }
 
 static int
 adf_processes_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	int i = 0, devices = 0;
 	struct adf_accel_dev *accel_dev = NULL;
 	struct adf_processes_priv_data *prv_data = NULL;
 	int error = 0;
 
 	for (i = 0; i < ADF_MAX_DEVICES; i++) {
 		accel_dev = adf_devmgr_get_dev_by_id(i);
 		if (!accel_dev)
 			continue;
 		if (!adf_dev_started(accel_dev))
 			continue;
 		devices++;
 	}
 	if (!devices) {
 		printf("QAT: No active devices found.\n");
 		return ENXIO;
 	}
 	prv_data = malloc(sizeof(*prv_data), M_QAT, M_WAITOK | M_ZERO);
 	INIT_LIST_HEAD(&prv_data->list);
 	error = devfs_set_cdevpriv(prv_data, adf_processes_release);
 	if (error) {
 		free(prv_data, M_QAT);
 		return error;
 	}
 
 	return 0;
 }
 
 static int
 adf_get_first_started_dev(void)
 {
 	int i = 0;
 	struct adf_accel_dev *accel_dev = NULL;
 
 	for (i = 0; i < ADF_MAX_DEVICES; i++) {
 		accel_dev = adf_devmgr_get_dev_by_id(i);
 		if (!accel_dev)
 			continue;
 		if (adf_dev_started(accel_dev))
 			return i;
 	}
 
 	return -1;
 }
 
 static int
 adf_processes_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct adf_processes_priv_data *prv_data = NULL;
 	struct adf_processes_priv_data *pdata = NULL;
 	int dev_num = 0, pr_num = 0;
 	struct list_head *lpos = NULL;
 	char usr_name[ADF_CFG_MAX_SECTION_LEN_IN_BYTES] = { 0 };
 	struct adf_accel_dev *accel_dev = NULL;
 	struct adf_cfg_section *section_ptr = NULL;
 	bool pr_name_available = 1;
 	uint32_t num_accel_devs = 0;
 	int error = 0;
 	ssize_t count;
 	int dev_id;
 
 	error = devfs_get_cdevpriv((void **)&prv_data);
 	if (error) {
 		printf("QAT: invalid file descriptor\n");
 		return error;
 	}
 
 	if (prv_data->read_flag == 1) {
 		printf("QAT: can only write once\n");
 		return EBADF;
 	}
 	count = uio->uio_resid;
 	if ((count <= 0) || (count > ADF_CFG_MAX_SECTION_LEN_IN_BYTES)) {
 		printf("QAT: wrong size %d\n", (int)count);
 		return EIO;
 	}
 
 	error = uiomove(usr_name, count, uio);
 	if (error) {
 		printf("QAT: can't copy data\n");
 		return error;
 	}
 
 	/* Lock other processes and try to find out the process name */
 	if (sx_xlock_sig(&processes_list_sema)) {
 		printf("QAT: can't aquire process info lock\n");
 		return EBADF;
 	}
 
 	dev_id = adf_get_first_started_dev();
 	if (-1 == dev_id) {
 		pr_err("QAT: could not find started device\n");
 		sx_xunlock(&processes_list_sema);
 		return -EIO;
 	}
 
 	accel_dev = adf_devmgr_get_dev_by_id(dev_id);
 	if (!accel_dev) {
 		pr_err("QAT: could not find started device\n");
 		sx_xunlock(&processes_list_sema);
 		return -EIO;
 	}
 
 	/* If there is nothing there then take the first name and return */
 	if (list_empty(&processes_list)) {
 		snprintf(prv_data->name,
 			 ADF_CFG_MAX_SECTION_LEN_IN_BYTES,
 			 "%s" ADF_INTERNAL_USERSPACE_SEC_SUFF "%d",
 			 usr_name,
 			 0);
 		list_add(&prv_data->list, &processes_list);
 		sx_xunlock(&processes_list_sema);
 		prv_data->read_flag = 1;
 		return 0;
 	}
 
 	/* If there are processes running then search for a first free name */
 	adf_devmgr_get_num_dev(&num_accel_devs);
 	for (dev_num = 0; dev_num < num_accel_devs; dev_num++) {
 		accel_dev = adf_devmgr_get_dev_by_id(dev_num);
 		if (!accel_dev)
 			continue;
 
 		if (!adf_dev_started(accel_dev))
 			continue; /* to next device */
 
 		for (pr_num = 0; pr_num < GET_MAX_PROCESSES(accel_dev);
 		     pr_num++) {
 			snprintf(prv_data->name,
 				 ADF_CFG_MAX_SECTION_LEN_IN_BYTES,
 				 "%s" ADF_INTERNAL_USERSPACE_SEC_SUFF "%d",
 				 usr_name,
 				 pr_num);
 			pr_name_available = 1;
 			/* Figure out if section exists in the config table */
 			section_ptr =
 			    adf_cfg_sec_find(accel_dev, prv_data->name);
 			if (NULL == section_ptr) {
 				/* This section name doesn't exist */
 				pr_name_available = 0;
 				/* As process_num enumerates from 0, once we get
 				 * to one which doesn't exist no further ones
 				 * will exist. On to next device
 				 */
 				break;
 			}
 			/* Figure out if it's been taken already */
 			list_for_each(lpos, &processes_list)
 			{
 				pdata =
 				    list_entry(lpos,
 					       struct adf_processes_priv_data,
 					       list);
 				if (!strncmp(
 					pdata->name,
 					prv_data->name,
 					ADF_CFG_MAX_SECTION_LEN_IN_BYTES)) {
 					pr_name_available = 0;
 					break;
 				}
 			}
 			if (pr_name_available)
 				break;
 		}
 		if (pr_name_available)
 			break;
 	}
 	/*
 	 * If we have a valid name that is not on
 	 * the list take it and add to the list
 	 */
 	if (pr_name_available) {
 		list_add(&prv_data->list, &processes_list);
 		sx_xunlock(&processes_list_sema);
 		prv_data->read_flag = 1;
 		return 0;
 	}
 	/* If not then the process needs to wait */
 	sx_xunlock(&processes_list_sema);
 	explicit_bzero(prv_data->name, ADF_CFG_MAX_SECTION_LEN_IN_BYTES);
 	prv_data->read_flag = 0;
 	return 1;
 }
 
 static int
 adf_processes_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct adf_processes_priv_data *prv_data = NULL;
 	int error = 0;
 
 	error = devfs_get_cdevpriv((void **)&prv_data);
 	if (error) {
 		printf("QAT: invalid file descriptor\n");
 		return error;
 	}
 
 	/*
 	 * If there is a name that the process can use then give it
 	 * to the proocess.
 	 */
 	if (prv_data->read_flag) {
 		error = uiomove(prv_data->name,
 				strnlen(prv_data->name,
 					ADF_CFG_MAX_SECTION_LEN_IN_BYTES),
 				uio);
 		if (error) {
 			printf("QAT: failed to copy data to user\n");
 			return error;
 		}
 		return 0;
 	}
 
 	return EIO;
 }
 
 static void
 adf_processes_release(void *data)
 {
 	struct adf_processes_priv_data *prv_data = NULL;
 
 	prv_data = (struct adf_processes_priv_data *)data;
 	sx_xlock(&processes_list_sema);
 	list_del(&prv_data->list);
 	sx_xunlock(&processes_list_sema);
 	free(prv_data, M_QAT);
 }
 
 int
 adf_processes_dev_register(void)
 {
 	return adf_chr_drv_create();
 }
 
 void
 adf_processes_dev_unregister(void)
 {
 	adf_chr_drv_destroy();
 }
 
 static void
 adf_state_callout_notify_ev(void *arg)
 {
 	int notified = 0;
 	struct adf_state_priv_data *priv = NULL;
 	struct entry_proc_events *proc_events = NULL;
 
 	SLIST_FOREACH (proc_events, &proc_events_head, entries_proc_events) {
 		if (!STAILQ_EMPTY(&proc_events->proc_events->state_head)) {
 			notified = 1;
 			priv = proc_events->proc_events;
 			wakeup(priv);
 			selwakeup(&priv->rsel);
 			KNOTE_UNLOCKED(&priv->rsel.si_note, 0);
 		}
 	}
 	if (notified)
 		callout_schedule(&callout, ADF_STATE_CALLOUT_TIME);
 }
 
 static void
 adf_state_set(int dev, enum adf_event event)
 {
 	struct adf_accel_dev *accel_dev = NULL;
 	struct state_head *head = NULL;
 	struct entry_proc_events *proc_events = NULL;
 	struct entry_state *state = NULL;
 
 	accel_dev = adf_devmgr_get_dev_by_id(dev);
 	if (!accel_dev)
 		return;
 	mtx_lock(&mtx);
 	SLIST_FOREACH (proc_events, &proc_events_head, entries_proc_events) {
 		state = NULL;
 		head = &proc_events->proc_events->state_head;
 		state = malloc(sizeof(struct entry_state),
 			       M_QAT,
 			       M_NOWAIT | M_ZERO);
 		if (!state)
 			continue;
 		state->state.dev_state = event;
 		state->state.dev_id = dev;
 		STAILQ_INSERT_TAIL(head, state, entries_state);
 		if (event == ADF_EVENT_STOP) {
 			state = NULL;
 			state = malloc(sizeof(struct entry_state),
 				       M_QAT,
 				       M_NOWAIT | M_ZERO);
 			if (!state)
 				continue;
 			state->state.dev_state = ADF_EVENT_SHUTDOWN;
 			state->state.dev_id = dev;
 			STAILQ_INSERT_TAIL(head, state, entries_state);
 		}
 	}
 	mtx_unlock(&mtx);
 	callout_schedule(&callout, ADF_STATE_CALLOUT_TIME);
 }
 
 static int
 adf_state_event_handler(struct adf_accel_dev *accel_dev, enum adf_event event)
 {
 	int ret = 0;
 
 #if defined(QAT_UIO) && defined(QAT_DBG)
 	if (event > ADF_EVENT_DBG_SHUTDOWN)
 		return -EINVAL;
 #else
 	if (event > ADF_EVENT_ERROR)
 		return -EINVAL;
 #endif /* defined(QAT_UIO) && defined(QAT_DBG) */
 
 	switch (event) {
 	case ADF_EVENT_INIT:
 		return ret;
 	case ADF_EVENT_SHUTDOWN:
 		return ret;
 	case ADF_EVENT_RESTARTING:
 		break;
 	case ADF_EVENT_RESTARTED:
 		break;
 	case ADF_EVENT_START:
 		return ret;
 	case ADF_EVENT_STOP:
 		break;
 	case ADF_EVENT_ERROR:
 		break;
 #if defined(QAT_UIO) && defined(QAT_DBG)
 	case ADF_EVENT_PROC_CRASH:
 		break;
 	case ADF_EVENT_MANUAL_DUMP:
 		break;
 	case ADF_EVENT_SLICE_HANG:
 		break;
 	case ADF_EVENT_DBG_SHUTDOWN:
 		break;
 #endif /* defined(QAT_UIO) && defined(QAT_DBG) */
 	default:
 		return -1;
 	}
 
 	adf_state_set(accel_dev->accel_id, event);
 
 	return 0;
 }
 
 static int
 adf_state_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct adf_state_priv_data *priv;
 
 	mtx_lock(&mtx);
 	priv = dev->si_drv1;
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &adf_state_read_filterops;
 		kn->kn_hook = priv;
 		knlist_add(&priv->rsel.si_note, kn, 1);
 		mtx_unlock(&mtx);
 		return 0;
 	default:
 		mtx_unlock(&mtx);
 		return -EINVAL;
 	}
 }
 
 static int
 adf_state_kqread_event(struct knote *kn, long hint)
 {
 	return 1;
 }
 
 static void
 adf_state_kqread_detach(struct knote *kn)
 {
 	struct adf_state_priv_data *priv = NULL;
 
 	mtx_lock(&mtx);
 	if (!kn) {
 		mtx_unlock(&mtx);
 		return;
 	}
 	priv = kn->kn_hook;
 	if (!priv) {
 		mtx_unlock(&mtx);
 		return;
 	}
 	knlist_remove(&priv->rsel.si_note, kn, 1);
 	mtx_unlock(&mtx);
 }
 
 void
 adf_state_init(void)
 {
 	adf_state_dev = make_dev(&adf_state_cdevsw,
 				 0,
 				 UID_ROOT,
 				 GID_WHEEL,
 				 0600,
 				 "%s",
 				 ADF_DEV_STATE_NAME);
 	SLIST_INIT(&proc_events_head);
 	mtx_init(&mtx, mtx_name, NULL, MTX_DEF);
 	mtx_init(&callout_mtx, mtx_callout_name, NULL, MTX_DEF);
 	callout_init_mtx(&callout, &callout_mtx, 0);
 	explicit_bzero(&adf_state_hndl, sizeof(adf_state_hndl));
 	adf_state_hndl.event_hld = adf_state_event_handler;
 	adf_state_hndl.name = "adf_state_event_handler";
 	adf_service_register(&adf_state_hndl);
 	callout_reset(&callout,
 		      ADF_STATE_CALLOUT_TIME,
 		      adf_state_callout_notify_ev,
 		      NULL);
 }
 
 void
 adf_state_destroy(void)
 {
 	struct entry_proc_events *proc_events = NULL;
 
 	adf_service_unregister(&adf_state_hndl);
 	mtx_lock(&callout_mtx);
 	callout_stop(&callout);
 	mtx_unlock(&callout_mtx);
 	mtx_destroy(&callout_mtx);
 	mtx_lock(&mtx);
 	while (!SLIST_EMPTY(&proc_events_head)) {
 		proc_events = SLIST_FIRST(&proc_events_head);
 		SLIST_REMOVE_HEAD(&proc_events_head, entries_proc_events);
 		free(proc_events, M_QAT);
 	}
 	mtx_unlock(&mtx);
 	mtx_destroy(&mtx);
 	destroy_dev(adf_state_dev);
 }
 
 static int
 adf_state_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct adf_state_priv_data *prv_data = NULL;
 	struct entry_proc_events *entry_proc_events = NULL;
 	int ret = 0;
 
 	prv_data = malloc(sizeof(*prv_data), M_QAT, M_WAITOK | M_ZERO);
 	entry_proc_events =
 	    malloc(sizeof(struct entry_proc_events), M_QAT, M_WAITOK | M_ZERO);
 	mtx_lock(&mtx);
 	prv_data->cdev = dev;
 	prv_data->cdev->si_drv1 = prv_data;
 	knlist_init_mtx(&prv_data->rsel.si_note, &mtx);
 	STAILQ_INIT(&prv_data->state_head);
 	entry_proc_events->proc_events = prv_data;
 	SLIST_INSERT_HEAD(&proc_events_head,
 			  entry_proc_events,
 			  entries_proc_events);
 	mtx_unlock(&mtx);
 	ret = devfs_set_cdevpriv(prv_data, adf_state_release);
 	if (ret) {
 		SLIST_REMOVE(&proc_events_head,
 			     entry_proc_events,
 			     entry_proc_events,
 			     entries_proc_events);
 		free(entry_proc_events, M_QAT);
 		free(prv_data, M_QAT);
 	}
 	callout_schedule(&callout, ADF_STATE_CALLOUT_TIME);
 	return ret;
 }
 
 static int
 adf_state_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	int ret = 0;
 	struct adf_state_priv_data *prv_data = NULL;
 	struct state_head *state_head = NULL;
 	struct entry_state *entry_state = NULL;
 	struct adf_state *state = NULL;
 	struct entry_proc_events *proc_events = NULL;
 
 	mtx_lock(&mtx);
 	ret = devfs_get_cdevpriv((void **)&prv_data);
 	if (ret) {
 		mtx_unlock(&mtx);
 		return 0;
 	}
 	state_head = &prv_data->state_head;
 	if (STAILQ_EMPTY(state_head)) {
 		mtx_unlock(&mtx);
 		return 0;
 	}
 	entry_state = STAILQ_FIRST(state_head);
 	state = &entry_state->state;
 	ret = uiomove(state, sizeof(struct adf_state), uio);
 	if (!ret && !STAILQ_EMPTY(state_head)) {
 		STAILQ_REMOVE_HEAD(state_head, entries_state);
 		free(entry_state, M_QAT);
 	}
 	SLIST_FOREACH (proc_events, &proc_events_head, entries_proc_events) {
 		if (!STAILQ_EMPTY(&proc_events->proc_events->state_head)) {
 			prv_data = proc_events->proc_events;
 			wakeup(prv_data);
 			selwakeup(&prv_data->rsel);
 			KNOTE_UNLOCKED(&prv_data->rsel.si_note, 0);
 		}
 	}
 	mtx_unlock(&mtx);
 	callout_schedule(&callout, ADF_STATE_CALLOUT_TIME);
 	return ret;
 }
 
 static void
 adf_state_release(void *data)
 {
 	struct adf_state_priv_data *prv_data = NULL;
 	struct entry_state *entry_state = NULL;
 	struct entry_proc_events *entry_proc_events = NULL;
 	struct entry_proc_events *tmp = NULL;
 
 	mtx_lock(&mtx);
 	prv_data = (struct adf_state_priv_data *)data;
 	knlist_delete(&prv_data->rsel.si_note, curthread, 1);
 	knlist_destroy(&prv_data->rsel.si_note);
 	seldrain(&prv_data->rsel);
 	while (!STAILQ_EMPTY(&prv_data->state_head)) {
 		entry_state = STAILQ_FIRST(&prv_data->state_head);
 		STAILQ_REMOVE_HEAD(&prv_data->state_head, entries_state);
 		free(entry_state, M_QAT);
 	}
 	SLIST_FOREACH_SAFE (entry_proc_events,
 			    &proc_events_head,
 			    entries_proc_events,
 			    tmp) {
 		if (entry_proc_events->proc_events == prv_data) {
 			SLIST_REMOVE(&proc_events_head,
 				     entry_proc_events,
 				     entry_proc_events,
 				     entries_proc_events);
 			free(entry_proc_events, M_QAT);
 		}
 	}
 	free(prv_data, M_QAT);
 	mtx_unlock(&mtx);
 }
diff --git a/sys/dev/usb/usb_dev.c b/sys/dev/usb/usb_dev.c
index c58c3b5f64d5..a736a12fc4f4 100644
--- a/sys/dev/usb/usb_dev.c
+++ b/sys/dev/usb/usb_dev.c
@@ -1,2471 +1,2471 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2006-2023 Hans Petter Selasky
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *
  * usb_dev.c - An abstraction layer for creating devices under /dev/...
  */
 
 #ifdef USB_GLOBAL_INCLUDE_FILE
 #include USB_GLOBAL_INCLUDE_FILE
 #else
 #ifdef COMPAT_FREEBSD32
 #include <sys/abi_compat.h>
 #endif
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usb_ioctl.h>
 #include <dev/usb/usbdi.h>
 #include <dev/usb/usbdi_util.h>
 
 #define	USB_DEBUG_VAR usb_fifo_debug
 
 #include <dev/usb/usb_core.h>
 #include <dev/usb/usb_dev.h>
 #include <dev/usb/usb_mbuf.h>
 #include <dev/usb/usb_process.h>
 #include <dev/usb/usb_device.h>
 #include <dev/usb/usb_debug.h>
 #include <dev/usb/usb_busdma.h>
 #include <dev/usb/usb_generic.h>
 #include <dev/usb/usb_dynamic.h>
 #include <dev/usb/usb_util.h>
 
 #include <dev/usb/usb_controller.h>
 #include <dev/usb/usb_bus.h>
 
 #include <sys/filio.h>
 #include <sys/ttycom.h>
 #include <sys/syscallsubr.h>
 
 #include <machine/stdarg.h>
 #endif			/* USB_GLOBAL_INCLUDE_FILE */
 
 #if USB_HAVE_UGEN
 
 #ifdef USB_DEBUG
 static int usb_fifo_debug = 0;
 
 static SYSCTL_NODE(_hw_usb, OID_AUTO, dev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "USB device");
 SYSCTL_INT(_hw_usb_dev, OID_AUTO, debug, CTLFLAG_RWTUN,
     &usb_fifo_debug, 0, "Debug Level");
 #endif
 
 #define	USB_UCRED struct ucred *ucred,
 
 /* prototypes */
 
 static int	usb_fifo_open(struct usb_cdev_privdata *, 
 		    struct usb_fifo *, int);
 static void	usb_fifo_close(struct usb_fifo *, int);
 static void	usb_dev_init(void *);
 static void	usb_dev_init_post(void *);
 static void	usb_dev_uninit(void *);
 static int	usb_fifo_uiomove(struct usb_fifo *, void *, int,
 		    struct uio *);
 static void	usb_fifo_check_methods(struct usb_fifo_methods *);
 static struct	usb_fifo *usb_fifo_alloc(struct mtx *);
 static struct	usb_endpoint *usb_dev_get_ep(struct usb_device *, uint8_t,
 		    uint8_t);
 static void	usb_loc_fill(struct usb_fs_privdata *,
 		    struct usb_cdev_privdata *);
 static void	usb_close(void *);
 static usb_error_t usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *, int);
 static usb_error_t usb_usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *);
 static void	usb_unref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *);
 
 static d_open_t usb_open;
 static d_ioctl_t usb_ioctl;
 static d_read_t usb_read;
 static d_write_t usb_write;
 static d_poll_t usb_poll;
 static d_kqfilter_t usb_kqfilter;
 
 static d_ioctl_t usb_static_ioctl;
 
 static usb_fifo_open_t usb_fifo_dummy_open;
 static usb_fifo_close_t usb_fifo_dummy_close;
 static usb_fifo_ioctl_t usb_fifo_dummy_ioctl;
 static usb_fifo_cmd_t usb_fifo_dummy_cmd;
 
 /* character device structure used for devices (/dev/ugenX.Y and /dev/uXXX) */
 struct cdevsw usb_devsw = {
 	.d_version = D_VERSION,
 	.d_open = usb_open,
 	.d_ioctl = usb_ioctl,
 	.d_name = "usbdev",
 	.d_flags = D_TRACKCLOSE,
 	.d_read = usb_read,
 	.d_write = usb_write,
 	.d_poll = usb_poll,
 	.d_kqfilter = usb_kqfilter,
 };
 
 static struct cdev* usb_dev = NULL;
 
 /* character device structure used for /dev/usb */
 static struct cdevsw usb_static_devsw = {
 	.d_version = D_VERSION,
 	.d_ioctl = usb_static_ioctl,
 	.d_name = "usb"
 };
 
 static TAILQ_HEAD(, usb_symlink) usb_sym_head;
 static struct sx usb_sym_lock;
 
 struct mtx usb_ref_lock;
 
 /*------------------------------------------------------------------------*
  *	usb_loc_fill
  *
  * This is used to fill out a usb_cdev_privdata structure based on the
  * device's address as contained in usb_fs_privdata.
  *------------------------------------------------------------------------*/
 static void
 usb_loc_fill(struct usb_fs_privdata* pd, struct usb_cdev_privdata *cpd)
 {
 	cpd->bus_index = pd->bus_index;
 	cpd->dev_index = pd->dev_index;
 	cpd->ep_addr = pd->ep_addr;
 	cpd->fifo_index = pd->fifo_index;
 }
 
 /*------------------------------------------------------------------------*
  *	usb_ref_device
  *
  * This function is used to atomically refer an USB device by its
  * device location. If this function returns success the USB device
  * will not disappear until the USB device is unreferenced.
  *
  * Return values:
  *  0: Success, refcount incremented on the given USB device.
  *  Else: Failure.
  *------------------------------------------------------------------------*/
 static usb_error_t
 usb_ref_device(struct usb_cdev_privdata *cpd, 
     struct usb_cdev_refdata *crd, int need_uref)
 {
 	struct usb_fifo **ppf;
 	struct usb_fifo *f;
 
 	DPRINTFN(2, "cpd=%p need uref=%d\n", cpd, need_uref);
 
 	/* clear all refs */
 	memset(crd, 0, sizeof(*crd));
 
 	mtx_lock(&usb_ref_lock);
 	cpd->bus = devclass_get_softc(usb_devclass_ptr, cpd->bus_index);
 	if (cpd->bus == NULL) {
 		DPRINTFN(2, "no bus at %u\n", cpd->bus_index);
 		goto error;
 	}
 	cpd->udev = cpd->bus->devices[cpd->dev_index];
 	if (cpd->udev == NULL) {
 		DPRINTFN(2, "no device at %u\n", cpd->dev_index);
 		goto error;
 	}
 	if (cpd->udev->state == USB_STATE_DETACHED &&
 	    (need_uref != 2)) {
 		DPRINTFN(2, "device is detached\n");
 		goto error;
 	}
 	if (need_uref) {
 		DPRINTFN(2, "ref udev - needed\n");
 
 		if (cpd->udev->refcount == USB_DEV_REF_MAX) {
 			DPRINTFN(2, "no dev ref\n");
 			goto error;
 		}
 		cpd->udev->refcount++;
 
 		mtx_unlock(&usb_ref_lock);
 
 		/*
 		 * We need to grab the enumeration SX-lock before
 		 * grabbing the FIFO refs to avoid deadlock at detach!
 		 */
 		crd->do_unlock = usbd_enum_lock_sig(cpd->udev);
 
 		mtx_lock(&usb_ref_lock);
 
 		/* 
 		 * Set "is_uref" after grabbing the default SX lock
 		 */
 		crd->is_uref = 1;
 
 		/* check for signal */
 		if (crd->do_unlock > 1) {
 			crd->do_unlock = 0;
 			goto error;
 		}
 	}
 
 	/* check if we are doing an open */
 	if (cpd->fflags == 0) {
 		/* use zero defaults */
 	} else {
 		/* check for write */
 		if (cpd->fflags & FWRITE) {
 			ppf = cpd->udev->fifo;
 			f = ppf[cpd->fifo_index + USB_FIFO_TX];
 			crd->txfifo = f;
 			crd->is_write = 1;	/* ref */
 			if (f == NULL || f->refcount == USB_FIFO_REF_MAX)
 				goto error;
 			if (f->curr_cpd != cpd)
 				goto error;
 			/* check if USB-FS is active */
 			if (f->fs_ep_max != 0) {
 				crd->is_usbfs = 1;
 			}
 		}
 
 		/* check for read */
 		if (cpd->fflags & FREAD) {
 			ppf = cpd->udev->fifo;
 			f = ppf[cpd->fifo_index + USB_FIFO_RX];
 			crd->rxfifo = f;
 			crd->is_read = 1;	/* ref */
 			if (f == NULL || f->refcount == USB_FIFO_REF_MAX)
 				goto error;
 			if (f->curr_cpd != cpd)
 				goto error;
 			/* check if USB-FS is active */
 			if (f->fs_ep_max != 0) {
 				crd->is_usbfs = 1;
 			}
 		}
 	}
 
 	/* when everything is OK we increment the refcounts */
 	if (crd->is_write) {
 		DPRINTFN(2, "ref write\n");
 		crd->txfifo->refcount++;
 	}
 	if (crd->is_read) {
 		DPRINTFN(2, "ref read\n");
 		crd->rxfifo->refcount++;
 	}
 	mtx_unlock(&usb_ref_lock);
 
 	return (0);
 
 error:
 	if (crd->do_unlock)
 		usbd_enum_unlock(cpd->udev);
 
 	if (crd->is_uref) {
 		if (--(cpd->udev->refcount) == 0)
 			cv_broadcast(&cpd->udev->ref_cv);
 	}
 	mtx_unlock(&usb_ref_lock);
 	DPRINTFN(2, "fail\n");
 
 	/* clear all refs */
 	memset(crd, 0, sizeof(*crd));
 
 	return (USB_ERR_INVAL);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_usb_ref_device
  *
  * This function is used to upgrade an USB reference to include the
  * USB device reference on a USB location.
  *
  * Return values:
  *  0: Success, refcount incremented on the given USB device.
  *  Else: Failure.
  *------------------------------------------------------------------------*/
 static usb_error_t
 usb_usb_ref_device(struct usb_cdev_privdata *cpd,
     struct usb_cdev_refdata *crd)
 {
 	/*
 	 * Check if we already got an USB reference on this location:
 	 */
 	if (crd->is_uref)
 		return (0);		/* success */
 
 	/*
 	 * To avoid deadlock at detach we need to drop the FIFO ref
 	 * and re-acquire a new ref!
 	 */
 	usb_unref_device(cpd, crd);
 
 	return (usb_ref_device(cpd, crd, 1 /* need uref */));
 }
 
 /*------------------------------------------------------------------------*
  *	usb_unref_device
  *
  * This function will release the reference count by one unit for the
  * given USB device.
  *------------------------------------------------------------------------*/
 static void
 usb_unref_device(struct usb_cdev_privdata *cpd,
     struct usb_cdev_refdata *crd)
 {
 
 	DPRINTFN(2, "cpd=%p is_uref=%d\n", cpd, crd->is_uref);
 
 	if (crd->do_unlock)
 		usbd_enum_unlock(cpd->udev);
 
 	mtx_lock(&usb_ref_lock);
 	if (crd->is_read) {
 		if (--(crd->rxfifo->refcount) == 0) {
 			cv_signal(&crd->rxfifo->cv_drain);
 		}
 		crd->is_read = 0;
 	}
 	if (crd->is_write) {
 		if (--(crd->txfifo->refcount) == 0) {
 			cv_signal(&crd->txfifo->cv_drain);
 		}
 		crd->is_write = 0;
 	}
 	if (crd->is_uref) {
 		crd->is_uref = 0;
 		if (--(cpd->udev->refcount) == 0)
 			cv_broadcast(&cpd->udev->ref_cv);
 	}
 	mtx_unlock(&usb_ref_lock);
 }
 
 static struct usb_fifo *
 usb_fifo_alloc(struct mtx *mtx)
 {
 	struct usb_fifo *f;
 
 	f = malloc(sizeof(*f), M_USBDEV, M_WAITOK | M_ZERO);
 	cv_init(&f->cv_io, "FIFO-IO");
 	cv_init(&f->cv_drain, "FIFO-DRAIN");
 	sx_init(&f->fs_fastpath_lock, "FIFO-FP");
 	f->priv_mtx = mtx;
 	f->refcount = 1;
 	knlist_init_mtx(&f->selinfo.si_note, mtx);
 	return (f);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_create
  *------------------------------------------------------------------------*/
 static int
 usb_fifo_create(struct usb_cdev_privdata *cpd,
     struct usb_cdev_refdata *crd)
 {
 	struct usb_device *udev = cpd->udev;
 	struct usb_fifo *f;
 	struct usb_endpoint *ep;
 	uint8_t n;
 	uint8_t is_tx;
 	uint8_t is_rx;
 	uint8_t no_null;
 	uint8_t is_busy;
 	int e = cpd->ep_addr;
 
 	is_tx = (cpd->fflags & FWRITE) ? 1 : 0;
 	is_rx = (cpd->fflags & FREAD) ? 1 : 0;
 	no_null = 1;
 	is_busy = 0;
 
 	/* Preallocated FIFO */
 	if (e < 0) {
 		DPRINTFN(5, "Preallocated FIFO\n");
 		if (is_tx) {
 			f = udev->fifo[cpd->fifo_index + USB_FIFO_TX];
 			if (f == NULL)
 				return (EINVAL);
 			crd->txfifo = f;
 		}
 		if (is_rx) {
 			f = udev->fifo[cpd->fifo_index + USB_FIFO_RX];
 			if (f == NULL)
 				return (EINVAL);
 			crd->rxfifo = f;
 		}
 		return (0);
 	}
 
 	KASSERT(e >= 0 && e <= 15, ("endpoint %d out of range", e));
 
 	/* search for a free FIFO slot */
 	DPRINTFN(5, "Endpoint device, searching for 0x%02x\n", e);
 	for (n = 0;; n += 2) {
 		if (n == USB_FIFO_MAX) {
 			if (no_null) {
 				no_null = 0;
 				n = 0;
 			} else {
 				/* end of FIFOs reached */
 				DPRINTFN(5, "out of FIFOs\n");
 				return (ENOMEM);
 			}
 		}
 		/* Check for TX FIFO */
 		if (is_tx) {
 			f = udev->fifo[n + USB_FIFO_TX];
 			if (f != NULL) {
 				if (f->dev_ep_index != e) {
 					/* wrong endpoint index */
 					continue;
 				}
 				if (f->curr_cpd != NULL) {
 					/* FIFO is opened */
 					is_busy = 1;
 					continue;
 				}
 			} else if (no_null) {
 				continue;
 			}
 		}
 		/* Check for RX FIFO */
 		if (is_rx) {
 			f = udev->fifo[n + USB_FIFO_RX];
 			if (f != NULL) {
 				if (f->dev_ep_index != e) {
 					/* wrong endpoint index */
 					continue;
 				}
 				if (f->curr_cpd != NULL) {
 					/* FIFO is opened */
 					is_busy = 1;
 					continue;
 				}
 			} else if (no_null) {
 				continue;
 			}
 		}
 		break;
 	}
 
 	if (no_null == 0) {
 		if (e >= (USB_EP_MAX / 2)) {
 			/* we don't create any endpoints in this range */
 			DPRINTFN(5, "ep out of range\n");
 			return (is_busy ? EBUSY : EINVAL);
 		}
 	}
 
 	if ((e != 0) && is_busy) {
 		/*
 		 * Only the default control endpoint is allowed to be
 		 * opened multiple times!
 		 */
 		DPRINTFN(5, "busy\n");
 		return (EBUSY);
 	}
 
 	/* Check TX FIFO */
 	if (is_tx &&
 	    (udev->fifo[n + USB_FIFO_TX] == NULL)) {
 		ep = usb_dev_get_ep(udev, e, USB_FIFO_TX);
 		DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_TX);
 		if (ep == NULL) {
 			DPRINTFN(5, "dev_get_endpoint returned NULL\n");
 			return (EINVAL);
 		}
 		f = usb_fifo_alloc(&udev->device_mtx);
 		if (f == NULL) {
 			DPRINTFN(5, "could not alloc tx fifo\n");
 			return (ENOMEM);
 		}
 		/* update some fields */
 		f->fifo_index = n + USB_FIFO_TX;
 		f->dev_ep_index = e;
 		f->priv_sc0 = ep;
 		f->methods = &usb_ugen_methods;
 		f->iface_index = ep->iface_index;
 		f->udev = udev;
 		mtx_lock(&usb_ref_lock);
 		udev->fifo[n + USB_FIFO_TX] = f;
 		mtx_unlock(&usb_ref_lock);
 	}
 	/* Check RX FIFO */
 	if (is_rx &&
 	    (udev->fifo[n + USB_FIFO_RX] == NULL)) {
 		ep = usb_dev_get_ep(udev, e, USB_FIFO_RX);
 		DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_RX);
 		if (ep == NULL) {
 			DPRINTFN(5, "dev_get_endpoint returned NULL\n");
 			return (EINVAL);
 		}
 		f = usb_fifo_alloc(&udev->device_mtx);
 		if (f == NULL) {
 			DPRINTFN(5, "could not alloc rx fifo\n");
 			return (ENOMEM);
 		}
 		/* update some fields */
 		f->fifo_index = n + USB_FIFO_RX;
 		f->dev_ep_index = e;
 		f->priv_sc0 = ep;
 		f->methods = &usb_ugen_methods;
 		f->iface_index = ep->iface_index;
 		f->udev = udev;
 		mtx_lock(&usb_ref_lock);
 		udev->fifo[n + USB_FIFO_RX] = f;
 		mtx_unlock(&usb_ref_lock);
 	}
 	if (is_tx) {
 		crd->txfifo = udev->fifo[n + USB_FIFO_TX];
 	}
 	if (is_rx) {
 		crd->rxfifo = udev->fifo[n + USB_FIFO_RX];
 	}
 	/* fill out fifo index */
 	DPRINTFN(5, "fifo index = %d\n", n);
 	cpd->fifo_index = n;
 
 	/* complete */
 
 	return (0);
 }
 
 void
 usb_fifo_free(struct usb_fifo *f)
 {
 	uint8_t n;
 
 	if (f == NULL) {
 		/* be NULL safe */
 		return;
 	}
 	/* destroy symlink devices, if any */
 	for (n = 0; n != 2; n++) {
 		if (f->symlink[n]) {
 			usb_free_symlink(f->symlink[n]);
 			f->symlink[n] = NULL;
 		}
 	}
 	mtx_lock(&usb_ref_lock);
 
 	/* delink ourselves to stop calls from userland */
 	if ((f->fifo_index < USB_FIFO_MAX) &&
 	    (f->udev != NULL) &&
 	    (f->udev->fifo[f->fifo_index] == f)) {
 		f->udev->fifo[f->fifo_index] = NULL;
 	} else {
 		DPRINTFN(0, "USB FIFO %p has not been linked\n", f);
 	}
 
 	/* decrease refcount */
 	f->refcount--;
 	/* need to wait until all callers have exited */
 	while (f->refcount != 0) {
 		mtx_unlock(&usb_ref_lock);	/* avoid LOR */
 		mtx_lock(f->priv_mtx);
 		/* prevent write flush, if any */
 		f->flag_iserror = 1;
 		/* get I/O thread out of any sleep state */
 		if (f->flag_sleeping) {
 			f->flag_sleeping = 0;
 			cv_broadcast(&f->cv_io);
 		}
 		mtx_unlock(f->priv_mtx);
 		mtx_lock(&usb_ref_lock);
 
 		/*
 		 * Check if the "f->refcount" variable reached zero
 		 * during the unlocked time before entering wait:
 		 */
 		if (f->refcount == 0)
 			break;
 
 		/* wait for sync */
 		cv_wait(&f->cv_drain, &usb_ref_lock);
 	}
 	mtx_unlock(&usb_ref_lock);
 
 	/* take care of closing the device here, if any */
 	usb_fifo_close(f, 0);
 
 	cv_destroy(&f->cv_io);
 	cv_destroy(&f->cv_drain);
 	sx_destroy(&f->fs_fastpath_lock);
 
 	knlist_clear(&f->selinfo.si_note, 0);
 	seldrain(&f->selinfo);
 	knlist_destroy(&f->selinfo.si_note);
 
 	free(f, M_USBDEV);
 }
 
 static struct usb_endpoint *
 usb_dev_get_ep(struct usb_device *udev, uint8_t ep_index, uint8_t dir)
 {
 	struct usb_endpoint *ep;
 	uint8_t ep_dir;
 
 	if (ep_index == 0) {
 		ep = &udev->ctrl_ep;
 	} else {
 		if (dir == USB_FIFO_RX) {
 			if (udev->flags.usb_mode == USB_MODE_HOST) {
 				ep_dir = UE_DIR_IN;
 			} else {
 				ep_dir = UE_DIR_OUT;
 			}
 		} else {
 			if (udev->flags.usb_mode == USB_MODE_HOST) {
 				ep_dir = UE_DIR_OUT;
 			} else {
 				ep_dir = UE_DIR_IN;
 			}
 		}
 		ep = usbd_get_ep_by_addr(udev, ep_index | ep_dir);
 	}
 
 	if (ep == NULL) {
 		/* if the endpoint does not exist then return */
 		return (NULL);
 	}
 	if (ep->edesc == NULL) {
 		/* invalid endpoint */
 		return (NULL);
 	}
 	return (ep);			/* success */
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_open
  *
  * Returns:
  * 0: Success
  * Else: Failure
  *------------------------------------------------------------------------*/
 static int
 usb_fifo_open(struct usb_cdev_privdata *cpd, 
     struct usb_fifo *f, int fflags)
 {
 	int err;
 
 	if (f == NULL) {
 		/* no FIFO there */
 		DPRINTFN(2, "no FIFO\n");
 		return (ENXIO);
 	}
 	/* remove FWRITE and FREAD flags */
 	fflags &= ~(FWRITE | FREAD);
 
 	/* set correct file flags */
 	if ((f->fifo_index & 1) == USB_FIFO_TX) {
 		fflags |= FWRITE;
 	} else {
 		fflags |= FREAD;
 	}
 
 	/* check if we are already opened */
 	/* we don't need any locks when checking this variable */
 	if (f->curr_cpd != NULL) {
 		err = EBUSY;
 		goto done;
 	}
 
 	/* reset short flag before open */
 	f->flag_short = 0;
 
 	/* call open method */
 	err = (f->methods->f_open) (f, fflags);
 	if (err) {
 		goto done;
 	}
 	mtx_lock(f->priv_mtx);
 
 	/* reset sleep flag */
 	f->flag_sleeping = 0;
 
 	/* reset error flag */
 	f->flag_iserror = 0;
 
 	/* reset complete flag */
 	f->flag_iscomplete = 0;
 
 	/* reset select flag */
 	f->flag_isselect = 0;
 
 	/* reset flushing flag */
 	f->flag_flushing = 0;
 
 	/* reset ASYNC proc flag */
 	f->async_p = NULL;
 
 	mtx_lock(&usb_ref_lock);
 	/* flag the fifo as opened to prevent others */
 	f->curr_cpd = cpd;
 	mtx_unlock(&usb_ref_lock);
 
 	/* reset queue */
 	usb_fifo_reset(f);
 
 	mtx_unlock(f->priv_mtx);
 done:
 	return (err);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_reset
  *------------------------------------------------------------------------*/
 void
 usb_fifo_reset(struct usb_fifo *f)
 {
 	struct usb_mbuf *m;
 
 	if (f == NULL) {
 		return;
 	}
 	while (1) {
 		USB_IF_DEQUEUE(&f->used_q, m);
 		if (m) {
 			USB_IF_ENQUEUE(&f->free_q, m);
 		} else {
 			break;
 		}
 	}
 	/* reset have fragment flag */
 	f->flag_have_fragment = 0;
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_close
  *------------------------------------------------------------------------*/
 static void
 usb_fifo_close(struct usb_fifo *f, int fflags)
 {
 	int err;
 
 	/* check if we are not opened */
 	if (f->curr_cpd == NULL) {
 		/* nothing to do - already closed */
 		return;
 	}
 	mtx_lock(f->priv_mtx);
 
 	/* clear current cdev private data pointer */
 	mtx_lock(&usb_ref_lock);
 	f->curr_cpd = NULL;
 	mtx_unlock(&usb_ref_lock);
 
 	/* check if we are watched by kevent */
 	KNOTE_LOCKED(&f->selinfo.si_note, 0);
 
 	/* check if we are selected */
 	if (f->flag_isselect) {
 		selwakeup(&f->selinfo);
 		f->flag_isselect = 0;
 	}
 	/* check if a thread wants SIGIO */
 	if (f->async_p != NULL) {
 		PROC_LOCK(f->async_p);
 		kern_psignal(f->async_p, SIGIO);
 		PROC_UNLOCK(f->async_p);
 		f->async_p = NULL;
 	}
 	/* remove FWRITE and FREAD flags */
 	fflags &= ~(FWRITE | FREAD);
 
 	/* flush written data, if any */
 	if ((f->fifo_index & 1) == USB_FIFO_TX) {
 		if (!f->flag_iserror) {
 			/* set flushing flag */
 			f->flag_flushing = 1;
 
 			/* get the last packet in */
 			if (f->flag_have_fragment) {
 				struct usb_mbuf *m;
 				f->flag_have_fragment = 0;
 				USB_IF_DEQUEUE(&f->free_q, m);
 				if (m) {
 					USB_IF_ENQUEUE(&f->used_q, m);
 				}
 			}
 
 			/* start write transfer, if not already started */
 			(f->methods->f_start_write) (f);
 
 			/* check if flushed already */
 			while (f->flag_flushing &&
 			    (!f->flag_iserror)) {
 				/* wait until all data has been written */
 				f->flag_sleeping = 1;
 				err = cv_timedwait_sig(&f->cv_io, f->priv_mtx,
 				    USB_MS_TO_TICKS(USB_DEFAULT_TIMEOUT));
 				if (err) {
 					DPRINTF("signal received\n");
 					break;
 				}
 			}
 		}
 		fflags |= FWRITE;
 
 		/* stop write transfer, if not already stopped */
 		(f->methods->f_stop_write) (f);
 	} else {
 		fflags |= FREAD;
 
 		/* stop write transfer, if not already stopped */
 		(f->methods->f_stop_read) (f);
 	}
 
 	/* check if we are sleeping */
 	if (f->flag_sleeping) {
 		DPRINTFN(2, "Sleeping at close!\n");
 	}
 	mtx_unlock(f->priv_mtx);
 
 	/* call close method */
 	(f->methods->f_close) (f, fflags);
 
 	DPRINTF("closed\n");
 }
 
 /*------------------------------------------------------------------------*
  *	usb_open - cdev callback
  *------------------------------------------------------------------------*/
 static int
 usb_open(struct cdev *dev, int fflags, int devtype, struct thread *td)
 {
 	struct usb_fs_privdata* pd = (struct usb_fs_privdata*)dev->si_drv1;
 	struct usb_cdev_refdata refs;
 	struct usb_cdev_privdata *cpd;
 	int err;
 
 	DPRINTFN(2, "%s fflags=0x%08x\n", devtoname(dev), fflags);
 
 	KASSERT(fflags & (FREAD|FWRITE), ("invalid open flags"));
 	if (((fflags & FREAD) && !(pd->mode & FREAD)) ||
 	    ((fflags & FWRITE) && !(pd->mode & FWRITE))) {
 		DPRINTFN(2, "access mode not supported\n");
 		return (EPERM);
 	}
 
 	cpd = malloc(sizeof(*cpd), M_USBDEV, M_WAITOK | M_ZERO);
 
 	usb_loc_fill(pd, cpd);
 	err = usb_ref_device(cpd, &refs, 1);
 	if (err) {
 		DPRINTFN(2, "cannot ref device\n");
 		free(cpd, M_USBDEV);
 		return (ENXIO);
 	}
 	cpd->fflags = fflags;	/* access mode for open lifetime */
 
 	/* create FIFOs, if any */
 	err = usb_fifo_create(cpd, &refs);
 	/* check for error */
 	if (err) {
 		DPRINTFN(2, "cannot create fifo\n");
 		usb_unref_device(cpd, &refs);
 		free(cpd, M_USBDEV);
 		return (err);
 	}
 	if (fflags & FREAD) {
 		err = usb_fifo_open(cpd, refs.rxfifo, fflags);
 		if (err) {
 			DPRINTFN(2, "read open failed\n");
 			usb_unref_device(cpd, &refs);
 			free(cpd, M_USBDEV);
 			return (err);
 		}
 	}
 	if (fflags & FWRITE) {
 		err = usb_fifo_open(cpd, refs.txfifo, fflags);
 		if (err) {
 			DPRINTFN(2, "write open failed\n");
 			if (fflags & FREAD) {
 				usb_fifo_close(refs.rxfifo, fflags);
 			}
 			usb_unref_device(cpd, &refs);
 			free(cpd, M_USBDEV);
 			return (err);
 		}
 	}
 	usb_unref_device(cpd, &refs);
 	devfs_set_cdevpriv(cpd, usb_close);
 
 	return (0);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_close - cdev callback
  *------------------------------------------------------------------------*/
 static void
 usb_close(void *arg)
 {
 	struct usb_cdev_refdata refs;
 	struct usb_cdev_privdata *cpd = arg;
 	int err;
 
 	DPRINTFN(2, "cpd=%p\n", cpd);
 
 	err = usb_ref_device(cpd, &refs,
 	    2 /* uref and allow detached state */);
 	if (err) {
 		DPRINTFN(2, "Cannot grab USB reference when "
 		    "closing USB file handle\n");
 		goto done;
 	}
 	if (cpd->fflags & FREAD) {
 		usb_fifo_close(refs.rxfifo, cpd->fflags);
 	}
 	if (cpd->fflags & FWRITE) {
 		usb_fifo_close(refs.txfifo, cpd->fflags);
 	}
 	usb_unref_device(cpd, &refs);
 done:
 	free(cpd, M_USBDEV);
 }
 
 static void
 usb_dev_init(void *arg)
 {
 	mtx_init(&usb_ref_lock, "USB ref mutex", NULL, MTX_DEF);
 	sx_init(&usb_sym_lock, "USB sym mutex");
 	TAILQ_INIT(&usb_sym_head);
 
 	/* check the UGEN methods */
 	usb_fifo_check_methods(&usb_ugen_methods);
 }
 
 SYSINIT(usb_dev_init, SI_SUB_KLD, SI_ORDER_FIRST, usb_dev_init, NULL);
 
 static void
 usb_dev_init_post(void *arg)
 {
 	/*
 	 * Create /dev/usb - this is needed for usbconfig(8), which
 	 * needs a well-known device name to access.
 	 */
 	usb_dev = make_dev(&usb_static_devsw, 0, UID_ROOT, GID_OPERATOR,
 	    0644, USB_DEVICE_NAME);
 	if (usb_dev == NULL) {
 		DPRINTFN(0, "Could not create usb bus device\n");
 	}
 }
 
 SYSINIT(usb_dev_init_post, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, usb_dev_init_post, NULL);
 
 static void
 usb_dev_uninit(void *arg)
 {
 	if (usb_dev != NULL) {
 		destroy_dev(usb_dev);
 		usb_dev = NULL;
 	}
 	mtx_destroy(&usb_ref_lock);
 	sx_destroy(&usb_sym_lock);
 }
 
 SYSUNINIT(usb_dev_uninit, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, usb_dev_uninit, NULL);
 
 static int
 usb_ioctl_f_sub(struct usb_fifo *f, u_long cmd, void *addr,
     struct thread *td)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case FIODTYPE:
 		*(int *)addr = 0;	/* character device */
 		break;
 
 	case FIONBIO:
 		/* handled by upper FS layer */
 		break;
 
 	case FIOASYNC:
 		if (*(int *)addr) {
 			if (f->async_p != NULL) {
 				error = EBUSY;
 				break;
 			}
 			f->async_p = USB_TD_GET_PROC(td);
 		} else {
 			f->async_p = NULL;
 		}
 		break;
 
 		/* XXX this is not the most general solution */
 	case TIOCSPGRP:
 		if (f->async_p == NULL) {
 			error = EINVAL;
 			break;
 		}
 		if (*(int *)addr != USB_PROC_GET_GID(f->async_p)) {
 			error = EPERM;
 			break;
 		}
 		break;
 	default:
 		return (ENOIOCTL);
 	}
 	DPRINTFN(3, "cmd 0x%lx = %d\n", cmd, error);
 	return (error);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_ioctl - cdev callback
  *------------------------------------------------------------------------*/
 static int
 usb_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int fflag, struct thread* td)
 {
 	struct usb_cdev_refdata refs;
 	struct usb_cdev_privdata* cpd;
 	struct usb_fifo *f;
 	int fflags;
 	int err;
 
 	DPRINTFN(2, "cmd=0x%lx\n", cmd);
 
 	err = devfs_get_cdevpriv((void **)&cpd);
 	if (err != 0)
 		return (err);
 
 	/* 
 	 * Performance optimisation: We try to check for IOCTL's that
 	 * don't need the USB reference first. Then we grab the USB
 	 * reference if we need it!
 	 */
 	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
 	if (err)
 		return (ENXIO);
 
 	fflags = cpd->fflags;
 
 	f = NULL;			/* set default value */
 	err = ENOIOCTL;			/* set default value */
 
 	if (fflags & FWRITE) {
 		f = refs.txfifo;
 		err = usb_ioctl_f_sub(f, cmd, addr, td);
 	}
 	if (fflags & FREAD) {
 		f = refs.rxfifo;
 		err = usb_ioctl_f_sub(f, cmd, addr, td);
 	}
 	KASSERT(f != NULL, ("fifo not found"));
 	if (err != ENOIOCTL)
 		goto done;
 
 	err = (f->methods->f_ioctl) (f, cmd, addr, fflags);
 
 	DPRINTFN(2, "f_ioctl cmd 0x%lx = %d\n", cmd, err);
 
 	if (err != ENOIOCTL)
 		goto done;
 
 	if (usb_usb_ref_device(cpd, &refs)) {
 		/* we lost the reference */
 		return (ENXIO);
 	}
 
 	err = (f->methods->f_ioctl_post) (f, cmd, addr, fflags);
 
 	DPRINTFN(2, "f_ioctl_post cmd 0x%lx = %d\n", cmd, err);
 
 	if (err == ENOIOCTL)
 		err = ENOTTY;
 
 	if (err)
 		goto done;
 
 	/* Wait for re-enumeration, if any */
 
 	while (f->udev->re_enumerate_wait != USB_RE_ENUM_DONE) {
 		usb_unref_device(cpd, &refs);
 
 		usb_pause_mtx(NULL, hz / 128);
 
 		while (usb_ref_device(cpd, &refs, 1 /* need uref */)) {
 			if (usb_ref_device(cpd, &refs, 0)) {
 				/* device no longer exists */
 				return (ENXIO);
 			}
 			usb_unref_device(cpd, &refs);
 			usb_pause_mtx(NULL, hz / 128);
 		}
 	}
 
 done:
 	usb_unref_device(cpd, &refs);
 	return (err);
 }
 
 static void
 usb_filter_detach(struct knote *kn)
 {
 	struct usb_fifo *f = kn->kn_hook;
 	knlist_remove(&f->selinfo.si_note, kn, 0);
 }
 
 static int
 usb_filter_write(struct knote *kn, long hint)
 {
 	struct usb_cdev_privdata* cpd;
 	struct usb_fifo *f;
 	struct usb_mbuf *m;
 
 	DPRINTFN(2, "\n");
 
 	f = kn->kn_hook;
 
 	USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);
 
 	cpd = f->curr_cpd;
 	if (cpd == NULL) {
 		m = (void *)1;
 	} else if (f->fs_ep_max == 0) {
 		if (f->flag_iserror) {
 			/* we got an error */
 			m = (void *)1;
 		} else {
 			if (f->queue_data == NULL) {
 				/*
 				 * start write transfer, if not
 				 * already started
 				 */
 				(f->methods->f_start_write) (f);
 			}
 			/* check if any packets are available */
 			USB_IF_POLL(&f->free_q, m);
 		}
 	} else {
 		if (f->flag_iscomplete) {
 			m = (void *)1;
 		} else {
 			m = NULL;
 		}
 	}
 	return (m ? 1 : 0);
 }
 
 static int
 usb_filter_read(struct knote *kn, long hint)
 {
 	struct usb_cdev_privdata* cpd;
 	struct usb_fifo *f;
 	struct usb_mbuf *m;
 
 	DPRINTFN(2, "\n");
 
 	f = kn->kn_hook;
 
 	USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);
 
 	cpd = f->curr_cpd;
 	if (cpd == NULL) {
 		m = (void *)1;
 	} else if (f->fs_ep_max == 0) {
 		if (f->flag_iserror) {
 			/* we have an error */
 			m = (void *)1;
 		} else {
 			if (f->queue_data == NULL) {
 				/*
 				 * start read transfer, if not
 				 * already started
 				 */
 				(f->methods->f_start_read) (f);
 			}
 			/* check if any packets are available */
 			USB_IF_POLL(&f->used_q, m);
 
 			/* start reading data, if any */
 			if (m == NULL)
 				(f->methods->f_start_read) (f);
 		}
 	} else {
 		if (f->flag_iscomplete) {
 			m = (void *)1;
 		} else {
 			m = NULL;
 		}
 	}
 	return (m ? 1 : 0);
 }
 
-static struct filterops usb_filtops_write = {
+static const struct filterops usb_filtops_write = {
 	.f_isfd = 1,
 	.f_detach = usb_filter_detach,
 	.f_event = usb_filter_write,
 };
 
-static struct filterops usb_filtops_read = {
+static const struct filterops usb_filtops_read = {
 	.f_isfd = 1,
 	.f_detach = usb_filter_detach,
 	.f_event = usb_filter_read,
 };
 
 /* ARGSUSED */
 static int
 usb_kqfilter(struct cdev* dev, struct knote *kn)
 {
 	struct usb_cdev_refdata refs;
 	struct usb_cdev_privdata* cpd;
 	struct usb_fifo *f;
 	int fflags;
 	int err = EINVAL;
 
 	DPRINTFN(2, "\n");
 
 	if (devfs_get_cdevpriv((void **)&cpd) != 0 ||
 	    usb_ref_device(cpd, &refs, 0) != 0)
 		return (ENXIO);
 
 	fflags = cpd->fflags;
 
 	/* Figure out who needs service */
 	switch (kn->kn_filter) {
 	case EVFILT_WRITE:
 		if (fflags & FWRITE) {
 			f = refs.txfifo;
 			kn->kn_fop = &usb_filtops_write;
 			err = 0;
 		}
 		break;
 	case EVFILT_READ:
 		if (fflags & FREAD) {
 			f = refs.rxfifo;
 			kn->kn_fop = &usb_filtops_read;
 			err = 0;
 		}
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 
 	if (err == 0) {
 		kn->kn_hook = f;
 		mtx_lock(f->priv_mtx);
 		knlist_add(&f->selinfo.si_note, kn, 1);
 		mtx_unlock(f->priv_mtx);
 	}
 
 	usb_unref_device(cpd, &refs);
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 usb_poll(struct cdev* dev, int events, struct thread* td)
 {
 	struct usb_cdev_refdata refs;
 	struct usb_cdev_privdata* cpd;
 	struct usb_fifo *f;
 	struct usb_mbuf *m;
 	int fflags, revents;
 
 	if (devfs_get_cdevpriv((void **)&cpd) != 0 ||
 	    usb_ref_device(cpd, &refs, 0) != 0)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	fflags = cpd->fflags;
 
 	/* Figure out who needs service */
 	revents = 0;
 	if ((events & (POLLOUT | POLLWRNORM)) &&
 	    (fflags & FWRITE)) {
 		f = refs.txfifo;
 
 		mtx_lock(f->priv_mtx);
 
 		if (!refs.is_usbfs) {
 			if (f->flag_iserror) {
 				/* we got an error */
 				m = (void *)1;
 			} else {
 				if (f->queue_data == NULL) {
 					/*
 					 * start write transfer, if not
 					 * already started
 					 */
 					(f->methods->f_start_write) (f);
 				}
 				/* check if any packets are available */
 				USB_IF_POLL(&f->free_q, m);
 			}
 		} else {
 			if (f->flag_iscomplete) {
 				m = (void *)1;
 			} else {
 				m = NULL;
 			}
 		}
 
 		if (m) {
 			revents |= events & (POLLOUT | POLLWRNORM);
 		} else {
 			f->flag_isselect = 1;
 			selrecord(td, &f->selinfo);
 		}
 
 		mtx_unlock(f->priv_mtx);
 	}
 	if ((events & (POLLIN | POLLRDNORM)) &&
 	    (fflags & FREAD)) {
 		f = refs.rxfifo;
 
 		mtx_lock(f->priv_mtx);
 
 		if (!refs.is_usbfs) {
 			if (f->flag_iserror) {
 				/* we have an error */
 				m = (void *)1;
 			} else {
 				if (f->queue_data == NULL) {
 					/*
 					 * start read transfer, if not
 					 * already started
 					 */
 					(f->methods->f_start_read) (f);
 				}
 				/* check if any packets are available */
 				USB_IF_POLL(&f->used_q, m);
 			}
 		} else {
 			if (f->flag_iscomplete) {
 				m = (void *)1;
 			} else {
 				m = NULL;
 			}
 		}
 
 		if (m) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			f->flag_isselect = 1;
 			selrecord(td, &f->selinfo);
 
 			if (!refs.is_usbfs) {
 				/* start reading data */
 				(f->methods->f_start_read) (f);
 			}
 		}
 
 		mtx_unlock(f->priv_mtx);
 	}
 	usb_unref_device(cpd, &refs);
 	return (revents);
 }
 
 static int
 usb_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct usb_cdev_refdata refs;
 	struct usb_cdev_privdata* cpd;
 	struct usb_fifo *f;
 	struct usb_mbuf *m;
 	int io_len;
 	int err;
 	uint8_t tr_data = 0;
 
 	err = devfs_get_cdevpriv((void **)&cpd);
 	if (err != 0)
 		return (err);
 
 	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
 	if (err)
 		return (ENXIO);
 
 	f = refs.rxfifo;
 	if (f == NULL) {
 		/* should not happen */
 		usb_unref_device(cpd, &refs);
 		return (EPERM);
 	}
 
 	mtx_lock(f->priv_mtx);
 
 	/* check for permanent read error */
 	if (f->flag_iserror) {
 		err = EIO;
 		goto done;
 	}
 	/* check if USB-FS interface is active */
 	if (refs.is_usbfs) {
 		/*
 		 * The queue is used for events that should be
 		 * retrieved using the "USB_FS_COMPLETE" ioctl.
 		 */
 		err = EINVAL;
 		goto done;
 	}
 	while (uio->uio_resid > 0) {
 		USB_IF_DEQUEUE(&f->used_q, m);
 
 		if (m == NULL) {
 			/* start read transfer, if not already started */
 
 			(f->methods->f_start_read) (f);
 
 			if (ioflag & IO_NDELAY) {
 				if (tr_data) {
 					/* return length before error */
 					break;
 				}
 				err = EWOULDBLOCK;
 				break;
 			}
 			DPRINTF("sleeping\n");
 
 			err = usb_fifo_wait(f);
 			if (err) {
 				break;
 			}
 			continue;
 		}
 		if (f->methods->f_filter_read) {
 			/*
 			 * Sometimes it is convenient to process data at the
 			 * expense of a userland process instead of a kernel
 			 * process.
 			 */
 			(f->methods->f_filter_read) (f, m);
 		}
 		tr_data = 1;
 
 		io_len = MIN(m->cur_data_len, uio->uio_resid);
 
 		DPRINTFN(2, "transfer %d bytes from %p\n",
 		    io_len, m->cur_data_ptr);
 
 		err = usb_fifo_uiomove(f,
 		    m->cur_data_ptr, io_len, uio);
 
 		m->cur_data_len -= io_len;
 		m->cur_data_ptr += io_len;
 
 		if (m->cur_data_len == 0) {
 			uint8_t last_packet;
 
 			last_packet = m->last_packet;
 
 			USB_IF_ENQUEUE(&f->free_q, m);
 
 			if (last_packet) {
 				/* keep framing */
 				break;
 			}
 		} else {
 			USB_IF_PREPEND(&f->used_q, m);
 		}
 
 		if (err) {
 			break;
 		}
 	}
 done:
 	mtx_unlock(f->priv_mtx);
 
 	usb_unref_device(cpd, &refs);
 
 	return (err);
 }
 
 static int
 usb_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct usb_cdev_refdata refs;
 	struct usb_cdev_privdata* cpd;
 	struct usb_fifo *f;
 	struct usb_mbuf *m;
 	uint8_t *pdata;
 	int io_len;
 	int err;
 	uint8_t tr_data = 0;
 
 	DPRINTFN(2, "\n");
 
 	err = devfs_get_cdevpriv((void **)&cpd);
 	if (err != 0)
 		return (err);
 
 	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
 	if (err)
 		return (ENXIO);
 
 	f = refs.txfifo;
 	if (f == NULL) {
 		/* should not happen */
 		usb_unref_device(cpd, &refs);
 		return (EPERM);
 	}
 
 	mtx_lock(f->priv_mtx);
 
 	/* check for permanent write error */
 	if (f->flag_iserror) {
 		err = EIO;
 		goto done;
 	}
 	/* check if USB-FS interface is active */
 	if (refs.is_usbfs) {
 		/*
 		 * The queue is used for events that should be
 		 * retrieved using the "USB_FS_COMPLETE" ioctl.
 		 */
 		err = EINVAL;
 		goto done;
 	}
 	if (f->queue_data == NULL) {
 		/* start write transfer, if not already started */
 		(f->methods->f_start_write) (f);
 	}
 	/* we allow writing zero length data */
 	do {
 		USB_IF_DEQUEUE(&f->free_q, m);
 
 		if (m == NULL) {
 			if (ioflag & IO_NDELAY) {
 				if (tr_data) {
 					/* return length before error */
 					break;
 				}
 				err = EWOULDBLOCK;
 				break;
 			}
 			DPRINTF("sleeping\n");
 
 			err = usb_fifo_wait(f);
 			if (err) {
 				break;
 			}
 			continue;
 		}
 		tr_data = 1;
 
 		if (f->flag_have_fragment == 0) {
 			USB_MBUF_RESET(m);
 			io_len = m->cur_data_len;
 			pdata = m->cur_data_ptr;
 			if (io_len > uio->uio_resid)
 				io_len = uio->uio_resid;
 			m->cur_data_len = io_len;
 		} else {
 			io_len = m->max_data_len - m->cur_data_len;
 			pdata = m->cur_data_ptr + m->cur_data_len;
 			if (io_len > uio->uio_resid)
 				io_len = uio->uio_resid;
 			m->cur_data_len += io_len;
 		}
 
 		DPRINTFN(2, "transfer %d bytes to %p\n",
 		    io_len, pdata);
 
 		err = usb_fifo_uiomove(f, pdata, io_len, uio);
 
 		if (err) {
 			f->flag_have_fragment = 0;
 			USB_IF_ENQUEUE(&f->free_q, m);
 			break;
 		}
 
 		/* check if the buffer is ready to be transmitted */
 
 		if ((f->flag_write_defrag == 0) ||
 		    (m->cur_data_len == m->max_data_len)) {
 			f->flag_have_fragment = 0;
 
 			/*
 			 * Check for write filter:
 			 *
 			 * Sometimes it is convenient to process data
 			 * at the expense of a userland process
 			 * instead of a kernel process.
 			 */
 			if (f->methods->f_filter_write) {
 				(f->methods->f_filter_write) (f, m);
 			}
 
 			/* Put USB mbuf in the used queue */
 			USB_IF_ENQUEUE(&f->used_q, m);
 
 			/* Start writing data, if not already started */
 			(f->methods->f_start_write) (f);
 		} else {
 			/* Wait for more data or close */
 			f->flag_have_fragment = 1;
 			USB_IF_PREPEND(&f->free_q, m);
 		}
 
 	} while (uio->uio_resid > 0);
 done:
 	mtx_unlock(f->priv_mtx);
 
 	usb_unref_device(cpd, &refs);
 
 	return (err);
 }
 
 int
 usb_static_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	union {
 		struct usb_read_dir *urd;
 #ifdef COMPAT_FREEBSD32
 		struct usb_read_dir32 *urd32;
 #endif
 		void* data;
 	} u;
 	int err;
 
 	u.data = data;
 	switch (cmd) {
 		case USB_READ_DIR:
 			err = usb_read_symlink(u.urd->urd_data,
 			    u.urd->urd_startentry, u.urd->urd_maxlen);
 			break;
 #ifdef COMPAT_FREEBSD32
 		case USB_READ_DIR32:
 			err = usb_read_symlink(PTRIN(u.urd32->urd_data),
 			    u.urd32->urd_startentry, u.urd32->urd_maxlen);
 			break;
 #endif
 		case USB_DEV_QUIRK_GET:
 		case USB_QUIRK_NAME_GET:
 		case USB_DEV_QUIRK_ADD:
 		case USB_DEV_QUIRK_REMOVE:
 			err = usb_quirk_ioctl_p(cmd, data, fflag, td);
 			break;
 		case USB_GET_TEMPLATE:
 			*(int *)data = usb_template;
 			err = 0;
 			break;
 		case USB_SET_TEMPLATE:
 			err = priv_check(curthread, PRIV_DRIVER);
 			if (err)
 				break;
 			usb_template = *(int *)data;
 			break;
 		default:
 			err = ENOTTY;
 			break;
 	}
 	return (err);
 }
 
 static int
 usb_fifo_uiomove(struct usb_fifo *f, void *cp,
     int n, struct uio *uio)
 {
 	int error;
 
 	mtx_unlock(f->priv_mtx);
 
 	/*
 	 * "uiomove()" can sleep so one needs to make a wrapper,
 	 * exiting the mutex and checking things:
 	 */
 	error = uiomove(cp, n, uio);
 
 	mtx_lock(f->priv_mtx);
 
 	return (error);
 }
 
 int
 usb_fifo_wait(struct usb_fifo *f)
 {
 	int err;
 
 	USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);
 
 	if (f->flag_iserror) {
 		/* we are gone */
 		return (EIO);
 	}
 	f->flag_sleeping = 1;
 
 	err = cv_wait_sig(&f->cv_io, f->priv_mtx);
 
 	if (f->flag_iserror) {
 		/* we are gone */
 		err = EIO;
 	}
 	return (err);
 }
 
 void
 usb_fifo_signal(struct usb_fifo *f)
 {
 	if (f->flag_sleeping) {
 		f->flag_sleeping = 0;
 		cv_broadcast(&f->cv_io);
 	}
 }
 
 void
 usb_fifo_wakeup(struct usb_fifo *f)
 {
 	usb_fifo_signal(f);
 
 	KNOTE_LOCKED(&f->selinfo.si_note, 0);
 
 	if (f->flag_isselect) {
 		selwakeup(&f->selinfo);
 		f->flag_isselect = 0;
 	}
 	if (f->async_p != NULL) {
 		PROC_LOCK(f->async_p);
 		kern_psignal(f->async_p, SIGIO);
 		PROC_UNLOCK(f->async_p);
 	}
 }
 
 static int
 usb_fifo_dummy_open(struct usb_fifo *fifo, int fflags)
 {
 	return (0);
 }
 
 static void
 usb_fifo_dummy_close(struct usb_fifo *fifo, int fflags)
 {
 	return;
 }
 
 static int
 usb_fifo_dummy_ioctl(struct usb_fifo *fifo, u_long cmd, void *addr, int fflags)
 {
 	return (ENOIOCTL);
 }
 
 static void
 usb_fifo_dummy_cmd(struct usb_fifo *fifo)
 {
 	fifo->flag_flushing = 0;	/* not flushing */
 }
 
 static void
 usb_fifo_check_methods(struct usb_fifo_methods *pm)
 {
 	/* check that all callback functions are OK */
 
 	if (pm->f_open == NULL)
 		pm->f_open = &usb_fifo_dummy_open;
 
 	if (pm->f_close == NULL)
 		pm->f_close = &usb_fifo_dummy_close;
 
 	if (pm->f_ioctl == NULL)
 		pm->f_ioctl = &usb_fifo_dummy_ioctl;
 
 	if (pm->f_ioctl_post == NULL)
 		pm->f_ioctl_post = &usb_fifo_dummy_ioctl;
 
 	if (pm->f_start_read == NULL)
 		pm->f_start_read = &usb_fifo_dummy_cmd;
 
 	if (pm->f_stop_read == NULL)
 		pm->f_stop_read = &usb_fifo_dummy_cmd;
 
 	if (pm->f_start_write == NULL)
 		pm->f_start_write = &usb_fifo_dummy_cmd;
 
 	if (pm->f_stop_write == NULL)
 		pm->f_stop_write = &usb_fifo_dummy_cmd;
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_attach
  *
  * The following function will create a duplex FIFO.
  *
  * Return values:
  * 0: Success.
  * Else: Failure.
  *------------------------------------------------------------------------*/
 int
 usb_fifo_attach(struct usb_device *udev, void *priv_sc,
     struct mtx *priv_mtx, struct usb_fifo_methods *pm,
     struct usb_fifo_sc *f_sc, uint16_t unit, int16_t subunit,
     uint8_t iface_index, uid_t uid, gid_t gid, int mode)
 {
 	struct usb_fifo *f_tx;
 	struct usb_fifo *f_rx;
 	char devname[32];
 	uint8_t n;
 
 	f_sc->fp[USB_FIFO_TX] = NULL;
 	f_sc->fp[USB_FIFO_RX] = NULL;
 
 	if (pm == NULL)
 		return (EINVAL);
 
 	/* check the methods */
 	usb_fifo_check_methods(pm);
 
 	if (priv_mtx == NULL)
 		priv_mtx = &Giant;
 
 	/* search for a free FIFO slot */
 	for (n = 0;; n += 2) {
 		if (n == USB_FIFO_MAX) {
 			/* end of FIFOs reached */
 			return (ENOMEM);
 		}
 		/* Check for TX FIFO */
 		if (udev->fifo[n + USB_FIFO_TX] != NULL) {
 			continue;
 		}
 		/* Check for RX FIFO */
 		if (udev->fifo[n + USB_FIFO_RX] != NULL) {
 			continue;
 		}
 		break;
 	}
 
 	f_tx = usb_fifo_alloc(priv_mtx);
 	f_rx = usb_fifo_alloc(priv_mtx);
 
 	if ((f_tx == NULL) || (f_rx == NULL)) {
 		usb_fifo_free(f_tx);
 		usb_fifo_free(f_rx);
 		return (ENOMEM);
 	}
 	/* initialise FIFO structures */
 
 	f_tx->fifo_index = n + USB_FIFO_TX;
 	f_tx->dev_ep_index = -1;
 	f_tx->priv_sc0 = priv_sc;
 	f_tx->methods = pm;
 	f_tx->iface_index = iface_index;
 	f_tx->udev = udev;
 
 	f_rx->fifo_index = n + USB_FIFO_RX;
 	f_rx->dev_ep_index = -1;
 	f_rx->priv_sc0 = priv_sc;
 	f_rx->methods = pm;
 	f_rx->iface_index = iface_index;
 	f_rx->udev = udev;
 
 	f_sc->fp[USB_FIFO_TX] = f_tx;
 	f_sc->fp[USB_FIFO_RX] = f_rx;
 
 	mtx_lock(&usb_ref_lock);
 	udev->fifo[f_tx->fifo_index] = f_tx;
 	udev->fifo[f_rx->fifo_index] = f_rx;
 	mtx_unlock(&usb_ref_lock);
 
 	for (n = 0; n != 4; n++) {
 		if (pm->basename[n] == NULL) {
 			continue;
 		}
 		if (subunit < 0) {
 			if (snprintf(devname, sizeof(devname),
 			    "%s%u%s", pm->basename[n],
 			    unit, pm->postfix[n] ?
 			    pm->postfix[n] : "")) {
 				/* ignore */
 			}
 		} else {
 			if (snprintf(devname, sizeof(devname),
 			    "%s%u.%d%s", pm->basename[n],
 			    unit, subunit, pm->postfix[n] ?
 			    pm->postfix[n] : "")) {
 				/* ignore */
 			}
 		}
 
 		/*
 		 * Distribute the symbolic links into two FIFO structures:
 		 */
 		if (n & 1) {
 			f_rx->symlink[n / 2] =
 			    usb_alloc_symlink(devname);
 		} else {
 			f_tx->symlink[n / 2] =
 			    usb_alloc_symlink(devname);
 		}
 
 		/* Create the device */
 		f_sc->dev = usb_make_dev(udev, devname, -1,
 		    f_tx->fifo_index & f_rx->fifo_index,
 		    FREAD|FWRITE, uid, gid, mode);
 	}
 
 	DPRINTFN(2, "attached %p/%p\n", f_tx, f_rx);
 	return (0);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_alloc_buffer
  *
  * Return values:
  * 0: Success
  * Else failure
  *------------------------------------------------------------------------*/
 int
 usb_fifo_alloc_buffer(struct usb_fifo *f, usb_size_t bufsize,
     uint16_t nbuf)
 {
 	struct usb_ifqueue temp_q = {};
 	void *queue_data;
 
 	usb_fifo_free_buffer(f);
 
 	temp_q.ifq_maxlen = nbuf;
 
 	queue_data = usb_alloc_mbufs(
 	    M_USBDEV, &temp_q, bufsize, nbuf);
 
 	if (queue_data == NULL && bufsize != 0 && nbuf != 0)
 		return (ENOMEM);
 
 	mtx_lock(f->priv_mtx);
 
 	/*
 	 * Setup queues and sizes under lock to avoid early use by
 	 * concurrent FIFO access:
 	 */
 	f->free_q = temp_q;
 	f->used_q.ifq_maxlen = nbuf;
 	f->queue_data = queue_data;
 	mtx_unlock(f->priv_mtx);
 
 	return (0);			/* success */
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_free_buffer
  *
  * This function will free the buffers associated with a FIFO. This
  * function can be called multiple times in a row.
  *------------------------------------------------------------------------*/
 void
 usb_fifo_free_buffer(struct usb_fifo *f)
 {
 	void *queue_data;
 
 	mtx_lock(f->priv_mtx);
 
 	/* Get and clear pointer to free, if any. */
 	queue_data = f->queue_data;
 	f->queue_data = NULL;
 
 	/*
 	 * Reset queues under lock to avoid use of freed buffers by
 	 * concurrent FIFO activity:
 	 */
 	memset(&f->free_q, 0, sizeof(f->free_q));
 	memset(&f->used_q, 0, sizeof(f->used_q));
 	mtx_unlock(f->priv_mtx);
 
 	/* Free old buffer, if any. */
 	free(queue_data, M_USBDEV);
 }
 
 void
 usb_fifo_detach(struct usb_fifo_sc *f_sc)
 {
 	if (f_sc == NULL) {
 		return;
 	}
 	usb_fifo_free(f_sc->fp[USB_FIFO_TX]);
 	usb_fifo_free(f_sc->fp[USB_FIFO_RX]);
 
 	f_sc->fp[USB_FIFO_TX] = NULL;
 	f_sc->fp[USB_FIFO_RX] = NULL;
 
 	usb_destroy_dev(f_sc->dev);
 
 	f_sc->dev = NULL;
 
 	DPRINTFN(2, "detached %p\n", f_sc);
 }
 
 usb_size_t
 usb_fifo_put_bytes_max(struct usb_fifo *f)
 {
 	struct usb_mbuf *m;
 	usb_size_t len;
 
 	USB_IF_POLL(&f->free_q, m);
 
 	if (m) {
 		len = m->max_data_len;
 	} else {
 		len = 0;
 	}
 	return (len);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_put_data
  *
  * what:
  *  0 - normal operation
  *  1 - set last packet flag to enforce framing
  *------------------------------------------------------------------------*/
 void
 usb_fifo_put_data(struct usb_fifo *f, struct usb_page_cache *pc,
     usb_frlength_t offset, usb_frlength_t len, uint8_t what)
 {
 	struct usb_mbuf *m;
 	usb_frlength_t io_len;
 
 	while (len || (what == 1)) {
 		USB_IF_DEQUEUE(&f->free_q, m);
 
 		if (m) {
 			USB_MBUF_RESET(m);
 
 			io_len = MIN(len, m->cur_data_len);
 
 			usbd_copy_out(pc, offset, m->cur_data_ptr, io_len);
 
 			m->cur_data_len = io_len;
 			offset += io_len;
 			len -= io_len;
 
 			if ((len == 0) && (what == 1)) {
 				m->last_packet = 1;
 			}
 			USB_IF_ENQUEUE(&f->used_q, m);
 
 			usb_fifo_wakeup(f);
 
 			if ((len == 0) || (what == 1)) {
 				break;
 			}
 		} else {
 			break;
 		}
 	}
 }
 
 void
 usb_fifo_put_data_linear(struct usb_fifo *f, void *ptr,
     usb_size_t len, uint8_t what)
 {
 	struct usb_mbuf *m;
 	usb_size_t io_len;
 
 	while (len || (what == 1)) {
 		USB_IF_DEQUEUE(&f->free_q, m);
 
 		if (m) {
 			USB_MBUF_RESET(m);
 
 			io_len = MIN(len, m->cur_data_len);
 
 			memcpy(m->cur_data_ptr, ptr, io_len);
 
 			m->cur_data_len = io_len;
 			ptr = USB_ADD_BYTES(ptr, io_len);
 			len -= io_len;
 
 			if ((len == 0) && (what == 1)) {
 				m->last_packet = 1;
 			}
 			USB_IF_ENQUEUE(&f->used_q, m);
 
 			usb_fifo_wakeup(f);
 
 			if ((len == 0) || (what == 1)) {
 				break;
 			}
 		} else {
 			break;
 		}
 	}
 }
 
 uint8_t
 usb_fifo_put_data_buffer(struct usb_fifo *f, void *ptr, usb_size_t len)
 {
 	struct usb_mbuf *m;
 
 	USB_IF_DEQUEUE(&f->free_q, m);
 
 	if (m) {
 		m->cur_data_len = len;
 		m->cur_data_ptr = ptr;
 		USB_IF_ENQUEUE(&f->used_q, m);
 		usb_fifo_wakeup(f);
 		return (1);
 	}
 	return (0);
 }
 
 void
 usb_fifo_put_data_error(struct usb_fifo *f)
 {
 	f->flag_iserror = 1;
 	usb_fifo_wakeup(f);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_fifo_get_data
  *
  * what:
  *  0 - normal operation
  *  1 - only get one "usb_mbuf"
  *
  * returns:
  *  0 - no more data
  *  1 - data in buffer
  *------------------------------------------------------------------------*/
 uint8_t
 usb_fifo_get_data(struct usb_fifo *f, struct usb_page_cache *pc,
     usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen,
     uint8_t what)
 {
 	struct usb_mbuf *m;
 	usb_frlength_t io_len;
 	uint8_t tr_data = 0;
 
 	actlen[0] = 0;
 
 	while (1) {
 		USB_IF_DEQUEUE(&f->used_q, m);
 
 		if (m) {
 			tr_data = 1;
 
 			io_len = MIN(len, m->cur_data_len);
 
 			usbd_copy_in(pc, offset, m->cur_data_ptr, io_len);
 
 			len -= io_len;
 			offset += io_len;
 			actlen[0] += io_len;
 			m->cur_data_ptr += io_len;
 			m->cur_data_len -= io_len;
 
 			if ((m->cur_data_len == 0) || (what == 1)) {
 				USB_IF_ENQUEUE(&f->free_q, m);
 
 				usb_fifo_wakeup(f);
 
 				if (what == 1) {
 					break;
 				}
 			} else {
 				USB_IF_PREPEND(&f->used_q, m);
 			}
 		} else {
 			if (tr_data) {
 				/* wait for data to be written out */
 				break;
 			}
 			if (f->flag_flushing) {
 				/* check if we should send a short packet */
 				if (f->flag_short != 0) {
 					f->flag_short = 0;
 					tr_data = 1;
 					break;
 				}
 				/* flushing complete */
 				f->flag_flushing = 0;
 				usb_fifo_wakeup(f);
 			}
 			break;
 		}
 		if (len == 0) {
 			break;
 		}
 	}
 	return (tr_data);
 }
 
 uint8_t
 usb_fifo_get_data_linear(struct usb_fifo *f, void *ptr,
     usb_size_t len, usb_size_t *actlen, uint8_t what)
 {
 	struct usb_mbuf *m;
 	usb_size_t io_len;
 	uint8_t tr_data = 0;
 
 	actlen[0] = 0;
 
 	while (1) {
 		USB_IF_DEQUEUE(&f->used_q, m);
 
 		if (m) {
 			tr_data = 1;
 
 			io_len = MIN(len, m->cur_data_len);
 
 			memcpy(ptr, m->cur_data_ptr, io_len);
 
 			len -= io_len;
 			ptr = USB_ADD_BYTES(ptr, io_len);
 			actlen[0] += io_len;
 			m->cur_data_ptr += io_len;
 			m->cur_data_len -= io_len;
 
 			if ((m->cur_data_len == 0) || (what == 1)) {
 				USB_IF_ENQUEUE(&f->free_q, m);
 
 				usb_fifo_wakeup(f);
 
 				if (what == 1) {
 					break;
 				}
 			} else {
 				USB_IF_PREPEND(&f->used_q, m);
 			}
 		} else {
 			if (tr_data) {
 				/* wait for data to be written out */
 				break;
 			}
 			if (f->flag_flushing) {
 				/* check if we should send a short packet */
 				if (f->flag_short != 0) {
 					f->flag_short = 0;
 					tr_data = 1;
 					break;
 				}
 				/* flushing complete */
 				f->flag_flushing = 0;
 				usb_fifo_wakeup(f);
 			}
 			break;
 		}
 		if (len == 0) {
 			break;
 		}
 	}
 	return (tr_data);
 }
 
 uint8_t
 usb_fifo_get_data_buffer(struct usb_fifo *f, void **pptr, usb_size_t *plen)
 {
 	struct usb_mbuf *m;
 
 	USB_IF_POLL(&f->used_q, m);
 
 	if (m) {
 		*plen = m->cur_data_len;
 		*pptr = m->cur_data_ptr;
 
 		return (1);
 	}
 	return (0);
 }
 
 void
 usb_fifo_get_data_error(struct usb_fifo *f)
 {
 	f->flag_iserror = 1;
 	usb_fifo_wakeup(f);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_alloc_symlink
  *
  * Return values:
  * NULL: Failure
  * Else: Pointer to symlink entry
  *------------------------------------------------------------------------*/
 struct usb_symlink *
 usb_alloc_symlink(const char *target)
 {
 	struct usb_symlink *ps;
 
 	ps = malloc(sizeof(*ps), M_USBDEV, M_WAITOK);
 	/* XXX no longer needed */
 	strlcpy(ps->src_path, target, sizeof(ps->src_path));
 	ps->src_len = strlen(ps->src_path);
 	strlcpy(ps->dst_path, target, sizeof(ps->dst_path));
 	ps->dst_len = strlen(ps->dst_path);
 
 	sx_xlock(&usb_sym_lock);
 	TAILQ_INSERT_TAIL(&usb_sym_head, ps, sym_entry);
 	sx_unlock(&usb_sym_lock);
 	return (ps);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_free_symlink
  *------------------------------------------------------------------------*/
 void
 usb_free_symlink(struct usb_symlink *ps)
 {
 	if (ps == NULL) {
 		return;
 	}
 	sx_xlock(&usb_sym_lock);
 	TAILQ_REMOVE(&usb_sym_head, ps, sym_entry);
 	sx_unlock(&usb_sym_lock);
 
 	free(ps, M_USBDEV);
 }
 
 /*------------------------------------------------------------------------*
  *	usb_read_symlink
  *
  * Return value:
  * 0: Success
  * Else: Failure
  *------------------------------------------------------------------------*/
 int
 usb_read_symlink(uint8_t *user_ptr, uint32_t startentry, uint32_t user_len)
 {
 	struct usb_symlink *ps;
 	uint32_t temp;
 	uint32_t delta = 0;
 	uint8_t len;
 	int error = 0;
 
 	sx_xlock(&usb_sym_lock);
 
 	TAILQ_FOREACH(ps, &usb_sym_head, sym_entry) {
 		/*
 		 * Compute total length of source and destination symlink
 		 * strings pluss one length byte and two NUL bytes:
 		 */
 		temp = ps->src_len + ps->dst_len + 3;
 
 		if (temp > 255) {
 			/*
 			 * Skip entry because this length cannot fit
 			 * into one byte:
 			 */
 			continue;
 		}
 		if (startentry != 0) {
 			/* decrement read offset */
 			startentry--;
 			continue;
 		}
 		if (temp > user_len) {
 			/* out of buffer space */
 			break;
 		}
 		len = temp;
 
 		/* copy out total length */
 
 		error = copyout(&len,
 		    USB_ADD_BYTES(user_ptr, delta), 1);
 		if (error) {
 			break;
 		}
 		delta += 1;
 
 		/* copy out source string */
 
 		error = copyout(ps->src_path,
 		    USB_ADD_BYTES(user_ptr, delta), ps->src_len);
 		if (error) {
 			break;
 		}
 		len = 0;
 		delta += ps->src_len;
 		error = copyout(&len,
 		    USB_ADD_BYTES(user_ptr, delta), 1);
 		if (error) {
 			break;
 		}
 		delta += 1;
 
 		/* copy out destination string */
 
 		error = copyout(ps->dst_path,
 		    USB_ADD_BYTES(user_ptr, delta), ps->dst_len);
 		if (error) {
 			break;
 		}
 		len = 0;
 		delta += ps->dst_len;
 		error = copyout(&len,
 		    USB_ADD_BYTES(user_ptr, delta), 1);
 		if (error) {
 			break;
 		}
 		delta += 1;
 
 		user_len -= temp;
 	}
 
 	/* a zero length entry indicates the end */
 
 	if ((user_len != 0) && (error == 0)) {
 		len = 0;
 
 		error = copyout(&len,
 		    USB_ADD_BYTES(user_ptr, delta), 1);
 	}
 	sx_unlock(&usb_sym_lock);
 	return (error);
 }
 
 void
 usb_fifo_set_close_zlp(struct usb_fifo *f, uint8_t onoff)
 {
 	if (f == NULL)
 		return;
 
 	/* send a Zero Length Packet, ZLP, before close */
 	f->flag_short = onoff;
 }
 
 void
 usb_fifo_set_write_defrag(struct usb_fifo *f, uint8_t onoff)
 {
 	if (f == NULL)
 		return;
 
 	/* defrag written data */
 	f->flag_write_defrag = onoff;
 	/* reset defrag state */
 	f->flag_have_fragment = 0;
 }
 
 void *
 usb_fifo_softc(struct usb_fifo *f)
 {
 	return (f->priv_sc0);
 }
 #endif	/* USB_HAVE_UGEN */
diff --git a/sys/fs/cuse/cuse.c b/sys/fs/cuse/cuse.c
index 9ef234c35427..e32154654386 100644
--- a/sys/fs/cuse/cuse.c
+++ b/sys/fs/cuse/cuse.c
@@ -1,2042 +1,2042 @@
 /*-
  * Copyright (c) 2010-2022 Hans Petter Selasky
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/linker_set.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/uio.h>
 #include <sys/poll.h>
 #include <sys/sx.h>
 #include <sys/rwlock.h>
 #include <sys/queue.h>
 #include <sys/fcntl.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/selinfo.h>
 #include <sys/ptrace.h>
 #include <sys/sysent.h>
 
 #include <machine/bus.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #include <fs/cuse/cuse_defs.h>
 #include <fs/cuse/cuse_ioctl.h>
 
 /* set this define to zero to disable this feature */
 #define	CUSE_COPY_BUFFER_MAX \
 	CUSE_BUFFER_MAX
 
 #define	CUSE_ALLOC_PAGES_MAX \
 	(CUSE_ALLOC_BYTES_MAX / PAGE_SIZE)
 
 #if (CUSE_ALLOC_PAGES_MAX == 0)
 #error "PAGE_SIZE is too big!"
 #endif
 
 static int
 cuse_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static moduledata_t cuse_mod = {
 	.name = "cuse",
 	.evhand = &cuse_modevent,
 };
 
 DECLARE_MODULE(cuse, cuse_mod, SI_SUB_DEVFS, SI_ORDER_FIRST);
 MODULE_VERSION(cuse, 1);
 
 /*
  * Prevent cuse4bsd.ko and cuse.ko from loading at the same time by
  * declaring support for the cuse4bsd interface in cuse.ko:
  */
 MODULE_VERSION(cuse4bsd, 1);
 
 #ifdef FEATURE
 FEATURE(cuse, "Userspace character devices");
 #endif
 
 struct cuse_command;
 struct cuse_server;
 struct cuse_client;
 
 struct cuse_client_command {
 	TAILQ_ENTRY(cuse_client_command) entry;
 	struct cuse_command sub;
 	struct sx sx;
 	struct cv cv;
 	struct thread *entered;
 	struct cuse_client *client;
 	struct proc *proc_curr;
 	int	proc_refs;
 	int	got_signal;
 	int	error;
 	int	command;
 };
 
 struct cuse_memory {
 	TAILQ_ENTRY(cuse_memory) entry;
 	vm_object_t object;
 	uint32_t page_count;
 	uint32_t alloc_nr;
 };
 
 struct cuse_server_dev {
 	TAILQ_ENTRY(cuse_server_dev) entry;
 	struct cuse_server *server;
 	struct cdev *kern_dev;
 	struct cuse_dev *user_dev;
 };
 
 struct cuse_server {
 	TAILQ_ENTRY(cuse_server) entry;
 	TAILQ_HEAD(, cuse_client_command) head;
 	TAILQ_HEAD(, cuse_server_dev) hdev;
 	TAILQ_HEAD(, cuse_client) hcli;
 	TAILQ_HEAD(, cuse_memory) hmem;
 	struct mtx mtx;
 	struct cv cv;
 	struct selinfo selinfo;
 	pid_t	pid;
 	int	is_closing;
 	int	refs;
 };
 
 struct cuse_client {
 	TAILQ_ENTRY(cuse_client) entry;
 	TAILQ_ENTRY(cuse_client) entry_ref;
 	struct cuse_client_command cmds[CUSE_CMD_MAX];
 	struct cuse_server *server;
 	struct cuse_server_dev *server_dev;
 
 	uintptr_t read_base;
 	uintptr_t write_base;
 	int read_length;
 	int write_length;
 	uint8_t	read_buffer[CUSE_COPY_BUFFER_MAX] __aligned(4);
 	uint8_t	write_buffer[CUSE_COPY_BUFFER_MAX] __aligned(4);
 	uint8_t	ioctl_buffer[CUSE_BUFFER_MAX] __aligned(4);
 
 	int	fflags;			/* file flags */
 	int	cflags;			/* client flags */
 #define	CUSE_CLI_IS_CLOSING 0x01
 #define	CUSE_CLI_KNOTE_NEED_READ 0x02
 #define	CUSE_CLI_KNOTE_NEED_WRITE 0x04
 #define	CUSE_CLI_KNOTE_HAS_READ 0x08
 #define	CUSE_CLI_KNOTE_HAS_WRITE 0x10
 };
 
 #define	CUSE_CLIENT_CLOSING(pcc) \
     ((pcc)->cflags & CUSE_CLI_IS_CLOSING)
 
 static	MALLOC_DEFINE(M_CUSE, "cuse", "CUSE memory");
 
 static TAILQ_HEAD(, cuse_server) cuse_server_head;
 static struct mtx cuse_global_mtx;
 static struct cdev *cuse_dev;
 static struct cuse_server *cuse_alloc_unit[CUSE_DEVICES_MAX];
 static int cuse_alloc_unit_id[CUSE_DEVICES_MAX];
 
 static void cuse_server_wakeup_all_client_locked(struct cuse_server *pcs);
 static void cuse_client_kqfilter_read_detach(struct knote *kn);
 static void cuse_client_kqfilter_write_detach(struct knote *kn);
 static int cuse_client_kqfilter_read_event(struct knote *kn, long hint);
 static int cuse_client_kqfilter_write_event(struct knote *kn, long hint);
 
-static struct filterops cuse_client_kqfilter_read_ops = {
+static const struct filterops cuse_client_kqfilter_read_ops = {
 	.f_isfd = 1,
 	.f_detach = cuse_client_kqfilter_read_detach,
 	.f_event = cuse_client_kqfilter_read_event,
 };
 
-static struct filterops cuse_client_kqfilter_write_ops = {
+static const struct filterops cuse_client_kqfilter_write_ops = {
 	.f_isfd = 1,
 	.f_detach = cuse_client_kqfilter_write_detach,
 	.f_event = cuse_client_kqfilter_write_event,
 };
 
 static d_open_t cuse_client_open;
 static d_close_t cuse_client_close;
 static d_ioctl_t cuse_client_ioctl;
 static d_read_t cuse_client_read;
 static d_write_t cuse_client_write;
 static d_poll_t cuse_client_poll;
 static d_mmap_single_t cuse_client_mmap_single;
 static d_kqfilter_t cuse_client_kqfilter;
 
 static struct cdevsw cuse_client_devsw = {
 	.d_version = D_VERSION,
 	.d_open = cuse_client_open,
 	.d_close = cuse_client_close,
 	.d_ioctl = cuse_client_ioctl,
 	.d_name = "cuse_client",
 	.d_flags = D_TRACKCLOSE,
 	.d_read = cuse_client_read,
 	.d_write = cuse_client_write,
 	.d_poll = cuse_client_poll,
 	.d_mmap_single = cuse_client_mmap_single,
 	.d_kqfilter = cuse_client_kqfilter,
 };
 
 static d_open_t cuse_server_open;
 static d_close_t cuse_server_close;
 static d_ioctl_t cuse_server_ioctl;
 static d_read_t cuse_server_read;
 static d_write_t cuse_server_write;
 static d_poll_t cuse_server_poll;
 static d_mmap_single_t cuse_server_mmap_single;
 
 static struct cdevsw cuse_server_devsw = {
 	.d_version = D_VERSION,
 	.d_open = cuse_server_open,
 	.d_close = cuse_server_close,
 	.d_ioctl = cuse_server_ioctl,
 	.d_name = "cuse_server",
 	.d_flags = D_TRACKCLOSE,
 	.d_read = cuse_server_read,
 	.d_write = cuse_server_write,
 	.d_poll = cuse_server_poll,
 	.d_mmap_single = cuse_server_mmap_single,
 };
 
 static void cuse_client_is_closing(struct cuse_client *);
 static int cuse_free_unit_by_id_locked(struct cuse_server *, int);
 
 static void
 cuse_global_lock(void)
 {
 	mtx_lock(&cuse_global_mtx);
 }
 
 static void
 cuse_global_unlock(void)
 {
 	mtx_unlock(&cuse_global_mtx);
 }
 
 static void
 cuse_server_lock(struct cuse_server *pcs)
 {
 	mtx_lock(&pcs->mtx);
 }
 
 static void
 cuse_server_unlock(struct cuse_server *pcs)
 {
 	mtx_unlock(&pcs->mtx);
 }
 
 static bool
 cuse_server_is_locked(struct cuse_server *pcs)
 {
 	return (mtx_owned(&pcs->mtx));
 }
 
 static void
 cuse_cmd_lock(struct cuse_client_command *pccmd)
 {
 	sx_xlock(&pccmd->sx);
 }
 
 static void
 cuse_cmd_unlock(struct cuse_client_command *pccmd)
 {
 	sx_xunlock(&pccmd->sx);
 }
 
 static void
 cuse_kern_init(void *arg)
 {
 	TAILQ_INIT(&cuse_server_head);
 
 	mtx_init(&cuse_global_mtx, "cuse-global-mtx", NULL, MTX_DEF);
 
 	cuse_dev = make_dev(&cuse_server_devsw, 0,
 	    UID_ROOT, GID_OPERATOR, 0600, "cuse");
 
 	printf("Cuse v%d.%d.%d @ /dev/cuse\n",
 	    (CUSE_VERSION >> 16) & 0xFF, (CUSE_VERSION >> 8) & 0xFF,
 	    (CUSE_VERSION >> 0) & 0xFF);
 }
 SYSINIT(cuse_kern_init, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_init, NULL);
 
 static void
 cuse_kern_uninit(void *arg)
 {
 	void *ptr;
 
 	while (1) {
 		printf("Cuse: Please exit all /dev/cuse instances "
 		    "and processes which have used this device.\n");
 
 		pause("DRAIN", 2 * hz);
 
 		cuse_global_lock();
 		ptr = TAILQ_FIRST(&cuse_server_head);
 		cuse_global_unlock();
 
 		if (ptr == NULL)
 			break;
 	}
 
 	if (cuse_dev != NULL)
 		destroy_dev(cuse_dev);
 
 	mtx_destroy(&cuse_global_mtx);
 }
 SYSUNINIT(cuse_kern_uninit, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_uninit, 0);
 
 static int
 cuse_server_get(struct cuse_server **ppcs)
 {
 	struct cuse_server *pcs;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&pcs);
 	if (error != 0) {
 		*ppcs = NULL;
 		return (error);
 	}
 	if (pcs->is_closing) {
 		*ppcs = NULL;
 		return (EINVAL);
 	}
 	*ppcs = pcs;
 	return (0);
 }
 
 static void
 cuse_server_is_closing(struct cuse_server *pcs)
 {
 	struct cuse_client *pcc;
 
 	if (pcs->is_closing)
 		return;
 
 	pcs->is_closing = 1;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		cuse_client_is_closing(pcc);
 	}
 }
 
 static struct cuse_client_command *
 cuse_server_find_command(struct cuse_server *pcs, struct thread *td)
 {
 	struct cuse_client *pcc;
 	int n;
 
 	if (pcs->is_closing)
 		goto done;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		if (CUSE_CLIENT_CLOSING(pcc))
 			continue;
 		for (n = 0; n != CUSE_CMD_MAX; n++) {
 			if (pcc->cmds[n].entered == td)
 				return (&pcc->cmds[n]);
 		}
 	}
 done:
 	return (NULL);
 }
 
 static void
 cuse_str_filter(char *ptr)
 {
 	int c;
 
 	while (((c = *ptr) != 0)) {
 		if ((c >= 'a') && (c <= 'z')) {
 			ptr++;
 			continue;
 		}
 		if ((c >= 'A') && (c <= 'Z')) {
 			ptr++;
 			continue;
 		}
 		if ((c >= '0') && (c <= '9')) {
 			ptr++;
 			continue;
 		}
 		if ((c == '.') || (c == '_') || (c == '/')) {
 			ptr++;
 			continue;
 		}
 		*ptr = '_';
 
 		ptr++;
 	}
 }
 
 static int
 cuse_convert_error(int error)
 {
 	;				/* indent fix */
 	switch (error) {
 	case CUSE_ERR_NONE:
 		return (0);
 	case CUSE_ERR_BUSY:
 		return (EBUSY);
 	case CUSE_ERR_WOULDBLOCK:
 		return (EWOULDBLOCK);
 	case CUSE_ERR_INVALID:
 		return (EINVAL);
 	case CUSE_ERR_NO_MEMORY:
 		return (ENOMEM);
 	case CUSE_ERR_FAULT:
 		return (EFAULT);
 	case CUSE_ERR_SIGNAL:
 		return (EINTR);
 	case CUSE_ERR_NO_DEVICE:
 		return (ENODEV);
 	default:
 		return (ENXIO);
 	}
 }
 
 static void
 cuse_vm_memory_free(struct cuse_memory *mem)
 {
 	/* last user is gone - free */
 	vm_object_deallocate(mem->object);
 
 	/* free CUSE memory */
 	free(mem, M_CUSE);
 }
 
 static int
 cuse_server_alloc_memory(struct cuse_server *pcs, uint32_t alloc_nr,
     uint32_t page_count)
 {
 	struct cuse_memory *temp;
 	struct cuse_memory *mem;
 	vm_object_t object;
 	int error;
 
 	mem = malloc(sizeof(*mem), M_CUSE, M_WAITOK | M_ZERO);
 
 	object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * page_count,
 	    VM_PROT_DEFAULT, 0, curthread->td_ucred);
 	if (object == NULL) {
 		error = ENOMEM;
 		goto error_0;
 	}
 
 	cuse_server_lock(pcs);
 	/* check if allocation number already exists */
 	TAILQ_FOREACH(temp, &pcs->hmem, entry) {
 		if (temp->alloc_nr == alloc_nr)
 			break;
 	}
 	if (temp != NULL) {
 		cuse_server_unlock(pcs);
 		error = EBUSY;
 		goto error_1;
 	}
 	mem->object = object;
 	mem->page_count = page_count;
 	mem->alloc_nr = alloc_nr;
 	TAILQ_INSERT_TAIL(&pcs->hmem, mem, entry);
 	cuse_server_unlock(pcs);
 
 	return (0);
 
 error_1:
 	vm_object_deallocate(object);
 error_0:
 	free(mem, M_CUSE);
 	return (error);
 }
 
 static int
 cuse_server_free_memory(struct cuse_server *pcs, uint32_t alloc_nr)
 {
 	struct cuse_memory *mem;
 
 	cuse_server_lock(pcs);
 	TAILQ_FOREACH(mem, &pcs->hmem, entry) {
 		if (mem->alloc_nr == alloc_nr)
 			break;
 	}
 	if (mem == NULL) {
 		cuse_server_unlock(pcs);
 		return (EINVAL);
 	}
 	TAILQ_REMOVE(&pcs->hmem, mem, entry);
 	cuse_server_unlock(pcs);
 
 	cuse_vm_memory_free(mem);
 
 	return (0);
 }
 
 static int
 cuse_client_get(struct cuse_client **ppcc)
 {
 	struct cuse_client *pcc;
 	int error;
 
 	/* try to get private data */
 	error = devfs_get_cdevpriv((void **)&pcc);
 	if (error != 0) {
 		*ppcc = NULL;
 		return (error);
 	}
 	if (CUSE_CLIENT_CLOSING(pcc) || pcc->server->is_closing) {
 		*ppcc = NULL;
 		return (EINVAL);
 	}
 	*ppcc = pcc;
 	return (0);
 }
 
 static void
 cuse_client_is_closing(struct cuse_client *pcc)
 {
 	struct cuse_client_command *pccmd;
 	uint32_t n;
 
 	if (CUSE_CLIENT_CLOSING(pcc))
 		return;
 
 	pcc->cflags |= CUSE_CLI_IS_CLOSING;
 	pcc->server_dev = NULL;
 
 	for (n = 0; n != CUSE_CMD_MAX; n++) {
 		pccmd = &pcc->cmds[n];
 
 		if (pccmd->entry.tqe_prev != NULL) {
 			TAILQ_REMOVE(&pcc->server->head, pccmd, entry);
 			pccmd->entry.tqe_prev = NULL;
 		}
 		cv_broadcast(&pccmd->cv);
 	}
 }
 
 static void
 cuse_client_send_command_locked(struct cuse_client_command *pccmd,
     uintptr_t data_ptr, unsigned long arg, int fflags, int ioflag)
 {
 	unsigned long cuse_fflags = 0;
 	struct cuse_server *pcs;
 
 	if (fflags & FREAD)
 		cuse_fflags |= CUSE_FFLAG_READ;
 
 	if (fflags & FWRITE)
 		cuse_fflags |= CUSE_FFLAG_WRITE;
 
 	if (ioflag & IO_NDELAY)
 		cuse_fflags |= CUSE_FFLAG_NONBLOCK;
 #if defined(__LP64__)
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		cuse_fflags |= CUSE_FFLAG_COMPAT32;
 #endif
 	pccmd->sub.fflags = cuse_fflags;
 	pccmd->sub.data_pointer = data_ptr;
 	pccmd->sub.argument = arg;
 
 	pcs = pccmd->client->server;
 
 	if ((pccmd->entry.tqe_prev == NULL) &&
 	    (CUSE_CLIENT_CLOSING(pccmd->client) == 0) &&
 	    (pcs->is_closing == 0)) {
 		TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry);
 		cv_signal(&pcs->cv);
 	}
 }
 
 static void
 cuse_client_got_signal(struct cuse_client_command *pccmd)
 {
 	struct cuse_server *pcs;
 
 	pccmd->got_signal = 1;
 
 	pccmd = &pccmd->client->cmds[CUSE_CMD_SIGNAL];
 
 	pcs = pccmd->client->server;
 
 	if ((pccmd->entry.tqe_prev == NULL) &&
 	    (CUSE_CLIENT_CLOSING(pccmd->client) == 0) &&
 	    (pcs->is_closing == 0)) {
 		TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry);
 		cv_signal(&pcs->cv);
 	}
 }
 
 static int
 cuse_client_receive_command_locked(struct cuse_client_command *pccmd,
     uint8_t *arg_ptr, uint32_t arg_len)
 {
 	struct cuse_server *pcs;
 	int error;
 
 	pcs = pccmd->client->server;
 	error = 0;
 
 	pccmd->proc_curr = curthread->td_proc;
 
 	if (CUSE_CLIENT_CLOSING(pccmd->client) || pcs->is_closing) {
 		error = CUSE_ERR_OTHER;
 		goto done;
 	}
 	while (pccmd->command == CUSE_CMD_NONE) {
 		if (error != 0) {
 			cv_wait(&pccmd->cv, &pcs->mtx);
 		} else {
 			error = cv_wait_sig(&pccmd->cv, &pcs->mtx);
 
 			if (error != 0)
 				cuse_client_got_signal(pccmd);
 		}
 		if (CUSE_CLIENT_CLOSING(pccmd->client) || pcs->is_closing) {
 			error = CUSE_ERR_OTHER;
 			goto done;
 		}
 	}
 
 	error = pccmd->error;
 	pccmd->command = CUSE_CMD_NONE;
 	cv_signal(&pccmd->cv);
 
 done:
 
 	/* wait until all process references are gone */
 
 	pccmd->proc_curr = NULL;
 
 	while (pccmd->proc_refs != 0)
 		cv_wait(&pccmd->cv, &pcs->mtx);
 
 	return (error);
 }
 
 /*------------------------------------------------------------------------*
  *	CUSE SERVER PART
  *------------------------------------------------------------------------*/
 
 static void
 cuse_server_free_dev(struct cuse_server_dev *pcsd)
 {
 	struct cuse_server *pcs;
 	struct cuse_client *pcc;
 
 	/* get server pointer */
 	pcs = pcsd->server;
 
 	/* prevent creation of more devices */
 	cuse_server_lock(pcs);
 	if (pcsd->kern_dev != NULL)
 		pcsd->kern_dev->si_drv1 = NULL;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		if (pcc->server_dev == pcsd)
 			cuse_client_is_closing(pcc);
 	}
 	cuse_server_unlock(pcs);
 
 	/* destroy device, if any */
 	if (pcsd->kern_dev != NULL) {
 		/* destroy device synchronously */
 		destroy_dev(pcsd->kern_dev);
 	}
 	free(pcsd, M_CUSE);
 }
 
 static void
 cuse_server_unref(struct cuse_server *pcs)
 {
 	struct cuse_server_dev *pcsd;
 	struct cuse_memory *mem;
 
 	cuse_server_lock(pcs);
 	if (--(pcs->refs) != 0) {
 		cuse_server_unlock(pcs);
 		return;
 	}
 	cuse_server_is_closing(pcs);
 	/* final client wakeup, if any */
 	cuse_server_wakeup_all_client_locked(pcs);
 
 	cuse_global_lock();
 	TAILQ_REMOVE(&cuse_server_head, pcs, entry);
 	cuse_global_unlock();
 
 	while ((pcsd = TAILQ_FIRST(&pcs->hdev)) != NULL) {
 		TAILQ_REMOVE(&pcs->hdev, pcsd, entry);
 		cuse_server_unlock(pcs);
 		cuse_server_free_dev(pcsd);
 		cuse_server_lock(pcs);
 	}
 
 	cuse_free_unit_by_id_locked(pcs, -1);
 
 	while ((mem = TAILQ_FIRST(&pcs->hmem)) != NULL) {
 		TAILQ_REMOVE(&pcs->hmem, mem, entry);
 		cuse_server_unlock(pcs);
 		cuse_vm_memory_free(mem);
 		cuse_server_lock(pcs);
 	}
 
 	knlist_clear(&pcs->selinfo.si_note, 1);
 	knlist_destroy(&pcs->selinfo.si_note);
 
 	cuse_server_unlock(pcs);
 
 	seldrain(&pcs->selinfo);
 
 	cv_destroy(&pcs->cv);
 
 	mtx_destroy(&pcs->mtx);
 
 	free(pcs, M_CUSE);
 }
 
 static int
 cuse_server_do_close(struct cuse_server *pcs)
 {
 	int retval;
 
 	cuse_server_lock(pcs);
 	cuse_server_is_closing(pcs);
 	/* final client wakeup, if any */
 	cuse_server_wakeup_all_client_locked(pcs);
 
 	knlist_clear(&pcs->selinfo.si_note, 1);
 
 	retval = pcs->refs;
 	cuse_server_unlock(pcs);
 
 	return (retval);
 }
 
 static void
 cuse_server_free(void *arg)
 {
 	struct cuse_server *pcs = arg;
 
 	/*
 	 * The final server unref should be done by the server thread
 	 * to prevent deadlock in the client cdevpriv destructor,
 	 * which cannot destroy itself.
 	 */
 	while (cuse_server_do_close(pcs) != 1)
 		pause("W", hz);
 
 	/* drop final refcount */
 	cuse_server_unref(pcs);
 }
 
 static int
 cuse_server_open(struct cdev *dev, int fflags, int devtype, struct thread *td)
 {
 	struct cuse_server *pcs;
 
 	pcs = malloc(sizeof(*pcs), M_CUSE, M_WAITOK | M_ZERO);
 
 	if (devfs_set_cdevpriv(pcs, &cuse_server_free)) {
 		printf("Cuse: Cannot set cdevpriv.\n");
 		free(pcs, M_CUSE);
 		return (ENOMEM);
 	}
 	/* store current process ID */
 	pcs->pid = curproc->p_pid;
 
 	TAILQ_INIT(&pcs->head);
 	TAILQ_INIT(&pcs->hdev);
 	TAILQ_INIT(&pcs->hcli);
 	TAILQ_INIT(&pcs->hmem);
 
 	cv_init(&pcs->cv, "cuse-server-cv");
 
 	mtx_init(&pcs->mtx, "cuse-server-mtx", NULL, MTX_DEF);
 
 	knlist_init_mtx(&pcs->selinfo.si_note, &pcs->mtx);
 
 	cuse_global_lock();
 	pcs->refs++;
 	TAILQ_INSERT_TAIL(&cuse_server_head, pcs, entry);
 	cuse_global_unlock();
 
 	return (0);
 }
 
 static int
 cuse_server_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	struct cuse_server *pcs;
 
 	if (cuse_server_get(&pcs) == 0)
 		cuse_server_do_close(pcs);
 
 	return (0);
 }
 
 static int
 cuse_server_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	return (ENXIO);
 }
 
 static int
 cuse_server_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	return (ENXIO);
 }
 
 static int
 cuse_server_ioctl_copy_locked(struct cuse_server *pcs,
     struct cuse_client_command *pccmd,
     struct cuse_data_chunk *pchk, bool isread)
 {
 	struct proc *p_proc;
 	uint32_t offset;
 	int error;
 
 	offset = pchk->peer_ptr - CUSE_BUF_MIN_PTR;
 
 	if (pchk->length > CUSE_BUFFER_MAX)
 		return (EFAULT);
 
 	if (offset >= CUSE_BUFFER_MAX)
 		return (EFAULT);
 
 	if ((offset + pchk->length) > CUSE_BUFFER_MAX)
 		return (EFAULT);
 
 	p_proc = pccmd->proc_curr;
 	if (p_proc == NULL)
 		return (ENXIO);
 
 	if (pccmd->proc_refs < 0)
 		return (ENOMEM);
 
 	pccmd->proc_refs++;
 
 	cuse_server_unlock(pcs);
 
 	if (!isread) {
 		error = copyin(
 		    (void *)pchk->local_ptr,
 		    pccmd->client->ioctl_buffer + offset,
 		    pchk->length);
 	} else {
 		error = copyout(
 		    pccmd->client->ioctl_buffer + offset,
 		    (void *)pchk->local_ptr,
 		    pchk->length);
 	}
 
 	cuse_server_lock(pcs);
 
 	pccmd->proc_refs--;
 
 	if (pccmd->proc_curr == NULL)
 		cv_signal(&pccmd->cv);
 
 	return (error);
 }
 
 static int
 cuse_proc2proc_copy(struct proc *proc_s, vm_offset_t data_s,
     struct proc *proc_d, vm_offset_t data_d, size_t len)
 {
 	struct thread *td;
 	struct proc *proc_cur;
 	int error;
 
 	td = curthread;
 	proc_cur = td->td_proc;
 
 	if (proc_cur == proc_d) {
 		struct iovec iov = {
 			.iov_base = (caddr_t)data_d,
 			.iov_len = len,
 		};
 		struct uio uio = {
 			.uio_iov = &iov,
 			.uio_iovcnt = 1,
 			.uio_offset = (off_t)data_s,
 			.uio_resid = len,
 			.uio_segflg = UIO_USERSPACE,
 			.uio_rw = UIO_READ,
 			.uio_td = td,
 		};
 
 		PHOLD(proc_s);
 		error = proc_rwmem(proc_s, &uio);
 		PRELE(proc_s);
 
 	} else if (proc_cur == proc_s) {
 		struct iovec iov = {
 			.iov_base = (caddr_t)data_s,
 			.iov_len = len,
 		};
 		struct uio uio = {
 			.uio_iov = &iov,
 			.uio_iovcnt = 1,
 			.uio_offset = (off_t)data_d,
 			.uio_resid = len,
 			.uio_segflg = UIO_USERSPACE,
 			.uio_rw = UIO_WRITE,
 			.uio_td = td,
 		};
 
 		PHOLD(proc_d);
 		error = proc_rwmem(proc_d, &uio);
 		PRELE(proc_d);
 	} else {
 		error = EINVAL;
 	}
 	return (error);
 }
 
 static int
 cuse_server_data_copy_locked(struct cuse_server *pcs,
     struct cuse_client_command *pccmd,
     struct cuse_data_chunk *pchk, bool isread)
 {
 	struct proc *p_proc;
 	int error;
 
 	p_proc = pccmd->proc_curr;
 	if (p_proc == NULL)
 		return (ENXIO);
 
 	if (pccmd->proc_refs < 0)
 		return (ENOMEM);
 
 	pccmd->proc_refs++;
 
 	cuse_server_unlock(pcs);
 
 	if (!isread) {
 		error = cuse_proc2proc_copy(
 		    curthread->td_proc, pchk->local_ptr,
 		    p_proc, pchk->peer_ptr,
 		    pchk->length);
 	} else {
 		error = cuse_proc2proc_copy(
 		    p_proc, pchk->peer_ptr,
 		    curthread->td_proc, pchk->local_ptr,
 		    pchk->length);
 	}
 
 	cuse_server_lock(pcs);
 
 	pccmd->proc_refs--;
 
 	if (pccmd->proc_curr == NULL)
 		cv_signal(&pccmd->cv);
 
 	return (error);
 }
 
 static int
 cuse_server_data_copy_optimized_locked(struct cuse_server *pcs,
     struct cuse_client_command *pccmd,
     struct cuse_data_chunk *pchk, bool isread)
 {
 	uintptr_t offset;
 	int error;
 
 	/*
 	 * Check if data is stored locally to avoid accessing
 	 * other process's data space:
 	 */
 	if (isread) {
 		offset = pchk->peer_ptr - pccmd->client->write_base;
 
 		if (offset < (uintptr_t)pccmd->client->write_length &&
 		    pchk->length <= (unsigned long)pccmd->client->write_length &&
 		    offset + pchk->length <= (uintptr_t)pccmd->client->write_length) {
 			cuse_server_unlock(pcs);
 			error = copyout(pccmd->client->write_buffer + offset,
 			    (void *)pchk->local_ptr, pchk->length);
 			goto done;
 		}
 	} else {
 		offset = pchk->peer_ptr - pccmd->client->read_base;
 
 		if (offset < (uintptr_t)pccmd->client->read_length &&
 		    pchk->length <= (unsigned long)pccmd->client->read_length &&
 		    offset + pchk->length <= (uintptr_t)pccmd->client->read_length) {
 			cuse_server_unlock(pcs);
 			error = copyin((void *)pchk->local_ptr,
 			    pccmd->client->read_buffer + offset, pchk->length);
 			goto done;
 		}
 	}
 
 	/* use process to process copy function */
 	error = cuse_server_data_copy_locked(pcs, pccmd, pchk, isread);
 done:
 	return (error);
 }
 
 static int
 cuse_alloc_unit_by_id_locked(struct cuse_server *pcs, int id)
 {
 	int n;
 	int x = 0;
 	int match;
 
 	do {
 		for (match = n = 0; n != CUSE_DEVICES_MAX; n++) {
 			if (cuse_alloc_unit[n] != NULL) {
 				if ((cuse_alloc_unit_id[n] ^ id) & CUSE_ID_MASK)
 					continue;
 				if ((cuse_alloc_unit_id[n] & ~CUSE_ID_MASK) == x) {
 					x++;
 					match = 1;
 				}
 			}
 		}
 	} while (match);
 
 	if (x < 256) {
 		for (n = 0; n != CUSE_DEVICES_MAX; n++) {
 			if (cuse_alloc_unit[n] == NULL) {
 				cuse_alloc_unit[n] = pcs;
 				cuse_alloc_unit_id[n] = id | x;
 				return (x);
 			}
 		}
 	}
 	return (-1);
 }
 
 static void
 cuse_server_wakeup_locked(struct cuse_server *pcs)
 {
 	selwakeup(&pcs->selinfo);
 	KNOTE_LOCKED(&pcs->selinfo.si_note, 0);
 }
 
 static void
 cuse_server_wakeup_all_client_locked(struct cuse_server *pcs)
 {
 	struct cuse_client *pcc;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		pcc->cflags |= (CUSE_CLI_KNOTE_NEED_READ |
 		    CUSE_CLI_KNOTE_NEED_WRITE);
 	}
 	cuse_server_wakeup_locked(pcs);
 }
 
 static int
 cuse_free_unit_by_id_locked(struct cuse_server *pcs, int id)
 {
 	int n;
 	int found = 0;
 
 	for (n = 0; n != CUSE_DEVICES_MAX; n++) {
 		if (cuse_alloc_unit[n] == pcs) {
 			if (cuse_alloc_unit_id[n] == id || id == -1) {
 				cuse_alloc_unit[n] = NULL;
 				cuse_alloc_unit_id[n] = 0;
 				found = 1;
 			}
 		}
 	}
 
 	return (found ? 0 : EINVAL);
 }
 
 static int
 cuse_server_ioctl(struct cdev *dev, unsigned long cmd,
     caddr_t data, int fflag, struct thread *td)
 {
 	struct cuse_server *pcs;
 	int error;
 
 	error = cuse_server_get(&pcs);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 		struct cuse_client_command *pccmd;
 		struct cuse_client *pcc;
 		struct cuse_command *pcmd;
 		struct cuse_alloc_info *pai;
 		struct cuse_create_dev *pcd;
 		struct cuse_server_dev *pcsd;
 		struct cuse_data_chunk *pchk;
 		int n;
 
 	case CUSE_IOCTL_GET_COMMAND:
 		pcmd = (void *)data;
 
 		cuse_server_lock(pcs);
 
 		while ((pccmd = TAILQ_FIRST(&pcs->head)) == NULL) {
 			error = cv_wait_sig(&pcs->cv, &pcs->mtx);
 
 			if (pcs->is_closing)
 				error = ENXIO;
 
 			if (error) {
 				cuse_server_unlock(pcs);
 				return (error);
 			}
 		}
 
 		TAILQ_REMOVE(&pcs->head, pccmd, entry);
 		pccmd->entry.tqe_prev = NULL;
 
 		pccmd->entered = curthread;
 
 		*pcmd = pccmd->sub;
 
 		cuse_server_unlock(pcs);
 
 		break;
 
 	case CUSE_IOCTL_SYNC_COMMAND:
 
 		cuse_server_lock(pcs);
 		while ((pccmd = cuse_server_find_command(pcs, curthread)) != NULL) {
 			/* send sync command */
 			pccmd->entered = NULL;
 			pccmd->error = *(int *)data;
 			pccmd->command = CUSE_CMD_SYNC;
 
 			/* signal peer, if any */
 			cv_signal(&pccmd->cv);
 		}
 		cuse_server_unlock(pcs);
 
 		break;
 
 	case CUSE_IOCTL_ALLOC_UNIT:
 
 		cuse_server_lock(pcs);
 		n = cuse_alloc_unit_by_id_locked(pcs,
 		    CUSE_ID_DEFAULT(0));
 		cuse_server_unlock(pcs);
 
 		if (n < 0)
 			error = ENOMEM;
 		else
 			*(int *)data = n;
 		break;
 
 	case CUSE_IOCTL_ALLOC_UNIT_BY_ID:
 
 		n = *(int *)data;
 
 		n = (n & CUSE_ID_MASK);
 
 		cuse_server_lock(pcs);
 		n = cuse_alloc_unit_by_id_locked(pcs, n);
 		cuse_server_unlock(pcs);
 
 		if (n < 0)
 			error = ENOMEM;
 		else
 			*(int *)data = n;
 		break;
 
 	case CUSE_IOCTL_FREE_UNIT:
 
 		n = *(int *)data;
 
 		n = CUSE_ID_DEFAULT(n);
 
 		cuse_server_lock(pcs);
 		error = cuse_free_unit_by_id_locked(pcs, n);
 		cuse_server_unlock(pcs);
 		break;
 
 	case CUSE_IOCTL_FREE_UNIT_BY_ID:
 
 		n = *(int *)data;
 
 		cuse_server_lock(pcs);
 		error = cuse_free_unit_by_id_locked(pcs, n);
 		cuse_server_unlock(pcs);
 		break;
 
 	case CUSE_IOCTL_ALLOC_MEMORY:
 
 		pai = (void *)data;
 
 		if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		if (pai->page_count > CUSE_ALLOC_PAGES_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		error = cuse_server_alloc_memory(pcs,
 		    pai->alloc_nr, pai->page_count);
 		break;
 
 	case CUSE_IOCTL_FREE_MEMORY:
 		pai = (void *)data;
 
 		if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		error = cuse_server_free_memory(pcs, pai->alloc_nr);
 		break;
 
 	case CUSE_IOCTL_GET_SIG:
 
 		cuse_server_lock(pcs);
 		pccmd = cuse_server_find_command(pcs, curthread);
 
 		if (pccmd != NULL) {
 			n = pccmd->got_signal;
 			pccmd->got_signal = 0;
 		} else {
 			n = 0;
 		}
 		cuse_server_unlock(pcs);
 
 		*(int *)data = n;
 
 		break;
 
 	case CUSE_IOCTL_SET_PFH:
 
 		cuse_server_lock(pcs);
 		pccmd = cuse_server_find_command(pcs, curthread);
 
 		if (pccmd != NULL) {
 			pcc = pccmd->client;
 			for (n = 0; n != CUSE_CMD_MAX; n++) {
 				pcc->cmds[n].sub.per_file_handle = *(uintptr_t *)data;
 			}
 		} else {
 			error = ENXIO;
 		}
 		cuse_server_unlock(pcs);
 		break;
 
 	case CUSE_IOCTL_CREATE_DEV:
 
 		error = priv_check(curthread, PRIV_DRIVER);
 		if (error)
 			break;
 
 		pcd = (void *)data;
 
 		/* filter input */
 
 		pcd->devname[sizeof(pcd->devname) - 1] = 0;
 
 		if (pcd->devname[0] == 0) {
 			error = EINVAL;
 			break;
 		}
 		cuse_str_filter(pcd->devname);
 
 		pcd->permissions &= 0777;
 
 		/* try to allocate a character device */
 
 		pcsd = malloc(sizeof(*pcsd), M_CUSE, M_WAITOK | M_ZERO);
 
 		pcsd->server = pcs;
 
 		pcsd->user_dev = pcd->dev;
 
 		pcsd->kern_dev = make_dev_credf(MAKEDEV_CHECKNAME,
 		    &cuse_client_devsw, 0, NULL, pcd->user_id, pcd->group_id,
 		    pcd->permissions, "%s", pcd->devname);
 
 		if (pcsd->kern_dev == NULL) {
 			free(pcsd, M_CUSE);
 			error = ENOMEM;
 			break;
 		}
 		pcsd->kern_dev->si_drv1 = pcsd;
 
 		cuse_server_lock(pcs);
 		TAILQ_INSERT_TAIL(&pcs->hdev, pcsd, entry);
 		cuse_server_unlock(pcs);
 
 		break;
 
 	case CUSE_IOCTL_DESTROY_DEV:
 
 		error = priv_check(curthread, PRIV_DRIVER);
 		if (error)
 			break;
 
 		cuse_server_lock(pcs);
 
 		error = EINVAL;
 
 		pcsd = TAILQ_FIRST(&pcs->hdev);
 		while (pcsd != NULL) {
 			if (pcsd->user_dev == *(struct cuse_dev **)data) {
 				TAILQ_REMOVE(&pcs->hdev, pcsd, entry);
 				cuse_server_unlock(pcs);
 				cuse_server_free_dev(pcsd);
 				cuse_server_lock(pcs);
 				error = 0;
 				pcsd = TAILQ_FIRST(&pcs->hdev);
 			} else {
 				pcsd = TAILQ_NEXT(pcsd, entry);
 			}
 		}
 
 		cuse_server_unlock(pcs);
 		break;
 
 	case CUSE_IOCTL_WRITE_DATA:
 	case CUSE_IOCTL_READ_DATA:
 
 		cuse_server_lock(pcs);
 		pchk = (struct cuse_data_chunk *)data;
 
 		pccmd = cuse_server_find_command(pcs, curthread);
 
 		if (pccmd == NULL) {
 			error = ENXIO;	/* invalid request */
 		} else if (pchk->peer_ptr < CUSE_BUF_MIN_PTR) {
 			error = EFAULT;	/* NULL pointer */
 		} else if (pchk->length == 0) {
 			/* NOP */
 		} else if (pchk->peer_ptr < CUSE_BUF_MAX_PTR) {
 			error = cuse_server_ioctl_copy_locked(pcs, pccmd,
 			    pchk, cmd == CUSE_IOCTL_READ_DATA);
 		} else {
 			error = cuse_server_data_copy_optimized_locked(
 			    pcs, pccmd, pchk, cmd == CUSE_IOCTL_READ_DATA);
 		}
 
 		/*
 		 * Sometimes the functions above drop the server lock
 		 * early as an optimization:
 		 */
 		if (cuse_server_is_locked(pcs))
 			cuse_server_unlock(pcs);
 		break;
 
 	case CUSE_IOCTL_SELWAKEUP:
 		cuse_server_lock(pcs);
 		/*
 		 * We don't know which direction caused the event.
 		 * Wakeup both!
 		 */
 		cuse_server_wakeup_all_client_locked(pcs);
 		cuse_server_unlock(pcs);
 		break;
 
 	default:
 		error = ENXIO;
 		break;
 	}
 	return (error);
 }
 
 static int
 cuse_server_poll(struct cdev *dev, int events, struct thread *td)
 {
 	return (events & (POLLHUP | POLLPRI | POLLIN |
 	    POLLRDNORM | POLLOUT | POLLWRNORM));
 }
 
 static int
 cuse_common_mmap_single(struct cuse_server *pcs,
     vm_ooffset_t *offset, vm_size_t size, struct vm_object **object)
 {
   	struct cuse_memory *mem;
 	int error;
 
 	/* verify size */
 	if ((size % PAGE_SIZE) != 0 || (size < PAGE_SIZE))
 		return (EINVAL);
 
 	cuse_server_lock(pcs);
 	error = ENOMEM;
 
 	/* lookup memory structure, if any */
 	TAILQ_FOREACH(mem, &pcs->hmem, entry) {
 		vm_ooffset_t min_off;
 		vm_ooffset_t max_off;
 
 		min_off = (mem->alloc_nr << CUSE_ALLOC_UNIT_SHIFT);
 		max_off = min_off + (PAGE_SIZE * mem->page_count);
 
 		if (*offset >= min_off && *offset < max_off) {
 			/* range check size */
 			if (size > (max_off - *offset)) {
 				error = EINVAL;
 			} else {
 				/* get new VM object offset to use */
 				*offset -= min_off;
 				vm_object_reference(mem->object);
 				*object = mem->object;
 				error = 0;
 			}
 			break;
 		}
 	}
 	cuse_server_unlock(pcs);
 	return (error);
 }
 
 static int
 cuse_server_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **object, int nprot)
 {
 	struct cuse_server *pcs;
 	int error;
 
 	error = cuse_server_get(&pcs);
 	if (error != 0)
 		return (error);
 
 	return (cuse_common_mmap_single(pcs, offset, size, object));
 }
 
 /*------------------------------------------------------------------------*
  *	CUSE CLIENT PART
  *------------------------------------------------------------------------*/
 static void
 cuse_client_free(void *arg)
 {
 	struct cuse_client *pcc = arg;
 	struct cuse_client_command *pccmd;
 	struct cuse_server *pcs;
 	int n;
 
 	pcs = pcc->server;
 
 	cuse_server_lock(pcs);
 	cuse_client_is_closing(pcc);
 	TAILQ_REMOVE(&pcs->hcli, pcc, entry);
 	cuse_server_unlock(pcs);
 
 	for (n = 0; n != CUSE_CMD_MAX; n++) {
 		pccmd = &pcc->cmds[n];
 
 		sx_destroy(&pccmd->sx);
 		cv_destroy(&pccmd->cv);
 	}
 
 	free(pcc, M_CUSE);
 
 	/* drop reference on server */
 	cuse_server_unref(pcs);
 }
 
 static int
 cuse_client_open(struct cdev *dev, int fflags, int devtype, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_server_dev *pcsd;
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	struct cuse_dev *pcd;
 	int error;
 	int n;
 
 	pcsd = dev->si_drv1;
 	if (pcsd != NULL) {
 		pcs = pcsd->server;
 		pcd = pcsd->user_dev;
 
 		cuse_server_lock(pcs);
 		/*
 		 * Check that the refcount didn't wrap and that the
 		 * same process is not both client and server. This
 		 * can easily lead to deadlocks when destroying the
 		 * CUSE character device nodes:
 		 */
 		pcs->refs++;
 		if (pcs->refs < 0 || pcs->pid == curproc->p_pid) {
 			/* overflow or wrong PID */
 			pcs->refs--;
 			cuse_server_unlock(pcs);
 			return (EINVAL);
 		}
 		cuse_server_unlock(pcs);
 	} else {
 		return (EINVAL);
 	}
 
 	pcc = malloc(sizeof(*pcc), M_CUSE, M_WAITOK | M_ZERO);
 	if (devfs_set_cdevpriv(pcc, &cuse_client_free)) {
 		printf("Cuse: Cannot set cdevpriv.\n");
 		/* drop reference on server */
 		cuse_server_unref(pcs);
 		free(pcc, M_CUSE);
 		return (ENOMEM);
 	}
 	pcc->fflags = fflags;
 	pcc->server_dev = pcsd;
 	pcc->server = pcs;
 
 	for (n = 0; n != CUSE_CMD_MAX; n++) {
 		pccmd = &pcc->cmds[n];
 
 		pccmd->sub.dev = pcd;
 		pccmd->sub.command = n;
 		pccmd->client = pcc;
 
 		sx_init(&pccmd->sx, "cuse-client-sx");
 		cv_init(&pccmd->cv, "cuse-client-cv");
 	}
 
 	cuse_server_lock(pcs);
 
 	/* cuse_client_free() assumes that the client is listed somewhere! */
 	/* always enqueue */
 
 	TAILQ_INSERT_TAIL(&pcs->hcli, pcc, entry);
 
 	/* check if server is closing */
 	if ((pcs->is_closing != 0) || (dev->si_drv1 == NULL)) {
 		error = EINVAL;
 	} else {
 		error = 0;
 	}
 	cuse_server_unlock(pcs);
 
 	if (error) {
 		devfs_clear_cdevpriv();	/* XXX bugfix */
 		return (error);
 	}
 	pccmd = &pcc->cmds[CUSE_CMD_OPEN];
 
 	cuse_cmd_lock(pccmd);
 
 	cuse_server_lock(pcs);
 	cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0);
 
 	error = cuse_client_receive_command_locked(pccmd, 0, 0);
 	cuse_server_unlock(pcs);
 
 	if (error < 0) {
 		error = cuse_convert_error(error);
 	} else {
 		error = 0;
 	}
 
 	cuse_cmd_unlock(pccmd);
 
 	if (error)
 		devfs_clear_cdevpriv();	/* XXX bugfix */
 
 	return (error);
 }
 
 static int
 cuse_client_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	int error;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (0);
 
 	pccmd = &pcc->cmds[CUSE_CMD_CLOSE];
 	pcs = pcc->server;
 
 	cuse_cmd_lock(pccmd);
 
 	cuse_server_lock(pcs);
 	cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0);
 
 	error = cuse_client_receive_command_locked(pccmd, 0, 0);
 	cuse_cmd_unlock(pccmd);
 
 	cuse_client_is_closing(pcc);
 	cuse_server_unlock(pcs);
 
 	return (0);
 }
 
 static void
 cuse_client_kqfilter_poll(struct cdev *dev, struct cuse_client *pcc)
 {
 	struct cuse_server *pcs = pcc->server;
 	int temp;
 
 	cuse_server_lock(pcs);
 	temp = (pcc->cflags & (CUSE_CLI_KNOTE_HAS_READ |
 	    CUSE_CLI_KNOTE_HAS_WRITE));
 	pcc->cflags &= ~(CUSE_CLI_KNOTE_NEED_READ |
 	    CUSE_CLI_KNOTE_NEED_WRITE);
 	cuse_server_unlock(pcs);
 
 	if (temp != 0) {
 		/* get the latest polling state from the server */
 		temp = cuse_client_poll(dev, POLLIN | POLLOUT, NULL);
 
 		if (temp & (POLLIN | POLLOUT)) {
 			cuse_server_lock(pcs);
 			if (temp & POLLIN)
 				pcc->cflags |= CUSE_CLI_KNOTE_NEED_READ;
 			if (temp & POLLOUT)
 				pcc->cflags |= CUSE_CLI_KNOTE_NEED_WRITE;
 
 			/* make sure the "knote" gets woken up */
 			cuse_server_wakeup_locked(pcc->server);
 			cuse_server_unlock(pcs);
 		}
 	}
 }
 
 static int
 cuse_client_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	int error;
 	int temp;
 	int len;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	pccmd = &pcc->cmds[CUSE_CMD_READ];
 	pcs = pcc->server;
 
 	if (uio->uio_segflg != UIO_USERSPACE) {
 		return (EINVAL);
 	}
 	uio->uio_segflg = UIO_NOCOPY;
 
 	cuse_cmd_lock(pccmd);
 
 	while (uio->uio_resid != 0) {
 		if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		len = uio->uio_iov->iov_len;
 
 		cuse_server_lock(pcs);
 		if (len <= CUSE_COPY_BUFFER_MAX) {
 			/* set read buffer region for small reads */
 			pcc->read_base = (uintptr_t)uio->uio_iov->iov_base;
 			pcc->read_length = len;
 		}
 		cuse_client_send_command_locked(pccmd,
 		    (uintptr_t)uio->uio_iov->iov_base,
 		    (unsigned long)(unsigned int)len, pcc->fflags, ioflag);
 
 		error = cuse_client_receive_command_locked(pccmd, 0, 0);
 		/*
 		 * After finishing reading data, disable the read
 		 * region for the cuse_server_data_copy_optimized_locked()
 		 * function:
 		 */
 		pcc->read_base = 0;
 		pcc->read_length = 0;
 		cuse_server_unlock(pcs);
 
 		/*
 		 * The return value indicates the read length, when
 		 * not negative. Range check it just in case to avoid
 		 * passing invalid length values to uiomove().
 		 */
 		if (error > len) {
 			error = ERANGE;
 			break;
 		} else if (error > 0 && len <= CUSE_COPY_BUFFER_MAX) {
 			temp = copyout(pcc->read_buffer,
 			    uio->uio_iov->iov_base, error);
 			if (temp != 0) {
 				error = temp;
 				break;
 			}
 		}
 		if (error < 0) {
 			error = cuse_convert_error(error);
 			break;
 		} else if (error == len) {
 			error = uiomove(NULL, error, uio);
 			if (error)
 				break;
 		} else {
 			error = uiomove(NULL, error, uio);
 			break;
 		}
 	}
 	cuse_cmd_unlock(pccmd);
 
 	uio->uio_segflg = UIO_USERSPACE;/* restore segment flag */
 
 	if (error == EWOULDBLOCK)
 		cuse_client_kqfilter_poll(dev, pcc);
 
 	return (error);
 }
 
 static int
 cuse_client_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	int error;
 	int len;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	pccmd = &pcc->cmds[CUSE_CMD_WRITE];
 	pcs = pcc->server;
 
 	if (uio->uio_segflg != UIO_USERSPACE) {
 		return (EINVAL);
 	}
 	uio->uio_segflg = UIO_NOCOPY;
 
 	cuse_cmd_lock(pccmd);
 
 	while (uio->uio_resid != 0) {
 		if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		len = uio->uio_iov->iov_len;
 
 		if (len <= CUSE_COPY_BUFFER_MAX) {
 			error = copyin(uio->uio_iov->iov_base,
 			    pcc->write_buffer, len);
 			if (error != 0)
 				break;
 		}
 
 		cuse_server_lock(pcs);
 		if (len <= CUSE_COPY_BUFFER_MAX) {
 			/* set write buffer region for small writes */
 			pcc->write_base = (uintptr_t)uio->uio_iov->iov_base;
 			pcc->write_length = len;
 		}
 		cuse_client_send_command_locked(pccmd,
 		    (uintptr_t)uio->uio_iov->iov_base,
 		    (unsigned long)(unsigned int)len, pcc->fflags, ioflag);
 
 		error = cuse_client_receive_command_locked(pccmd, 0, 0);
 
 		/*
 		 * After finishing writing data, disable the write
 		 * region for the cuse_server_data_copy_optimized_locked()
 		 * function:
 		 */
 		pcc->write_base = 0;
 		pcc->write_length = 0;
 		cuse_server_unlock(pcs);
 
 		/*
 		 * The return value indicates the write length, when
 		 * not negative. Range check it just in case to avoid
 		 * passing invalid length values to uiomove().
 		 */
 		if (error > len) {
 			error = ERANGE;
 			break;
 		} else if (error < 0) {
 			error = cuse_convert_error(error);
 			break;
 		} else if (error == len) {
 			error = uiomove(NULL, error, uio);
 			if (error)
 				break;
 		} else {
 			error = uiomove(NULL, error, uio);
 			break;
 		}
 	}
 	cuse_cmd_unlock(pccmd);
 
 	/* restore segment flag */
 	uio->uio_segflg = UIO_USERSPACE;
 
 	if (error == EWOULDBLOCK)
 		cuse_client_kqfilter_poll(dev, pcc);
 
 	return (error);
 }
 
 int
 cuse_client_ioctl(struct cdev *dev, unsigned long cmd,
     caddr_t data, int fflag, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	int error;
 	int len;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	len = IOCPARM_LEN(cmd);
 	if (len > CUSE_BUFFER_MAX)
 		return (ENOMEM);
 
 	pccmd = &pcc->cmds[CUSE_CMD_IOCTL];
 	pcs = pcc->server;
 
 	cuse_cmd_lock(pccmd);
 
 	if (cmd & (IOC_IN | IOC_VOID))
 		memcpy(pcc->ioctl_buffer, data, len);
 
 	/*
 	 * When the ioctl-length is zero drivers can pass information
 	 * through the data pointer of the ioctl. Make sure this information
 	 * is forwarded to the driver.
 	 */
 
 	cuse_server_lock(pcs);
 	cuse_client_send_command_locked(pccmd,
 	    (len == 0) ? *(long *)data : CUSE_BUF_MIN_PTR,
 	    (unsigned long)cmd, pcc->fflags,
 	    (fflag & O_NONBLOCK) ? IO_NDELAY : 0);
 
 	error = cuse_client_receive_command_locked(pccmd, data, len);
 	cuse_server_unlock(pcs);
 
 	if (error < 0) {
 		error = cuse_convert_error(error);
 	} else {
 		error = 0;
 	}
 
 	if (cmd & IOC_OUT)
 		memcpy(data, pcc->ioctl_buffer, len);
 
 	cuse_cmd_unlock(pccmd);
 
 	if (error == EWOULDBLOCK)
 		cuse_client_kqfilter_poll(dev, pcc);
 
 	return (error);
 }
 
 static int
 cuse_client_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	unsigned long temp;
 	int error;
 	int revents;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		goto pollnval;
 
 	temp = 0;
 	pcs = pcc->server;
 
 	if (events & (POLLPRI | POLLIN | POLLRDNORM))
 		temp |= CUSE_POLL_READ;
 
 	if (events & (POLLOUT | POLLWRNORM))
 		temp |= CUSE_POLL_WRITE;
 
 	if (events & POLLHUP)
 		temp |= CUSE_POLL_ERROR;
 
 	pccmd = &pcc->cmds[CUSE_CMD_POLL];
 
 	cuse_cmd_lock(pccmd);
 
 	/* Need to selrecord() first to not loose any events. */
 	if (temp != 0 && td != NULL)
 		selrecord(td, &pcs->selinfo);
 
 	cuse_server_lock(pcs);
 	cuse_client_send_command_locked(pccmd,
 	    0, temp, pcc->fflags, IO_NDELAY);
 
 	error = cuse_client_receive_command_locked(pccmd, 0, 0);
 	cuse_server_unlock(pcs);
 
 	cuse_cmd_unlock(pccmd);
 
 	if (error < 0) {
 		goto pollnval;
 	} else {
 		revents = 0;
 		if (error & CUSE_POLL_READ)
 			revents |= (events & (POLLPRI | POLLIN | POLLRDNORM));
 		if (error & CUSE_POLL_WRITE)
 			revents |= (events & (POLLOUT | POLLWRNORM));
 		if (error & CUSE_POLL_ERROR)
 			revents |= (events & POLLHUP);
 	}
 	return (revents);
 
 pollnval:
 	/* XXX many clients don't understand POLLNVAL */
 	return (events & (POLLHUP | POLLPRI | POLLIN |
 	    POLLRDNORM | POLLOUT | POLLWRNORM));
 }
 
 static int
 cuse_client_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **object, int nprot)
 {
 	struct cuse_client *pcc;
 	int error;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	return (cuse_common_mmap_single(pcc->server, offset, size, object));
 }
 
 static void
 cuse_client_kqfilter_read_detach(struct knote *kn)
 {
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 
 	pcc = kn->kn_hook;
 	pcs = pcc->server;
 
 	cuse_server_lock(pcs);
 	knlist_remove(&pcs->selinfo.si_note, kn, 1);
 	cuse_server_unlock(pcs);
 }
 
 static void
 cuse_client_kqfilter_write_detach(struct knote *kn)
 {
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 
 	pcc = kn->kn_hook;
 	pcs = pcc->server;
 
 	cuse_server_lock(pcs);
 	knlist_remove(&pcs->selinfo.si_note, kn, 1);
 	cuse_server_unlock(pcs);
 }
 
 static int
 cuse_client_kqfilter_read_event(struct knote *kn, long hint)
 {
 	struct cuse_client *pcc;
 
 	pcc = kn->kn_hook;
 
 	mtx_assert(&pcc->server->mtx, MA_OWNED);
 
 	return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_READ) ? 1 : 0);
 }
 
 static int
 cuse_client_kqfilter_write_event(struct knote *kn, long hint)
 {
 	struct cuse_client *pcc;
 
 	pcc = kn->kn_hook;
 
 	mtx_assert(&pcc->server->mtx, MA_OWNED);
 
 	return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_WRITE) ? 1 : 0);
 }
 
 static int
 cuse_client_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	int error;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	pcs = pcc->server;
 
 	cuse_server_lock(pcs);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		pcc->cflags |= CUSE_CLI_KNOTE_HAS_READ;
 		kn->kn_hook = pcc;
 		kn->kn_fop = &cuse_client_kqfilter_read_ops;
 		knlist_add(&pcs->selinfo.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		pcc->cflags |= CUSE_CLI_KNOTE_HAS_WRITE;
 		kn->kn_hook = pcc;
 		kn->kn_fop = &cuse_client_kqfilter_write_ops;
 		knlist_add(&pcs->selinfo.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	cuse_server_unlock(pcs);
 
 	if (error == 0)
 		cuse_client_kqfilter_poll(dev, pcc);
 	return (error);
 }
diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index 9dcf3b235feb..7d17362df05e 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -1,2143 +1,2143 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2000-2004
  *	Poul-Henning Kamp.  All rights reserved.
  * Copyright (c) 1989, 1992-1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  */
 
 /*
  * TODO:
  *	mkdir: want it ?
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 static struct vop_vector devfs_vnodeops;
 static struct vop_vector devfs_specops;
-static struct fileops devfs_ops_f;
+static const struct fileops devfs_ops_f;
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data");
 
 struct mtx	devfs_de_interlock;
 MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF);
 struct mtx	cdevpriv_mtx;
 MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF);
 
 SYSCTL_DECL(_vfs_devfs);
 
 static int devfs_dotimes;
 SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW,
     &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision");
 
 /*
  * Update devfs node timestamp.  Note that updates are unlocked and
  * stat(2) could see partially updated times.
  */
 static void
 devfs_timestamp(struct timespec *tsp)
 {
 	time_t ts;
 
 	if (devfs_dotimes) {
 		vfs_timestamp(tsp);
 	} else {
 		ts = time_second;
 		if (tsp->tv_sec != ts) {
 			tsp->tv_sec = ts;
 			tsp->tv_nsec = 0;
 		}
 	}
 }
 
 static int
 devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp,
     int *ref)
 {
 	*dswp = devvn_refthread(fp->f_vnode, devp, ref);
 	if (*dswp == NULL || *devp != fp->f_data) {
 		if (*dswp != NULL)
 			dev_relthread(*devp, *ref);
 		return (ENXIO);
 	}
 	KASSERT((*devp)->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp)));
 	if (*dswp == NULL)
 		return (ENXIO);
 	curthread->td_fpop = fp;
 	return (0);
 }
 
 int
 devfs_get_cdevpriv(void **datap)
 {
 	struct file *fp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (EBADF);
 	p = fp->f_cdevpriv;
 	if (p != NULL) {
 		error = 0;
 		*datap = p->cdpd_data;
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 int
 devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr)
 {
 	struct file *fp;
 	struct cdev_priv *cdp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (ENOENT);
 	cdp = cdev2priv((struct cdev *)fp->f_data);
 	p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK);
 	p->cdpd_data = priv;
 	p->cdpd_dtr = priv_dtr;
 	p->cdpd_fp = fp;
 	mtx_lock(&cdevpriv_mtx);
 	if (fp->f_cdevpriv == NULL) {
 		LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list);
 		fp->f_cdevpriv = p;
 		mtx_unlock(&cdevpriv_mtx);
 		error = 0;
 	} else {
 		mtx_unlock(&cdevpriv_mtx);
 		free(p, M_CDEVPDATA);
 		error = EBUSY;
 	}
 	return (error);
 }
 
 int
 devfs_foreach_cdevpriv(struct cdev *dev, int (*cb)(void *data, void *arg),
     void *arg)
 {
 	struct cdev_priv *cdp;
 	struct cdev_privdata *p;
 	int error;
 
 	cdp = cdev2priv(dev);
 	error = 0;
 	mtx_lock(&cdevpriv_mtx);
 	LIST_FOREACH(p, &cdp->cdp_fdpriv, cdpd_list) {
 		error = cb(p->cdpd_data, arg);
 		if (error != 0)
 			break;
 	}
 	mtx_unlock(&cdevpriv_mtx);
 	return (error);
 }
 
 void
 devfs_destroy_cdevpriv(struct cdev_privdata *p)
 {
 
 	mtx_assert(&cdevpriv_mtx, MA_OWNED);
 	KASSERT(p->cdpd_fp->f_cdevpriv == p,
 	    ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p));
 	p->cdpd_fp->f_cdevpriv = NULL;
 	LIST_REMOVE(p, cdpd_list);
 	mtx_unlock(&cdevpriv_mtx);
 	(p->cdpd_dtr)(p->cdpd_data);
 	free(p, M_CDEVPDATA);
 }
 
 static void
 devfs_fpdrop(struct file *fp)
 {
 	struct cdev_privdata *p;
 
 	mtx_lock(&cdevpriv_mtx);
 	if ((p = fp->f_cdevpriv) == NULL) {
 		mtx_unlock(&cdevpriv_mtx);
 		return;
 	}
 	devfs_destroy_cdevpriv(p);
 }
 
 void
 devfs_clear_cdevpriv(void)
 {
 	struct file *fp;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return;
 	devfs_fpdrop(fp);
 }
 
 static void
 devfs_usecount_add(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct cdev *dev;
 
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp);
 	if (VN_IS_DOOMED(vp)) {
 		goto out_unlock;
 	}
 
 	de = vp->v_data;
 	dev = vp->v_rdev;
 	MPASS(de != NULL);
 	MPASS(dev != NULL);
 	dev->si_usecount++;
 	de->de_usecount++;
 out_unlock:
 	VI_UNLOCK(vp);
 	mtx_unlock(&devfs_de_interlock);
 }
 
 static void
 devfs_usecount_subl(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct cdev *dev;
 
 	mtx_assert(&devfs_de_interlock, MA_OWNED);
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp);
 
 	de = vp->v_data;
 	dev = vp->v_rdev;
 	if (de == NULL)
 		return;
 	if (dev == NULL) {
 		MPASS(de->de_usecount == 0);
 		return;
 	}
 	if (dev->si_usecount < de->de_usecount)
 		panic("%s: si_usecount underflow for dev %p "
 		    "(has %ld, dirent has %d)\n",
 		    __func__, dev, dev->si_usecount, de->de_usecount);
 	if (VN_IS_DOOMED(vp)) {
 		dev->si_usecount -= de->de_usecount;
 		de->de_usecount = 0;
 	} else {
 		if (de->de_usecount == 0)
 			panic("%s: de_usecount underflow for dev %p\n",
 			    __func__, dev);
 		dev->si_usecount--;
 		de->de_usecount--;
 	}
 }
 
 static void
 devfs_usecount_sub(struct vnode *vp)
 {
 
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	devfs_usecount_subl(vp);
 	VI_UNLOCK(vp);
 	mtx_unlock(&devfs_de_interlock);
 }
 
 static int
 devfs_usecountl(struct vnode *vp)
 {
 
 	VNPASS(vp->v_type == VCHR, vp);
 	mtx_assert(&devfs_de_interlock, MA_OWNED);
 	ASSERT_VI_LOCKED(vp, __func__);
 	return (vp->v_rdev->si_usecount);
 }
 
 int
 devfs_usecount(struct vnode *vp)
 {
 	int count;
 
 	VNPASS(vp->v_type == VCHR, vp);
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	count = devfs_usecountl(vp);
 	VI_UNLOCK(vp);
 	mtx_unlock(&devfs_de_interlock);
 	return (count);
 }
 
 void
 devfs_ctty_ref(struct vnode *vp)
 {
 
 	vrefact(vp);
 	devfs_usecount_add(vp);
 }
 
 void
 devfs_ctty_unref(struct vnode *vp)
 {
 
 	devfs_usecount_sub(vp);
 	vrele(vp);
 }
 
 /*
  * On success devfs_populate_vp() returns with dmp->dm_lock held.
  */
 static int
 devfs_populate_vp(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	int locked;
 
 	ASSERT_VOP_LOCKED(vp, "devfs_populate_vp");
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	if (!devfs_populate_needed(dmp)) {
 		sx_xlock(&dmp->dm_lock);
 		goto out_nopopulate;
 	}
 
 	locked = VOP_ISLOCKED(vp);
 
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 
 	/* Can't call devfs_populate() with the vnode lock held. */
 	VOP_UNLOCK(vp);
 	devfs_populate(dmp);
 
 	sx_xunlock(&dmp->dm_lock);
 	vn_lock(vp, locked | LK_RETRY);
 	sx_xlock(&dmp->dm_lock);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ERESTART);
 	}
 out_nopopulate:
 	if (VN_IS_DOOMED(vp)) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 	de = vp->v_data;
 	KASSERT(de != NULL,
 	    ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed"));
 	if ((de->de_flags & DE_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 
 	return (0);
 }
 
 static int
 devfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct devfs_mount *dmp;
 	char *buf = ap->a_buf;
 	size_t *buflen = ap->a_buflen;
 	struct devfs_dirent *dd, *de;
 	int i, error;
 
 	dmp = VFSTODEVFS(vp->v_mount);
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	if (vp->v_type != VCHR && vp->v_type != VDIR) {
 		error = ENOENT;
 		goto finished;
 	}
 
 	dd = vp->v_data;
 	if (vp->v_type == VDIR && dd == dmp->dm_rootdir) {
 		*dvp = vp;
 		vref(*dvp);
 		goto finished;
 	}
 
 	i = *buflen;
 	i -= dd->de_dirent->d_namlen;
 	if (i < 0) {
 		error = ENOMEM;
 		goto finished;
 	}
 	bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen);
 	*buflen = i;
 	de = devfs_parent_dirent(dd);
 	if (de == NULL) {
 		error = ENOENT;
 		goto finished;
 	}
 	mtx_lock(&devfs_de_interlock);
 	*dvp = de->de_vnode;
 	if (*dvp != NULL) {
 		VI_LOCK(*dvp);
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(*dvp);
 		VI_UNLOCK(*dvp);
 		vref(*dvp);
 		vdrop(*dvp);
 	} else {
 		mtx_unlock(&devfs_de_interlock);
 		error = ENOENT;
 	}
 finished:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /*
  * Construct the fully qualified path name relative to the mountpoint.
  * If a NULL cnp is provided, no '/' is appended to the resulting path.
  */
 char *
 devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd,
     struct componentname *cnp)
 {
 	int i;
 	struct devfs_dirent *de;
 
 	sx_assert(&dmp->dm_lock, SA_LOCKED);
 
 	i = SPECNAMELEN;
 	buf[i] = '\0';
 	if (cnp != NULL)
 		i -= cnp->cn_namelen;
 	if (i < 0)
 		 return (NULL);
 	if (cnp != NULL)
 		bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen);
 	de = dd;
 	while (de != dmp->dm_rootdir) {
 		if (cnp != NULL || i < SPECNAMELEN) {
 			i--;
 			if (i < 0)
 				 return (NULL);
 			buf[i] = '/';
 		}
 		i -= de->de_dirent->d_namlen;
 		if (i < 0)
 			 return (NULL);
 		bcopy(de->de_dirent->d_name, buf + i,
 		    de->de_dirent->d_namlen);
 		de = devfs_parent_dirent(de);
 		if (de == NULL)
 			return (NULL);
 	}
 	return (buf + i);
 }
 
 static int
 devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp,
 	struct devfs_dirent *de)
 {
 	int not_found;
 
 	not_found = 0;
 	if (de->de_flags & DE_DOOMED)
 		not_found = 1;
 	if (DEVFS_DE_DROP(de)) {
 		KASSERT(not_found == 1, ("DEVFS de dropped but not doomed"));
 		devfs_dirent_free(de);
 	}
 	if (DEVFS_DMP_DROP(dmp)) {
 		KASSERT(not_found == 1,
 			("DEVFS mount struct freed before dirent"));
 		not_found = 2;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 	}
 	if (not_found == 1 || (drop_dm_lock && not_found != 2))
 		sx_unlock(&dmp->dm_lock);
 	return (not_found);
 }
 
 /*
  * devfs_allocv shall be entered with dmp->dm_lock held, and it drops
  * it on return.
  */
 int
 devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode,
     struct vnode **vpp)
 {
 	int error;
 	struct vnode *vp;
 	struct cdev *dev;
 	struct devfs_mount *dmp;
 	struct cdevsw *dsw;
 	enum vgetstate vs;
 
 	dmp = VFSTODEVFS(mp);
 	if (de->de_flags & DE_DOOMED) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ENOENT);
 	}
 loop:
 	DEVFS_DE_HOLD(de);
 	DEVFS_DMP_HOLD(dmp);
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		vs = vget_prep(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_xunlock(&dmp->dm_lock);
 		vget_finish(vp, lockmode | LK_RETRY, vs);
 		sx_xlock(&dmp->dm_lock);
 		if (devfs_allocv_drop_refs(0, dmp, de)) {
 			vput(vp);
 			return (ENOENT);
 		} else if (VN_IS_DOOMED(vp)) {
 			mtx_lock(&devfs_de_interlock);
 			if (de->de_vnode == vp) {
 				de->de_vnode = NULL;
 				vp->v_data = NULL;
 			}
 			mtx_unlock(&devfs_de_interlock);
 			vput(vp);
 			goto loop;
 		}
 		sx_xunlock(&dmp->dm_lock);
 		*vpp = vp;
 		return (0);
 	}
 	mtx_unlock(&devfs_de_interlock);
 	if (de->de_dirent->d_type == DT_CHR) {
 		if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) {
 			devfs_allocv_drop_refs(1, dmp, de);
 			return (ENOENT);
 		}
 		dev = &de->de_cdp->cdp_c;
 	} else {
 		dev = NULL;
 	}
 	error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp);
 	if (error != 0) {
 		devfs_allocv_drop_refs(1, dmp, de);
 		printf("devfs_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	if (de->de_dirent->d_type == DT_CHR) {
 		vp->v_type = VCHR;
 		VI_LOCK(vp);
 		dev_lock();
 		dev_refl(dev);
 		/* XXX: v_rdev should be protect by vnode lock */
 		vp->v_rdev = dev;
 		VNPASS(vp->v_usecount == 1, vp);
 		/* Special casing of ttys for deadfs.  Probably redundant. */
 		dsw = dev->si_devsw;
 		if (dsw != NULL && (dsw->d_flags & D_TTY) != 0)
 			vp->v_vflag |= VV_ISTTY;
 		dev_unlock();
 		VI_UNLOCK(vp);
 		if ((dev->si_flags & SI_ETERNAL) != 0)
 			vp->v_vflag |= VV_ETERNALDEV;
 		vp->v_op = &devfs_specops;
 	} else if (de->de_dirent->d_type == DT_DIR) {
 		vp->v_type = VDIR;
 	} else if (de->de_dirent->d_type == DT_LNK) {
 		vp->v_type = VLNK;
 	} else {
 		vp->v_type = VBAD;
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS);
 	VN_LOCK_ASHARE(vp);
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = de;
 	de->de_vnode = vp;
 	mtx_unlock(&devfs_de_interlock);
 	error = insmntque1(vp, mp);
 	if (error != 0) {
 		mtx_lock(&devfs_de_interlock);
 		vp->v_data = NULL;
 		de->de_vnode = NULL;
 		mtx_unlock(&devfs_de_interlock);
 		vgone(vp);
 		vput(vp);
 		(void) devfs_allocv_drop_refs(1, dmp, de);
 		return (error);
 	}
 	if (devfs_allocv_drop_refs(0, dmp, de)) {
 		vgone(vp);
 		vput(vp);
 		return (ENOENT);
 	}
 #ifdef MAC
 	mac_devfs_vnode_associate(mp, de, vp);
 #endif
 	sx_xunlock(&dmp->dm_lock);
 	vn_set_state(vp, VSTATE_CONSTRUCTED);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 devfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct proc *p;
 	int error;
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid,
 	    ap->a_accmode, ap->a_cred);
 	if (error == 0)
 		return (0);
 	if (error != EACCES)
 		return (error);
 	p = ap->a_td->td_proc;
 	/* We do, however, allow access to the controlling terminal */
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == de->de_cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0,
     "devfs-only flag reuse failed");
 
 static int
 devfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	struct proc *p;
 	struct cdev *dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	struct devfs_dirent *de = vp->v_data;
 	int dflags, error, ref, vp_locked;
 
 	/*
 	 * XXX: Don't call d_close() if we were called because of
 	 * XXX: insmntque() failure.
 	 */
 	if (vp->v_data == NULL)
 		return (0);
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 	if (de->de_usecount == 2 && td != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		if (vp == p->p_session->s_ttyvp) {
 			PROC_UNLOCK(p);
 			oldvp = NULL;
 			sx_xlock(&proctree_lock);
 			if (vp == p->p_session->s_ttyvp) {
 				SESS_LOCK(p->p_session);
 				mtx_lock(&devfs_de_interlock);
 				VI_LOCK(vp);
 				if (devfs_usecountl(vp) == 2 && !VN_IS_DOOMED(vp)) {
 					p->p_session->s_ttyvp = NULL;
 					p->p_session->s_ttydp = NULL;
 					oldvp = vp;
 				}
 				VI_UNLOCK(vp);
 				mtx_unlock(&devfs_de_interlock);
 				SESS_UNLOCK(p->p_session);
 			}
 			sx_xunlock(&proctree_lock);
 			if (oldvp != NULL)
 				devfs_ctty_unref(oldvp);
 		} else
 			PROC_UNLOCK(p);
 	}
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	dflags = 0;
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	if (devfs_usecountl(vp) == 1)
 		dflags |= FLASTCLOSE;
 	devfs_usecount_subl(vp);
 	mtx_unlock(&devfs_de_interlock);
 	if (VN_IS_DOOMED(vp)) {
 		/* Forced close. */
 		dflags |= FREVOKE | FNONBLOCK;
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if ((dflags & FLASTCLOSE) == 0) {
 		VI_UNLOCK(vp);
 		dev_relthread(dev, ref);
 		return (0);
 	}
 	vholdnz(vp);
 	VI_UNLOCK(vp);
 	vp_locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
 	error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td);
 	dev_relthread(dev, ref);
 	vn_lock(vp, vp_locked | LK_RETRY);
 	vdrop(vp);
 	return (error);
 }
 
 static int
 devfs_close_f(struct file *fp, struct thread *td)
 {
 	int error;
 	struct file *fpop;
 
 	/*
 	 * NB: td may be NULL if this descriptor is closed due to
 	 * garbage collection from a closed UNIX domain socket.
 	 */
 	fpop = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = vnops.fo_close(fp, td);
 	curthread->td_fpop = fpop;
 
 	/*
 	 * The f_cdevpriv cannot be assigned non-NULL value while we
 	 * are destroying the file.
 	 */
 	if (fp->f_cdevpriv != NULL)
 		devfs_fpdrop(fp);
 	return (error);
 }
 
 static int
 devfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	struct cdev *dev;
 	struct timeval boottime;
 	int error;
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xunlock(&dmp->dm_lock);
 
 	de = vp->v_data;
 	KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp));
 	if (vp->v_type == VDIR) {
 		de = de->de_dir;
 		KASSERT(de != NULL,
 		    ("Null dir dirent in devfs_getattr vp=%p", vp));
 	}
 	vap->va_uid = de->de_uid;
 	vap->va_gid = de->de_gid;
 	vap->va_mode = de->de_mode;
 	if (vp->v_type == VLNK)
 		vap->va_size = strlen(de->de_symlink);
 	else if (vp->v_type == VDIR)
 		vap->va_size = vap->va_bytes = DEV_BSIZE;
 	else
 		vap->va_size = 0;
 	if (vp->v_type != VDIR)
 		vap->va_bytes = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 
 	getboottime(&boottime);
 #define fix(aa)							\
 	do {							\
 		if ((aa).tv_sec <= 3600) {			\
 			(aa).tv_sec = boottime.tv_sec;		\
 			(aa).tv_nsec = boottime.tv_usec * 1000; \
 		}						\
 	} while (0)
 
 	if (vp->v_type != VCHR)  {
 		fix(de->de_atime);
 		vap->va_atime = de->de_atime;
 		fix(de->de_mtime);
 		vap->va_mtime = de->de_mtime;
 		fix(de->de_ctime);
 		vap->va_ctime = de->de_ctime;
 	} else {
 		dev = vp->v_rdev;
 		fix(dev->si_atime);
 		vap->va_atime = dev->si_atime;
 		fix(dev->si_mtime);
 		vap->va_mtime = dev->si_mtime;
 		fix(dev->si_ctime);
 		vap->va_ctime = dev->si_ctime;
 
 		vap->va_rdev = cdev2priv(dev)->cdp_inode;
 	}
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_filerev = 0;
 	vap->va_nlink = de->de_links;
 	vap->va_fileid = de->de_inode;
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
 {
 	struct file *fpop;
 	int error;
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	error = vnops.fo_ioctl(fp, com, data, cred, td);
 	td->td_fpop = fpop;
 	return (error);
 }
 
 void *
 fiodgname_buf_get_ptr(void *fgnp, u_long com)
 {
 	union {
 		struct fiodgname_arg	fgn;
 #ifdef COMPAT_FREEBSD32
 		struct fiodgname_arg32	fgn32;
 #endif
 	} *fgnup;
 
 	fgnup = fgnp;
 	switch (com) {
 	case FIODGNAME:
 		return (fgnup->fgn.buf);
 #ifdef COMPAT_FREEBSD32
 	case FIODGNAME_32:
 		return ((void *)(uintptr_t)fgnup->fgn32.buf);
 #endif
 	default:
 		panic("Unhandled ioctl command %ld", com);
 	}
 }
 
 static int
 devfs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct fiodgname_arg *fgn;
 	struct vnode *vpold, *vp;
 	struct cdevsw *dsw;
 	struct thread *td;
 	struct session *sess;
 	struct cdev *dev;
 	int error, ref, i;
 	const char *p;
 	u_long com;
 
 	vp = ap->a_vp;
 	com = ap->a_command;
 	td = ap->a_td;
 
 	dsw = devvn_refthread(vp, &dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(dev)));
 
 	switch (com) {
 	case FIODTYPE:
 		*(int *)ap->a_data = dsw->d_flags & D_TYPEMASK;
 		error = 0;
 		break;
 	case FIODGNAME:
 #ifdef	COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 		fgn = ap->a_data;
 		p = devtoname(dev);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			error = EINVAL;
 		else
 			error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i);
 		break;
 	default:
 		error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td);
 	}
 
 	dev_relthread(dev, ref);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	if (error == 0 && com == TIOCSCTTY) {
 		/*
 		 * Do nothing if reassigning same control tty, or if the
 		 * control tty has already disappeared.  If it disappeared,
 		 * it's because we were racing with TIOCNOTTY.  TIOCNOTTY
 		 * already took care of releasing the old vnode and we have
 		 * nothing left to do.
 		 */
 		sx_slock(&proctree_lock);
 		sess = td->td_proc->p_session;
 		if (sess->s_ttyvp == vp || sess->s_ttyp == NULL) {
 			sx_sunlock(&proctree_lock);
 			return (0);
 		}
 
 		devfs_ctty_ref(vp);
 		SESS_LOCK(sess);
 		vpold = sess->s_ttyvp;
 		sess->s_ttyvp = vp;
 		sess->s_ttydp = cdev2priv(dev);
 		SESS_UNLOCK(sess);
 
 		sx_sunlock(&proctree_lock);
 
 		/* Get rid of reference to old control tty */
 		if (vpold)
 			devfs_ctty_unref(vpold);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 	struct thread *td;
 
 	td = curthread;
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error)
 		return (error);
 	error = dsw->d_kqfilter(dev, kn);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 static inline int
 devfs_prison_check(struct devfs_dirent *de, struct thread *td)
 {
 	struct cdev_priv *cdp;
 	struct ucred *dcr;
 	struct proc *p;
 	int error;
 
 	cdp = de->de_cdp;
 	if (cdp == NULL)
 		return (0);
 	dcr = cdp->cdp_c.si_cred;
 	if (dcr == NULL)
 		return (0);
 
 	error = prison_check(td->td_ucred, dcr);
 	if (error == 0)
 		return (0);
 	/* We do, however, allow access to the controlling terminal */
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *de, *dd;
 	struct devfs_dirent **dde;
 	struct devfs_mount *dmp;
 	struct mount *mp;
 	struct cdev *cdev;
 	int error, flags, nameiop, dvplocked;
 	char specname[SPECNAMELEN + 1], *pname;
 
 	td = curthread;
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	mp = dvp->v_mount;
 	dmp = VFSTODEVFS(mp);
 	dd = dvp->v_data;
 	*vpp = NULLVP;
 
 	if ((flags & ISLASTCN) && nameiop == RENAME)
 		return (EOPNOTSUPP);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT))
 		return (EIO);
 
 	error = vn_dir_check_exec(dvp, cnp);
 	if (error != 0)
 		return (error);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		de = devfs_parent_dirent(dd);
 		if (de == NULL)
 			return (ENOENT);
 		dvplocked = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp);
 		error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK,
 		    vpp);
 		*dm_unlock = 0;
 		vn_lock(dvp, dvplocked | LK_RETRY);
 		return (error);
 	}
 
 	dd = dvp->v_data;
 	de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0);
 	while (de == NULL) {	/* While(...) so we can use break */
 
 		if (nameiop == DELETE)
 			return (ENOENT);
 
 		/*
 		 * OK, we didn't have an entry for the name we were asked for
 		 * so we try to see if anybody can create it on demand.
 		 */
 		pname = devfs_fqpn(specname, dmp, dd, cnp);
 		if (pname == NULL)
 			break;
 
 		cdev = NULL;
 		DEVFS_DMP_HOLD(dmp);
 		sx_xunlock(&dmp->dm_lock);
 		EVENTHANDLER_INVOKE(dev_clone,
 		    td->td_ucred, pname, strlen(pname), &cdev);
 
 		if (cdev == NULL)
 			sx_xlock(&dmp->dm_lock);
 		else if (devfs_populate_vp(dvp) != 0) {
 			*dm_unlock = 0;
 			sx_xlock(&dmp->dm_lock);
 			if (DEVFS_DMP_DROP(dmp)) {
 				sx_xunlock(&dmp->dm_lock);
 				devfs_unmount_final(dmp);
 			} else
 				sx_xunlock(&dmp->dm_lock);
 			dev_rel(cdev);
 			return (ENOENT);
 		}
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			if (cdev != NULL)
 				dev_rel(cdev);
 			return (ENOENT);
 		}
 
 		if (cdev == NULL)
 			break;
 
 		dev_lock();
 		dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx];
 		if (dde != NULL && *dde != NULL)
 			de = *dde;
 		dev_unlock();
 		dev_rel(cdev);
 		break;
 	}
 
 	if (de == NULL || de->de_flags & DE_WHITEOUT) {
 		if ((nameiop == CREATE || nameiop == RENAME) &&
 		    (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) {
 			return (EJUSTRETURN);
 		}
 		return (ENOENT);
 	}
 
 	if (devfs_prison_check(de, td))
 		return (ENOENT);
 
 	if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		if (*vpp == dvp) {
 			VREF(dvp);
 			*vpp = dvp;
 			return (0);
 		}
 	}
 	error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp);
 	*dm_unlock = 0;
 	return (error);
 }
 
 static int
 devfs_lookup(struct vop_lookup_args *ap)
 {
 	int j;
 	struct devfs_mount *dmp;
 	int dm_unlock;
 
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOTDIR);
 
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dm_unlock = 1;
 	j = devfs_lookupx(ap, &dm_unlock);
 	if (dm_unlock == 1)
 		sx_xunlock(&dmp->dm_lock);
 	return (j);
 }
 
 static int
 devfs_mknod(struct vop_mknod_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct devfs_dirent *dd, *de;
 	struct devfs_mount *dmp;
 	int error;
 
 	/*
 	 * The only type of node we should be creating here is a
 	 * character device, for anything else return EOPNOTSUPP.
 	 */
 	if (ap->a_vap->va_type != VCHR)
 		return (EOPNOTSUPP);
 	dvp = ap->a_dvp;
 	dmp = VFSTODEVFS(dvp->v_mount);
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dd = dvp->v_data;
 
 	error = ENOENT;
 	sx_xlock(&dmp->dm_lock);
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (cnp->cn_namelen != de->de_dirent->d_namlen)
 			continue;
 		if (de->de_dirent->d_type == DT_CHR &&
 		    (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 		if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name,
 		    de->de_dirent->d_namlen) != 0)
 			continue;
 		if (de->de_flags & DE_WHITEOUT)
 			break;
 		goto notfound;
 	}
 	if (de == NULL)
 		goto notfound;
 	de->de_flags &= ~DE_WHITEOUT;
 	error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp);
 	return (error);
 notfound:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_open(struct vop_open_args *ap)
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
 	struct file *fp = ap->a_fp;
 	int error, ref, vlocked;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	if (dev == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	if (fp == NULL && dsw->d_fdopen != NULL) {
 		dev_relthread(dev, ref);
 		return (ENXIO);
 	}
 
 	if (vp->v_type == VCHR)
 		devfs_usecount_add(vp);
 
 	vlocked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	if (fp != NULL) {
 		fp->f_data = dev;
 		fp->f_vnode = vp;
 	}
 	if (dsw->d_fdopen != NULL)
 		error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 	else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	/* Clean up any cdevpriv upon error. */
 	if (error != 0)
 		devfs_clear_cdevpriv();
 	td->td_fpop = fpop;
 
 	vn_lock(vp, vlocked | LK_RETRY);
 	if (error != 0 && vp->v_type == VCHR)
 		devfs_usecount_sub(vp);
 
 	dev_relthread(dev, ref);
 	if (error != 0) {
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 
 #if 0	/* /dev/console */
 	KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp"));
 #else
 	if (fp == NULL)
 		return (error);
 #endif
 	if (fp->f_ops == &badfileops)
 		finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	return (error);
 }
 
 static int
 devfs_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = INT_MAX;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_MAX_CANON:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_CANON;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAX_INPUT:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_INPUT;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_VDISABLE:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = _POSIX_VDISABLE;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		/*
 		 * If MAC is enabled, devfs automatically supports
 		 * trivial non-persistent label storage.
 		 */
 		*ap->a_retval = 1;
 #else
 		*ap->a_retval = 0;
 #endif
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
 devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_poll(fp, events, cred, td);
 		return (error);
 	}
 	error = dsw->d_poll(dev, events, td);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return(error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 devfs_print(struct vop_print_args *ap)
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 static int
 devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int ioflag, error, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_read(fp, uio, cred, flags, td);
 		return (error);
 	}
 	resid = uio->uio_resid;
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		devfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_R);
 	return (error);
 }
 
 static int
 devfs_readdir(struct vop_readdir_args *ap)
 {
 	int error;
 	struct uio *uio;
 	struct dirent *dp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	off_t off;
 	int *tmp_ncookies = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	uio = ap->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	/*
 	 * XXX: This is a temporary hack to get around this filesystem not
 	 * supporting cookies. We store the location of the ncookies pointer
 	 * in a temporary variable before calling vfs_subr.c:vfs_read_dirent()
 	 * and set the number of cookies to 0. We then set the pointer to
 	 * NULL so that vfs_read_dirent doesn't try to call realloc() on 
 	 * ap->a_cookies. Later in this function, we restore the ap->a_ncookies
 	 * pointer to its original location before returning to the caller.
 	 */
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
 	}
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	if (devfs_populate_vp(ap->a_vp) != 0) {
 		if (tmp_ncookies != NULL)
 			ap->a_ncookies = tmp_ncookies;
 		return (EIO);
 	}
 	error = 0;
 	de = ap->a_vp->v_data;
 	off = 0;
 	TAILQ_FOREACH(dd, &de->de_dlist, de_list) {
 		KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__));
 		if (dd->de_flags & (DE_COVERED | DE_WHITEOUT))
 			continue;
 		if (devfs_prison_check(dd, uio->uio_td))
 			continue;
 		if (dd->de_dirent->d_type == DT_DIR)
 			de = dd->de_dir;
 		else
 			de = dd;
 		dp = dd->de_dirent;
 		MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp));
 		if (dp->d_reclen > uio->uio_resid)
 			break;
 		dp->d_fileno = de->de_inode;
 		/* NOTE: d_off is the offset for the *next* entry. */
 		dp->d_off = off + dp->d_reclen;
 		if (off >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, dp, off);
 			if (error)
 				break;
 		}
 		off += dp->d_reclen;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	uio->uio_offset = off;
 
 	/*
 	 * Restore ap->a_ncookies if it wasn't originally NULL in the first
 	 * place.
 	 */
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 static int
 devfs_readlink(struct vop_readlink_args *ap)
 {
 	struct devfs_dirent *de;
 
 	de = ap->a_vp->v_data;
 	return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio));
 }
 
 static void
 devfs_reclaiml(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 
 	mtx_assert(&devfs_de_interlock, MA_OWNED);
 	de = vp->v_data;
 	if (de != NULL) {
 		MPASS(de->de_usecount == 0);
 		de->de_vnode = NULL;
 		vp->v_data = NULL;
 	}
 }
 
 static int
 devfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	mtx_lock(&devfs_de_interlock);
 	devfs_reclaiml(vp);
 	mtx_unlock(&devfs_de_interlock);
 	return (0);
 }
 
 static int
 devfs_reclaim_vchr(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct cdev *dev;
 
 	vp = ap->a_vp;
 	MPASS(vp->v_type == VCHR);
 
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	devfs_usecount_subl(vp);
 	devfs_reclaiml(vp);
 	mtx_unlock(&devfs_de_interlock);
 	dev_lock();
 	dev = vp->v_rdev;
 	vp->v_rdev = NULL;
 	dev_unlock();
 	VI_UNLOCK(vp);
 	if (dev != NULL)
 		dev_rel(dev);
 	return (0);
 }
 
 static int
 devfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered;
 	struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount);
 
 	ASSERT_VOP_ELOCKED(dvp, "devfs_remove");
 	ASSERT_VOP_ELOCKED(vp, "devfs_remove");
 
 	sx_xlock(&dmp->dm_lock);
 	dd = ap->a_dvp->v_data;
 	de = vp->v_data;
 	if (de->de_cdp == NULL) {
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_dirent->d_type == DT_LNK) {
 			de_covered = devfs_find(dd, de->de_dirent->d_name,
 			    de->de_dirent->d_namlen, 0);
 			if (de_covered != NULL)
 				de_covered->de_flags &= ~DE_COVERED;
 		}
 		/* We need to unlock dvp because devfs_delete() may lock it. */
 		VOP_UNLOCK(vp);
 		if (dvp != vp)
 			VOP_UNLOCK(dvp);
 		devfs_delete(dmp, de, 0);
 		sx_xunlock(&dmp->dm_lock);
 		if (dvp != vp)
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	} else {
 		de->de_flags |= DE_WHITEOUT;
 		sx_xunlock(&dmp->dm_lock);
 	}
 	return (0);
 }
 
 /*
  * Revoke is called on a tty when a terminal session ends.  The vnode
  * is orphaned by setting v_op to deadfs so we need to let go of it
  * as well so that we create a new one next time around.
  *
  */
 static int
 devfs_revoke(struct vop_revoke_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *vp2;
 	struct cdev *dev;
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	enum vgetstate vs;
 	u_int i;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL"));
 
 	dev = vp->v_rdev;
 	cdp = cdev2priv(dev);
 
 	dev_lock();
 	cdp->cdp_inuse++;
 	dev_unlock();
 
 	vhold(vp);
 	vgone(vp);
 	vdrop(vp);
 
 	VOP_UNLOCK(vp);
  loop:
 	for (;;) {
 		mtx_lock(&devfs_de_interlock);
 		dev_lock();
 		vp2 = NULL;
 		for (i = 0; i <= cdp->cdp_maxdirent; i++) {
 			de = cdp->cdp_dirents[i];
 			if (de == NULL)
 				continue;
 
 			vp2 = de->de_vnode;
 			if (vp2 != NULL) {
 				dev_unlock();
 				vs = vget_prep(vp2);
 				mtx_unlock(&devfs_de_interlock);
 				if (vget_finish(vp2, LK_EXCLUSIVE, vs) != 0)
 					goto loop;
 				vhold(vp2);
 				vgone(vp2);
 				vdrop(vp2);
 				vput(vp2);
 				break;
 			}
 		}
 		if (vp2 != NULL) {
 			continue;
 		}
 		dev_unlock();
 		mtx_unlock(&devfs_de_interlock);
 		break;
 	}
 	dev_lock();
 	cdp->cdp_inuse--;
 	if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) {
 		KASSERT((cdp->cdp_flags & CDP_ON_ACTIVE_LIST) != 0,
 		    ("%s: cdp %p (%s) not on active list",
 		    __func__, cdp, dev->si_name));
 		cdp->cdp_flags &= ~CDP_ON_ACTIVE_LIST;
 		TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 		dev_unlock();
 		dev_rel(&cdp->cdp_c);
 	} else
 		dev_unlock();
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	return (0);
 }
 
 static int
 devfs_rioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_mount *dmp;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		VOP_UNLOCK(vp);
 		return (EBADF);
 	}
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	VOP_UNLOCK(vp);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td);
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 static int
 devfs_rread(struct vop_read_args *ap)
 {
 
 	if (ap->a_vp->v_type != VDIR)
 		return (EINVAL);
 	return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL));
 }
 
 static int
 devfs_setattr(struct vop_setattr_args *ap)
 {
 	struct devfs_dirent *de;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	td = curthread;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = de->de_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = de->de_gid;
 	else
 		gid = vap->va_gid;
 	if (uid != de->de_uid || gid != de->de_gid) {
 		if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid ||
 		    (gid != de->de_gid && !groupmember(gid, ap->a_cred))) {
 			error = priv_check(td, PRIV_VFS_CHOWN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_uid = uid;
 		de->de_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != de->de_uid) {
 			error = priv_check(td, PRIV_VFS_ADMIN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		error = vn_utimes_perm(vp, vap, ap->a_cred, td);
 		if (error != 0)
 			goto ret;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_atime = vap->va_atime;
 			else
 				de->de_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_mtime = vap->va_mtime;
 			else
 				de->de_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 
 	if (c) {
 		if (vp->v_type == VCHR)
 			vfs_timestamp(&vp->v_rdev->si_ctime);
 		else
 			vfs_timestamp(&de->de_mtime);
 	}
 
 ret:
 	sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock);
 	return (error);
 }
 
 #ifdef MAC
 static int
 devfs_setlabel(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	de = vp->v_data;
 
 	mac_vnode_relabel(ap->a_cred, vp, ap->a_label);
 	mac_devfs_update(vp->v_mount, de, vp);
 
 	return (0);
 }
 #endif
 
 static int
 devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred)
 {
 
 	return (vnops.fo_stat(fp, sb, cred));
 }
 
 static int
 devfs_symlink(struct vop_symlink_args *ap)
 {
 	int i, error;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered, *de_dotdot;
 	struct devfs_mount *dmp;
 
 	error = priv_check(curthread, PRIV_DEVFS_SYMLINK);
 	if (error)
 		return(error);
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOENT);
 
 	dd = ap->a_dvp->v_data;
 	de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen);
 	de->de_flags = DE_USER;
 	de->de_uid = 0;
 	de->de_gid = 0;
 	de->de_mode = 0755;
 	de->de_inode = alloc_unr(devfs_inos);
 	de->de_dir = dd;
 	de->de_dirent->d_type = DT_LNK;
 	i = strlen(ap->a_target) + 1;
 	de->de_symlink = malloc(i, M_DEVFS, M_WAITOK);
 	bcopy(ap->a_target, de->de_symlink, i);
 #ifdef MAC
 	mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de);
 #endif
 	de_covered = devfs_find(dd, de->de_dirent->d_name,
 	    de->de_dirent->d_namlen, 0);
 	if (de_covered != NULL) {
 		if ((de_covered->de_flags & DE_USER) != 0) {
 			devfs_delete(dmp, de, DEVFS_DEL_NORECURSE);
 			sx_xunlock(&dmp->dm_lock);
 			return (EEXIST);
 		}
 		KASSERT((de_covered->de_flags & DE_COVERED) == 0,
 		    ("devfs_symlink: entry %p already covered", de_covered));
 		de_covered->de_flags |= DE_COVERED;
 	}
 
 	de_dotdot = TAILQ_FIRST(&dd->de_dlist);		/* "." */
 	de_dotdot = TAILQ_NEXT(de_dotdot, de_list);	/* ".." */
 	TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list);
 	devfs_dir_ref_de(dmp, dd);
 	devfs_rules_apply(dmp, de);
 
 	return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp));
 }
 
 static int
 devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 static int
 devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int error, ioflag, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_write(fp, uio, cred, flags, td);
 		return (error);
 	}
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
 	error = dsw->d_write(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		devfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_W);
 	return (error);
 }
 
 static int
 devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	struct mount *mp;
 	struct vnode *vp;
 	struct file *fpop;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error, ref;
 
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 * The one exception is that D_MMAP_ANON devices
 	 * (i.e. /dev/zero) permit private writable mappings.
 	 *
 	 * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests
 	 * as well as updating maxprot to permit writing for
 	 * D_MMAP_ANON devices rather than doing that here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff,
 	    &object);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 	return (error);
 }
 
 dev_t
 dev2udev(struct cdev *x)
 {
 	if (x == NULL)
 		return (NODEV);
 	return (cdev2priv(x)->cdp_inode);
 }
 
 static int
 devfs_cmp_f(struct file *fp1, struct file *fp2, struct thread *td)
 {
 	if (fp2->f_type != DTYPE_VNODE || fp2->f_ops != &devfs_ops_f)
 		return (3);
 	return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data));
 }
 
-static struct fileops devfs_ops_f = {
+static const struct fileops devfs_ops_f = {
 	.fo_read =	devfs_read_f,
 	.fo_write =	devfs_write_f,
 	.fo_truncate =	devfs_truncate_f,
 	.fo_ioctl =	devfs_ioctl_f,
 	.fo_poll =	devfs_poll_f,
 	.fo_kqfilter =	devfs_kqfilter_f,
 	.fo_stat =	devfs_stat_f,
 	.fo_close =	devfs_close_f,
 	.fo_chmod =	vn_chmod,
 	.fo_chown =	vn_chown,
 	.fo_sendfile =	vn_sendfile,
 	.fo_seek =	vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap =	devfs_mmap_f,
 	.fo_cmp =	devfs_cmp_f,
 	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 /* Vops for non-CHR vnodes in /dev. */
 static struct vop_vector devfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_rioctl,
 	.vop_lookup =		devfs_lookup,
 	.vop_mknod =		devfs_mknod,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_read =		devfs_rread,
 	.vop_readdir =		devfs_readdir,
 	.vop_readlink =		devfs_readlink,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_revoke =		devfs_revoke,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_symlink =		devfs_symlink,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_lock1 =		vop_lock,
 	.vop_unlock =		vop_unlock,
 	.vop_islocked =		vop_islocked,
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 };
 VFS_VOP_VECTOR_REGISTER(devfs_vnodeops);
 
 /* Vops for VCHR vnodes in /dev. */
 static struct vop_vector devfs_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_bmap =		VOP_PANIC,
 	.vop_close =		devfs_close,
 	.vop_create =		VOP_PANIC,
 	.vop_fsync =		vop_stdfsync,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_ioctl,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		devfs_open,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_poll =		dead_poll,
 	.vop_print =		devfs_print,
 	.vop_read =		dead_read,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		devfs_reclaim_vchr,
 	.vop_remove =		devfs_remove,
 	.vop_rename =		VOP_PANIC,
 	.vop_revoke =		devfs_revoke,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_strategy =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_write =		dead_write,
 	.vop_lock1 =		vop_lock,
 	.vop_unlock =		vop_unlock,
 	.vop_islocked =		vop_islocked,
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 };
 VFS_VOP_VECTOR_REGISTER(devfs_specops);
 
 /*
  * Our calling convention to the device drivers used to be that we passed
  * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ 
  * flags instead since that's what open(), close() and ioctl() takes and
  * we don't really want vnode.h in device drivers.
  * We solved the source compatibility by redefining some vnode flags to
  * be the same as the fcntl ones and by sending down the bitwise OR of
  * the respective fcntl/vnode flags.  These CTASSERTS make sure nobody
  * pulls the rug out under this.
  */
 CTASSERT(O_NONBLOCK == IO_NDELAY);
 CTASSERT(O_FSYNC == IO_SYNC);
diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c
index 88ebe702ec0a..5df9be59ce36 100644
--- a/sys/fs/fuse/fuse_device.c
+++ b/sys/fs/fuse/fuse_device.c
@@ -1,610 +1,610 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Copyright (c) 2019 The FreeBSD Foundation
  *
  * Portions of this software were developed by BFF Storage Systems, LLC under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/fcntl.h>
 #include <sys/sysctl.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 
 #include "fuse.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 
 #include <compat/linux/linux_errno.h>
 #include <compat/linux/linux_errno.inc>
 
 SDT_PROVIDER_DECLARE(fusefs);
 /* 
  * Fuse trace probe:
  * arg0: verbosity.  Higher numbers give more verbose messages
  * arg1: Textual message
  */
 SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*");
 
 static struct cdev *fuse_dev;
 
 static d_kqfilter_t fuse_device_filter;
 static d_open_t fuse_device_open;
 static d_poll_t fuse_device_poll;
 static d_read_t fuse_device_read;
 static d_write_t fuse_device_write;
 
 static struct cdevsw fuse_device_cdevsw = {
 	.d_kqfilter = fuse_device_filter,
 	.d_open = fuse_device_open,
 	.d_name = "fuse",
 	.d_poll = fuse_device_poll,
 	.d_read = fuse_device_read,
 	.d_write = fuse_device_write,
 	.d_version = D_VERSION,
 };
 
 static int fuse_device_filt_read(struct knote *kn, long hint);
 static int fuse_device_filt_write(struct knote *kn, long hint);
 static void fuse_device_filt_detach(struct knote *kn);
 
-struct filterops fuse_device_rfiltops = {
+static const struct filterops fuse_device_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = fuse_device_filt_detach,
 	.f_event = fuse_device_filt_read,
 };
 
-struct filterops fuse_device_wfiltops = {
+static const struct filterops fuse_device_wfiltops = {
 	.f_isfd = 1,
 	.f_event = fuse_device_filt_write,
 };
 
 /****************************
  *
  * >>> Fuse device op defs
  *
  ****************************/
 
 static void
 fdata_dtor(void *arg)
 {
 	struct fuse_data *fdata;
 	struct fuse_ticket *tick;
 
 	fdata = arg;
 	if (fdata == NULL)
 		return;
 
 	fdata_set_dead(fdata);
 
 	FUSE_LOCK();
 	fuse_lck_mtx_lock(fdata->aw_mtx);
 	/* wakup poll()ers */
 	selwakeuppri(&fdata->ks_rsel, PZERO + 1);
 	/* Don't let syscall handlers wait in vain */
 	while ((tick = fuse_aw_pop(fdata))) {
 		fuse_lck_mtx_lock(tick->tk_aw_mtx);
 		fticket_set_answered(tick);
 		tick->tk_aw_errno = ENOTCONN;
 		wakeup(tick);
 		fuse_lck_mtx_unlock(tick->tk_aw_mtx);
 		FUSE_ASSERT_AW_DONE(tick);
 		fuse_ticket_drop(tick);
 	}
 	fuse_lck_mtx_unlock(fdata->aw_mtx);
 
 	/* Cleanup unsent operations */
 	fuse_lck_mtx_lock(fdata->ms_mtx);
 	while ((tick = fuse_ms_pop(fdata))) {
 		fuse_ticket_drop(tick);
 	}
 	fuse_lck_mtx_unlock(fdata->ms_mtx);
 	FUSE_UNLOCK();
 
 	fdata_trydestroy(fdata);
 }
 
 static int
 fuse_device_filter(struct cdev *dev, struct knote *kn)
 {
 	struct fuse_data *data;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&data);
 
 	if (error == 0 && kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &fuse_device_rfiltops;
 		kn->kn_hook = data;
 		knlist_add(&data->ks_rsel.si_note, kn, 0);
 		error = 0;
 	} else if (error == 0 && kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &fuse_device_wfiltops;
 		error = 0;
 	} else if (error == 0) {
 		error = EINVAL;
 		kn->kn_data = error;
 	}
 
 	return (error);
 }
 
 static void
 fuse_device_filt_detach(struct knote *kn)
 {
 	struct fuse_data *data;
 
 	data = (struct fuse_data*)kn->kn_hook;
 	MPASS(data != NULL);
 	knlist_remove(&data->ks_rsel.si_note, kn, 0);
 	kn->kn_hook = NULL;
 }
 
 static int
 fuse_device_filt_read(struct knote *kn, long hint)
 {
 	struct fuse_data *data;
 	int ready;
 
 	data = (struct fuse_data*)kn->kn_hook;
 	MPASS(data != NULL);
 
 	mtx_assert(&data->ms_mtx, MA_OWNED);
 	if (fdata_get_dead(data)) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = ENODEV;
 		kn->kn_data = 1;
 		ready = 1;
 	} else if (STAILQ_FIRST(&data->ms_head)) {
 		MPASS(data->ms_count >= 1);
 		kn->kn_data = data->ms_count;
 		ready = 1;
 	} else {
 		ready = 0;
 	}
 
 	return (ready);
 }
 
 static int
 fuse_device_filt_write(struct knote *kn, long hint)
 {
 
 	kn->kn_data = 0;
 
 	/* The device is always ready to write, so we return 1*/
 	return (1);
 }
 
 /*
  * Resources are set up on a per-open basis
  */
 static int
 fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct fuse_data *fdata;
 	int error;
 
 	SDT_PROBE2(fusefs, , device, trace, 1, "device open");
 
 	fdata = fdata_alloc(dev, td->td_ucred);
 	error = devfs_set_cdevpriv(fdata, fdata_dtor);
 	if (error != 0)
 		fdata_trydestroy(fdata);
 	else
 		SDT_PROBE2(fusefs, , device, trace, 1, "device open success");
 	return (error);
 }
 
 int
 fuse_device_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct fuse_data *data;
 	int error, revents = 0;
 
 	error = devfs_get_cdevpriv((void **)&data);
 	if (error != 0)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		fuse_lck_mtx_lock(data->ms_mtx);
 		if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &data->ks_rsel);
 		fuse_lck_mtx_unlock(data->ms_mtx);
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		revents |= events & (POLLOUT | POLLWRNORM);
 	}
 	return (revents);
 }
 
 /*
  * fuse_device_read hangs on the queue of VFS messages.
  * When it's notified that there is a new one, it picks that and
  * passes up to the daemon
  */
 int
 fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	int err;
 	struct fuse_data *data;
 	struct fuse_ticket *tick;
 	void *buf;
 	int buflen;
 
 	SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read");
 
 	err = devfs_get_cdevpriv((void **)&data);
 	if (err != 0)
 		return (err);
 
 	fuse_lck_mtx_lock(data->ms_mtx);
 again:
 	if (fdata_get_dead(data)) {
 		SDT_PROBE2(fusefs, , device, trace, 2,
 			"we know early on that reader should be kicked so we "
 			"don't wait for news");
 		fuse_lck_mtx_unlock(data->ms_mtx);
 		return (ENODEV);
 	}
 	if (!(tick = fuse_ms_pop(data))) {
 		/* check if we may block */
 		if (ioflag & O_NONBLOCK) {
 			/* get outa here soon */
 			fuse_lck_mtx_unlock(data->ms_mtx);
 			return (EAGAIN);
 		} else {
 			err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0);
 			if (err != 0) {
 				fuse_lck_mtx_unlock(data->ms_mtx);
 				return (fdata_get_dead(data) ? ENODEV : err);
 			}
 			tick = fuse_ms_pop(data);
 		}
 	}
 	if (!tick) {
 		/*
 		 * We can get here if fuse daemon suddenly terminates,
 		 * eg, by being hit by a SIGKILL
 		 * -- and some other cases, too, tho not totally clear, when
 		 * (cv_signal/wakeup_one signals the whole process ?)
 		 */
 		SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread");
 		goto again;
 	}
 	fuse_lck_mtx_unlock(data->ms_mtx);
 
 	if (fdata_get_dead(data)) {
 		/*
 		 * somebody somewhere -- eg., umount routine --
 		 * wants this liaison finished off
 		 */
 		SDT_PROBE2(fusefs, , device, trace, 2,
 			"reader is to be sacked");
 		if (tick) {
 			SDT_PROBE2(fusefs, , device, trace, 2, "weird -- "
 				"\"kick\" is set tho there is message");
 			FUSE_ASSERT_MS_DONE(tick);
 			fuse_ticket_drop(tick);
 		}
 		return (ENODEV);	/* This should make the daemon get off
 					 * of us */
 	}
 	SDT_PROBE2(fusefs, , device, trace, 1,
 		"fuse device read message successfully");
 
 	buf = tick->tk_ms_fiov.base;
 	buflen = tick->tk_ms_fiov.len;
 
 	/*
 	 * Why not ban mercilessly stupid daemons who can't keep up
 	 * with us? (There is no much use of a partial read here...)
 	 */
 	/*
 	 * XXX note that in such cases Linux FUSE throws EIO at the
 	 * syscall invoker and stands back to the message queue. The
 	 * rationale should be made clear (and possibly adopt that
 	 * behaviour). Keeping the current scheme at least makes
 	 * fallacy as loud as possible...
 	 */
 	if (uio->uio_resid < buflen) {
 		fdata_set_dead(data);
 		SDT_PROBE2(fusefs, , device, trace, 2,
 		    "daemon is stupid, kick it off...");
 		err = ENODEV;
 	} else {
 		err = uiomove(buf, buflen, uio);
 	}
 
 	FUSE_ASSERT_MS_DONE(tick);
 	fuse_ticket_drop(tick);
 
 	return (err);
 }
 
 static inline int
 fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio)
 {
 	if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) {
 		SDT_PROBE2(fusefs, , device, trace, 1,
 			"Format error: body size "
 			"differs from size claimed by header");
 		return (EINVAL);
 	}
 	if (uio->uio_resid && ohead->unique != 0 && ohead->error) {
 		SDT_PROBE2(fusefs, , device, trace, 1, 
 			"Format error: non zero error but message had a body");
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify,
 	"struct fuse_out_header*");
 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket,
 	"uint64_t");
 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found,
 	"struct fuse_ticket*");
 /*
  * fuse_device_write first reads the header sent by the daemon.
  * If that's OK, looks up ticket/callback node by the unique id seen in header.
  * If the callback node contains a handler function, the uio is passed over
  * that.
  */
 static int
 fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct fuse_out_header ohead;
 	int err = 0;
 	struct fuse_data *data;
 	struct mount *mp;
 	struct fuse_ticket *tick, *itick, *x_tick;
 	int found = 0;
 
 	err = devfs_get_cdevpriv((void **)&data);
 	if (err != 0)
 		return (err);
 	mp = data->mp;
 
 	if (uio->uio_resid < sizeof(struct fuse_out_header)) {
 		SDT_PROBE2(fusefs, , device, trace, 1,
 			"fuse_device_write got less than a header!");
 		fdata_set_dead(data);
 		return (EINVAL);
 	}
 	if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0)
 		return (err);
 
 	if (data->linux_errnos != 0 && ohead.error != 0) {
 		err = -ohead.error;
 		if (err < 0 || err >= nitems(linux_to_bsd_errtbl))
 			return (EINVAL);
 
 		/* '-', because it will get flipped again below */
 		ohead.error = -linux_to_bsd_errtbl[err];
 	}
 
 	/*
 	 * We check header information (which is redundant) and compare it
 	 * with what we see. If we see some inconsistency we discard the
 	 * whole answer and proceed on as if it had never existed. In
 	 * particular, no pretender will be woken up, regardless the
 	 * "unique" value in the header.
 	 */
 	if ((err = fuse_ohead_audit(&ohead, uio))) {
 		fdata_set_dead(data);
 		return (err);
 	}
 	/* Pass stuff over to callback if there is one installed */
 
 	/* Looking for ticket with the unique id of header */
 	fuse_lck_mtx_lock(data->aw_mtx);
 	TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link,
 	    x_tick) {
 		if (tick->tk_unique == ohead.unique) {
 			SDT_PROBE1(fusefs, , device, fuse_device_write_found,
 				tick);
 			found = 1;
 			fuse_aw_remove(tick);
 			break;
 		}
 	}
 	if (found && tick->irq_unique > 0) {
 		/* 
 		 * Discard the FUSE_INTERRUPT ticket that tried to interrupt
 		 * this operation
 		 */
 		TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link,
 		    x_tick) {
 			if (itick->tk_unique == tick->irq_unique) {
 				fuse_aw_remove(itick);
 				fuse_ticket_drop(itick);
 				break;
 			}
 		}
 		tick->irq_unique = 0;
 	}
 	fuse_lck_mtx_unlock(data->aw_mtx);
 
 	if (found) {
 		if (tick->tk_aw_handler) {
 			/*
 			 * We found a callback with proper handler. In this
 			 * case the out header will be 0wnd by the callback,
 			 * so the fun of freeing that is left for her.
 			 * (Then, by all chance, she'll just get that's done
 			 * via ticket_drop(), so no manual mucking
 			 * around...)
 			 */
 			SDT_PROBE2(fusefs, , device, trace, 1,
 				"pass ticket to a callback");
 			/* Sanitize the linuxism of negative errnos */
 			ohead.error *= -1;
 			if (ohead.error < 0 || ohead.error > ELAST) {
 				/* Illegal error code */
 				ohead.error = EIO;
 				memcpy(&tick->tk_aw_ohead, &ohead,
 					sizeof(ohead));
 				tick->tk_aw_handler(tick, uio);
 				err = EINVAL;
 			} else {
 				memcpy(&tick->tk_aw_ohead, &ohead,
 					sizeof(ohead));
 				err = tick->tk_aw_handler(tick, uio);
 			}
 		} else {
 			/* pretender doesn't wanna do anything with answer */
 			SDT_PROBE2(fusefs, , device, trace, 1,
 				"stuff devalidated, so we drop it");
 		}
 
 		/*
 		 * As aw_mtx was not held during the callback execution the
 		 * ticket may have been inserted again.  However, this is safe
 		 * because fuse_ticket_drop() will deal with refcount anyway.
 		 */
 		fuse_ticket_drop(tick);
 	} else if (ohead.unique == 0){
 		/* unique == 0 means asynchronous notification */
 		SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead);
 		switch (ohead.error) {
 		case FUSE_NOTIFY_INVAL_ENTRY:
 			err = fuse_internal_invalidate_entry(mp, uio);
 			break;
 		case FUSE_NOTIFY_INVAL_INODE:
 			err = fuse_internal_invalidate_inode(mp, uio);
 			break;
 		case FUSE_NOTIFY_RETRIEVE:
 		case FUSE_NOTIFY_STORE:
 			/*
 			 * Unimplemented.  I don't know of any file systems
 			 * that use them, and the protocol isn't sound anyway,
 			 * since the notification messages don't include the
 			 * inode's generation number.  Without that, it's
 			 * possible to manipulate the cache of the wrong vnode.
 			 * Finally, it's not defined what this message should
 			 * do for a file with dirty cache.
 			 */
 		case FUSE_NOTIFY_POLL:
 			/* Unimplemented.  See comments in fuse_vnops */
 		default:
 			/* Not implemented */
 			err = ENOSYS;
 		}
 	} else {
 		/* no callback at all! */
 		SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, 
 			ohead.unique);
 		if (ohead.error == -EAGAIN) {
 			/* 
 			 * This was probably a response to a FUSE_INTERRUPT
 			 * operation whose original operation is already
 			 * complete.  We can't store FUSE_INTERRUPT tickets
 			 * indefinitely because their responses are optional.
 			 * So we delete them when the original operation
 			 * completes.  And sadly the fuse_header_out doesn't
 			 * identify the opcode, so we have to guess.
 			 */
 			err = 0;
 		} else {
 			err = EINVAL;
 		}
 	}
 
 	return (err);
 }
 
 int
 fuse_device_init(void)
 {
 
 	fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR,
 	    S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse");
 	if (fuse_dev == NULL)
 		return (ENOMEM);
 	return (0);
 }
 
 void
 fuse_device_destroy(void)
 {
 
 	MPASS(fuse_dev != NULL);
 	destroy_dev(fuse_dev);
 }
diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c
index f12236264d19..0b5cfdf77149 100644
--- a/sys/geom/geom_dev.c
+++ b/sys/geom/geom_dev.c
@@ -1,911 +1,911 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/ctype.h>
 #include <sys/bio.h>
 #include <sys/devctl.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/selinfo.h>
 #include <sys/sysctl.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <machine/stdarg.h>
 
 struct g_dev_softc {
 	struct mtx	 sc_mtx;
 	struct cdev	*sc_dev;
 	struct cdev	*sc_alias;
 	int		 sc_open;
 	u_int		 sc_active;
 	struct selinfo	 sc_selinfo;
 #define	SC_A_DESTROY	(1 << 31)
 #define	SC_A_OPEN	(1 << 30)
 #define	SC_A_ACTIVE	(SC_A_OPEN - 1)
 };
 
 static d_open_t		g_dev_open;
 static d_close_t	g_dev_close;
 static d_strategy_t	g_dev_strategy;
 static d_ioctl_t	g_dev_ioctl;
 static d_kqfilter_t	g_dev_kqfilter;
 
 static void		gdev_filter_detach(struct knote *kn);
 static int		gdev_filter_vnode(struct knote *kn, long hint);
 
-static struct filterops gdev_filterops_vnode = {
+static const struct filterops gdev_filterops_vnode = {
 	.f_isfd = 1,
 	.f_detach = gdev_filter_detach,
 	.f_event = gdev_filter_vnode,
 };
 
 static struct cdevsw g_dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	g_dev_open,
 	.d_close =	g_dev_close,
 	.d_read =	physread,
 	.d_write =	physwrite,
 	.d_ioctl =	g_dev_ioctl,
 	.d_strategy =	g_dev_strategy,
 	.d_name =	"g_dev",
 	.d_flags =	D_DISK | D_TRACKCLOSE,
 	.d_kqfilter =	g_dev_kqfilter,
 };
 
 static g_init_t g_dev_init;
 static g_fini_t g_dev_fini;
 static g_taste_t g_dev_taste;
 static g_orphan_t g_dev_orphan;
 static g_attrchanged_t g_dev_attrchanged;
 static g_resize_t g_dev_resize;
 
 static struct g_class g_dev_class	= {
 	.name = "DEV",
 	.version = G_VERSION,
 	.init = g_dev_init,
 	.fini = g_dev_fini,
 	.taste = g_dev_taste,
 	.orphan = g_dev_orphan,
 	.attrchanged = g_dev_attrchanged,
 	.resize = g_dev_resize
 };
 
 /*
  * We target 262144 (8 x 32768) sectors by default as this significantly
  * increases the throughput on commonly used SSD's with a marginal
  * increase in non-interruptible request latency.
  */
 static uint64_t g_dev_del_max_sectors = 262144;
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "GEOM_DEV stuff");
 SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW,
     &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single "
     "delete request sent to the provider. Larger requests are chunked "
     "so they can be interrupted. (0 = disable chunking)");
 
 static char *dumpdev = NULL;
 static void
 g_dev_init(struct g_class *mp)
 {
 
 	dumpdev = kern_getenv("dumpdev");
 }
 
 static void
 g_dev_fini(struct g_class *mp)
 {
 
 	freeenv(dumpdev);
 	dumpdev = NULL;
 }
 
 static int
 g_dev_setdumpdev(struct cdev *dev, struct diocskerneldump_arg *kda)
 {
 	struct g_kerneldump kd;
 	struct g_consumer *cp;
 	int error, len;
 
 	MPASS(dev != NULL && kda != NULL);
 	MPASS(kda->kda_index != KDA_REMOVE);
 
 	cp = dev->si_drv2;
 	len = sizeof(kd);
 	memset(&kd, 0, len);
 	kd.offset = 0;
 	kd.length = OFF_MAX;
 	error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd);
 	if (error != 0)
 		return (error);
 
 	error = dumper_insert(&kd.di, devtoname(dev), kda);
 	if (error == 0)
 		dev->si_flags |= SI_DUMPDEV;
 
 	return (error);
 }
 
 static int
 init_dumpdev(struct cdev *dev)
 {
 	struct diocskerneldump_arg kda;
 	struct g_consumer *cp;
 	const char *devprefix = _PATH_DEV, *devname;
 	int error;
 	size_t len;
 
 	bzero(&kda, sizeof(kda));
 	kda.kda_index = KDA_APPEND;
 
 	if (dumpdev == NULL)
 		return (0);
 
 	len = strlen(devprefix);
 	devname = devtoname(dev);
 	if (strcmp(devname, dumpdev) != 0 &&
 	   (strncmp(dumpdev, devprefix, len) != 0 ||
 	    strcmp(devname, dumpdev + len) != 0))
 		return (0);
 
 	cp = (struct g_consumer *)dev->si_drv2;
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 
 	error = g_dev_setdumpdev(dev, &kda);
 	if (error == 0) {
 		freeenv(dumpdev);
 		dumpdev = NULL;
 	}
 
 	(void)g_access(cp, -1, 0, 0);
 
 	return (error);
 }
 
 static void
 g_dev_destroy(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_dev_softc *sc;
 	char buf[SPECNAMELEN + 6];
 
 	g_topology_assert();
 	cp = arg;
 	gp = cp->geom;
 	sc = cp->private;
 	g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name);
 	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
 	devctl_notify("GEOM", "DEV", "DESTROY", buf);
 	knlist_clear(&sc->sc_selinfo.si_note, 0);
 	knlist_destroy(&sc->sc_selinfo.si_note);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	mtx_destroy(&sc->sc_mtx);
 	g_free(sc);
 }
 
 void
 g_dev_print(void)
 {
 	struct g_geom *gp;
 	char const *p = "";
 
 	LIST_FOREACH(gp, &g_dev_class.geom, geom) {
 		printf("%s%s", p, gp->name);
 		p = " ";
 	}
 	printf("\n");
 }
 
 static void
 g_dev_set_physpath(struct g_consumer *cp)
 {
 	struct g_dev_softc *sc;
 	char *physpath;
 	int error, physpath_len;
 
 	if (g_access(cp, 1, 0, 0) != 0)
 		return;
 
 	sc = cp->private;
 	physpath_len = MAXPATHLEN;
 	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
 	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
 	g_access(cp, -1, 0, 0);
 	if (error == 0 && strlen(physpath) != 0) {
 		struct cdev *dev, *old_alias_dev;
 		struct cdev **alias_devp;
 
 		dev = sc->sc_dev;
 		old_alias_dev = sc->sc_alias;
 		alias_devp = (struct cdev **)&sc->sc_alias;
 		make_dev_physpath_alias(MAKEDEV_WAITOK | MAKEDEV_CHECKNAME,
 		    alias_devp, dev, old_alias_dev, physpath);
 	} else if (sc->sc_alias) {
 		destroy_dev((struct cdev *)sc->sc_alias);
 		sc->sc_alias = NULL;
 	}
 	g_free(physpath);
 }
 
 static void
 g_dev_set_media(struct g_consumer *cp)
 {
 	struct g_dev_softc *sc;
 	struct cdev *dev;
 	char buf[SPECNAMELEN + 6];
 
 	sc = cp->private;
 	dev = sc->sc_dev;
 	snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
 	devctl_notify("DEVFS", "CDEV", "MEDIACHANGE", buf);
 	devctl_notify("GEOM", "DEV", "MEDIACHANGE", buf);
 	dev = sc->sc_alias;
 	if (dev != NULL) {
 		snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
 		devctl_notify("DEVFS", "CDEV", "MEDIACHANGE", buf);
 		devctl_notify("GEOM", "DEV", "MEDIACHANGE", buf);
 	}
 }
 
 static void
 g_dev_attrchanged(struct g_consumer *cp, const char *attr)
 {
 
 	if (strcmp(attr, "GEOM::media") == 0) {
 		g_dev_set_media(cp);
 		return;
 	}
 
 	if (strcmp(attr, "GEOM::physpath") == 0) {
 		g_dev_set_physpath(cp);
 		return;
 	}
 }
 
 static void
 g_dev_resize(struct g_consumer *cp)
 {
 	struct g_dev_softc *sc;
 	char buf[SPECNAMELEN + 6];
 
 	sc = cp->private;
 	KNOTE_UNLOCKED(&sc->sc_selinfo.si_note, NOTE_ATTRIB);
 
 	snprintf(buf, sizeof(buf), "cdev=%s", cp->provider->name);
 	devctl_notify("GEOM", "DEV", "SIZECHANGE", buf);
 }
 
 struct g_provider *
 g_dev_getprovider(struct cdev *dev)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	if (dev == NULL)
 		return (NULL);
 	if (dev->si_devsw != &g_dev_cdevsw)
 		return (NULL);
 	cp = dev->si_drv2;
 	return (cp->provider);
 }
 
 static struct g_geom *
 g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused)
 {
 	struct g_geom *gp;
 	struct g_geom_alias *gap;
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error;
 	struct cdev *dev, *adev;
 	char buf[SPECNAMELEN + 6];
 	struct make_dev_args args;
 
 	g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name);
 	g_topology_assert();
 	gp = g_new_geomf(mp, "%s", pp->name);
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF);
 	cp = g_new_consumer(gp);
 	cp->private = sc;
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		printf("%s: g_dev_taste(%s) failed to g_attach, error=%d\n",
 		    __func__, pp->name, error);
 		g_destroy_consumer(cp);
 		g_destroy_geom(gp);
 		mtx_destroy(&sc->sc_mtx);
 		g_free(sc);
 		return (NULL);
 	}
 	make_dev_args_init(&args);
 	args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
 	args.mda_devsw = &g_dev_cdevsw;
 	args.mda_cr = NULL;
 	args.mda_uid = UID_ROOT;
 	args.mda_gid = GID_OPERATOR;
 	args.mda_mode = 0640;
 	args.mda_si_drv1 = sc;
 	args.mda_si_drv2 = cp;
 	error = make_dev_s(&args, &sc->sc_dev, "%s", gp->name);
 	if (error != 0) {
 		printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n",
 		    __func__, gp->name, error);
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_destroy_geom(gp);
 		mtx_destroy(&sc->sc_mtx);
 		g_free(sc);
 		return (NULL);
 	}
 	dev = sc->sc_dev;
 	dev->si_flags |= SI_UNMAPPED;
 	dev->si_iosize_max = maxphys;
 	knlist_init_mtx(&sc->sc_selinfo.si_note, &sc->sc_mtx);
 	error = init_dumpdev(dev);
 	if (error != 0)
 		printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n",
 		    __func__, gp->name, error);
 
 	g_dev_attrchanged(cp, "GEOM::physpath");
 	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
 	devctl_notify("GEOM", "DEV", "CREATE", buf);
 	/*
 	 * Now add all the aliases for this drive
 	 */
 	LIST_FOREACH(gap, &pp->aliases, ga_next) {
 		error = make_dev_alias_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &adev, dev,
 		    "%s", gap->ga_alias);
 		if (error) {
 			printf("%s: make_dev_alias_p() failed (name=%s, error=%d)\n",
 			    __func__, gap->ga_alias, error);
 			continue;
 		}
 		snprintf(buf, sizeof(buf), "cdev=%s", gap->ga_alias);
 		devctl_notify("GEOM", "DEV", "CREATE", buf);
 	}
 
 	return (gp);
 }
 
 static int
 g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error, r, w, e;
 
 	cp = dev->si_drv2;
 	g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)",
 	    cp->geom->name, flags, fmt, td);
 
 	r = flags & FREAD ? 1 : 0;
 	w = flags & FWRITE ? 1 : 0;
 #ifdef notyet
 	e = flags & O_EXCL ? 1 : 0;
 #else
 	e = 0;
 #endif
 
 	/*
 	 * This happens on attempt to open a device node with O_EXEC.
 	 */
 	if (r + w + e == 0)
 		return (EINVAL);
 
 	if (w) {
 		/*
 		 * When running in very secure mode, do not allow
 		 * opens for writing of any disks.
 		 */
 		error = securelevel_ge(td->td_ucred, 2);
 		if (error)
 			return (error);
 	}
 	g_topology_lock();
 	error = g_access(cp, r, w, e);
 	g_topology_unlock();
 	if (error == 0) {
 		sc = dev->si_drv1;
 		mtx_lock(&sc->sc_mtx);
 		if (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0)
 			wakeup(&sc->sc_active);
 		sc->sc_open += r + w + e;
 		if (sc->sc_open == 0)
 			atomic_clear_int(&sc->sc_active, SC_A_OPEN);
 		else
 			atomic_set_int(&sc->sc_active, SC_A_OPEN);
 		KNOTE_LOCKED(&sc->sc_selinfo.si_note, NOTE_OPEN);
 		mtx_unlock(&sc->sc_mtx);
 	}
 	return (error);
 }
 
 static int
 g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error, r, w, e;
 
 	cp = dev->si_drv2;
 	g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)",
 	    cp->geom->name, flags, fmt, td);
 
 	r = flags & FREAD ? -1 : 0;
 	w = flags & FWRITE ? -1 : 0;
 #ifdef notyet
 	e = flags & O_EXCL ? -1 : 0;
 #else
 	e = 0;
 #endif
 
 	/*
 	 * The vgonel(9) - caused by eg. forced unmount of devfs - calls
 	 * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags,
 	 * which would result in zero deltas, which in turn would cause
 	 * panic in g_access(9).
 	 *
 	 * Note that we cannot zero the counters (ie. do "r = cp->acr"
 	 * etc) instead, because the consumer might be opened in another
 	 * devfs instance.
 	 */
 	if (r + w + e == 0)
 		return (EINVAL);
 
 	sc = dev->si_drv1;
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_open += r + w + e;
 	if (sc->sc_open == 0)
 		atomic_clear_int(&sc->sc_active, SC_A_OPEN);
 	else
 		atomic_set_int(&sc->sc_active, SC_A_OPEN);
 	while (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0)
 		msleep(&sc->sc_active, &sc->sc_mtx, 0, "g_dev_close", hz / 10);
 	KNOTE_LOCKED(&sc->sc_selinfo.si_note, NOTE_CLOSE | (w ? NOTE_CLOSE_WRITE : 0));
 	mtx_unlock(&sc->sc_mtx);
 	g_topology_lock();
 	error = g_access(cp, r, w, e);
 	g_topology_unlock();
 	return (error);
 }
 
 static int
 g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	off_t offset, length, chunk, odd;
 	int i, error;
 
 	cp = dev->si_drv2;
 	pp = cp->provider;
 
 	/* If consumer or provider is dying, don't disturb. */
 	if (cp->flags & G_CF_ORPHAN)
 		return (ENXIO);
 	if (pp->error)
 		return (pp->error);
 
 	error = 0;
 	KASSERT(cp->acr || cp->acw,
 	    ("Consumer with zero access count in g_dev_ioctl"));
 
 	i = IOCPARM_LEN(cmd);
 	switch (cmd) {
 	case DIOCGSECTORSIZE:
 		*(u_int *)data = pp->sectorsize;
 		if (*(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGMEDIASIZE:
 		*(off_t *)data = pp->mediasize;
 		if (*(off_t *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFWSECTORS:
 		error = g_io_getattr("GEOM::fwsectors", cp, &i, data);
 		if (error == 0 && *(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFWHEADS:
 		error = g_io_getattr("GEOM::fwheads", cp, &i, data);
 		if (error == 0 && *(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCSKERNELDUMP:
 	    {
 		struct diocskerneldump_arg *kda;
 		uint8_t *encryptedkey;
 
 		kda = (struct diocskerneldump_arg *)data;
 		if (kda->kda_index == KDA_REMOVE_ALL ||
 		    kda->kda_index == KDA_REMOVE_DEV ||
 		    kda->kda_index == KDA_REMOVE) {
 			error = dumper_remove(devtoname(dev), kda);
 			explicit_bzero(kda, sizeof(*kda));
 			break;
 		}
 
 		if (kda->kda_encryption != KERNELDUMP_ENC_NONE) {
 			if (kda->kda_encryptedkeysize == 0 ||
 			    kda->kda_encryptedkeysize >
 			    KERNELDUMP_ENCKEY_MAX_SIZE) {
 				explicit_bzero(kda, sizeof(*kda));
 				return (EINVAL);
 			}
 			encryptedkey = malloc(kda->kda_encryptedkeysize, M_TEMP,
 			    M_WAITOK);
 			error = copyin(kda->kda_encryptedkey, encryptedkey,
 			    kda->kda_encryptedkeysize);
 		} else {
 			encryptedkey = NULL;
 		}
 		if (error == 0) {
 			kda->kda_encryptedkey = encryptedkey;
 			error = g_dev_setdumpdev(dev, kda);
 		}
 		zfree(encryptedkey, M_TEMP);
 		explicit_bzero(kda, sizeof(*kda));
 		break;
 	    }
 	case DIOCGFLUSH:
 		error = g_io_flush(cp);
 		break;
 	case DIOCGDELETE:
 		offset = ((off_t *)data)[0];
 		length = ((off_t *)data)[1];
 		if ((offset % pp->sectorsize) != 0 ||
 		    (length % pp->sectorsize) != 0 || length <= 0) {
 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
 			    length);
 			error = EINVAL;
 			break;
 		}
 		while (length > 0) {
 			chunk = length;
 			if (g_dev_del_max_sectors != 0 &&
 			    chunk > g_dev_del_max_sectors * pp->sectorsize) {
 				chunk = g_dev_del_max_sectors * pp->sectorsize;
 				if (pp->stripesize > 0) {
 					odd = (offset + chunk +
 					    pp->stripeoffset) % pp->stripesize;
 					if (chunk > odd)
 						chunk -= odd;
 				}
 			}
 			error = g_delete_data(cp, offset, chunk);
 			length -= chunk;
 			offset += chunk;
 			if (error)
 				break;
 			/*
 			 * Since the request size can be large, the service
 			 * time can be is likewise.  We make this ioctl
 			 * interruptible by checking for signals for each bio.
 			 */
 			if (SIGPENDING(td))
 				break;
 		}
 		break;
 	case DIOCGIDENT:
 		error = g_io_getattr("GEOM::ident", cp, &i, data);
 		break;
 	case DIOCGPROVIDERNAME:
 		strlcpy(data, pp->name, i);
 		break;
 	case DIOCGSTRIPESIZE:
 		*(off_t *)data = pp->stripesize;
 		break;
 	case DIOCGSTRIPEOFFSET:
 		*(off_t *)data = pp->stripeoffset;
 		break;
 	case DIOCGPHYSPATH:
 		error = g_io_getattr("GEOM::physpath", cp, &i, data);
 		if (error == 0 && *(char *)data == '\0')
 			error = ENOENT;
 		break;
 	case DIOCGATTR: {
 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
 
 		if (arg->len > sizeof(arg->value)) {
 			error = EINVAL;
 			break;
 		}
 		error = g_io_getattr(arg->name, cp, &arg->len, &arg->value);
 		break;
 	}
 	case DIOCZONECMD: {
 		struct disk_zone_args *zone_args =(struct disk_zone_args *)data;
 		struct disk_zone_rep_entry *new_entries, *old_entries;
 		struct disk_zone_report *rep;
 		size_t alloc_size;
 
 		old_entries = NULL;
 		new_entries = NULL;
 		rep = NULL;
 		alloc_size = 0;
 
 		if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) {
 			rep = &zone_args->zone_params.report;
 #define	MAXENTRIES	(maxphys / sizeof(struct disk_zone_rep_entry))
 			if (rep->entries_allocated > MAXENTRIES)
 				rep->entries_allocated = MAXENTRIES;
 			alloc_size = rep->entries_allocated *
 			    sizeof(struct disk_zone_rep_entry);
 			if (alloc_size != 0)
 				new_entries = g_malloc(alloc_size,
 				    M_WAITOK | M_ZERO);
 			old_entries = rep->entries;
 			rep->entries = new_entries;
 		}
 		error = g_io_zonecmd(zone_args, cp);
 		if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES &&
 		    alloc_size != 0 && error == 0)
 			error = copyout(new_entries, old_entries, alloc_size);
 		if (old_entries != NULL && rep != NULL)
 			rep->entries = old_entries;
 		g_free(new_entries);
 		break;
 	}
 	default:
 		if (pp->geom->ioctl != NULL) {
 			error = pp->geom->ioctl(pp, cmd, data, fflag, td);
 		} else {
 			error = ENOIOCTL;
 		}
 	}
 
 	return (error);
 }
 
 static void
 g_dev_done(struct bio *bp2)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	struct bio *bp;
 	int active;
 
 	cp = bp2->bio_from;
 	sc = cp->private;
 	bp = bp2->bio_parent;
 	bp->bio_error = bp2->bio_error;
 	bp->bio_completed = bp2->bio_completed;
 	bp->bio_resid = bp->bio_length - bp2->bio_completed;
 	if (bp2->bio_cmd == BIO_ZONE)
 		bcopy(&bp2->bio_zone, &bp->bio_zone, sizeof(bp->bio_zone));
 
 	if (bp2->bio_error != 0) {
 		g_trace(G_T_BIO, "g_dev_done(%p) had error %d",
 		    bp2, bp2->bio_error);
 		bp->bio_flags |= BIO_ERROR;
 	} else {
 		if (bp->bio_cmd == BIO_READ)
 			KNOTE_UNLOCKED(&sc->sc_selinfo.si_note, NOTE_READ);
 		if (bp->bio_cmd == BIO_WRITE)
 			KNOTE_UNLOCKED(&sc->sc_selinfo.si_note, NOTE_WRITE);
 		g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd",
 		    bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed);
 	}
 	g_destroy_bio(bp2);
 	active = atomic_fetchadd_int(&sc->sc_active, -1) - 1;
 	if ((active & SC_A_ACTIVE) == 0) {
 		if ((active & SC_A_OPEN) == 0)
 			wakeup(&sc->sc_active);
 		if (active & SC_A_DESTROY)
 			g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL);
 	}
 	biodone(bp);
 }
 
 static void
 g_dev_strategy(struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct bio *bp2;
 	struct cdev *dev;
 	struct g_dev_softc *sc;
 
 	KASSERT(bp->bio_cmd == BIO_READ ||
 	        bp->bio_cmd == BIO_WRITE ||
 	        bp->bio_cmd == BIO_DELETE ||
 		bp->bio_cmd == BIO_FLUSH ||
 		bp->bio_cmd == BIO_ZONE,
 		("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd));
 	dev = bp->bio_dev;
 	cp = dev->si_drv2;
 	KASSERT(cp->acr || cp->acw,
 	    ("Consumer with zero access count in g_dev_strategy"));
 	biotrack(bp, __func__);
 #ifdef INVARIANTS
 	if ((bp->bio_offset % cp->provider->sectorsize) != 0 ||
 	    (bp->bio_bcount % cp->provider->sectorsize) != 0) {
 		bp->bio_resid = bp->bio_bcount;
 		biofinish(bp, NULL, EINVAL);
 		return;
 	}
 #endif
 	sc = dev->si_drv1;
 	KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy"));
 	atomic_add_int(&sc->sc_active, 1);
 
 	for (;;) {
 		/*
 		 * XXX: This is not an ideal solution, but I believe it to
 		 * XXX: deadlock safely, all things considered.
 		 */
 		bp2 = g_clone_bio(bp);
 		if (bp2 != NULL)
 			break;
 		pause("gdstrat", hz / 10);
 	}
 	KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place"));
 	bp2->bio_done = g_dev_done;
 	g_trace(G_T_BIO,
 	    "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d",
 	    bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length,
 	    bp2->bio_data, bp2->bio_cmd);
 	g_io_request(bp2, cp);
 	KASSERT(cp->acr || cp->acw,
 	    ("g_dev_strategy raced with g_dev_close and lost"));
 
 }
 
 /*
  * g_dev_callback()
  *
  * Called by devfs when asynchronous device destruction is completed.
  * - Mark that we have no attached device any more.
  * - If there are no outstanding requests, schedule geom destruction.
  *   Otherwise destruction will be scheduled later by g_dev_done().
  */
 
 static void
 g_dev_callback(void *arg)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int active;
 
 	cp = arg;
 	sc = cp->private;
 	g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name);
 
 	sc->sc_dev = NULL;
 	sc->sc_alias = NULL;
 	active = atomic_fetchadd_int(&sc->sc_active, SC_A_DESTROY);
 	if ((active & SC_A_ACTIVE) == 0)
 		g_post_event(g_dev_destroy, cp, M_WAITOK, NULL);
 }
 
 /*
  * g_dev_orphan()
  *
  * Called from below when the provider orphaned us.
  * - Clear any dump settings.
  * - Request asynchronous device destruction to prevent any more requests
  *   from coming in.  The provider is already marked with an error, so
  *   anything which comes in the interim will be returned immediately.
  */
 
 static void
 g_dev_orphan(struct g_consumer *cp)
 {
 	struct cdev *dev;
 	struct g_dev_softc *sc;
 
 	g_topology_assert();
 	sc = cp->private;
 	dev = sc->sc_dev;
 	g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name);
 
 	/* Reset any dump-area set on this device */
 	if (dev->si_flags & SI_DUMPDEV) {
 		struct diocskerneldump_arg kda;
 
 		bzero(&kda, sizeof(kda));
 		kda.kda_index = KDA_REMOVE_DEV;
 		(void)dumper_remove(devtoname(dev), &kda);
 	}
 
 	/* Destroy the struct cdev *so we get no more requests */
 	delist_dev(dev);
 	destroy_dev_sched_cb(dev, g_dev_callback, cp);
 }
 
 static void
 gdev_filter_detach(struct knote *kn)
 {
 	struct g_dev_softc *sc;
 
 	sc = kn->kn_hook;
 
 	knlist_remove(&sc->sc_selinfo.si_note, kn, 0);
 }
 
 static int
 gdev_filter_vnode(struct knote *kn, long hint)
 {
 	kn->kn_fflags |= kn->kn_sfflags & hint;
 
 	return (kn->kn_fflags != 0);
 }
 
 static int
 g_dev_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct g_dev_softc *sc;
 
 	sc = dev->si_drv1;
 
 	if (kn->kn_filter != EVFILT_VNODE)
 		return (EINVAL);
 
 #define SUPPORTED_EVENTS (NOTE_ATTRIB | NOTE_OPEN | NOTE_CLOSE | \
     NOTE_CLOSE_WRITE | NOTE_READ | NOTE_WRITE)
 	if (kn->kn_sfflags & ~SUPPORTED_EVENTS)
 		return (EOPNOTSUPP);
 
 	kn->kn_fop = &gdev_filterops_vnode;
 	kn->kn_hook = sc;
 	knlist_add(&sc->sc_selinfo.si_note, kn, 0);
 
 	return (0);
 }
 
 DECLARE_GEOM_CLASS(g_dev_class, g_dev);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 9036e3a25ab8..61cf7fc845a2 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1,5434 +1,5435 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/kdb.h>
 #include <sys/smr.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/ktrace.h>
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static __read_mostly uma_zone_t file_zone;
 static __read_mostly uma_zone_t filedesc0_zone;
 __read_mostly uma_zone_t pwd_zone;
 VFS_SMR_DECLARE;
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, bool holdleaders, bool audit);
 static void	export_file_to_kinfo(struct file *fp, int fd,
 		    cap_rights_t *rightsp, struct kinfo_file *kif,
 		    struct filedesc *fdp, int flags);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	fget_unlocked_seq(struct thread *td, int fd,
 		    cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp);
 static int	getmaxfd(struct thread *td);
 static u_long	*filecaps_copy_prep(const struct filecaps *src);
 static void	filecaps_copy_finish(const struct filecaps *src,
 		    struct filecaps *dst, u_long *ioctls);
 static u_long 	*filecaps_free_prep(struct filecaps *fcaps);
 static void	filecaps_free_finish(u_long *ioctls);
 
 static struct pwd *pwd_alloc(void);
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 #define	FILEDESC_FOREACH_FDE(fdp, _iterator, _fde)				\
 	struct filedesc *_fdp = (fdp);						\
 	int _lastfile = fdlastfile_single(_fdp);				\
 	for (_iterator = 0; _iterator <= _lastfile; _iterator++)		\
 		if ((_fde = &_fdp->fd_ofiles[_iterator])->fde_file != NULL)
 
 #define	FILEDESC_FOREACH_FP(fdp, _iterator, _fp)				\
 	struct filedesc *_fdp = (fdp);						\
 	int _lastfile = fdlastfile_single(_fdp);				\
 	for (_iterator = 0; _iterator <= _lastfile; _iterator++)		\
 		if ((_fp = _fdp->fd_ofiles[_iterator].fde_file) != NULL)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct fdescenttbl *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 
 struct fdescenttbl0 {
 	int	fdt_nfiles;
 	struct	filedescent fdt_ofiles[NDFILE];
 };
 
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	fdescenttbl0 fd_dfiles;
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 static int __exclusive_cache_line openfiles; /* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the last used fd.
  *
  * Call this variant if fdp can't be modified by anyone else (e.g, during exec).
  * Otherwise use fdlastfile.
  */
 int
 fdlastfile_single(struct filedesc *fdp)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	int off, minoff;
 
 	off = NDSLOT(fdp->fd_nfiles - 1);
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 int
 fdlastfile(struct filedesc *fdp)
 {
 
 	FILEDESC_LOCK_ASSERT(fdp);
 	return (fdlastfile_single(fdp));
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused_init(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 }
 
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fdused_init(fdp, fd);
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile++;
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 fdefree_last(struct filedescent *fde)
 {
 
 	filecaps_free(&fde->fde_caps);
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = NULL;
 #ifdef CAPABILITIES
 	seqc_write_end(&fde->fde_seqc);
 #endif
 	fdefree_last(fde);
 	fdunused(fdp, fd);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 #ifdef	RACCT
 	uint64_t lim;
 #endif
 
 	td->td_retval[0] = getmaxfd(td);
 #ifdef	RACCT
 	PROC_LOCK(td->td_proc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(td->td_proc);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 #endif
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error, newcmd;
 
 	error = 0;
 	newcmd = cmd;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 			newcmd = F_GETLK;
 			break;
 		case F_OSETLK:
 			newcmd = F_SETLK;
 			break;
 		case F_OSETLKW:
 			newcmd = F_SETLKW;
 			break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 	case F_SETLK_REMOTE:
 		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
 		arg1 = (intptr_t)&fl;
 		break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, newcmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	struct mount *mp;
 	struct kinfo_file *kif;
 	int error, flg, kif_sz, seals, tmp, got_set, got_cleared;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	AUDIT_ARG_FD(cmd);
 	AUDIT_ARG_CMD(cmd);
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_GETFD:
 		error = EBADF;
 		FILEDESC_SLOCK(fdp);
 		fde = fdeget_noref(fdp, fd);
 		if (fde != NULL) {
 			td->td_retval[0] =
 			    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		error = EBADF;
 		FILEDESC_XLOCK(fdp);
 		fde = fdeget_noref(fdp, fd);
 		if (fde != NULL) {
 			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 			    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 			error = 0;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_ops == &path_fileops) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		got_set = tmp & ~flg;
 		got_cleared = flg & ~tmp;
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0)
 			goto revert_f_setfl;
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 revert_f_setfl:
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= got_cleared;
 			tmp &= ~got_set;
 		} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error != 0)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		flp = (struct flock *)arg;
 		if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		error = fget_unlocked(td, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
 				PROC_LOCK(p->p_leader);
 				p->p_leader->p_flag |= P_ADVLOCK;
 				PROC_UNLOCK(p->p_leader);
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
 				PROC_LOCK(p->p_leader);
 				p->p_leader->p_flag |= P_ADVLOCK;
 				PROC_UNLOCK(p->p_leader);
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(td, fd, &cap_no_rights, &fp2);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(td, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			    foffset < OFF_MIN - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_ADD_SEALS:
 		error = fget_unlocked(td, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		error = fo_add_seals(fp, arg);
 		fdrop(fp, td);
 		break;
 
 	case F_GET_SEALS:
 		error = fget_unlocked(td, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fo_get_seals(fp, &seals) == 0)
 			td->td_retval[0] = seals;
 		else
 			error = EINVAL;
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(td, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		if (vp->v_type != VREG) {
 			fdrop(fp, td);
 			error = ENOTTY;
 			break;
 		}
 
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			arg = MIN(arg, INT_MAX - bsize + 1);
 			fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX,
 			    (arg + bsize - 1) / bsize);
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp);
 		fdrop(fp, td);
 		break;
 
 	case F_ISUNIONSTACK:
 		/*
 		 * Check if the vnode is part of a union stack (either the
 		 * "union" flag from mount(2) or unionfs).
 		 *
 		 * Prior to introduction of this op libc's readdir would call
 		 * fstatfs(2), in effect unnecessarily copying kilobytes of
 		 * data just to check fs name and a mount flag.
 		 *
 		 * Fixing the code to handle everything in the kernel instead
 		 * is a non-trivial endeavor and has low priority, thus this
 		 * horrible kludge facilitates the current behavior in a much
 		 * cheaper manner until someone(tm) sorts this out.
 		 */
 		error = fget_unlocked(td, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		/*
 		 * Since we don't prevent dooming the vnode even non-null mp
 		 * found can become immediately stale. This is tolerable since
 		 * mount points are type-stable (providing safe memory access)
 		 * and any vfs op on this vnode going forward will return an
 		 * error (meaning return value in this case is meaningless).
 		 */
 		mp = atomic_load_ptr(&vp->v_mount);
 		if (__predict_false(mp == NULL)) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		td->td_retval[0] = 0;
 		if (mp->mnt_kern_flag & MNTK_UNIONFS ||
 		    mp->mnt_flag & MNT_UNION)
 			td->td_retval[0] = 1;
 		fdrop(fp, td);
 		break;
 
 	case F_KINFO:
 #ifdef CAPABILITY_MODE
 		if (CAP_TRACING(td))
 			ktrcapfail(CAPFAIL_SYSCALL, &cmd);
 		if (IN_CAPABILITY_MODE(td)) {
 			error = ECAPMODE;
 			break;
 		}
 #endif
 		error = copyin((void *)arg, &kif_sz, sizeof(kif_sz));
 		if (error != 0)
 			break;
 		if (kif_sz != sizeof(*kif)) {
 			error = EINVAL;
 			break;
 		}
 		kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO);
 		FILEDESC_SLOCK(fdp);
 		error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL);
 		if (error == 0 && fhold(fp)) {
 			export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0);
 			FILEDESC_SUNLOCK(fdp);
 			fdrop(fp, td);
 			if ((kif->kf_status & KF_ATTR_VALID) != 0) {
 				kif->kf_structsize = sizeof(*kif);
 				error = copyout(kif, (void *)arg, sizeof(*kif));
 			} else {
 				error = EBADF;
 			}
 		} else {
 			FILEDESC_SUNLOCK(fdp);
 			if (error == 0)
 				error = EBADF;
 		}
 		free(kif, M_TEMP);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct thread *td)
 {
 
 	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 int
 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *delfp, *oldfp;
 	u_long *oioctls, *nioctls;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	oioctls = NULL;
 
 	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
 	AUDIT_ARG_FD(old);
 	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(td);
 	if (new >= maxfd)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 
 	error = EBADF;
 	FILEDESC_XLOCK(fdp);
 	if (fget_noref(fdp, old) == NULL)
 		goto unlock;
 	if (mode == FDDUP_FIXED && old == new) {
 		td->td_retval[0] = new;
 		if (flags & FDDUP_FLAG_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		error = 0;
 		goto unlock;
 	}
 
 	oldfde = &fdp->fd_ofiles[old];
 	oldfp = oldfde->fde_file;
 	if (!fhold(oldfp))
 		goto unlock;
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	switch (mode) {
 	case FDDUP_NORMAL:
 	case FDDUP_FCNTL:
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			fdrop(oldfp, td);
 			goto unlock;
 		}
 		break;
 	case FDDUP_FIXED:
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
 			if (RACCT_ENABLED()) {
 				error = racct_set_unlocked(p, RACCT_NOFILE, new + 1);
 				if (error != 0) {
 					error = EMFILE;
 					fdrop(oldfp, td);
 					goto unlock;
 				}
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 		}
 		if (!fdisused(fdp, new))
 			fdused(fdp, new);
 		break;
 	default:
 		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
 	}
 
 	KASSERT(old != new, ("new fd is same as old"));
 
 	/* Refetch oldfde because the table may have grown and old one freed. */
 	oldfde = &fdp->fd_ofiles[old];
 	KASSERT(oldfp == oldfde->fde_file,
 	    ("fdt_ofiles shift from growth observed at fd %d",
 	    old));
 
 	newfde = &fdp->fd_ofiles[new];
 	delfp = newfde->fde_file;
 
 	nioctls = filecaps_copy_prep(&oldfde->fde_caps);
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seqc_write_begin(&newfde->fde_seqc);
 #endif
 	oioctls = filecaps_free_prep(&newfde->fde_caps);
 	fde_copy(oldfde, newfde);
 	filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 	    nioctls);
 	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seqc_write_end(&newfde->fde_seqc);
 #endif
 	td->td_retval[0] = new;
 
 	error = 0;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, true, false);
 		FILEDESC_UNLOCK_ASSERT(fdp);
 	} else {
 unlock:
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	filecaps_free_finish(oioctls);
 	return (error);
 }
 
 static void
 sigiofree(struct sigio *sigio)
 {
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 static struct sigio *
 funsetown_locked(struct sigio *sigio)
 {
 	struct proc *p;
 	struct pgrp *pg;
 
 	SIGIO_ASSERT_LOCKED();
 
 	if (sigio == NULL)
 		return (NULL);
 	*sigio->sio_myref = NULL;
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		p = sigio->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	return (sigio);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	/* Racy check, consumers must provide synchronization. */
 	if (*sigiop == NULL)
 		return;
 
 	SIGIO_LOCK();
 	sigio = funsetown_locked(*sigiop);
 	SIGIO_UNLOCK();
 	if (sigio != NULL)
 		sigiofree(sigio);
 }
 
 /*
  * Free a list of sigio structures.  The caller must ensure that new sigio
  * structures cannot be added after this point.  For process groups this is
  * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves
  * as an interlock.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio, *tmp;
 
 	/* Racy check. */
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 
 	p = NULL;
 	pg = NULL;
 
 	SIGIO_LOCK();
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 
 	/*
 	 * Every entry of the list should belong to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		sx_assert(&proctree_lock, SX_XLOCKED);
 		PGRP_LOCK(pg);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK(p);
 		KASSERT((p->p_flag & P_WEXIT) != 0,
 		    ("%s: process %p is not exiting", __func__, p));
 	}
 
 	SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) {
 		*sigio->sio_myref = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 		}
 	}
 
 	if (pg != NULL)
 		PGRP_UNLOCK(pg);
 	else
 		PROC_UNLOCK(p);
 	SIGIO_UNLOCK();
 
 	SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp)
 		sigiofree(sigio);
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *osigio, *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	ret = 0;
 	if (pgid > 0) {
 		ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc);
 		SIGIO_LOCK();
 		osigio = funsetown_locked(*sigiop);
 		if (ret == 0) {
 			PROC_LOCK(proc);
 			_PRELE(proc);
 			if ((proc->p_flag & P_WEXIT) != 0) {
 				ret = ESRCH;
 			} else if (proc->p_session !=
 			    curthread->td_proc->p_session) {
 				/*
 				 * Policy - Don't allow a process to FSETOWN a
 				 * process in another session.
 				 *
 				 * Remove this test to allow maximum flexibility
 				 * or restrict FSETOWN to the current process or
 				 * process group for maximum safety.
 				 */
 				ret = EPERM;
 			} else {
 				sigio->sio_proc = proc;
 				SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio,
 				    sio_pgsigio);
 			}
 			PROC_UNLOCK(proc);
 		}
 	} else /* if (pgid < 0) */ {
 		sx_slock(&proctree_lock);
 		SIGIO_LOCK();
 		osigio = funsetown_locked(*sigiop);
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 		} else {
 			if (pgrp->pg_session != curthread->td_proc->p_session) {
 				/*
 				 * Policy - Don't allow a process to FSETOWN a
 				 * process in another session.
 				 *
 				 * Remove this test to allow maximum flexibility
 				 * or restrict FSETOWN to the current process or
 				 * process group for maximum safety.
 				 */
 				ret = EPERM;
 			} else {
 				sigio->sio_pgrp = pgrp;
 				SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio,
 				    sio_pgsigio);
 			}
 			PGRP_UNLOCK(pgrp);
 		}
 		sx_sunlock(&proctree_lock);
 	}
 	if (ret == 0)
 		*sigiop = sigio;
 	SIGIO_UNLOCK();
 	if (osigio != NULL)
 		sigiofree(osigio);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(struct sigio **sigiop)
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 static int
 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool audit)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
 		knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (__predict_false(fp->f_type == DTYPE_MQUEUE))
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 #ifdef AUDIT
 	if (AUDITING_TD(td) && audit)
 		audit_sysclose(td, fd, fp);
 #endif
 	error = closef(fp, td);
 
 	/*
 	 * All paths leading up to closefp() will have already removed or
 	 * replaced the fd in the filedesc table, so a restart would not
 	 * operate on the same file.
 	 */
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool holdleaders, bool audit)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = false;
 		}
 	}
 
 	error = closefp_impl(fdp, fd, fp, td, audit);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool holdleaders, bool audit)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (__predict_false(td->td_proc->p_fdtol != NULL)) {
 		return (closefp_hl(fdp, fd, fp, td, holdleaders, audit));
 	} else {
 		return (closefp_impl(fdp, fd, fp, td, audit));
 	}
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(struct thread *td, struct close_args *uap)
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(struct thread *td, int fd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_noref(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, true, true));
 }
 
 static int
 close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
 {
 	struct filedesc *fdp;
 	struct fdescenttbl *fdt;
 	struct filedescent *fde;
 	int fd;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	fdt = atomic_load_ptr(&fdp->fd_files);
 	highfd = MIN(highfd, fdt->fdt_nfiles - 1);
 	fd = lowfd;
 	if (__predict_false(fd > highfd)) {
 		goto out_locked;
 	}
 	for (; fd <= highfd; fd++) {
 		fde = &fdt->fdt_ofiles[fd];
 		if (fde->fde_file != NULL)
 			fde->fde_flags |= UF_EXCLOSE;
 	}
 out_locked:
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
 
 static int
 close_range_impl(struct thread *td, u_int lowfd, u_int highfd)
 {
 	struct filedesc *fdp;
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 	int fd;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	fdt = atomic_load_ptr(&fdp->fd_files);
 	highfd = MIN(highfd, fdt->fdt_nfiles - 1);
 	fd = lowfd;
 	if (__predict_false(fd > highfd)) {
 		goto out_locked;
 	}
 	for (;;) {
 		fp = fdt->fdt_ofiles[fd].fde_file;
 		if (fp == NULL) {
 			if (fd == highfd)
 				goto out_locked;
 		} else {
 			fdfree(fdp, fd);
 			(void) closefp(fdp, fd, fp, td, true, true);
 			if (fd == highfd)
 				goto out_unlocked;
 			FILEDESC_XLOCK(fdp);
 			fdt = atomic_load_ptr(&fdp->fd_files);
 		}
 		fd++;
 	}
 out_locked:
 	FILEDESC_XUNLOCK(fdp);
 out_unlocked:
 	return (0);
 }
 
 int
 kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd)
 {
 
 	/*
 	 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2
 	 * open should not be a usage error.  From a close_range() perspective,
 	 * close_range(3, ~0U, 0) in the same scenario should also likely not
 	 * be a usage error as all fd above 3 are in-fact already closed.
 	 */
 	if (highfd < lowfd) {
 		return (EINVAL);
 	}
 
 	if ((flags & CLOSE_RANGE_CLOEXEC) != 0)
 		return (close_range_cloexec(td, lowfd, highfd));
 
 	return (close_range_impl(td, lowfd, highfd));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct close_range_args {
 	u_int	lowfd;
 	u_int	highfd;
 	int	flags;
 };
 #endif
 int
 sys_close_range(struct thread *td, struct close_range_args *uap)
 {
 
 	AUDIT_ARG_FD(uap->lowfd);
 	AUDIT_ARG_CMD(uap->highfd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 
 	if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0)
 		return (EINVAL);
 	return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd));
 }
 
 #ifdef COMPAT_FREEBSD12
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd12_closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap)
 {
 	u_int lowfd;
 
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	lowfd = MAX(0, uap->lowfd);
 	return (kern_close_range(td, 0, lowfd, ~0U));
 }
 #endif	/* COMPAT_FREEBSD12 */
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD11)
 int
 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->sb, sizeof(osb));
 	return (error);
 }
 #endif	/* COMPAT_FREEBSD11 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, &cap_fstat_rights, &fp);
 	if (__predict_false(error != 0))
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred);
 	fdrop(fp, td);
 #ifdef __STAT_TIME_T_EXT
 	sbp->st_atim_ext = 0;
 	sbp->st_mtim_ext = 0;
 	sbp->st_ctim_ext = 0;
 	sbp->st_btim_ext = 0;
 #endif
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrstat_error(sbp, error);
 #endif
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtnstat(&ub, &nub);
 	if (error != 0)
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	return (error);
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	long value;
 	int error;
 
 	error = kern_fpathconf(td, uap->fd, uap->name, &value);
 	if (error == 0)
 		td->td_retval[0] = value;
 	return (error);
 }
 
 int
 kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
 	error = fget(td, fd, &cap_fpathconf_rights, &fp);
 	if (error != 0)
 		return (error);
 
 	if (name == _PC_ASYNC_IO) {
 		*valuep = _POSIX_ASYNCHRONOUS_IO;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, name, valuep);
 		VOP_UNLOCK(vp);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			*valuep = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  *
  * The last parameter indicates whether the fdtable is locked. If it is not and
  * ioctls are encountered, copying fails and the caller must lock the table.
  *
  * Note that if the table was not locked, the caller has to check the relevant
  * sequence counter to determine whether the operation was successful.
  */
 bool
 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 {
 	size_t size;
 
 	if (src->fc_ioctls != NULL && !locked)
 		return (false);
 	memcpy(dst, src, sizeof(*src));
 	if (src->fc_ioctls == NULL)
 		return (true);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	memcpy(dst->fc_ioctls, src->fc_ioctls, size);
 	return (true);
 }
 
 static u_long *
 filecaps_copy_prep(const struct filecaps *src)
 {
 	u_long *ioctls;
 	size_t size;
 
 	if (__predict_true(src->fc_ioctls == NULL))
 		return (NULL);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	return (ioctls);
 }
 
 static void
 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
     u_long *ioctls)
 {
 	size_t size;
 
 	*dst = *src;
 	if (__predict_true(src->fc_ioctls == NULL)) {
 		MPASS(ioctls == NULL);
 		return;
 	}
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = ioctls;
 	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 static void
 filecaps_free_ioctl(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	fcaps->fc_ioctls = NULL;
 }
 
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	filecaps_free_ioctl(fcaps);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 static u_long *
 filecaps_free_prep(struct filecaps *fcaps)
 {
 	u_long *ioctls;
 
 	ioctls = fcaps->fc_ioctls;
 	bzero(fcaps, sizeof(*fcaps));
 	return (ioctls);
 }
 
 static void
 filecaps_free_finish(u_long *ioctls)
 {
 
 	free(ioctls, M_FILECAPS);
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	/*
 	 * open calls without WANTIOCTLCAPS free caps but leave the counter
 	 */
 #if 0
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 #endif
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accommodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct fdescenttbl *ntable;
 	struct fdescenttbl *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_files;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the number of
 	 * entries, file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 	    sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data */
 	ntable->fdt_nfiles = nnfiles;
 	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 	    onfiles * sizeof(ntable->fdt_ofiles[0]));
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * Make sure that ntable is correctly initialized before we replace
 	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 	 * data.
 	 */
 	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 
 	/*
 	 * Free the old file table when not shared by other threads or processes.
 	 * The old file table is considered to be shared when either are true:
 	 * - The process has more than one thread.
 	 * - The file descriptor table has been shared via fdshare().
 	 *
 	 * When shared, the old file table will be placed on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		/*
 		 * Note we may be called here from fdinit while allocating a
 		 * table for a new process in which case ->p_fd points
 		 * elsewhere.
 		 */
 		if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) {
 			free(otable, M_FILEDESC);
 		} else {
 			ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 			fdp0 = (struct filedesc0 *)fdp;
 			ft->ft_table = otable;
 			SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 		}
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(td);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (__predict_false(fd >= maxfd))
 		return (EMFILE);
 	if (__predict_false(fd >= fdp->fd_nfiles)) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		if (RACCT_ENABLED()) {
 			error = racct_set_unlocked(p, RACCT_NOFILE, allocfd);
 			if (error != 0)
 				return (EMFILE);
 		}
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file descriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
     struct filecaps *fcaps)
 {
 	struct file *fp;
 	int error, fd;
 
 	MPASS(resultfp != NULL);
 	MPASS(resultfd != NULL);
 
 	error = _falloc_noinstall(td, &fp, 2);
 	if (__predict_false(error != 0)) {
 		return (error);
 	}
 
 	error = finstall_refed(td, fp, &fd, flags, fcaps);
 	if (__predict_false(error != 0)) {
 		falloc_abort(td, fp);
 		return (error);
 	}
 
 	*resultfp = fp;
 	*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	int openfiles_new;
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 	MPASS(n > 0);
 
 	openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
 	if ((openfiles_new >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles_new >= maxfiles) {
 		atomic_subtract_int(&openfiles, 1);
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 		}
 		return (ENFILE);
 	}
 	fp = uma_zalloc(file_zone, M_WAITOK);
 	bzero(fp, sizeof(*fp));
 	refcount_init(&fp->f_count, n);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	*resultfp = fp;
 	return (0);
 }
 
 void
 falloc_abort(struct thread *td, struct file *fp)
 {
 
 	/*
 	 * For assertion purposes.
 	 */
 	refcount_init(&fp->f_count, 0);
 	_fdrop(fp, td);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 void
 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedescent *fde;
 
 	MPASS(fp != NULL);
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = fp;
 	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seqc_write_end(&fde->fde_seqc);
 #endif
 }
 
 int
 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 
 	MPASS(fd != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	error = fdalloc(td, 0, fd);
 	if (__predict_true(error == 0)) {
 		_finstall(fdp, fp, *fd, flags, fcaps);
 	}
 	FILEDESC_XUNLOCK(fdp);
 	return (error);
 }
 
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	int error;
 
 	MPASS(fd != NULL);
 
 	if (!fhold(fp))
 		return (EBADF);
 	error = finstall_refed(td, fp, fd, flags, fcaps);
 	if (__predict_false(error != 0)) {
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * Build a new filedesc structure from another.
  *
  * If fdp is not NULL, return with it shared locked.
  */
 struct filedesc *
 fdinit(void)
 {
 	struct filedesc0 *newfdp0;
 	struct filedesc *newfdp;
 
 	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 	newfdp = &newfdp0->fd_fd;
 
 	/* Create the file descriptor table. */
 	FILEDESC_LOCK_INIT(newfdp);
 	refcount_init(&newfdp->fd_refcnt, 1);
 	refcount_init(&newfdp->fd_holdcnt, 1);
 	newfdp->fd_map = newfdp0->fd_dmap;
 	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 	newfdp->fd_files->fdt_nfiles = NDFILE;
 
 	return (newfdp);
 }
 
 /*
  * Build a pwddesc structure from another.
  * Copy the current, root, and jail root vnode references.
  *
  * If pdp is not NULL, return with it shared locked.
  */
 struct pwddesc *
 pdinit(struct pwddesc *pdp, bool keeplock)
 {
 	struct pwddesc *newpdp;
 	struct pwd *newpwd;
 
 	newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO);
 
 	PWDDESC_LOCK_INIT(newpdp);
 	refcount_init(&newpdp->pd_refcount, 1);
 	newpdp->pd_cmask = CMASK;
 
 	if (pdp == NULL) {
 		newpwd = pwd_alloc();
 		smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 		return (newpdp);
 	}
 
 	PWDDESC_XLOCK(pdp);
 	newpwd = pwd_hold_pwddesc(pdp);
 	smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 	if (!keeplock)
 		PWDDESC_XUNLOCK(pdp);
 	return (newpdp);
 }
 
 /*
  * Hold either filedesc or pwddesc of the passed process.
  *
  * The process lock is used to synchronize against the target exiting and
  * freeing the data.
  *
  * Clearing can be ilustrated in 3 steps:
  * 1. set the pointer to NULL. Either routine can race against it, hence
  *   atomic_load_ptr.
  * 2. observe the process lock as not taken. Until then fdhold/pdhold can
  *   race to either still see the pointer or find NULL. It is still safe to
  *   grab a reference as clearing is stalled.
  * 3. after the lock is observed as not taken, any fdhold/pdhold calls are
  *   guaranteed to see NULL, making it safe to finish clearing
  */
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	fdp = atomic_load_ptr(&p->p_fd);
 	if (fdp != NULL)
 		refcount_acquire(&fdp->fd_holdcnt);
 	return (fdp);
 }
 
 static struct pwddesc *
 pdhold(struct proc *p)
 {
 	struct pwddesc *pdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	pdp = atomic_load_ptr(&p->p_pd);
 	if (pdp != NULL)
 		refcount_acquire(&pdp->pd_refcount);
 	return (pdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 
 	if (refcount_load(&fdp->fd_holdcnt) > 1) {
 		if (refcount_release(&fdp->fd_holdcnt) == 0)
 			return;
 	}
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	uma_zfree(filedesc0_zone, fdp);
 }
 
 static void
 pddrop(struct pwddesc *pdp)
 {
 	struct pwd *pwd;
 
 	if (refcount_release_if_not_last(&pdp->pd_refcount))
 		return;
 
 	PWDDESC_XLOCK(pdp);
 	if (refcount_release(&pdp->pd_refcount) == 0) {
 		PWDDESC_XUNLOCK(pdp);
 		return;
 	}
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	pwd_set(pdp, NULL);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(pwd);
 
 	PWDDESC_LOCK_DESTROY(pdp);
 	free(pdp, M_PWDDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	refcount_acquire(&fdp->fd_refcnt);
 	return (fdp);
 }
 
 /*
  * Share a pwddesc structure.
  */
 struct pwddesc *
 pdshare(struct pwddesc *pdp)
 {
 	refcount_acquire(&pdp->pd_refcount);
 	return (pdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (refcount_load(&p->p_fd->fd_refcnt) == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 /*
  * Unshare a pwddesc structure.
  */
 void
 pdunshare(struct thread *td)
 {
 	struct pwddesc *pdp;
 	struct proc *p;
 
 	p = td->td_proc;
 	/* Not shared. */
 	if (refcount_load(&p->p_pd->pd_refcount) == 1)
 		return;
 
 	pdp = pdcopy(p->p_pd);
 	pdescfree(td);
 	p->p_pd = pdp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i, lastfile;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit();
 	FILEDESC_SLOCK(fdp);
 	for (;;) {
 		lastfile = fdlastfile(fdp);
 		if (lastfile < newfdp->fd_nfiles)
 			break;
 		FILEDESC_SUNLOCK(fdp);
 		fdgrowtable(newfdp, lastfile + 1);
 		FILEDESC_SLOCK(fdp);
 	}
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = fdp->fd_freefile;
 	FILEDESC_FOREACH_FDE(fdp, i, ofde) {
 		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
 		    !fhold(ofde->fde_file)) {
 			if (newfdp->fd_freefile == fdp->fd_freefile)
 				newfdp->fd_freefile = i;
 			continue;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fdused_init(newfdp, i);
 	}
 	MPASS(newfdp->fd_freefile != -1);
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Copy a pwddesc structure.
  */
 struct pwddesc *
 pdcopy(struct pwddesc *pdp)
 {
 	struct pwddesc *newpdp;
 
 	MPASS(pdp != NULL);
 
 	newpdp = pdinit(pdp, true);
 	newpdp->pd_cmask = pdp->pd_cmask;
 	PWDDESC_XUNLOCK(pdp);
 	return (newpdp);
 }
 
 /*
  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
  * one of processes using it exits) and the table used to be shared.
  */
 static void
 fdclearlocks(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedesc_to_leader *fdtol;
 	struct flock lf;
 	struct file *fp;
 	struct proc *p;
 	struct vnode *vp;
 	int i;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	fdtol = p->p_fdtol;
 	MPASS(fdtol != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	KASSERT(fdtol->fdl_refcount > 0,
 	    ("filedesc_to_refcount botch: fdl_refcount=%d",
 	    fdtol->fdl_refcount));
 	if (fdtol->fdl_refcount == 1 &&
 	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 		FILEDESC_FOREACH_FP(fdp, i, fp) {
 			if (fp->f_type != DTYPE_VNODE ||
 			    !fhold(fp))
 				continue;
 			FILEDESC_XUNLOCK(fdp);
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			vp = fp->f_vnode;
 			(void) VOP_ADVLOCK(vp,
 			    (caddr_t)p->p_leader, F_UNLCK,
 			    &lf, F_POSIX);
 			FILEDESC_XLOCK(fdp);
 			fdrop(fp, td);
 		}
 	}
 retry:
 	if (fdtol->fdl_refcount == 1) {
 		if (fdp->fd_holdleaderscount > 0 &&
 		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 			/*
 			 * close() or kern_dup() has cleared a reference
 			 * in a shared file descriptor table.
 			 */
 			fdp->fd_holdleaderswakeup = 1;
 			sx_sleep(&fdp->fd_holdleaderscount,
 			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 			goto retry;
 		}
 		if (fdtol->fdl_holdcount > 0) {
 			/*
 			 * Ensure that fdtol->fdl_leader remains
 			 * valid in closef().
 			 */
 			fdtol->fdl_wakeup = 1;
 			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 			    "fdlhold", 0);
 			goto retry;
 		}
 	}
 	fdtol->fdl_refcount--;
 	if (fdtol->fdl_refcount == 0 &&
 	    fdtol->fdl_holdcount == 0) {
 		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 	} else
 		fdtol = NULL;
 	p->p_fdtol = NULL;
 	FILEDESC_XUNLOCK(fdp);
 	if (fdtol != NULL)
 		free(fdtol, M_FILEDESC_TO_LEADER);
 }
 
 /*
  * Release a filedesc structure.
  */
 static void
 fdescfree_fds(struct thread *td, struct filedesc *fdp)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft, *tft;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 0,
 	    ("%s: fd table %p carries references", __func__, fdp));
 
 	/*
 	 * Serialize with threads iterating over the table, if any.
 	 */
 	if (refcount_load(&fdp->fd_holdcnt) > 1) {
 		FILEDESC_XLOCK(fdp);
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	FILEDESC_FOREACH_FDE(fdp, i, fde) {
 		fp = fde->fde_file;
 		fdefree_last(fde);
 		(void) closef(fp, td);
 	}
 
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_files, M_FILEDESC);
 
 	fdp0 = (struct filedesc0 *)fdp;
 	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 		free(ft->ft_table, M_FILEDESC);
 
 	fddrop(fdp);
 }
 
 void
 fdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct filedesc *fdp;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	MPASS(fdp != NULL);
 
 #ifdef RACCT
 	if (RACCT_ENABLED())
 		racct_set_unlocked(p, RACCT_NOFILE, 0);
 #endif
 
 	if (p->p_fdtol != NULL)
 		fdclearlocks(td);
 
 	/*
 	 * Check fdhold for an explanation.
 	 */
 	atomic_store_ptr(&p->p_fd, NULL);
 	atomic_thread_fence_seq_cst();
 	PROC_WAIT_UNLOCKED(p);
 
 	if (refcount_release(&fdp->fd_refcnt) == 0)
 		return;
 
 	fdescfree_fds(td, fdp);
 }
 
 void
 pdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct pwddesc *pdp;
 
 	p = td->td_proc;
 	pdp = p->p_pd;
 	MPASS(pdp != NULL);
 
 	/*
 	 * Check pdhold for an explanation.
 	 */
 	atomic_store_ptr(&p->p_pd, NULL);
 	atomic_thread_fence_seq_cst();
 	PROC_WAIT_UNLOCKED(p);
 
 	pddrop(pdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static bool
 is_unsafe(struct file *fp)
 {
 	struct vnode *vp;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (false);
 
 	vp = fp->f_vnode;
 	return ((vp->v_vflag & VV_PROCDEP) != 0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 fdsetugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	for (i = 0; i <= 2; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			FILEDESC_XLOCK(fdp);
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
 	}
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct thread *td, struct file *fp, int idx)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	FILEDESC_FOREACH_FDE(fdp, i, fde) {
 		fp = fde->fde_file;
 		if (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE)) {
 			FILEDESC_XLOCK(fdp);
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, false, false);
 			FILEDESC_UNLOCK_ASSERT(fdp);
 		}
 	}
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	devnull = -1;
 	for (i = 0; i <= 2; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 
 		save = td->td_retval[0];
 		if (devnull != -1) {
 			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 		} else {
 			error = kern_openat(td, AT_FDCWD, "/dev/null",
 			    UIO_SYSSPACE, O_RDWR, 0);
 			if (error == 0) {
 				devnull = td->td_retval[0];
 				KASSERT(devnull == i, ("we didn't get our fd"));
 			}
 		}
 		td->td_retval[0] = save;
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	MPASS(td != NULL);
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			    fdtol != td->td_proc->p_fdtol;
 			    fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				    P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop_close(fp, td));
 }
 
 /*
  * Hack for file descriptor passing code.
  */
 void
 closef_nothread(struct file *fp)
 {
 
 	fdrop(fp, NULL);
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
-finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
+finit(struct file *fp, u_int flag, short type, void *data,
+    const struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 void
-finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops)
+finit_vnode(struct file *fp, u_int flag, void *data, const struct fileops *ops)
 {
 	fp->f_seqcount[UIO_READ] = 1;
 	fp->f_seqcount[UIO_WRITE] = 1;
 	finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
 	    data, ops);
 }
 
 int
 fget_cap_noref(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedescent *fde;
 	int error;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	*fpp = NULL;
 	fde = fdeget_noref(fdp, fd);
 	if (fde == NULL) {
 		error = EBADF;
 		goto out;
 	}
 
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights_fde_inline(fde), needrightsp);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (havecapsp != NULL)
 		filecaps_copy(&fde->fde_caps, havecapsp, true);
 
 	*fpp = fde->fde_file;
 
 	error = 0;
 out:
 	return (error);
 }
 
 #ifdef CAPABILITIES
 int
 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 	struct file *fp;
 	seqc_t seq;
 
 	*fpp = NULL;
 	for (;;) {
 		error = fget_unlocked_seq(td, fd, needrightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 
 		if (havecapsp != NULL) {
 			if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
 			    havecapsp, false)) {
 				fdrop(fp, td);
 				goto get_locked;
 			}
 		}
 
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	*fpp = fp;
 	return (0);
 
 get_locked:
 	FILEDESC_SLOCK(fdp);
 	error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp);
 	if (error == 0 && !fhold(*fpp))
 		error = EBADF;
 	FILEDESC_SUNLOCK(fdp);
 	return (error);
 }
 #else
 int
 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	int error;
 	error = fget_unlocked(td, fd, needrightsp, fpp);
 	if (havecapsp != NULL && error == 0)
 		filecaps_fill(havecapsp);
 
 	return (error);
 }
 #endif
 
 int
 fget_remote(struct thread *td, struct proc *p, int fd, struct file **fpp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 
 	if (p == td->td_proc)	/* curproc */
 		return (fget_unlocked(td, fd, &cap_no_rights, fpp));
 
 	PROC_LOCK(p);
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (ENOENT);
 	FILEDESC_SLOCK(fdp);
 	if (refcount_load(&fdp->fd_refcnt) != 0) {
 		fp = fget_noref(fdp, fd);
 		if (fp != NULL && fhold(fp)) {
 			*fpp = fp;
 			error = 0;
 		} else {
 			error = EBADF;
 		}
 	} else {
 		error = ENOENT;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	return (error);
 }
 
 #ifdef CAPABILITIES
 int
 fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 {
 	const struct filedescent *fde;
 	const struct fdescenttbl *fdt;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct vnode *vp;
 	const cap_rights_t *haverights;
 	cap_rights_t rights;
 	seqc_t seq;
 	int fd;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	fd = ndp->ni_dirfd;
 	rights = *ndp->ni_rightsneeded;
 	cap_rights_set_one(&rights, CAP_LOOKUP);
 
 	fdp = curproc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 	fde = &fdt->fdt_ofiles[fd];
 	haverights = cap_rights_fde_inline(fde);
 	fp = fde->fde_file;
 	if (__predict_false(fp == NULL))
 		return (EAGAIN);
 	if (__predict_false(cap_check_inline_transient(haverights, &rights)))
 		return (EAGAIN);
 	*fsearch = ((fp->f_flag & FSEARCH) != 0);
 	vp = fp->f_vnode;
 	if (__predict_false(vp == NULL)) {
 		return (EAGAIN);
 	}
 	if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) {
 		return (EAGAIN);
 	}
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 	if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)))
 		return (EAGAIN);
 	/*
 	 * If file descriptor doesn't have all rights,
 	 * all lookups relative to it must also be
 	 * strictly relative.
 	 *
 	 * Not yet supported by fast path.
 	 */
 	CAP_ALL(&rights);
 	if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
 	    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 	    ndp->ni_filecaps.fc_nioctls != -1) {
 #ifdef notyet
 		ndp->ni_lcf |= NI_LCF_STRICTREL;
 #else
 		return (EAGAIN);
 #endif
 	}
 	*vpp = vp;
 	return (0);
 }
 #else
 int
 fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 {
 	const struct fdescenttbl *fdt;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct vnode *vp;
 	int fd;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	fd = ndp->ni_dirfd;
 	fdp = curproc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	fp = fdt->fdt_ofiles[fd].fde_file;
 	if (__predict_false(fp == NULL))
 		return (EAGAIN);
 	*fsearch = ((fp->f_flag & FSEARCH) != 0);
 	vp = fp->f_vnode;
 	if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
 		return (EAGAIN);
 	}
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 		return (EAGAIN);
 	filecaps_fill(&ndp->ni_filecaps);
 	*vpp = vp;
 	return (0);
 }
 #endif
 
 int
 fgetvp_lookup(struct nameidata *ndp, struct vnode **vpp)
 {
 	struct thread *td;
 	struct file *fp;
 	struct vnode *vp;
 	struct componentname *cnp;
 	cap_rights_t rights;
 	int error;
 
 	td = curthread;
 	rights = *ndp->ni_rightsneeded;
 	cap_rights_set_one(&rights, CAP_LOOKUP);
 	cnp = &ndp->ni_cnd;
 
 	error = fget_cap(td, ndp->ni_dirfd, &rights, &fp, &ndp->ni_filecaps);
 	if (__predict_false(error != 0))
 		return (error);
 	if (__predict_false(fp->f_ops == &badfileops)) {
 		error = EBADF;
 		goto out_free;
 	}
 	vp = fp->f_vnode;
 	if (__predict_false(vp == NULL)) {
 		error = ENOTDIR;
 		goto out_free;
 	}
 	vrefact(vp);
 	/*
 	 * XXX does not check for VDIR, handled by namei_setup
 	 */
 	if ((fp->f_flag & FSEARCH) != 0)
 		cnp->cn_flags |= NOEXECCHECK;
 	fdrop(fp, td);
 
 #ifdef CAPABILITIES
 	/*
 	 * If file descriptor doesn't have all rights,
 	 * all lookups relative to it must also be
 	 * strictly relative.
 	 */
 	CAP_ALL(&rights);
 	if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
 	    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 	    ndp->ni_filecaps.fc_nioctls != -1) {
 		ndp->ni_lcf |= NI_LCF_STRICTREL;
 		ndp->ni_resflags |= NIRES_STRICTREL;
 	}
 #endif
 
 	/*
 	 * TODO: avoid copying ioctl caps if it can be helped to begin with
 	 */
 	if ((cnp->cn_flags & WANTIOCTLCAPS) == 0)
 		filecaps_free_ioctl(&ndp->ni_filecaps);
 
 	*vpp = vp;
 	return (0);
 
 out_free:
 	filecaps_free(&ndp->ni_filecaps);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Fetch the descriptor locklessly.
  *
  * We avoid fdrop() races by never raising a refcount above 0.  To accomplish
  * this we have to use a cmpset loop rather than an atomic_add.  The descriptor
  * must be re-verified once we acquire a reference to be certain that the
  * identity is still correct and we did not lose a race due to preemption.
  *
  * Force a reload of fdt when looping. Another thread could reallocate
  * the table before this fd was closed, so it is possible that there is
  * a stale fp pointer in cached version.
  */
 #ifdef CAPABILITIES
 static int
 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, seqc_t *seqp)
 {
 	struct filedesc *fdp;
 	const struct filedescent *fde;
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 	seqc_t seq;
 	cap_rights_t haverights;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 
 	for (;;) {
 		seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 		fde = &fdt->fdt_ofiles[fd];
 		haverights = *cap_rights_fde_inline(fde);
 		fp = fde->fde_file;
 		if (__predict_false(fp == NULL)) {
 			if (seqc_consistent(fd_seqc(fdt, fd), seq))
 				return (EBADF);
 			fdt = atomic_load_ptr(&fdp->fd_files);
 			continue;
 		}
 		error = cap_check_inline(&haverights, needrightsp);
 		if (__predict_false(error != 0)) {
 			if (seqc_consistent(fd_seqc(fdt, fd), seq))
 				return (error);
 			fdt = atomic_load_ptr(&fdp->fd_files);
 			continue;
 		}
 		if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
 			fdt = atomic_load_ptr(&fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		atomic_thread_fence_acq();
 		fdt = fdp->fd_files;
 		if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))
 			break;
 		fdrop(fp, td);
 	}
 	*fpp = fp;
 	if (seqp != NULL) {
 		*seqp = seq;
 	}
 	return (0);
 }
 #else
 static int
 fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, seqc_t *seqp __unused)
 {
 	struct filedesc *fdp;
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 
 	for (;;) {
 		fp = fdt->fdt_ofiles[fd].fde_file;
 		if (__predict_false(fp == NULL))
 			return (EBADF);
 		if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
 			fdt = atomic_load_ptr(&fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		atomic_thread_fence_acq();
 		fdt = fdp->fd_files;
 		if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file))
 			break;
 		fdrop(fp, td);
 	}
 	*fpp = fp;
 	return (0);
 }
 #endif
 
 /*
  * See the comments in fget_unlocked_seq for an explanation of how this works.
  *
  * This is a simplified variant which bails out to the aforementioned routine
  * if anything goes wrong. In practice this only happens when userspace is
  * racing with itself.
  */
 int
 fget_unlocked(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 	struct filedesc *fdp;
 #ifdef CAPABILITIES
 	const struct filedescent *fde;
 #endif
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 #ifdef CAPABILITIES
 	seqc_t seq;
 	const cap_rights_t *haverights;
 #endif
 
 	fdp = td->td_proc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) {
 		*fpp = NULL;
 		return (EBADF);
 	}
 #ifdef CAPABILITIES
 	seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 	fde = &fdt->fdt_ofiles[fd];
 	haverights = cap_rights_fde_inline(fde);
 	fp = fde->fde_file;
 #else
 	fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 	if (__predict_false(fp == NULL))
 		goto out_fallback;
 #ifdef CAPABILITIES
 	if (__predict_false(cap_check_inline_transient(haverights, needrightsp)))
 		goto out_fallback;
 #endif
 	if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count)))
 		goto out_fallback;
 
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 	if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)))
 #else
 	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 #endif
 		goto out_fdrop;
 	*fpp = fp;
 	return (0);
 out_fdrop:
 	fdrop(fp, td);
 out_fallback:
 	*fpp = NULL;
 	return (fget_unlocked_seq(td, fd, needrightsp, fpp, NULL));
 }
 
 /*
  * Translate fd -> file when the caller guarantees the file descriptor table
  * can't be changed by others.
  *
  * Note this does not mean the file object itself is only visible to the caller,
  * merely that it wont disappear without having to be referenced.
  *
  * Must be paired with fput_only_user.
  */
 #ifdef	CAPABILITIES
 int
 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 	const struct filedescent *fde;
 	const struct fdescenttbl *fdt;
 	const cap_rights_t *haverights;
 	struct file *fp;
 	int error;
 
 	MPASS(FILEDESC_IS_ONLY_USER(fdp));
 
 	*fpp = NULL;
 	if (__predict_false(fd >= fdp->fd_nfiles))
 		return (EBADF);
 
 	fdt = fdp->fd_files;
 	fde = &fdt->fdt_ofiles[fd];
 	fp = fde->fde_file;
 	if (__predict_false(fp == NULL))
 		return (EBADF);
 	MPASS(refcount_load(&fp->f_count) > 0);
 	haverights = cap_rights_fde_inline(fde);
 	error = cap_check_inline(haverights, needrightsp);
 	if (__predict_false(error != 0))
 		return (error);
 	*fpp = fp;
 	return (0);
 }
 #else
 int
 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 	struct file *fp;
 
 	MPASS(FILEDESC_IS_ONLY_USER(fdp));
 
 	*fpp = NULL;
 	if (__predict_false(fd >= fdp->fd_nfiles))
 		return (EBADF);
 
 	fp = fdp->fd_ofiles[fd].fde_file;
 	if (__predict_false(fp == NULL))
 		return (EBADF);
 
 	MPASS(refcount_load(&fp->f_count) > 0);
 	*fpp = fp;
 	return (0);
 }
 #endif
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occurred the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp)
 {
 	struct file *fp;
 	int error;
 
 	*fpp = NULL;
 	error = fget_unlocked(td, fd, needrightsp, &fp);
 	if (__predict_false(error != 0))
 		return (error);
 	if (__predict_false(fp->f_ops == &badfileops)) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 		if (fp->f_ops != &path_fileops &&
 		    ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    (fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
     struct file **fpp)
 {
 	int error;
 #ifndef CAPABILITIES
 	error = _fget(td, fd, fpp, 0, rightsp);
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 	return (error);
 #else
 	cap_rights_t fdrights;
 	struct filedesc *fdp;
 	struct file *fp;
 	seqc_t seq;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 	for (;;) {
 		error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq);
 		if (__predict_false(error != 0))
 			return (error);
 		if (__predict_false(fp->f_ops == &badfileops)) {
 			fdrop(fp, td);
 			return (EBADF);
 		}
 		if (maxprotp != NULL)
 			fdrights = *cap_rights(fdp, fd);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	/*
 	 * If requested, convert capability rights to access flags.
 	 */
 	if (maxprotp != NULL)
 		*maxprotp = cap_rights_to_vmprot(&fdrights);
 	*fpp = fp;
 	return (0);
 #endif
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FREAD, rightsp));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp));
 }
 
 int
 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
     struct file **fpp)
 {
 #ifndef CAPABILITIES
 	return (fget_unlocked(td, fd, rightsp, fpp));
 #else
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp;
 	int error;
 	seqc_t seq;
 
 	*fpp = NULL;
 	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 	for (;;) {
 		error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 		error = cap_fcntl_check(fdp, fd, needfcntl);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 	*fpp = fp;
 	return (0);
 #endif
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vrefact(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filecaps caps;
 	struct file *fp;
 	int error;
 
 	error = fget_cap(td, fd, needrightsp, &fp, &caps);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	*havecaps = caps;
 	*vpp = fp->f_vnode;
 	vrefact(*vpp);
 	fdrop(fp, td);
 
 	return (0);
 out:
 	filecaps_free(&caps);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Handle the last reference to a file being closed.
  *
  * Without the noinline attribute clang keeps inlining the func thorough this
  * file when fdrop is used.
  */
 int __noinline
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 #ifdef INVARIANTS
 	int count;
 
 	count = refcount_load(&fp->f_count);
 	if (count != 0)
 		panic("fdrop: fp %p count %d", fp, count);
 #endif
 	error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	error = fget(td, uap->fd, &cap_flock_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = EOPNOTSUPP;
 	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
 		goto done;
 	}
 	if (fp->f_ops == &path_fileops) {
 		goto done;
 	}
 
 	error = 0;
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	u_long *ioctls;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_noref(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		if (!fhold(fp)) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EBADF);
 		}
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 		ioctls = filecaps_copy_prep(&oldfde->fde_caps);
 #ifdef CAPABILITIES
 		seqc_write_begin(&newfde->fde_seqc);
 #endif
 		fde_copy(oldfde, newfde);
 		filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 		    ioctls);
 #ifdef CAPABILITIES
 		seqc_write_end(&newfde->fde_seqc);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seqc_write_begin(&oldfde->fde_seqc);
 		seqc_write_begin(&newfde->fde_seqc);
 #endif
 		fde_copy(oldfde, newfde);
 		oldfde->fde_file = NULL;
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seqc_write_end(&newfde->fde_seqc);
 		seqc_write_end(&oldfde->fde_seqc);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
     &chroot_allow_open_directories, 0,
     "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(struct filedesc *fdp)
 {
 	struct vnode *vp;
 	struct file *fp;
 	int i;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	FILEDESC_FOREACH_FP(fdp, i, fp) {
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 static void
 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
 {
 
 	if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) {
 		vrefact(oldpwd->pwd_cdir);
 		newpwd->pwd_cdir = oldpwd->pwd_cdir;
 	}
 
 	if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) {
 		vrefact(oldpwd->pwd_rdir);
 		newpwd->pwd_rdir = oldpwd->pwd_rdir;
 	}
 
 	if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) {
 		vrefact(oldpwd->pwd_jdir);
 		newpwd->pwd_jdir = oldpwd->pwd_jdir;
 	}
 
 	if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) {
 		vrefact(oldpwd->pwd_adir);
 		newpwd->pwd_adir = oldpwd->pwd_adir;
 	}
 }
 
 struct pwd *
 pwd_hold_pwddesc(struct pwddesc *pdp)
 {
 	struct pwd *pwd;
 
 	PWDDESC_ASSERT_XLOCKED(pdp);
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (pwd != NULL)
 		refcount_acquire(&pwd->pwd_refcount);
 	return (pwd);
 }
 
 bool
 pwd_hold_smr(struct pwd *pwd)
 {
 
 	MPASS(pwd != NULL);
 	if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) {
 		return (true);
 	}
 	return (false);
 }
 
 struct pwd *
 pwd_hold(struct thread *td)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 
 	pdp = td->td_proc->p_pd;
 
 	vfs_smr_enter();
 	pwd = vfs_smr_entered_load(&pdp->pd_pwd);
 	if (pwd_hold_smr(pwd)) {
 		vfs_smr_exit();
 		return (pwd);
 	}
 	vfs_smr_exit();
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	MPASS(pwd != NULL);
 	PWDDESC_XUNLOCK(pdp);
 	return (pwd);
 }
 
 struct pwd *
 pwd_hold_proc(struct proc *p)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 
 	PROC_ASSERT_HELD(p);
 	PROC_LOCK(p);
 	pdp = pdhold(p);
 	MPASS(pdp != NULL);
 	PROC_UNLOCK(p);
 
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	MPASS(pwd != NULL);
 	PWDDESC_XUNLOCK(pdp);
 	pddrop(pdp);
 	return (pwd);
 }
 
 static struct pwd *
 pwd_alloc(void)
 {
 	struct pwd *pwd;
 
 	pwd = uma_zalloc_smr(pwd_zone, M_WAITOK);
 	bzero(pwd, sizeof(*pwd));
 	refcount_init(&pwd->pwd_refcount, 1);
 	return (pwd);
 }
 
 void
 pwd_drop(struct pwd *pwd)
 {
 
 	if (!refcount_release(&pwd->pwd_refcount))
 		return;
 
 	if (pwd->pwd_cdir != NULL)
 		vrele(pwd->pwd_cdir);
 	if (pwd->pwd_rdir != NULL)
 		vrele(pwd->pwd_rdir);
 	if (pwd->pwd_jdir != NULL)
 		vrele(pwd->pwd_jdir);
 	if (pwd->pwd_adir != NULL)
 		vrele(pwd->pwd_adir);
 	uma_zfree_smr(pwd_zone, pwd);
 }
 
 /*
 * The caller is responsible for invoking priv_check() and
 * mac_vnode_check_chroot() to authorize this operation.
 */
 int
 pwd_chroot(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct filedesc *fdp;
 	struct pwd *newpwd, *oldpwd;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	pdp = td->td_proc->p_pd;
 	newpwd = pwd_alloc();
 	FILEDESC_SLOCK(fdp);
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 &&
 	    oldpwd->pwd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		FILEDESC_SUNLOCK(fdp);
 		if (error != 0) {
 			PWDDESC_XUNLOCK(pdp);
 			pwd_drop(newpwd);
 			return (error);
 		}
 	} else {
 		FILEDESC_SUNLOCK(fdp);
 	}
 
 	vrefact(vp);
 	newpwd->pwd_rdir = vp;
 	vrefact(vp);
 	newpwd->pwd_adir = vp;
 	if (oldpwd->pwd_jdir == NULL) {
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
 	}
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 	return (0);
 }
 
 void
 pwd_chdir(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct pwd *newpwd, *oldpwd;
 
 	VNPASS(vp->v_usecount > 0, vp);
 
 	newpwd = pwd_alloc();
 	pdp = td->td_proc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	newpwd->pwd_cdir = vp;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 /*
  * Process is transitioning to/from a non-native ABI.
  */
 void
 pwd_altroot(struct thread *td, struct vnode *altroot_vp)
 {
 	struct pwddesc *pdp;
 	struct pwd *newpwd, *oldpwd;
 
 	newpwd = pwd_alloc();
 	pdp = td->td_proc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (altroot_vp != NULL) {
 		/*
 		 * Native process to a non-native ABI.
 		 */
 
 		vrefact(altroot_vp);
 		newpwd->pwd_adir = altroot_vp;
 	} else {
 		/*
 		 * Non-native process to the native ABI.
 		 */
 
 		vrefact(oldpwd->pwd_rdir);
 		newpwd->pwd_adir = oldpwd->pwd_rdir;
 	}
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 /*
  * jail_attach(2) changes both root and working directories.
  */
 int
 pwd_chroot_chdir(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct filedesc *fdp;
 	struct pwd *newpwd, *oldpwd;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	pdp = td->td_proc->p_pd;
 	newpwd = pwd_alloc();
 	FILEDESC_SLOCK(fdp);
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	error = chroot_refuse_vdir_fds(fdp);
 	FILEDESC_SUNLOCK(fdp);
 	if (error != 0) {
 		PWDDESC_XUNLOCK(pdp);
 		pwd_drop(newpwd);
 		return (error);
 	}
 
 	vrefact(vp);
 	newpwd->pwd_rdir = vp;
 	vrefact(vp);
 	newpwd->pwd_cdir = vp;
 	if (oldpwd->pwd_jdir == NULL) {
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
 	}
 	vrefact(vp);
 	newpwd->pwd_adir = vp;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 	return (0);
 }
 
 void
 pwd_ensure_dirs(void)
 {
 	struct pwddesc *pdp;
 	struct pwd *oldpwd, *newpwd;
 
 	pdp = curproc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL &&
 	    oldpwd->pwd_adir != NULL) {
 		PWDDESC_XUNLOCK(pdp);
 		return;
 	}
 	PWDDESC_XUNLOCK(pdp);
 
 	newpwd = pwd_alloc();
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	pwd_fill(oldpwd, newpwd);
 	if (newpwd->pwd_cdir == NULL) {
 		vrefact(rootvnode);
 		newpwd->pwd_cdir = rootvnode;
 	}
 	if (newpwd->pwd_rdir == NULL) {
 		vrefact(rootvnode);
 		newpwd->pwd_rdir = rootvnode;
 	}
 	if (newpwd->pwd_adir == NULL) {
 		vrefact(rootvnode);
 		newpwd->pwd_adir = rootvnode;
 	}
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 void
 pwd_set_rootvnode(void)
 {
 	struct pwddesc *pdp;
 	struct pwd *oldpwd, *newpwd;
 
 	pdp = curproc->p_pd;
 
 	newpwd = pwd_alloc();
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	vrefact(rootvnode);
 	newpwd->pwd_cdir = rootvnode;
 	vrefact(rootvnode);
 	newpwd->pwd_rdir = rootvnode;
 	vrefact(rootvnode);
 	newpwd->pwd_adir = rootvnode;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct pwddesc *pdp;
 	struct pwd *newpwd, *oldpwd;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	newpwd = pwd_alloc();
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		pdp = pdhold(p);
 		PROC_UNLOCK(p);
 		if (pdp == NULL)
 			continue;
 		PWDDESC_XLOCK(pdp);
 		oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 		if (oldpwd == NULL ||
 		    (oldpwd->pwd_cdir != olddp &&
 		    oldpwd->pwd_rdir != olddp &&
 		    oldpwd->pwd_jdir != olddp &&
 		    oldpwd->pwd_adir != olddp)) {
 			PWDDESC_XUNLOCK(pdp);
 			pddrop(pdp);
 			continue;
 		}
 		if (oldpwd->pwd_cdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_cdir = newdp;
 		}
 		if (oldpwd->pwd_rdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_rdir = newdp;
 		}
 		if (oldpwd->pwd_jdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_jdir = newdp;
 		}
 		if (oldpwd->pwd_adir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_adir = newdp;
 		}
 		pwd_fill(oldpwd, newpwd);
 		pwd_set(pdp, newpwd);
 		PWDDESC_XUNLOCK(pdp);
 		pwd_drop(oldpwd);
 		pddrop(pdp);
 		newpwd = pwd_alloc();
 	}
 	sx_sunlock(&allproc_lock);
 	pwd_drop(newpwd);
 	if (rootvnode == olddp) {
 		vrefact(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vrefact(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vrefact(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 int
 descrip_check_write_mp(struct filedesc *fdp, struct mount *mp)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error, i;
 
 	error = 0;
 	FILEDESC_SLOCK(fdp);
 	FILEDESC_FOREACH_FP(fdp, i, fp) {
 		if (fp->f_type != DTYPE_VNODE ||
 		    (atomic_load_int(&fp->f_flag) & FWRITE) == 0)
 			continue;
 		vp = fp->f_vnode;
 		if (vp->v_mount == mp) {
 			error = EDEADLK;
 			break;
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	return (error);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp,
     struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	    M_FILEDESC_TO_LEADER, M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_share(struct filedesc_to_leader *fdtol, struct filedesc *fdp)
 {
 	FILEDESC_XLOCK(fdp);
 	fdtol->fdl_refcount++;
 	FILEDESC_XUNLOCK(fdp);
 	return (fdtol);
 }
 
 static int
 filedesc_nfiles(struct filedesc *fdp)
 {
 	NDSLOTTYPE *map;
 	int count, off, minoff;
 
 	if (fdp == NULL)
 		return (0);
 	count = 0;
 	FILEDESC_SLOCK(fdp);
 	map = fdp->fd_map;
 	off = NDSLOT(fdp->fd_nfiles - 1);
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		count += bitcountl(map[off]);
 	FILEDESC_SUNLOCK(fdp);
 	return (count);
 }
 
 int
 proc_nfiles(struct proc *p)
 {
 	struct filedesc *fdp;
 	int res;
 
 	PROC_LOCK(p);
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	res = filedesc_nfiles(fdp);
 	fddrop(fdp);
 	return (res);
 }
 
 static int
 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 {
 	u_int namelen;
 	int count;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	if (*(int *)arg1 != 0)
 		return (EINVAL);
 
 	count = filedesc_nfiles(curproc->p_fd);
 	return (SYSCTL_OUT(req, &count, sizeof(count)));
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
     "Number of open file descriptors");
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			fdp = fdhold(p);
 			PROC_UNLOCK(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			n += fdp->fd_nfiles;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		if (refcount_load(&fdp->fd_refcnt) == 0)
 			goto nextproc;
 		FILEDESC_FOREACH_FP(fdp, n, fp) {
 			xf.xf_fd = n;
 			xf.xf_file = (uintptr_t)fp;
 			xf.xf_data = (uintptr_t)fp->f_data;
 			xf.xf_vnode = (uintptr_t)fp->f_vnode;
 			xf.xf_type = (uintptr_t)fp->f_type;
 			xf.xf_count = refcount_load(&fp->f_count);
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 
 			/*
 			 * There is no need to re-check the fdtable refcount
 			 * here since the filedesc lock is not dropped in the
 			 * loop body.
 			 */
 			if (error != 0)
 				break;
 		}
 nextproc:
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static int
 xlate_fflags(int fflags)
 {
 	static const struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 	unsigned int i;
 	int kflags;
 
 	kflags = 0;
 	for (i = 0; i < nitems(fflags_table); i++)
 		if (fflags & fflags_table[i].fflag)
 			kflags |=  fflags_table[i].kf_fflag;
 	return (kflags);
 }
 
 /* Trim unused data from kf_path by truncating the structure size. */
 void
 pack_kinfo(struct kinfo_file *kif)
 {
 
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 }
 
 static void
 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	/* Set a default type to allow for empty fill_kinfo() methods. */
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	kif->kf_flags = xlate_fflags(fp->f_flag);
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init_zero(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = refcount_load(&fp->f_count);
 	kif->kf_offset = foffset_get(fp);
 
 	/*
 	 * This may drop the filedesc lock, so the 'fp' cannot be
 	 * accessed after this call.
 	 */
 	error = fo_fill_kinfo(fp, kif, fdp);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 }
 
 static void
 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
     struct kinfo_file *kif, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	kif->kf_type = KF_TYPE_VNODE;
 	error = vn_fill_kinfo_vnode(vp, kif);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	kif->kf_flags = xlate_fflags(fflags);
 	cap_rights_init_zero(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 	vrele(vp);
 }
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct pwddesc	*pdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 	int			flags;
 };
 
 static int
 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 {
 	struct kinfo_file *kif;
 
 	kif = &efbuf->kif;
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize)
 			return (ENOMEM);
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0)
 		return (sbuf_error(efbuf->sb));
 	return (0);
 }
 
 static int
 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (ENOMEM);
 	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 	    efbuf->flags);
 	FILEDESC_SUNLOCK(efbuf->fdp);
 	error = export_kinfo_to_sb(efbuf);
 	FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 static int
 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (ENOMEM);
 	if (efbuf->pdp != NULL)
 		PWDDESC_XUNLOCK(efbuf->pdp);
 	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 	error = export_kinfo_to_sb(efbuf);
 	if (efbuf->pdp != NULL)
 		PWDDESC_XLOCK(efbuf->pdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
     int flags)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	struct pwd *pwd;
 	int error, i;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = ktr_get_tracevp(p, true);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vrefact(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vrefact(cttyvp);
 	}
 	fdp = fdhold(p);
 	pdp = pdhold(p);
 	PROC_UNLOCK(p);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->pdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	efbuf->flags = flags;
 
 	error = 0;
 	if (tracevp != NULL)
 		error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE,
 		    FREAD | FWRITE, efbuf);
 	if (error == 0 && textvp != NULL)
 		error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD,
 		    efbuf);
 	if (error == 0 && cttyvp != NULL)
 		error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY,
 		    FREAD | FWRITE, efbuf);
 	if (error != 0 || pdp == NULL || fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	efbuf->pdp = pdp;
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	if (pwd != NULL) {
 		/* working directory */
 		if (pwd->pwd_cdir != NULL) {
 			vrefact(pwd->pwd_cdir);
 			error = export_vnode_to_sb(pwd->pwd_cdir,
 			    KF_FD_TYPE_CWD, FREAD, efbuf);
 		}
 		/* root directory */
 		if (error == 0 && pwd->pwd_rdir != NULL) {
 			vrefact(pwd->pwd_rdir);
 			error = export_vnode_to_sb(pwd->pwd_rdir,
 			    KF_FD_TYPE_ROOT, FREAD, efbuf);
 		}
 		/* jail directory */
 		if (error == 0 && pwd->pwd_jdir != NULL) {
 			vrefact(pwd->pwd_jdir);
 			error = export_vnode_to_sb(pwd->pwd_jdir,
 			    KF_FD_TYPE_JAIL, FREAD, efbuf);
 		}
 	}
 	PWDDESC_XUNLOCK(pdp);
 	if (error != 0)
 		goto fail;
 	if (pwd != NULL)
 		pwd_drop(pwd);
 	FILEDESC_SLOCK(fdp);
 	if (refcount_load(&fdp->fd_refcnt) == 0)
 		goto skip;
 	FILEDESC_FOREACH_FP(fdp, i, fp) {
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		rights = cap_no_rights;
 #endif
 		/*
 		 * Create sysctl entry.  It is OK to drop the filedesc
 		 * lock inside of export_file_to_sb() as we will
 		 * re-validate and re-evaluate its properties when the
 		 * loop continues.
 		 */
 		error = export_file_to_sb(fp, i, &rights, efbuf);
 		if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0)
 			break;
 	}
 skip:
 	FILEDESC_SUNLOCK(fdp);
 fail:
 	if (fdp != NULL)
 		fddrop(fdp);
 	if (pdp != NULL)
 		pddrop(pdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	u_int namelen;
 	int error, error2, *name;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen,
 	    KERN_FILEDESC_PACK_KINFO);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #ifdef COMPAT_FREEBSD7
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 static void
 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 {
 
 	okif->kf_structsize = sizeof(*okif);
 	okif->kf_type = kif->kf_type;
 	okif->kf_fd = kif->kf_fd;
 	okif->kf_ref_count = kif->kf_ref_count;
 	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 	okif->kf_offset = kif->kf_offset;
 	if (kif->kf_type == KF_TYPE_VNODE)
 		okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
 	else
 		okif->kf_vnode_type = KF_VTYPE_VNON;
 	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 	if (kif->kf_type == KF_TYPE_SOCKET) {
 		okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
 		okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
 		okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
 		okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
 		okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
 	} else {
 		okif->kf_sa_local.ss_family = AF_UNSPEC;
 		okif->kf_sa_peer.ss_family = AF_UNSPEC;
 	}
 }
 
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
     struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req)
 {
 	int error;
 
 	vrefact(vp);
 	PWDDESC_XUNLOCK(pdp);
 	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 	kinfo_to_okinfo(kif, okif);
 	error = SYSCTL_OUT(req, okif, sizeof(*okif));
 	PWDDESC_XLOCK(pdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_ofile *okif;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 	u_int namelen;
 	int error, i, *name;
 	struct file *fp;
 	struct proc *p;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	if (fdp != NULL)
 		pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL || pdp == NULL) {
 		if (fdp != NULL)
 			fddrop(fdp);
 		return (ENOENT);
 	}
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	if (pwd != NULL) {
 		if (pwd->pwd_cdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif,
 			    okif, pdp, req);
 		if (pwd->pwd_rdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif,
 			    okif, pdp, req);
 		if (pwd->pwd_jdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif,
 			    okif, pdp, req);
 	}
 	PWDDESC_XUNLOCK(pdp);
 	if (pwd != NULL)
 		pwd_drop(pwd);
 	FILEDESC_SLOCK(fdp);
 	if (refcount_load(&fdp->fd_refcnt) == 0)
 		goto skip;
 	FILEDESC_FOREACH_FP(fdp, i, fp) {
 		export_file_to_kinfo(fp, i, NULL, kif, fdp,
 		    KERN_FILEDESC_PACK_KINFO);
 		FILEDESC_SUNLOCK(fdp);
 		kinfo_to_okinfo(kif, okif);
 		error = SYSCTL_OUT(req, okif, sizeof(*okif));
 		FILEDESC_SLOCK(fdp);
 		if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0)
 			break;
 	}
 skip:
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	pddrop(pdp);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < nitems(vtypes_table); i++)
 		if (vtypes_table[i].vtype == vtype)
 			return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 /*
  * Store a process current working directory information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 	struct export_fd_buf *efbuf;
 	struct vnode *cdir;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	if (pdp == NULL)
 		return (EINVAL);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->pdp = pdp;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	efbuf->flags = 0;
 
 	PWDDESC_XLOCK(pdp);
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	cdir = pwd->pwd_cdir;
 	if (cdir == NULL) {
 		error = EINVAL;
 	} else {
 		vrefact(cdir);
 		error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 	}
 	PWDDESC_XUNLOCK(pdp);
 	pddrop(pdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Get per-process current working directory.
  */
 static int
 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	u_int namelen;
 	int error, error2, *name;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_cwd_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_kern_proc_cwd, "Process current working directory");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnode");
 	case DTYPE_SOCKET:
 		return ("socket");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kqueue");
 	case DTYPE_CRYPTO:
 		return ("crypto");
 	case DTYPE_MQUEUE:
 		return ("mqueue");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	case DTYPE_PTS:
 		return ("pts");
 	case DTYPE_DEV:
 		return ("dev");
 	case DTYPE_PROCDESC:
 		return ("proc");
 	case DTYPE_EVENTFD:
 		return ("eventfd");
 	case DTYPE_TIMERFD:
 		return ("timerfd");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
 	struct proc *p;
 
 	if (header)
 		db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
 		    XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
 		    "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
 		    "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
 	    fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
 	    fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 
 #undef XPTRWIDTH
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND_FLAGS(files, db_show_files, DB_CMD_MEMSAFE)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc,
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     &openfiles, 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
 	/*
 	 * XXXMJG this is a temporary hack due to boot ordering issues against
 	 * the vnode zone.
 	 */
 	vfs_smr = uma_zone_get_smr(pwd_zone);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return (0);
 }
 
-struct fileops badfileops = {
+const struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
 static int
 path_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	return (POLLNVAL);
 }
 
 static int
 path_close(struct file *fp, struct thread *td)
 {
 	MPASS(fp->f_type == DTYPE_VNODE);
 	fp->f_ops = &badfileops;
 	vrele(fp->f_vnode);
 	return (0);
 }
 
-struct fileops path_fileops = {
+const struct fileops path_fileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = path_poll,
 	.fo_kqfilter = vn_kqfilter_opath,
 	.fo_stat = vn_statfile,
 	.fo_close = path_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_cmp = vn_cmp,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_ioctl(struct file *fp, u_long com, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	return (ENOTTY);
 }
 
 int
 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (poll_no_poll(events));
 }
 
 int
 invfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c
index 602b82105525..d83bc380c2fe 100644
--- a/sys/kern/kern_devctl.c
+++ b/sys/kern/kern_devctl.c
@@ -1,610 +1,610 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2002-2020 M. Warner Losh <imp@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 #include "opt_bus.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/queue.h>
 #include <machine/bus.h>
 #include <sys/sbuf.h>
 #include <sys/selinfo.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/bus.h>
 
 #include <machine/cpu.h>
 #include <machine/stdarg.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 STAILQ_HEAD(devq, dev_event_info);
 
 static struct dev_softc {
 	int		inuse;
 	int		nonblock;
 	int		queued;
 	int		async;
 	struct mtx	mtx;
 	struct cv	cv;
 	struct selinfo	sel;
 	struct devq	devq;
 	struct sigio	*sigio;
 	uma_zone_t	zone;
 } devsoftc;
 
 /*
  * This design allows only one reader for /dev/devctl.  This is not desirable
  * in the long run, but will get a lot of hair out of this implementation.
  * Maybe we should make this device a clonable device.
  *
  * Also note: we specifically do not attach a device to the device_t tree
  * to avoid potential chicken and egg problems.  One could argue that all
  * of this belongs to the root node.
  */
 
 #define DEVCTL_DEFAULT_QUEUE_LEN 1000
 static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
 static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
 SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RWTUN |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_queue, "I", "devctl queue length");
 static bool nomatch_enabled = true;
 SYSCTL_BOOL(_hw_bus, OID_AUTO, devctl_nomatch_enabled, CTLFLAG_RWTUN,
     &nomatch_enabled, 0, "enable nomatch events");
 
 static void devctl_attach_handler(void *arg __unused, device_t dev);
 static void devctl_detach_handler(void *arg __unused, device_t dev,
     enum evhdev_detach state);
 static void devctl_nomatch_handler(void *arg __unused, device_t dev);
 
 static d_open_t		devopen;
 static d_close_t	devclose;
 static d_read_t		devread;
 static d_ioctl_t	devioctl;
 static d_poll_t		devpoll;
 static d_kqfilter_t	devkqfilter;
 
 #define DEVCTL_BUFFER (1024 - sizeof(void *))
 struct dev_event_info {
 	STAILQ_ENTRY(dev_event_info) dei_link;
 	char dei_data[DEVCTL_BUFFER];
 };
 
 
 static struct cdevsw dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	devopen,
 	.d_close =	devclose,
 	.d_read =	devread,
 	.d_ioctl =	devioctl,
 	.d_poll =	devpoll,
 	.d_kqfilter =	devkqfilter,
 	.d_name =	"devctl",
 };
 
 static void	filt_devctl_detach(struct knote *kn);
 static int	filt_devctl_read(struct knote *kn, long hint);
 
-static struct filterops devctl_rfiltops = {
+static const struct filterops devctl_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_devctl_detach,
 	.f_event = filt_devctl_read,
 };
 
 static struct cdev *devctl_dev;
 static void devaddq(const char *type, const char *what, device_t dev);
 
 static struct devctlbridge {
 	send_event_f *send_f;
 } devctl_notify_hook = { .send_f = NULL };
 
 static void
 devctl_init(void)
 {
 	int reserve;
 	uma_zone_t z;
 
 	devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0600, "devctl");
 	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
 	cv_init(&devsoftc.cv, "dev cv");
 	STAILQ_INIT(&devsoftc.devq);
 	knlist_init_mtx(&devsoftc.sel.si_note, &devsoftc.mtx);
 	if (devctl_queue_length > 0) {
 		/*
 		 * Allocate a zone for the messages. Preallocate 2% of these for
 		 * a reserve. Allow only devctl_queue_length slabs to cap memory
 		 * usage.  The reserve usually allows coverage of surges of
 		 * events during memory shortages. Normally we won't have to
 		 * re-use events from the queue, but will in extreme shortages.
 		 */
 		z = devsoftc.zone = uma_zcreate("DEVCTL",
 		    sizeof(struct dev_event_info), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		reserve = max(devctl_queue_length / 50, 100);	/* 2% reserve */
 		uma_zone_set_max(z, devctl_queue_length);
 		uma_zone_set_maxcache(z, 0);
 		uma_zone_reserve(z, reserve);
 		uma_prealloc(z, reserve);
 	}
 	EVENTHANDLER_REGISTER(device_attach, devctl_attach_handler,
 	    NULL, EVENTHANDLER_PRI_LAST);
 	EVENTHANDLER_REGISTER(device_detach, devctl_detach_handler,
 	    NULL, EVENTHANDLER_PRI_LAST);
 	EVENTHANDLER_REGISTER(device_nomatch, devctl_nomatch_handler,
 	    NULL, EVENTHANDLER_PRI_LAST);
 }
 SYSINIT(devctl_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, devctl_init, NULL);
 
 /*
  * A device was added to the tree.  We are called just after it successfully
  * attaches (that is, probe and attach success for this device).  No call
  * is made if a device is merely parented into the tree.  See devnomatch
  * if probe fails.  If attach fails, no notification is sent (but maybe
  * we should have a different message for this).
  */
 static void
 devctl_attach_handler(void *arg __unused, device_t dev)
 {
 	devaddq("+", device_get_nameunit(dev), dev);
 }
 
 /*
  * A device was removed from the tree.  We are called just before this
  * happens.
  */
 static void
 devctl_detach_handler(void *arg __unused, device_t dev, enum evhdev_detach state)
 {
 	if (state == EVHDEV_DETACH_COMPLETE)
 		devaddq("-", device_get_nameunit(dev), dev);
 }
 
 /*
  * Called when there's no match for this device.  This is only called
  * the first time that no match happens, so we don't keep getting this
  * message.  Should that prove to be undesirable, we can change it.
  * This is called when all drivers that can attach to a given bus
  * decline to accept this device.  Other errors may not be detected.
  */
 static void
 devctl_nomatch_handler(void *arg __unused, device_t dev)
 {
 	if (nomatch_enabled)
 		devaddq("?", "", dev);
 }
 
 static int
 devopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	mtx_lock(&devsoftc.mtx);
 	if (devsoftc.inuse) {
 		mtx_unlock(&devsoftc.mtx);
 		return (EBUSY);
 	}
 	/* move to init */
 	devsoftc.inuse = 1;
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 static int
 devclose(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	mtx_lock(&devsoftc.mtx);
 	devsoftc.inuse = 0;
 	devsoftc.nonblock = 0;
 	devsoftc.async = 0;
 	cv_broadcast(&devsoftc.cv);
 	funsetown(&devsoftc.sigio);
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /*
  * The read channel for this device is used to report changes to
  * userland in realtime.  We are required to free the data as well as
  * the n1 object because we allocate them separately.  Also note that
  * we return one record at a time.  If you try to read this device a
  * character at a time, you will lose the rest of the data.  Listening
  * programs are expected to cope.
  */
 static int
 devread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct dev_event_info *n1;
 	int rv;
 
 	mtx_lock(&devsoftc.mtx);
 	while (STAILQ_EMPTY(&devsoftc.devq)) {
 		if (devsoftc.nonblock) {
 			mtx_unlock(&devsoftc.mtx);
 			return (EAGAIN);
 		}
 		rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
 		if (rv) {
 			/*
 			 * Need to translate ERESTART to EINTR here? -- jake
 			 */
 			mtx_unlock(&devsoftc.mtx);
 			return (rv);
 		}
 	}
 	n1 = STAILQ_FIRST(&devsoftc.devq);
 	STAILQ_REMOVE_HEAD(&devsoftc.devq, dei_link);
 	devsoftc.queued--;
 	mtx_unlock(&devsoftc.mtx);
 	rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
 	uma_zfree(devsoftc.zone, n1);
 	return (rv);
 }
 
 static	int
 devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	switch (cmd) {
 	case FIONBIO:
 		if (*(int*)data)
 			devsoftc.nonblock = 1;
 		else
 			devsoftc.nonblock = 0;
 		return (0);
 	case FIOASYNC:
 		if (*(int*)data)
 			devsoftc.async = 1;
 		else
 			devsoftc.async = 0;
 		return (0);
 	case FIOSETOWN:
 		return fsetown(*(int *)data, &devsoftc.sigio);
 	case FIOGETOWN:
 		*(int *)data = fgetown(&devsoftc.sigio);
 		return (0);
 
 		/* (un)Support for other fcntl() calls. */
 	case FIOCLEX:
 	case FIONCLEX:
 	case FIONREAD:
 	default:
 		break;
 	}
 	return (ENOTTY);
 }
 
 static	int
 devpoll(struct cdev *dev, int events, struct thread *td)
 {
 	int	revents = 0;
 
 	mtx_lock(&devsoftc.mtx);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (!STAILQ_EMPTY(&devsoftc.devq))
 			revents = events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &devsoftc.sel);
 	}
 	mtx_unlock(&devsoftc.mtx);
 
 	return (revents);
 }
 
 static int
 devkqfilter(struct cdev *dev, struct knote *kn)
 {
 	int error;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &devctl_rfiltops;
 		knlist_add(&devsoftc.sel.si_note, kn, 0);
 		error = 0;
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_devctl_detach(struct knote *kn)
 {
 	knlist_remove(&devsoftc.sel.si_note, kn, 0);
 }
 
 static int
 filt_devctl_read(struct knote *kn, long hint)
 {
 	kn->kn_data = devsoftc.queued;
 	return (kn->kn_data != 0);
 }
 
 /**
  * @brief Return whether the userland process is running
  */
 bool
 devctl_process_running(void)
 {
 	return (devsoftc.inuse == 1);
 }
 
 static struct dev_event_info *
 devctl_alloc_dei(void)
 {
 	struct dev_event_info *dei = NULL;
 
 	mtx_lock(&devsoftc.mtx);
 	if (devctl_queue_length == 0)
 		goto out;
 	dei = uma_zalloc(devsoftc.zone, M_NOWAIT);
 	if (dei == NULL)
 		dei = uma_zalloc(devsoftc.zone, M_NOWAIT | M_USE_RESERVE);
 	if (dei == NULL) {
 		/*
 		 * Guard against no items in the queue. Normally, this won't
 		 * happen, but if lots of events happen all at once and there's
 		 * a chance we're out of allocated space but none have yet been
 		 * queued when we get here, leaving nothing to steal. This can
 		 * also happen with error injection. Fail safe by returning
 		 * NULL in that case..
 		 */
 		if (devsoftc.queued == 0)
 			goto out;
 		dei = STAILQ_FIRST(&devsoftc.devq);
 		STAILQ_REMOVE_HEAD(&devsoftc.devq, dei_link);
 		devsoftc.queued--;
 	}
 	MPASS(dei != NULL);
 	*dei->dei_data = '\0';
 out:
 	mtx_unlock(&devsoftc.mtx);
 	return (dei);
 }
 
 static struct dev_event_info *
 devctl_alloc_dei_sb(struct sbuf *sb)
 {
 	struct dev_event_info *dei;
 
 	dei = devctl_alloc_dei();
 	if (dei != NULL)
 		sbuf_new(sb, dei->dei_data, sizeof(dei->dei_data), SBUF_FIXEDLEN);
 	return (dei);
 }
 
 static void
 devctl_free_dei(struct dev_event_info *dei)
 {
 	uma_zfree(devsoftc.zone, dei);
 }
 
 static void
 devctl_queue(struct dev_event_info *dei)
 {
 	mtx_lock(&devsoftc.mtx);
 	STAILQ_INSERT_TAIL(&devsoftc.devq, dei, dei_link);
 	devsoftc.queued++;
 	cv_broadcast(&devsoftc.cv);
 	KNOTE_LOCKED(&devsoftc.sel.si_note, 0);
 	mtx_unlock(&devsoftc.mtx);
 	selwakeup(&devsoftc.sel);
 	if (devsoftc.async && devsoftc.sigio != NULL)
 		pgsigio(&devsoftc.sigio, SIGIO, 0);
 }
 
 /**
  * @brief Send a 'notification' to userland, using standard ways
  */
 void
 devctl_notify(const char *system, const char *subsystem, const char *type,
     const char *data)
 {
 	struct dev_event_info *dei;
 	struct sbuf sb;
 
 	if (system == NULL || subsystem == NULL || type == NULL)
 		return;
 	if (devctl_notify_hook.send_f != NULL)
 		devctl_notify_hook.send_f(system, subsystem, type, data);
 	dei = devctl_alloc_dei_sb(&sb);
 	if (dei == NULL)
 		return;
 	sbuf_cpy(&sb, "!system=");
 	sbuf_cat(&sb, system);
 	sbuf_cat(&sb, " subsystem=");
 	sbuf_cat(&sb, subsystem);
 	sbuf_cat(&sb, " type=");
 	sbuf_cat(&sb, type);
 	if (data != NULL) {
 		sbuf_putc(&sb, ' ');
 		sbuf_cat(&sb, data);
 	}
 	sbuf_putc(&sb, '\n');
 	if (sbuf_finish(&sb) != 0)
 		devctl_free_dei(dei);	/* overflow -> drop it */
 	else
 		devctl_queue(dei);
 }
 
 /*
  * Common routine that tries to make sending messages as easy as possible.
  * We allocate memory for the data, copy strings into that, but do not
  * free it unless there's an error.  The dequeue part of the driver should
  * free the data.  We don't send data when the device is disabled.  We do
  * send data, even when we have no listeners, because we wish to avoid
  * races relating to startup and restart of listening applications.
  *
  * devaddq is designed to string together the type of event, with the
  * object of that event, plus the plug and play info and location info
  * for that event.  This is likely most useful for devices, but less
  * useful for other consumers of this interface.  Those should use
  * the devctl_notify() interface instead.
  *
  * Output:
  *	${type}${what} at $(location dev) $(pnp-info dev) on $(parent dev)
  */
 static void
 devaddq(const char *type, const char *what, device_t dev)
 {
 	struct dev_event_info *dei;
 	const char *parstr;
 	struct sbuf sb;
 	size_t beginlen;
 
 	dei = devctl_alloc_dei_sb(&sb);
 	if (dei == NULL)
 		return;
 	sbuf_cpy(&sb, type);
 	sbuf_cat(&sb, what);
 	sbuf_cat(&sb, " at ");
 	beginlen = sbuf_len(&sb);
 
 	/* Add in the location */
 	bus_child_location(dev, &sb);
 	sbuf_putc(&sb, ' ');
 
 	/* Add in pnpinfo */
 	bus_child_pnpinfo(dev, &sb);
 
 	/* Get the parent of this device, or / if high enough in the tree. */
 	if (device_get_parent(dev) == NULL)
 		parstr = ".";	/* Or '/' ? */
 	else
 		parstr = device_get_nameunit(device_get_parent(dev));
 	sbuf_cat(&sb, " on ");
 	sbuf_cat(&sb, parstr);
 	sbuf_putc(&sb, '\n');
 	if (sbuf_finish(&sb) != 0)
 		goto bad;
 	if (devctl_notify_hook.send_f != NULL) {
 		const char *t;
 
 		switch (*type) {
 		case '+':
 			t = "ATTACH";
 			break;
 		case '-':
 			t = "DETACH";
 			break;
 		default:
 			t = "NOMATCH";
 			break;
 		}
 		devctl_notify_hook.send_f("device",
 		    what, t, sbuf_data(&sb) + beginlen);
 	}
 	devctl_queue(dei);
 	return;
 bad:
 	devctl_free_dei(dei);
 }
 
 static int
 sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
 {
 	int q, error;
 
 	q = devctl_queue_length;
 	error = sysctl_handle_int(oidp, &q, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (q < 0)
 		return (EINVAL);
 
 	/*
 	 * When set as a tunable, we've not yet initialized the mutex.
 	 * It is safe to just assign to devctl_queue_length and return
 	 * as we're racing no one. We'll use whatever value set in
 	 * devinit.
 	 */
 	if (!mtx_initialized(&devsoftc.mtx)) {
 		devctl_queue_length = q;
 		return (0);
 	}
 
 	/*
 	 * XXX It's hard to grow or shrink the UMA zone. Only allow
 	 * disabling the queue size for the moment until underlying
 	 * UMA issues can be sorted out.
 	 */
 	if (q != 0)
 		return (EINVAL);
 	if (q == devctl_queue_length)
 		return (0);
 	mtx_lock(&devsoftc.mtx);
 	devctl_queue_length = 0;
 	uma_zdestroy(devsoftc.zone);
 	devsoftc.zone = 0;
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /**
  * @brief safely quotes strings that might have double quotes in them.
  *
  * The devctl protocol relies on quoted strings having matching quotes.
  * This routine quotes any internal quotes so the resulting string
  * is safe to pass to snprintf to construct, for example pnp info strings.
  *
  * @param sb	sbuf to place the characters into
  * @param src	Original buffer.
  */
 void
 devctl_safe_quote_sb(struct sbuf *sb, const char *src)
 {
 	while (*src != '\0') {
 		if (*src == '"' || *src == '\\')
 			sbuf_putc(sb, '\\');
 		sbuf_putc(sb, *src++);
 	}
 }
 
 void
 devctl_set_notify_hook(send_event_f *hook)
 {
 	devctl_notify_hook.send_f = hook;
 }
 
 void
 devctl_unset_notify_hook(void)
 {
 	devctl_notify_hook.send_f = NULL;
 }
 
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index fa96fbad20ce..dcb2c10ee1f5 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -1,2831 +1,2831 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
  * Copyright (c) 2009 Apple, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ktrace.h"
 #include "opt_kqueue.h"
 
 #ifdef COMPAT_FREEBSD11
 #define	_WANT_FREEBSD11_KEVENT
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <machine/atomic.h>
 
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 /*
  * This lock is used if multiple kq locks are required.  This possibly
  * should be made into a per proc lock.
  */
 static struct mtx	kq_global;
 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
 	if (!haslck)				\
 		mtx_lock(lck);			\
 	haslck = 1;				\
 } while (0)
 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
 	if (haslck)				\
 		mtx_unlock(lck);			\
 	haslck = 0;				\
 } while (0)
 
 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
 		    struct thread *td, int mflag);
 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
 static void	kqueue_destroy(struct kqueue *kq);
 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
 static int	kqueue_expand(struct kqueue *kq, const struct filterops *fops,
 		    uintptr_t ident, int mflag);
 static void	kqueue_task(void *arg, int pending);
 static int	kqueue_scan(struct kqueue *kq, int maxevents,
 		    struct kevent_copyops *k_ops,
 		    const struct timespec *timeout,
 		    struct kevent *keva, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 static const struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
 struct g_kevent_args;
 static int	kern_kevent_generic(struct thread *td,
 		    struct g_kevent_args *uap,
 		    struct kevent_copyops *k_ops, const char *struct_name);
 
 static fo_ioctl_t	kqueue_ioctl;
 static fo_poll_t	kqueue_poll;
 static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
 
-static struct fileops kqueueops = {
+static const struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = kqueue_ioctl,
 	.fo_poll = kqueue_poll,
 	.fo_kqfilter = kqueue_kqfilter,
 	.fo_stat = kqueue_stat,
 	.fo_close = kqueue_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_cmp = file_kcmp_generic,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(int mflag);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static void	filt_timerexpire_l(struct knote *kn, bool proc_locked);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static void	filt_timerstart(struct knote *kn, sbintime_t to);
 static void	filt_timertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
 static int	filt_timer(struct knote *kn, long hint);
 static int	filt_userattach(struct knote *kn);
 static void	filt_userdetach(struct knote *kn);
 static int	filt_user(struct knote *kn, long hint);
 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 
-static struct filterops file_filtops = {
+static const struct filterops file_filtops = {
 	.f_isfd = 1,
 	.f_attach = filt_fileattach,
 };
-static struct filterops kqread_filtops = {
+static const struct filterops kqread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_kqdetach,
 	.f_event = filt_kqueue,
 };
 /* XXX - move to kern_proc.c?  */
-static struct filterops proc_filtops = {
+static const struct filterops proc_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_procattach,
 	.f_detach = filt_procdetach,
 	.f_event = filt_proc,
 };
-static struct filterops timer_filtops = {
+static const struct filterops timer_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_timerattach,
 	.f_detach = filt_timerdetach,
 	.f_event = filt_timer,
 	.f_touch = filt_timertouch,
 };
-static struct filterops user_filtops = {
+static const struct filterops user_filtops = {
 	.f_attach = filt_userattach,
 	.f_detach = filt_userdetach,
 	.f_event = filt_user,
 	.f_touch = filt_usertouch,
 };
 
 static uma_zone_t	knote_zone;
 static unsigned int __exclusive_cache_line	kq_ncallouts;
 static unsigned int 	kq_calloutmax = 4 * 1024;
 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not influx ? */
 #define KNOTE_ACTIVATE(kn, islock) do { 				\
 	if ((islock))							\
 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
 	else								\
 		KQ_LOCK((kn)->kn_kq);					\
 	(kn)->kn_status |= KN_ACTIVE;					\
 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue((kn));					\
 	if (!(islock))							\
 		KQ_UNLOCK((kn)->kn_kq);					\
 } while (0)
 #define KQ_LOCK(kq) do {						\
 	mtx_lock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_FLUX_WAKEUP(kq) do {						\
 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
 		wakeup((kq));						\
 	}								\
 } while (0)
 #define KQ_UNLOCK_FLUX(kq) do {						\
 	KQ_FLUX_WAKEUP(kq);						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_UNLOCK(kq) do {						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_OWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
 } while (0)
 #define KQ_NOTOWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
 } while (0)
 
 static struct knlist *
 kn_list_lock(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = kn->kn_knlist;
 	if (knl != NULL)
 		knl->kl_lock(knl->kl_lockarg);
 	return (knl);
 }
 
 static void
 kn_list_unlock(struct knlist *knl)
 {
 	bool do_free;
 
 	if (knl == NULL)
 		return;
 	do_free = knl->kl_autodestroy && knlist_empty(knl);
 	knl->kl_unlock(knl->kl_lockarg);
 	if (do_free) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 static bool
 kn_in_flux(struct knote *kn)
 {
 
 	return (kn->kn_influx > 0);
 }
 
 static void
 kn_enter_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx < INT_MAX);
 	kn->kn_influx++;
 }
 
 static bool
 kn_leave_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx > 0);
 	kn->kn_influx--;
 	return (kn->kn_influx == 0);
 }
 
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
 	else								\
 		KNL_ASSERT_UNLOCKED(knl);				\
 } while (0)
 #ifdef INVARIANTS
 #define	KNL_ASSERT_LOCKED(knl) do {					\
 	knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED);		\
 } while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
 	knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED);		\
 } while (0)
 #else /* !INVARIANTS */
 #define	KNL_ASSERT_LOCKED(knl) do {} while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
 #endif /* INVARIANTS */
 
 #ifndef	KN_HASHSIZE
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #endif
 
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
-struct filterops null_filtops = {
+static const struct filterops null_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_nullattach,
 };
 
 /* XXX - make SYSINIT to add these, and move into respective modules. */
-extern struct filterops sig_filtops;
-extern struct filterops fs_filtops;
+extern const struct filterops sig_filtops;
+extern const struct filterops fs_filtops;
 
 /*
  * Table for all system-defined filters.
  */
 static struct mtx	filterops_lock;
 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF);
 static struct {
 	const struct filterops *for_fop;
 	int for_nolock;
 	int for_refcnt;
 } sysfilt_ops[EVFILT_SYSCOUNT] = {
 	[~EVFILT_READ] = { &file_filtops, 1 },
 	[~EVFILT_WRITE] = { &file_filtops, 1 },
 	[~EVFILT_AIO] = { &null_filtops },
 	[~EVFILT_VNODE] = { &file_filtops, 1 },
 	[~EVFILT_PROC] = { &proc_filtops, 1 },
 	[~EVFILT_SIGNAL] = { &sig_filtops, 1 },
 	[~EVFILT_TIMER] = { &timer_filtops, 1 },
 	[~EVFILT_PROCDESC] = { &file_filtops, 1 },
 	[~EVFILT_FS] = { &fs_filtops, 1 },
 	[~EVFILT_LIO] = { &null_filtops },
 	[~EVFILT_USER] = { &user_filtops, 1 },
 	[~EVFILT_SENDFILE] = { &null_filtops },
 	[~EVFILT_EMPTY] = { &file_filtops, 1 },
 };
 
 /*
  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
  * method.
  */
 static int
 filt_fileattach(struct knote *kn)
 {
 
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_status |= KN_KQUEUE;
 	kn->kn_fop = &kqread_filtops;
 	knlist_add(&kq->kq_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int error;
 	bool exiting, immediate;
 
 	exiting = immediate = false;
 	if (kn->kn_sfflags & NOTE_EXIT)
 		p = pfind_any(kn->kn_id);
 	else
 		p = pfind(kn->kn_id);
 	if (p == NULL)
 		return (ESRCH);
 	if (p->p_flag & P_WEXIT)
 		exiting = true;
 
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * Internal flag indicating registration done by kernel for the
 	 * purposes of getting a NOTE_CHILD notification.
 	 */
 	if (kn->kn_flags & EV_FLAG2) {
 		kn->kn_flags &= ~EV_FLAG2;
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
 		immediate = true; /* Force immediate activation of child note. */
 	}
 	/*
 	 * Internal flag indicating registration done by kernel (for other than
 	 * NOTE_CHILD).
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	knlist_add(p->p_klist, kn, 1);
 
 	/*
 	 * Immediately activate any child notes or, in the case of a zombie
 	 * target process, exit notes.  The latter is necessary to handle the
 	 * case where the target process, e.g. a child, dies before the kevent
 	 * is registered.
 	 */
 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
 		KNOTE_ACTIVATE(kn, 0);
 
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 /* XXX - move to kern_proc.c?  */
 static void
 filt_procdetach(struct knote *kn)
 {
 
 	knlist_remove(kn->kn_knlist, kn, 0);
 	kn->kn_ptr.p_proc = NULL;
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	struct proc *p;
 	u_int event;
 
 	p = kn->kn_ptr.p_proc;
 	if (p == NULL) /* already activated, from attach filter */
 		return (0);
 
 	/* Mask off extra data. */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		kn->kn_ptr.p_proc = NULL;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 /*
  * Called when the process forked. It mostly does the same as the
  * knote(), activating all knotes registered to be activated when the
  * process forked. Additionally, for each knote attached to the
  * parent, check whether user wants to track the new process. If so
  * attach a new knote to it, and immediately report an event with the
  * child's pid.
  */
 void
 knote_fork(struct knlist *list, int pid)
 {
 	struct kqueue *kq;
 	struct knote *kn;
 	struct kevent kev;
 	int error;
 
 	MPASS(list != NULL);
 	KNL_ASSERT_LOCKED(list);
 	if (SLIST_EMPTY(&list->kl_list))
 		return;
 
 	memset(&kev, 0, sizeof(kev));
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The same as knote(), activate the event.
 		 */
 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The NOTE_TRACK case. In addition to the activation
 		 * of the event, we need to register new events to
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		list->kl_unlock(list->kl_lockarg);
 
 		/*
 		 * Activate existing knote and register tracking knotes with
 		 * new process.
 		 *
 		 * First register a knote to get just the child notice. This
 		 * must be a separate note from a potential NOTE_EXIT
 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
 		 * to use the data field (in conflicting ways).
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
 		    EV_FLAG2;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 
 		/*
 		 * Then register another knote to track other potential events
 		 * from the new process.
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
 			KNOTE_ACTIVATE(kn, 0);
 		list->kl_lock(list->kl_lockarg);
 		KQ_LOCK(kq);
 		kn_leave_flux(kn);
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 /*
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
 
 #define NOTE_TIMER_PRECMASK						\
     (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
 
 static sbintime_t
 timer2sbintime(int64_t data, int flags)
 {
 	int64_t secs;
 
         /*
          * Macros for converting to the fractional second portion of an
          * sbintime_t using 64bit multiplication to improve precision.
          */
 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
 	switch (flags & NOTE_TIMER_PRECMASK) {
 	case NOTE_SECONDS:
 #ifdef __LP64__
 		if (data > (SBT_MAX / SBT_1S))
 			return (SBT_MAX);
 #endif
 		return ((sbintime_t)data << 32);
 	case NOTE_MSECONDS: /* FALLTHROUGH */
 	case 0:
 		if (data >= 1000) {
 			secs = data / 1000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | MS_TO_SBT(data % 1000));
 		}
 		return (MS_TO_SBT(data));
 	case NOTE_USECONDS:
 		if (data >= 1000000) {
 			secs = data / 1000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000));
 		}
 		return (US_TO_SBT(data));
 	case NOTE_NSECONDS:
 		if (data >= 1000000000) {
 			secs = data / 1000000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | NS_TO_SBT(data % 1000000000));
 		}
 		return (NS_TO_SBT(data));
 	default:
 		break;
 	}
 	return (-1);
 }
 
 struct kq_timer_cb_data {
 	struct callout c;
 	struct proc *p;
 	struct knote *kn;
 	int cpuid;
 	int flags;
 	TAILQ_ENTRY(kq_timer_cb_data) link;
 	sbintime_t next;	/* next timer event fires at */
 	sbintime_t to;		/* precalculated timer period, 0 for abs */
 };
 
 #define	KQ_TIMER_CB_ENQUEUED	0x01
 
 static void
 kqtimer_sched_callout(struct kq_timer_cb_data *kc)
 {
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn,
 	    kc->cpuid, C_ABSOLUTE);
 }
 
 void
 kqtimer_proc_continue(struct proc *p)
 {
 	struct kq_timer_cb_data *kc, *kc1;
 	struct bintime bt;
 	sbintime_t now;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	getboottimebin(&bt);
 	now = bttosbt(bt);
 
 	TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) {
 		TAILQ_REMOVE(&p->p_kqtim_stop, kc, link);
 		kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
 		if (kc->next <= now)
 			filt_timerexpire_l(kc->kn, true);
 		else
 			kqtimer_sched_callout(kc);
 	}
 }
 
 static void
 filt_timerexpire_l(struct knote *kn, bool proc_locked)
 {
 	struct kq_timer_cb_data *kc;
 	struct proc *p;
 	uint64_t delta;
 	sbintime_t now;
 
 	kc = kn->kn_ptr.p_v;
 
 	if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) {
 		kn->kn_data++;
 		KNOTE_ACTIVATE(kn, 0);
 		return;
 	}
 
 	now = sbinuptime();
 	if (now >= kc->next) {
 		delta = (now - kc->next) / kc->to;
 		if (delta == 0)
 			delta = 1;
 		kn->kn_data += delta;
 		kc->next += delta * kc->to;
 		if (now >= kc->next)	/* overflow */
 			kc->next = now + kc->to;
 		KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 	}
 
 	/*
 	 * Initial check for stopped kc->p is racy.  It is fine to
 	 * miss the set of the stop flags, at worst we would schedule
 	 * one more callout.  On the other hand, it is not fine to not
 	 * schedule when we we missed clearing of the flags, we
 	 * recheck them under the lock and observe consistent state.
 	 */
 	p = kc->p;
 	if (P_SHOULDSTOP(p) || P_KILLED(p)) {
 		if (!proc_locked)
 			PROC_LOCK(p);
 		if (P_SHOULDSTOP(p) || P_KILLED(p)) {
 			if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) {
 				kc->flags |= KQ_TIMER_CB_ENQUEUED;
 				TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link);
 			}
 			if (!proc_locked)
 				PROC_UNLOCK(p);
 			return;
 		}
 		if (!proc_locked)
 			PROC_UNLOCK(p);
 	}
 	kqtimer_sched_callout(kc);
 }
 
 static void
 filt_timerexpire(void *knx)
 {
 	filt_timerexpire_l(knx, false);
 }
 
 /*
  * data contains amount of time to sleep
  */
 static int
 filt_timervalidate(struct knote *kn, sbintime_t *to)
 {
 	struct bintime bt;
 	sbintime_t sbt;
 
 	if (kn->kn_sdata < 0)
 		return (EINVAL);
 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
 		kn->kn_sdata = 1;
 	/*
 	 * The only fflags values supported are the timer unit
 	 * (precision) and the absolute time indicator.
 	 */
 	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
 		return (EINVAL);
 
 	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
 	if (*to < 0)
 		return (EINVAL);
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		getboottimebin(&bt);
 		sbt = bttosbt(bt);
 		*to = MAX(0, *to - sbt);
 	}
 	return (0);
 }
 
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	sbintime_t to;
 	int error;
 
 	to = -1;
 	error = filt_timervalidate(kn, &to);
 	if (error != 0)
 		return (error);
 	KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 ||
 	    (kn->kn_sfflags & NOTE_ABSTIME) != 0,
 	    ("%s: periodic timer has a calculated zero timeout", __func__));
 	KASSERT(to >= 0,
 	    ("%s: timer has a calculated negative timeout", __func__));
 
 	if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
 		atomic_subtract_int(&kq_ncallouts, 1);
 		return (ENOMEM);
 	}
 
 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
 	kc->kn = kn;
 	kc->p = curproc;
 	kc->cpuid = PCPU_GET(cpuid);
 	kc->flags = 0;
 	callout_init(&kc->c, 1);
 	filt_timerstart(kn, to);
 
 	return (0);
 }
 
 static void
 filt_timerstart(struct knote *kn, sbintime_t to)
 {
 	struct kq_timer_cb_data *kc;
 
 	kc = kn->kn_ptr.p_v;
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		kc->next = to;
 		kc->to = 0;
 	} else {
 		kc->next = to + sbinuptime();
 		kc->to = to;
 	}
 	kqtimer_sched_callout(kc);
 }
 
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	unsigned int old __unused;
 	bool pending;
 
 	kc = kn->kn_ptr.p_v;
 	do {
 		callout_drain(&kc->c);
 
 		/*
 		 * kqtimer_proc_continue() might have rescheduled this callout.
 		 * Double-check, using the process mutex as an interlock.
 		 */
 		PROC_LOCK(kc->p);
 		if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) {
 			kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
 			TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link);
 		}
 		pending = callout_pending(&kc->c);
 		PROC_UNLOCK(kc->p);
 	} while (pending);
 	free(kc, M_KQUEUE);
 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
 }
 
 static void
 filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	struct kq_timer_cb_data *kc;
 	struct kqueue *kq;
 	sbintime_t to;
 	int error;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		/* Handle re-added timers that update data/fflags */
 		if (kev->flags & EV_ADD) {
 			kc = kn->kn_ptr.p_v;
 
 			/* Drain any existing callout. */
 			callout_drain(&kc->c);
 
 			/* Throw away any existing undelivered record
 			 * of the timer expiration. This is done under
 			 * the presumption that if a process is
 			 * re-adding this timer with new parameters,
 			 * it is no longer interested in what may have
 			 * happened under the old parameters. If it is
 			 * interested, it can wait for the expiration,
 			 * delete the old timer definition, and then
 			 * add the new one.
 			 *
 			 * This has to be done while the kq is locked:
 			 *   - if enqueued, dequeue
 			 *   - make it no longer active
 			 *   - clear the count of expiration events
 			 */
 			kq = kn->kn_kq;
 			KQ_LOCK(kq);
 			if (kn->kn_status & KN_QUEUED)
 				knote_dequeue(kn);
 
 			kn->kn_status &= ~KN_ACTIVE;
 			kn->kn_data = 0;
 			KQ_UNLOCK(kq);
 
 			/* Reschedule timer based on new data/fflags */
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			error = filt_timervalidate(kn, &to);
 			if (error != 0) {
 			  	kn->kn_flags |= EV_ERROR;
 				kn->kn_data = error;
 			} else
 			  	filt_timerstart(kn, to);
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_timertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
 static int
 filt_userattach(struct knote *kn)
 {
 
 	/*
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */
 	kn->kn_hook = NULL;
 	if (kn->kn_fflags & NOTE_TRIGGER)
 		kn->kn_hookid = 1;
 	else
 		kn->kn_hookid = 0;
 	return (0);
 }
 
 static void
 filt_userdetach(__unused struct knote *kn)
 {
 
 	/*
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */
 }
 
 static int
 filt_user(struct knote *kn, __unused long hint)
 {
 
 	return (kn->kn_hookid);
 }
 
 static void
 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	u_int ffctrl;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		if (kev->fflags & NOTE_TRIGGER)
 			kn->kn_hookid = 1;
 
 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
 		kev->fflags &= NOTE_FFLAGSMASK;
 		switch (ffctrl) {
 		case NOTE_FFNOP:
 			break;
 
 		case NOTE_FFAND:
 			kn->kn_sfflags &= kev->fflags;
 			break;
 
 		case NOTE_FFOR:
 			kn->kn_sfflags |= kev->fflags;
 			break;
 
 		case NOTE_FFCOPY:
 			kn->kn_sfflags = kev->fflags;
 			break;
 
 		default:
 			/* XXX Return error? */
 			break;
 		}
 		kn->kn_sdata = kev->data;
 		if (kev->flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		kev->fflags = kn->kn_sfflags;
 		kev->data = kn->kn_sdata;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_usertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
 	return (kern_kqueue(td, 0, NULL));
 }
 
 int
 sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
 {
 	int flags;
 
 	if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
 		return (EINVAL);
 	flags = 0;
 	if ((uap->flags & KQUEUE_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
 	return (kern_kqueue(td, flags, NULL));
 }
 
 static void
 kqueue_init(struct kqueue *kq)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 }
 
 int
 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct ucred *cred;
 	int fd, error;
 
 	fdp = td->td_proc->p_fd;
 	cred = td->td_ucred;
 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
 		return (ENOMEM);
 
 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
 	}
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	kqueue_init(kq);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
 	FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_XUNLOCK(fdp);
 
 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (0);
 }
 
 struct g_kevent_args {
 	int	fd;
 	const void *changelist;
 	int	nchanges;
 	void	*eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 
 int
 sys_kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent_copyout,
 		.k_copyin = kevent_copyin,
 		.kevent_size = sizeof(struct kevent),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
 }
 
 static int
 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
     struct kevent_copyops *k_ops, const char *struct_name)
 {
 	struct timespec ts, *tsp;
 #ifdef KTRACE
 	struct kevent *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
 		    uap->nchanges, k_ops->kevent_size);
 #endif
 
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    k_ops, tsp);
 
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
 		    td->td_retval[0], k_ops->kevent_size);
 #endif
 
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
 	if (error == 0)
 		uap->changelist += count;
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD11
 static int
 kevent11_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct freebsd11_kevent kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		kev11.ident = kevp->ident;
 		kev11.filter = kevp->filter;
 		kev11.flags = kevp->flags;
 		kev11.fflags = kevp->fflags;
 		kev11.data = kevp->data;
 		kev11.udata = kevp->udata;
 		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
 		if (error != 0)
 			break;
 		uap->eventlist++;
 		kevp++;
 	}
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent11_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct freebsd11_kevent kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		error = copyin(uap->changelist, &kev11, sizeof(kev11));
 		if (error != 0)
 			break;
 		kevp->ident = kev11.ident;
 		kevp->filter = kev11.filter;
 		kevp->flags = kev11.flags;
 		kevp->fflags = kev11.fflags;
 		kevp->data = (uintptr_t)kev11.data;
 		kevp->udata = kev11.udata;
 		bzero(&kevp->ext, sizeof(kevp->ext));
 		uap->changelist++;
 		kevp++;
 	}
 	return (error);
 }
 
 int
 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent11_copyout,
 		.k_copyin = kevent11_copyin,
 		.kevent_size = sizeof(struct freebsd11_kevent),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "freebsd11_kevent"));
 }
 #endif
 
 int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	int error;
 
 	cap_rights_init_zero(&rights);
 	if (nchanges > 0)
 		cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
 	if (nevents > 0)
 		cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
 	error = fget(td, fd, &rights, &fp);
 	if (error != 0)
 		return (error);
 
 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
 	fdrop(fp, td);
 
 	return (error);
 }
 
 static int
 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	int i, n, nerrors, error;
 
 	if (nchanges < 0)
 		return (EINVAL);
 
 	nerrors = 0;
 	while (nchanges > 0) {
 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
 		error = k_ops->k_copyin(k_ops->arg, keva, n);
 		if (error)
 			return (error);
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
 			if (!kevp->filter)
 				continue;
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, M_WAITOK);
 			if (error || (kevp->flags & EV_RECEIPT)) {
 				if (nevents == 0)
 					return (error);
 				kevp->flags = EV_ERROR;
 				kevp->data = error;
 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
 				nevents--;
 				nerrors++;
 			}
 		}
 		nchanges -= n;
 	}
 	if (nerrors) {
 		td->td_retval[0] = nerrors;
 		return (0);
 	}
 
 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
 }
 
 int
 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kqueue *kq;
 	int error;
 
 	error = kqueue_acquire(fp, &kq);
 	if (error != 0)
 		return (error);
 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
 	kqueue_release(kq, 0);
 	return (error);
 }
 
 /*
  * Performs a kevent() call on a temporarily created kqueue. This can be
  * used to perform one-shot polling, similar to poll() and select().
  */
 int
 kern_kevent_anonymous(struct thread *td, int nevents,
     struct kevent_copyops *k_ops)
 {
 	struct kqueue kq = {};
 	int error;
 
 	kqueue_init(&kq);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
 	kqueue_destroy(&kq);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, const struct filterops *filtops)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
 		printf(
 "trying to add a filterop that is out of range: %d is beyond %d\n",
 		    ~filt, EVFILT_SYSCOUNT);
 		return EINVAL;
 	}
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
 	    sysfilt_ops[~filt].for_fop != NULL)
 		error = EEXIST;
 	else {
 		sysfilt_ops[~filt].for_fop = filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return (error);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return EINVAL;
 
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
 	    sysfilt_ops[~filt].for_fop == NULL)
 		error = EINVAL;
 	else if (sysfilt_ops[~filt].for_refcnt != 0)
 		error = EBUSY;
 	else {
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return error;
 }
 
 static const struct filterops *
 kqueue_fo_find(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return NULL;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return sysfilt_ops[~filt].for_fop;
 
 	mtx_lock(&filterops_lock);
 	sysfilt_ops[~filt].for_refcnt++;
 	if (sysfilt_ops[~filt].for_fop == NULL)
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 	mtx_unlock(&filterops_lock);
 
 	return sysfilt_ops[~filt].for_fop;
 }
 
 static void
 kqueue_fo_release(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return;
 
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 	    ("filter object refcount not valid on release"));
 	sysfilt_ops[~filt].for_refcnt--;
 	mtx_unlock(&filterops_lock);
 }
 
 /*
  * A ref to kq (obtained via kqueue_acquire) must be held.
  */
 static int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
     int mflag)
 {
 	const struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
 	struct knlist *knl;
 	int error, filt, event;
 	int haskqglobal, filedesc_unlock;
 
 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
 		return (EINVAL);
 
 	fp = NULL;
 	kn = NULL;
 	knl = NULL;
 	error = 0;
 	haskqglobal = 0;
 	filedesc_unlock = 0;
 
 	filt = kev->filter;
 	fops = kqueue_fo_find(filt);
 	if (fops == NULL)
 		return EINVAL;
 
 	if (kev->flags & EV_ADD) {
 		/* Reject an invalid flag pair early */
 		if (kev->flags & EV_KEEPUDATA) {
 			tkn = NULL;
 			error = EINVAL;
 			goto done;
 		}
 
 		/*
 		 * Prevent waiting with locks.  Non-sleepable
 		 * allocation failures are handled in the loop, only
 		 * if the spare knote appears to be actually required.
 		 */
 		tkn = knote_alloc(mflag);
 	} else {
 		tkn = NULL;
 	}
 
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
 		if (kev->ident > INT_MAX)
 			error = EBADF;
 		else
 			error = fget(td, kev->ident, &cap_event_rights, &fp);
 		if (error)
 			goto done;
 
 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 		    kev->ident, M_NOWAIT) != 0) {
 			/* try again */
 			fdrop(fp, td);
 			fp = NULL;
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error)
 				goto done;
 			goto findkn;
 		}
 
 		if (fp->f_type == DTYPE_KQUEUE) {
 			/*
 			 * If we add some intelligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
 			 * they are the same thing.
 			 */
 			if (fp->f_data == kq) {
 				error = EINVAL;
 				goto done;
 			}
 
 			/*
 			 * Pre-lock the filedesc before the global
 			 * lock mutex, see the comment in
 			 * kqueue_close().
 			 */
 			FILEDESC_XLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 1;
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
 		KQ_LOCK(kq);
 		if (kev->ident < kq->kq_knlistsize) {
 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
 				if (kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if ((kev->flags & EV_ADD) == EV_ADD) {
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error != 0)
 				goto done;
 		}
 
 		KQ_LOCK(kq);
 
 		/*
 		 * If possible, find an existing knote to use for this kevent.
 		 */
 		if (kev->filter == EVFILT_PROC &&
 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
 			/* This is an internal creation of a process tracking
 			 * note. Don't attempt to coalesce this with an
 			 * existing note.
 			 */
 			;
 		} else if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
 
 	/* knote is in the process of changing, wait for it to stabilize. */
 	if (kn != NULL && kn_in_flux(kn)) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		if (filedesc_unlock) {
 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 0;
 		}
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 		if (fp != NULL) {
 			fdrop(fp, td);
 			fp = NULL;
 		}
 		goto findkn;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kn == NULL) {
 		if (kev->flags & EV_ADD) {
 			kn = tkn;
 			tkn = NULL;
 			if (kn == NULL) {
 				KQ_UNLOCK(kq);
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 			/*
 			 * apply reference counts to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fops = NULL;
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
 			kn->kn_status = KN_DETACHED;
 			if ((kev->flags & EV_DISABLE) != 0)
 				kn->kn_status |= KN_DISABLED;
 			kn_enter_flux(kn);
 
 			error = knote_attach(kn, kq);
 			KQ_UNLOCK(kq);
 			if (error != 0) {
 				tkn = kn;
 				goto done;
 			}
 
 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
 				knote_drop_detached(kn, td);
 				goto done;
 			}
 			knl = kn_list_lock(kn);
 			goto done_ev_add;
 		} else {
 			/* No matching knote and the EV_ADD flag is not set. */
 			KQ_UNLOCK(kq);
 			error = ENOENT;
 			goto done;
 		}
 	}
 
 	if (kev->flags & EV_DELETE) {
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if (kev->flags & EV_FORCEONESHOT) {
 		kn->kn_flags |= EV_ONESHOT;
 		KNOTE_ACTIVATE(kn, 1);
 	}
 
 	if ((kev->flags & EV_ENABLE) != 0)
 		kn->kn_status &= ~KN_DISABLED;
 	else if ((kev->flags & EV_DISABLE) != 0)
 		kn->kn_status |= KN_DISABLED;
 
 	/*
 	 * The user may change some filter values after the initial EV_ADD,
 	 * but doing so will not reset any filter which has already been
 	 * triggered.
 	 */
 	kn->kn_status |= KN_SCAN;
 	kn_enter_flux(kn);
 	KQ_UNLOCK(kq);
 	knl = kn_list_lock(kn);
 	if ((kev->flags & EV_KEEPUDATA) == 0)
 		kn->kn_kevent.udata = kev->udata;
 	if (!fops->f_isfd && fops->f_touch != NULL) {
 		fops->f_touch(kn, kev, EVENT_REGISTER);
 	} else {
 		kn->kn_sfflags = kev->fflags;
 		kn->kn_sdata = kev->data;
 	}
 
 done_ev_add:
 	/*
 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
 	 * the initial attach event decides that the event is "completed"
 	 * already, e.g., filt_procattach() is called on a zombie process.  It
 	 * will call filt_proc() which will remove it from the list, and NULL
 	 * kn_knlist.
 	 *
 	 * KN_DISABLED will be stable while the knote is in flux, so the
 	 * unlocked read will not race with an update.
 	 */
 	if ((kn->kn_status & KN_DISABLED) == 0)
 		event = kn->kn_fop->f_event(kn, 0);
 	else
 		event = 0;
 
 	KQ_LOCK(kq);
 	if (event)
 		kn->kn_status |= KN_ACTIVE;
 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
 	    KN_ACTIVE)
 		knote_enqueue(kn);
 	kn->kn_status &= ~KN_SCAN;
 	kn_leave_flux(kn);
 	kn_list_unlock(knl);
 	KQ_UNLOCK_FLUX(kq);
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
 	knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
 }
 
 static int
 kqueue_acquire(struct file *fp, struct kqueue **kqp)
 {
 	int error;
 	struct kqueue *kq;
 
 	error = 0;
 
 	kq = fp->f_data;
 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
 		return (EBADF);
 	*kqp = kq;
 	KQ_LOCK(kq);
 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 		KQ_UNLOCK(kq);
 		return (EBADF);
 	}
 	kq->kq_refcnt++;
 	KQ_UNLOCK(kq);
 
 	return error;
 }
 
 static void
 kqueue_release(struct kqueue *kq, int locked)
 {
 	if (locked)
 		KQ_OWNED(kq);
 	else
 		KQ_LOCK(kq);
 	kq->kq_refcnt--;
 	if (kq->kq_refcnt == 1)
 		wakeup(&kq->kq_refcnt);
 	if (!locked)
 		KQ_UNLOCK(kq);
 }
 
 static void
 ast_kqueue(struct thread *td, int tda __unused)
 {
 	taskqueue_quiesce(taskqueue_kqueue_ctx);
 }
 
 static void
 kqueue_schedtask(struct kqueue *kq)
 {
 	KQ_OWNED(kq);
 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
 	    ("scheduling kqueue task while draining"));
 
 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
 		kq->kq_state |= KQ_TASKSCHED;
 		ast_sched(curthread, TDA_KQUEUE);
 	}
 }
 
 /*
  * Expand the kq to make sure we have storage for fops/ident pair.
  *
  * Return 0 on success (or no work necessary), return errno on failure.
  */
 static int
 kqueue_expand(struct kqueue *kq, const struct filterops *fops, uintptr_t ident,
     int mflag)
 {
 	struct klist *list, *tmp_knhash, *to_free;
 	u_long tmp_knhashmask;
 	int error, fd, size;
 
 	KQ_NOTOWNED(kq);
 
 	error = 0;
 	to_free = NULL;
 	if (fops->f_isfd) {
 		fd = ident;
 		if (kq->kq_knlistsize <= fd) {
 			size = kq->kq_knlistsize;
 			while (size <= fd)
 				size += KQEXTENT;
 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
 			if (list == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = list;
 				error = EBADF;
 			} else if (kq->kq_knlistsize > fd) {
 				to_free = list;
 			} else {
 				if (kq->kq_knlist != NULL) {
 					bcopy(kq->kq_knlist, list,
 					    kq->kq_knlistsize * sizeof(*list));
 					to_free = kq->kq_knlist;
 					kq->kq_knlist = NULL;
 				}
 				bzero((caddr_t)list +
 				    kq->kq_knlistsize * sizeof(*list),
 				    (size - kq->kq_knlistsize) * sizeof(*list));
 				kq->kq_knlistsize = size;
 				kq->kq_knlist = list;
 			}
 			KQ_UNLOCK(kq);
 		}
 	} else {
 		if (kq->kq_knhashmask == 0) {
 			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
 			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
 			    HASH_WAITOK : HASH_NOWAIT);
 			if (tmp_knhash == NULL)
 				return (ENOMEM);
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = tmp_knhash;
 				error = EBADF;
 			} else if (kq->kq_knhashmask == 0) {
 				kq->kq_knhash = tmp_knhash;
 				kq->kq_knhashmask = tmp_knhashmask;
 			} else {
 				to_free = tmp_knhash;
 			}
 			KQ_UNLOCK(kq);
 		}
 	}
 	free(to_free, M_KQUEUE);
 
 	KQ_NOTOWNED(kq);
 	return (error);
 }
 
 static void
 kqueue_task(void *arg, int pending)
 {
 	struct kqueue *kq;
 	int haskqglobal;
 
 	haskqglobal = 0;
 	kq = arg;
 
 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 	KQ_LOCK(kq);
 
 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
 
 	kq->kq_state &= ~KQ_TASKSCHED;
 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
 		wakeup(&kq->kq_state);
 	}
 	KQ_UNLOCK(kq);
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 }
 
 /*
  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
  * We treat KN_MARKER knotes as if they are in flux.
  */
 static int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
 	struct knote *kn, *marker;
 	struct knlist *knl;
 	sbintime_t asbt, rsbt;
 	int count, error, haskqglobal, influx, nkev, touch;
 
 	count = maxevents;
 	nkev = 0;
 	error = 0;
 	haskqglobal = 0;
 
 	if (maxevents == 0)
 		goto done_nl;
 	if (maxevents < 0) {
 		error = EINVAL;
 		goto done_nl;
 	}
 
 	rsbt = 0;
 	if (tsp != NULL) {
 		if (!timespecvalid_interval(tsp)) {
 			error = EINVAL;
 			goto done_nl;
 		}
 		if (timespecisset(tsp)) {
 			if (tsp->tv_sec <= INT32_MAX) {
 				rsbt = tstosbt(*tsp);
 				if (TIMESEL(&asbt, rsbt))
 					asbt += tc_tick_sbt;
 				if (asbt <= SBT_MAX - rsbt)
 					asbt += rsbt;
 				else
 					asbt = 0;
 				rsbt >>= tc_precexp;
 			} else
 				asbt = 0;
 		} else
 			asbt = -1;
 	} else
 		asbt = 0;
 	marker = knote_alloc(M_WAITOK);
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
 
 retry:
 	kevp = keva;
 	if (kq->kq_count == 0) {
 		if (asbt == -1) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
 			    "kqread", asbt, rsbt, C_ABSOLUTE);
 		}
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 	influx = 0;
 	while (count) {
 		KQ_OWNED(kq);
 		kn = TAILQ_FIRST(&kq->kq_head);
 
 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
 		    kn_in_flux(kn)) {
 			if (influx) {
 				influx = 0;
 				KQ_FLUX_WAKEUP(kq);
 			}
 			kq->kq_state |= KQ_FLUXWAIT;
 			error = msleep(kq, &kq->kq_lock, PSOCK,
 			    "kqflxwt", 0);
 			continue;
 		}
 
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if (kn == marker) {
 			KQ_FLUX_WAKEUP(kq);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		KASSERT(!kn_in_flux(kn),
 		    ("knote %p is unexpectedly in flux", kn));
 
 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked it as in flux.
 			 */
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			continue;
 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked the knote as being in flux.
 			 */
 			*kevp = kn->kn_kevent;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
 			kn->kn_status |= KN_SCAN;
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 			knl = kn_list_lock(kn);
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
 				    KN_SCAN);
 				kn_leave_flux(kn);
 				kq->kq_count--;
 				kn_list_unlock(knl);
 				influx = 1;
 				continue;
 			}
 			touch = (!kn->kn_fop->f_isfd &&
 			    kn->kn_fop->f_touch != NULL);
 			if (touch)
 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
 			else
 				*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
 				/*
 				 * Manually clear knotes who weren't
 				 * 'touch'ed.
 				 */
 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
 					kn->kn_data = 0;
 					kn->kn_fflags = 0;
 				}
 				if (kn->kn_flags & EV_DISPATCH)
 					kn->kn_status |= KN_DISABLED;
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 				kq->kq_count--;
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 
 			kn->kn_status &= ~KN_SCAN;
 			kn_leave_flux(kn);
 			kn_list_unlock(knl);
 			influx = 1;
 		}
 
 		/* we are returning a copy to the user */
 		kevp++;
 		nkev++;
 		count--;
 
 		if (nkev == KQ_NEVENTS) {
 			influx = 0;
 			KQ_UNLOCK_FLUX(kq);
 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 			nkev = 0;
 			kevp = keva;
 			KQ_LOCK(kq);
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 done:
 	KQ_OWNED(kq);
 	KQ_UNLOCK_FLUX(kq);
 	knote_free(marker);
 done_nl:
 	KQ_NOTOWNED(kq);
 	if (nkev != 0)
 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 	td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	/*
 	 * Enabling sigio causes two major problems:
 	 * 1) infinite recursion:
 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
 	 * set.  On receipt of a signal this will cause a kqueue to recurse
 	 * into itself over and over.  Sending the sigio causes the kqueue
 	 * to become ready, which in turn posts sigio again, forever.
 	 * Solution: this can be solved by setting a flag in the kqueue that
 	 * we have a SIGIO in progress.
 	 * 2) locking problems:
 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
 	 * us above the proc and pgrp locks.
 	 * Solution: Post a signal using an async mechanism, being sure to
 	 * record a generation count in the delivery so that we do not deliver
 	 * a signal to the wrong process.
 	 *
 	 * Note, these two mechanisms are somewhat mutually exclusive!
 	 */
 #if 0
 	struct kqueue *kq;
 
 	kq = fp->f_data;
 	switch (cmd) {
 	case FIOASYNC:
 		if (*(int *)data) {
 			kq->kq_state |= KQ_ASYNC;
 		} else {
 			kq->kq_state &= ~KQ_ASYNC;
 		}
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &kq->kq_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&kq->kq_sigio);
 		return (0);
 	}
 #endif
 
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct kqueue *kq;
 	int revents = 0;
 	int error;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return POLLERR;
 
 	KQ_LOCK(kq);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (kq->kq_count) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			selrecord(td, &kq->kq_sel);
 			if (SEL_WAITING(&kq->kq_sel))
 				kq->kq_state |= KQ_SEL;
 		}
 	}
 	kqueue_release(kq, 1);
 	KQ_UNLOCK(kq);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
 
 	bzero((void *)st, sizeof *st);
 	/*
 	 * We no longer return kq_count because the unlocked value is useless.
 	 * If you spent all this time getting the count, why not spend your
 	 * syscall better by calling kevent?
 	 *
 	 * XXX - This is needed for libc_r.
 	 */
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 static void
 kqueue_drain(struct kqueue *kq, struct thread *td)
 {
 	struct knote *kn;
 	int i;
 
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
 	    ("kqueue already closing"));
 	kq->kq_state |= KQ_CLOSING;
 	if (kq->kq_refcnt > 1)
 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
 
 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
 
 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
 	    ("kqueue's knlist not empty"));
 
 	for (i = 0; i < kq->kq_knlistsize; i++) {
 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
 			if (kn_in_flux(kn)) {
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
 				continue;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 	}
 	if (kq->kq_knhashmask != 0) {
 		for (i = 0; i <= kq->kq_knhashmask; i++) {
 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
 				if (kn_in_flux(kn)) {
 					kq->kq_state |= KQ_FLUXWAIT;
 					msleep(kq, &kq->kq_lock, PSOCK,
 					       "kqclo2", 0);
 					continue;
 				}
 				kn_enter_flux(kn);
 				KQ_UNLOCK(kq);
 				knote_drop(kn, td);
 				KQ_LOCK(kq);
 			}
 		}
 	}
 
 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
 		kq->kq_state |= KQ_TASKDRAIN;
 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
 	}
 
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 
 	KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_destroy(struct kqueue *kq)
 {
 
 	KASSERT(kq->kq_fdp == NULL,
 	    ("kqueue still attached to a file descriptor"));
 	seldrain(&kq->kq_sel);
 	knlist_destroy(&kq->kq_sel.si_note);
 	mtx_destroy(&kq->kq_lock);
 
 	if (kq->kq_knhash != NULL)
 		free(kq->kq_knhash, M_KQUEUE);
 	if (kq->kq_knlist != NULL)
 		free(kq->kq_knlist, M_KQUEUE);
 
 	funsetown(&kq->kq_sigio);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = fp->f_data;
 	struct filedesc *fdp;
 	int error;
 	int filedesc_unlock;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return error;
 	kqueue_drain(kq, td);
 
 	/*
 	 * We could be called due to the knote_drop() doing fdrop(),
 	 * called from kqueue_register().  In this case the global
 	 * lock is owned, and filedesc sx is locked before, to not
 	 * take the sleepable lock after non-sleepable.
 	 */
 	fdp = kq->kq_fdp;
 	kq->kq_fdp = NULL;
 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
 		FILEDESC_XLOCK(fdp);
 		filedesc_unlock = 1;
 	} else
 		filedesc_unlock = 0;
 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(fdp);
 
 	kqueue_destroy(kq);
 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
 	crfree(kq->kq_cred);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static int
 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct kqueue *kq = fp->f_data;
 
 	kif->kf_type = KF_TYPE_KQUEUE;
 	kif->kf_un.kf_kqueue.kf_kqueue_addr = (uintptr_t)kq;
 	kif->kf_un.kf_kqueue.kf_kqueue_count = kq->kq_count;
 	kif->kf_un.kf_kqueue.kf_kqueue_state = kq->kq_state;
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 	if (!knlist_empty(&kq->kq_sel.si_note))
 		kqueue_schedtask(kq);
 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
 		pgsigio(&kq->kq_sigio, SIGIO, 0);
 	}
 }
 
 /*
  * Walk down a list of knotes, activating them if their event has triggered.
  *
  * There is a possibility to optimize in the case of one kq watching another.
  * Instead of scheduling a task to wake it up, you could pass enough state
  * down the chain to make up the parent kqueue.  Make this code functional
  * first.
  */
 void
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
 	struct knote *kn, *tkn;
 	int error;
 
 	if (list == NULL)
 		return;
 
 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
 
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
 	 * If we unlock the list lock (and enter influx), we can
 	 * eliminate the kqueue scheduling, but this will introduce
 	 * four lock/unlock's for each knote to test.  Also, marker
 	 * would be needed to keep iteration position, since filters
 	 * or other threads could remove events.
 	 */
 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			/*
 			 * Do not process the influx notes, except for
 			 * the influx coming from the kq unlock in the
 			 * kqueue_scan().  In the later case, we do
 			 * not interfere with the scan, since the code
 			 * fragment in kqueue_scan() locks the knlist,
 			 * and cannot proceed until we finished.
 			 */
 			KQ_UNLOCK(kq);
 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			error = kn->kn_fop->f_event(kn, hint);
 			KQ_LOCK(kq);
 			kn_leave_flux(kn);
 			if (error)
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK_FLUX(kq);
 		} else {
 			if (kn->kn_fop->f_event(kn, hint))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 		}
 	}
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_unlock(list->kl_lockarg); 
 }
 
 /*
  * add a knote to a knlist
  */
 void
 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	KNL_ASSERT_LOCK(knl, islocked);
 	KQ_NOTOWNED(kn->kn_kq);
 	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p was not detached", kn));
 	if (!islocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
 	if (!islocked)
 		knl->kl_unlock(knl->kl_lockarg);
 	KQ_LOCK(kn->kn_kq);
 	kn->kn_knlist = knl;
 	kn->kn_status &= ~KN_DETACHED;
 	KQ_UNLOCK(kn->kn_kq);
 }
 
 static void
 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
     int kqislocked)
 {
 
 	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
 	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) == 0,
 	    ("knote %p was already detached", kn));
 	if (!knlislocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
 	kn->kn_knlist = NULL;
 	if (!knlislocked)
 		kn_list_unlock(knl);
 	if (!kqislocked)
 		KQ_LOCK(kn->kn_kq);
 	kn->kn_status |= KN_DETACHED;
 	if (!kqislocked)
 		KQ_UNLOCK(kn->kn_kq);
 }
 
 /*
  * remove knote from the specified knlist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	knlist_remove_kq(knl, kn, islocked, 0);
 }
 
 int
 knlist_empty(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	return (SLIST_EMPTY(&knl->kl_list));
 }
 
 static struct mtx knlist_lock;
 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
     MTX_DEF);
 static void knlist_mtx_lock(void *arg);
 static void knlist_mtx_unlock(void *arg);
 
 static void
 knlist_mtx_lock(void *arg)
 {
 
 	mtx_lock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_unlock(void *arg)
 {
 
 	mtx_unlock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_assert_lock(void *arg, int what)
 {
 
 	if (what == LA_LOCKED)
 		mtx_assert((struct mtx *)arg, MA_OWNED);
 	else
 		mtx_assert((struct mtx *)arg, MA_NOTOWNED);
 }
 
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *),
     void (*kl_assert_lock)(void *, int))
 {
 
 	if (lock == NULL)
 		knl->kl_lockarg = &knlist_lock;
 	else
 		knl->kl_lockarg = lock;
 
 	if (kl_lock == NULL)
 		knl->kl_lock = knlist_mtx_lock;
 	else
 		knl->kl_lock = kl_lock;
 	if (kl_unlock == NULL)
 		knl->kl_unlock = knlist_mtx_unlock;
 	else
 		knl->kl_unlock = kl_unlock;
 	if (kl_assert_lock == NULL)
 		knl->kl_assert_lock = knlist_mtx_assert_lock;
 	else
 		knl->kl_assert_lock = kl_assert_lock;
 
 	knl->kl_autodestroy = 0;
 	SLIST_INIT(&knl->kl_list);
 }
 
 void
 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
 {
 
 	knlist_init(knl, lock, NULL, NULL, NULL);
 }
 
 struct knlist *
 knlist_alloc(struct mtx *lock)
 {
 	struct knlist *knl;
 
 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
 	knlist_init_mtx(knl, lock);
 	return (knl);
 }
 
 void
 knlist_destroy(struct knlist *knl)
 {
 
 	KASSERT(KNLIST_EMPTY(knl),
 	    ("destroying knlist %p with knotes on it", knl));
 }
 
 void
 knlist_detach(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	knl->kl_autodestroy = 1;
 	if (knlist_empty(knl)) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 /*
  * Even if we are locked, we may need to drop the lock to allow any influx
  * knotes time to "settle".
  */
 void
 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 {
 	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		KNL_ASSERT_UNLOCKED(knl);
 again:		/* need to reacquire lock since we have dropped it */
 		knl->kl_lock(knl->kl_lockarg);
 	}
 
 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn)) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 		knlist_remove_kq(knl, kn, 1, 1);
 		if (killkn) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop_detached(kn, td);
 		} else {
 			/* Make sure cleared knotes disappear soon */
 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 
 	if (!SLIST_EMPTY(&knl->kl_list)) {
 		/* there are still in flux knotes remaining */
 		kn = SLIST_FIRST(&knl->kl_list);
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
 		knl->kl_unlock(knl->kl_lockarg);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
 		kq = NULL;
 		goto again;
 	}
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		knl->kl_unlock(knl->kl_lockarg);
 		KNL_ASSERT_UNLOCKED(knl);
 	}
 }
 
 /*
  * Remove all knotes referencing a specified fd must be called with FILEDESC
  * lock.  This prevents a race where a new fd comes along and occupies the
  * entry and we attach a knote to the fd.
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct kqueue *kq;
 	struct knote *kn;
 	int influx;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We shouldn't have to worry about new kevents appearing on fd
 	 * since filedesc is locked.
 	 */
 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
 		KQ_LOCK(kq);
 
 again:
 		influx = 0;
 		while (kq->kq_knlistsize > fd &&
 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
 			if (kn_in_flux(kn)) {
 				/* someone else might be waiting on our knote */
 				if (influx)
 					wakeup(kq);
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
 				goto again;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			influx = 1;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 static int
 knote_attach(struct knote *kn, struct kqueue *kq)
 {
 	struct klist *list;
 
 	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_CLOSING) != 0)
 		return (EBADF);
 	if (kn->kn_fop->f_isfd) {
 		if (kn->kn_id >= kq->kq_knlistsize)
 			return (ENOMEM);
 		list = &kq->kq_knlist[kn->kn_id];
 	} else {
 		if (kq->kq_knhash == NULL)
 			return (ENOMEM);
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 	}
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 	return (0);
 }
 
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
 
 	if ((kn->kn_status & KN_DETACHED) == 0)
 		kn->kn_fop->f_detach(kn);
 	knote_drop_detached(kn, td);
 }
 
 static void
 knote_drop_detached(struct knote *kn, struct thread *td)
 {
 	struct kqueue *kq;
 	struct klist *list;
 
 	kq = kn->kn_kq;
 
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p still attached", kn));
 	KQ_NOTOWNED(kq);
 
 	KQ_LOCK(kq);
 	KASSERT(kn->kn_influx == 1,
 	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
 
 	if (kn->kn_fop->f_isfd)
 		list = &kq->kq_knlist[kn->kn_id];
 	else
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
 	if (!SLIST_EMPTY(list))
 		SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	KQ_UNLOCK_FLUX(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		fdrop(kn->kn_fp, td);
 		kn->kn_fp = NULL;
 	}
 	kqueue_fo_release(kn->kn_kevent.filter);
 	kn->kn_fop = NULL;
 	knote_free(kn);
 }
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 }
 
 static void
 knote_init(void)
 {
 
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
 static struct knote *
 knote_alloc(int mflag)
 {
 
 	return (uma_zalloc(knote_zone, mflag | M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 
 	uma_zfree(knote_zone, kn);
 }
 
 /*
  * Register the kev w/ the kq specified by fd.
  */
 int
 kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
 	    &fp);
 	if (error != 0)
 		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto noacquire;
 
 	error = kqueue_register(kq, kev, td, mflag);
 	kqueue_release(kq, 0);
 
 noacquire:
 	fdrop(fp, td);
 	return (error);
 }
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 802231767762..6661f4cd6187 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1,4618 +1,4618 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/ctype.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
 #include <sys/compressor.h>
 #include <sys/condvar.h>
 #include <sys/devctl.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/ptrace.h>
 #include <sys/posix4.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE3(proc, , , signal__send,
     "struct thread *", "struct proc *", "int");
 SDT_PROBE_DEFINE2(proc, , , signal__clear,
     "int", "ksiginfo_t *");
 SDT_PROBE_DEFINE3(proc, , , signal__discard,
     "struct thread *", "struct proc *", "int");
 
 static int	coredump(struct thread *);
 static int	killpg1(struct thread *td, int sig, int pgid, int all,
 		    ksiginfo_t *ksi);
 static int	issignal(struct thread *td);
 static void	reschedule_signals(struct proc *p, sigset_t block, int flags);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static int	sig_suspend_threads(struct thread *, struct proc *);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock);
 static void	sigqueue_start(void);
 static void	sigfastblock_setpend(struct thread *td, bool resched);
 
 static uma_zone_t	ksiginfo_zone = NULL;
-struct filterops sig_filtops = {
+const struct filterops sig_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_sigattach,
 	.f_detach = filt_sigdetach,
 	.f_event = filt_signal,
 };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
     &kern_logsigexit, 0,
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 static int	kern_lognosys = 0;
 SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0,
     "Log invalid syscalls");
 
 static int	kern_signosys = 1;
 SYSCTL_INT(_kern, OID_AUTO, signosys, CTLFLAG_RWTUN, &kern_signosys, 0,
     "Send SIGSYS on return from invalid syscall");
 
 __read_frequently bool sigfastblock_fetch_always = false;
 SYSCTL_BOOL(_kern, OID_AUTO, sigfastblock_fetch_always, CTLFLAG_RWTUN,
     &sigfastblock_fetch_always, 0,
     "Fetch sigfastblock word on each syscall entry for proper "
     "blocking semantic");
 
 static bool	kern_sig_discard_ign = true;
 SYSCTL_BOOL(_kern, OID_AUTO, sig_discard_ign, CTLFLAG_RWTUN,
     &kern_sig_discard_ign, 0,
     "Discard ignored signals on delivery, otherwise queue them to "
     "the target queue");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 static int	sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
     &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
 
 static int	capmode_coredump;
 SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
     &capmode_coredump, 0, "Allow processes in capability mode to dump core");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 static int	coredump_devctl = 0;
 SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
 	0, "Generate a devctl notification when processes coredump");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SIGPROP_KILL		0x01	/* terminates process by default */
 #define	SIGPROP_CORE		0x02	/* ditto and coredumps */
 #define	SIGPROP_STOP		0x04	/* suspend process */
 #define	SIGPROP_TTYSTOP		0x08	/* ditto, from tty */
 #define	SIGPROP_IGNORE		0x10	/* ignore by default */
 #define	SIGPROP_CONT		0x20	/* continue if suspended */
 
 static const int sigproptbl[NSIG] = {
 	[SIGHUP] =	SIGPROP_KILL,
 	[SIGINT] =	SIGPROP_KILL,
 	[SIGQUIT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGILL] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGTRAP] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGABRT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGEMT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGFPE] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGKILL] =	SIGPROP_KILL,
 	[SIGBUS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSEGV] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSYS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGPIPE] =	SIGPROP_KILL,
 	[SIGALRM] =	SIGPROP_KILL,
 	[SIGTERM] =	SIGPROP_KILL,
 	[SIGURG] =	SIGPROP_IGNORE,
 	[SIGSTOP] =	SIGPROP_STOP,
 	[SIGTSTP] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGCONT] =	SIGPROP_IGNORE | SIGPROP_CONT,
 	[SIGCHLD] =	SIGPROP_IGNORE,
 	[SIGTTIN] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGTTOU] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGIO] =	SIGPROP_IGNORE,
 	[SIGXCPU] =	SIGPROP_KILL,
 	[SIGXFSZ] =	SIGPROP_KILL,
 	[SIGVTALRM] =	SIGPROP_KILL,
 	[SIGPROF] =	SIGPROP_KILL,
 	[SIGWINCH] =	SIGPROP_IGNORE,
 	[SIGINFO] =	SIGPROP_IGNORE,
 	[SIGUSR1] =	SIGPROP_KILL,
 	[SIGUSR2] =	SIGPROP_KILL,
 };
 
 #define	_SIG_FOREACH_ADVANCE(i, set) ({					\
 	int __found;							\
 	for (;;) {							\
 		if (__bits != 0) {					\
 			int __sig = ffs(__bits);			\
 			__bits &= ~(1u << (__sig - 1));			\
 			sig = __i * sizeof((set)->__bits[0]) * NBBY + __sig; \
 			__found = 1;					\
 			break;						\
 		}							\
 		if (++__i == _SIG_WORDS) {				\
 			__found = 0;					\
 			break;						\
 		}							\
 		__bits = (set)->__bits[__i];				\
 	}								\
 	__found != 0;							\
 })
 
 #define	SIG_FOREACH(i, set)						\
 	for (int32_t __i = -1, __bits = 0;				\
 	    _SIG_FOREACH_ADVANCE(i, set); )				\
 
 static sigset_t fastblock_mask;
 
 static void
 ast_sig(struct thread *td, int tda)
 {
 	struct proc *p;
 	int old_boundary, sig;
 	bool resched_sigs;
 
 	p = td->td_proc;
 
 #ifdef DIAGNOSTIC
 	if (p->p_numthreads == 1 && (tda & (TDAI(TDA_SIG) |
 	    TDAI(TDA_AST))) == 0) {
 		PROC_LOCK(p);
 		thread_lock(td);
 		/*
 		 * Note that TDA_SIG should be re-read from
 		 * td_ast, since signal might have been delivered
 		 * after we cleared td_flags above.  This is one of
 		 * the reason for looping check for AST condition.
 		 * See comment in userret() about P_PPWAIT.
 		 */
 		if ((p->p_flag & P_PPWAIT) == 0 &&
 		    (td->td_pflags & TDP_SIGFASTBLOCK) == 0) {
 			if (SIGPENDING(td) && ((tda | td->td_ast) &
 			    (TDAI(TDA_SIG) | TDAI(TDA_AST))) == 0) {
 				thread_unlock(td); /* fix dumps */
 				panic(
 				    "failed2 to set signal flags for ast p %p "
 				    "td %p tda %#x td_ast %#x fl %#x",
 				    p, td, tda, td->td_ast, td->td_flags);
 			}
 		}
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	/*
 	 * Check for signals. Unlocked reads of p_pendingcnt or
 	 * p_siglist might cause process-directed signal to be handled
 	 * later.
 	 */
 	if ((tda & TDAI(TDA_SIG)) != 0 || p->p_pendingcnt > 0 ||
 	    !SIGISEMPTY(p->p_siglist)) {
 		sigfastblock_fetch(td);
 		PROC_LOCK(p);
 		old_boundary = ~TDB_BOUNDARY | (td->td_dbgflags & TDB_BOUNDARY);
 		td->td_dbgflags |= TDB_BOUNDARY;
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0) {
 			KASSERT(sig >= 0, ("sig %d", sig));
 			postsig(sig);
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		td->td_dbgflags &= old_boundary;
 		PROC_UNLOCK(p);
 		resched_sigs = true;
 	} else {
 		resched_sigs = false;
 	}
 
 	/*
 	 * Handle deferred update of the fast sigblock value, after
 	 * the postsig() loop was performed.
 	 */
 	sigfastblock_setpend(td, resched_sigs);
 }
 
 static void
 ast_sigsuspend(struct thread *td, int tda __unused)
 {
 	MPASS((td->td_pflags & TDP_OLDMASK) != 0);
 	td->td_pflags &= ~TDP_OLDMASK;
 	kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
 }
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 	SIGFILLSET(fastblock_mask);
 	SIG_CANTMASK(fastblock_mask);
 	ast_register(TDA_SIG, ASTR_UNCOND, 0, ast_sig);
 	ast_register(TDA_SIGSUSPEND, ASTR_ASTF_REQUIRED | ASTR_TDP,
 	    TDP_OLDMASK, ast_sigsuspend);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int mwait)
 {
 	MPASS(mwait == M_WAITOK || mwait == M_NOWAIT);
 
 	if (ksiginfo_zone == NULL)
 		return (NULL);
 	return (uma_zalloc(ksiginfo_zone, mwait | M_ZERO));
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline bool
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if ((ksi->ksi_flags & KSI_EXT) == 0) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (true);
 	}
 	return (false);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	SIGEMPTYSET(list->sq_ptrace);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  *	0	-	signal not found
  *	others	-	signal number
  */
 static int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_ptrace, signo)) {
 		count++;
 		SIGDELSET(sq->sq_ptrace, signo);
 		si->ksi_flags |= KSI_PTRACE;
 	}
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		if (count == 1)
 			SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) &&
 	    !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 static int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	/*
 	 * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path
 	 * for these signals.
 	 */
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(M_NOWAIT)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if (ret != 0) {
 		if ((si->ksi_flags & KSI_PTRACE) != 0) {
 			SIGADDSET(sq->sq_ptrace, signo);
 			ret = 0;
 			goto out_set_bit;
 		} else if ((si->ksi_flags & KSI_TRAP) != 0 ||
 		    (si->ksi_flags & KSI_SIGQ) == 0) {
 			SIGADDSET(sq->sq_kill, signo);
 			ret = 0;
 			goto out_set_bit;
 		}
 		return (ret);
 	}
 
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 	SIGEMPTYSET(sq->sq_ptrace);
 }
 
 static void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
 {
 	sigset_t tmp;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_ptrace;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_ptrace, tmp);
 	SIGSETNAND(src->sq_ptrace, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 }
 
 #if 0
 static void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 #endif
 
 static void
 sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_ptrace, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 static void
 sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 static void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to thread td, the current
  * thread, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 
 	if (SIGPENDING(td))
 		ast_sched(td, TDA_SIG);
 }
 
 /*
  * Returns 1 (true) if altstack is configured for the thread, and the
  * passed stack bottom address falls into the altstack range.  Handles
  * the 43 compat special case where the alt stack size is zero.
  */
 int
 sigonstack(size_t sp)
 {
 	struct thread *td;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_ALTSTACK) == 0)
 		return (0);
 #if defined(COMPAT_43)
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && td->td_sigstk.ss_size == 0)
 		return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0);
 #endif
 	return (sp >= (size_t)td->td_sigstk.ss_sp &&
 	    sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < nitems(sigproptbl))
 		return (sigproptbl[sig]);
 	return (0);
 }
 
 static bool
 sigact_flag_test(const struct sigaction *act, int flag)
 {
 
 	/*
 	 * SA_SIGINFO is reset when signal disposition is set to
 	 * ignore or default.  Other flags are kept according to user
 	 * settings.
 	 */
 	return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO ||
 	    ((__sighandler_t *)act->sa_sigaction != SIG_IGN &&
 	    (__sighandler_t *)act->sa_sigaction != SIG_DFL)));
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
     struct sigaction *oact, int flags)
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 	if (act != NULL && act->sa_handler != SIG_DFL &&
 	    act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK |
 	    SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER |
 	    SA_NOCLDWAIT | SA_SIGINFO)) != 0)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		memset(oact, 0, sizeof(*oact));
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig)) {
 			oact->sa_flags |= SA_SIGINFO;
 			oact->sa_sigaction =
 			    (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
 		} else
 			oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (sigact_flag_test(act, SA_SIGINFO)) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!sigact_flag_test(act, SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (sigact_flag_test(act, SA_ONSTACK))
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (sigact_flag_test(act, SA_RESETHAND))
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (sigact_flag_test(act, SA_NODEFER))
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SIGPROP_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 			/* never to be seen again */
 			sigqueue_delete_proc(p, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sys_sigaction(struct thread *td, struct sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(struct thread *td, struct osigaction_args *uap)
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(struct thread *td, struct osigreturn_args *uap)
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(struct proc *p)
 {
 	int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++) {
 		if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) {
 			SIGADDSET(ps->ps_sigignore, i);
 		}
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset specified signal to the default disposition.
  */
 static void
 sigdflt(struct sigacts *ps, int sig)
 {
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	SIGDELSET(ps->ps_sigcatch, sig);
 	if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT)
 		SIGADDSET(ps->ps_sigignore, sig);
 	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	SIGDELSET(ps->ps_siginfo, sig);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	struct sigacts *ps;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sig_drop_caught(p);
 
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td = curthread;
 	MPASS(td->td_proc == p);
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
     int flags)
 {
 	sigset_t new_block, oset1;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	if ((flags & SIGPROCMASK_PROC_LOCKED) != 0)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 	else
 		PROC_LOCK(p);
 	mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0
 	    ? MA_OWNED : MA_NOTOWNED);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			SIGSETOR(td->td_sigmask, *set);
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			goto out;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			if (flags & SIGPROCMASK_OLD)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * The new_block set contains signals that were not previously
 		 * blocked, but are blocked now.
 		 *
 		 * In case we block any signal that was not previously blocked
 		 * for td, and process has the signal pending, try to schedule
 		 * signal delivery to some thread that does not block the
 		 * signal, possibly waking it up.
 		 */
 		if (p->p_numthreads != 1)
 			reschedule_signals(p, new_block, flags);
 	}
 
 out:
 	if (!(flags & SIGPROCMASK_PROC_LOCKED))
 		PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(struct thread *td, struct osigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sys_sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		/*
 		 * sigwait() function shall not return EINTR, but
 		 * the syscall does.  Non-ancient libc provides the
 		 * wrapper which hides EINTR.  Otherwise, EINTR return
 		 * is used by libthr to handle required cancellation
 		 * point in the sigwait().
 		 */
 		if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT)
 			return (ERESTART);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 static void
 proc_td_siginfo_capture(struct thread *td, siginfo_t *si)
 {
 	struct thread *thr;
 
 	FOREACH_THREAD_IN_PROC(td->td_proc, thr) {
 		if (thr == td)
 			thr->td_si = *si;
 		else
 			thr->td_si.si_signo = 0;
 	}
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t saved_mask, new_block;
 	struct proc *p;
 	int error, sig, timevalid = 0;
 	sbintime_t sbt, precision, tsbt;
 	struct timespec ts;
 	bool traced;
 
 	p = td->td_proc;
 	error = 0;
 	traced = false;
 
 	/* Ensure the sigfastblock value is up to date. */
 	sigfastblock_fetch(td);
 
 	if (timeout != NULL) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			ts = *timeout;
 			if (ts.tv_sec < INT32_MAX / 2) {
 				tsbt = tstosbt(ts);
 				precision = tsbt;
 				precision >>= tc_precexp;
 				if (TIMESEL(&sbt, tsbt))
 					sbt += tc_tick_sbt;
 				sbt += tsbt;
 			} else
 				precision = sbt = 0;
 		}
 	} else
 		precision = sbt = 0;
 	ksiginfo_init(ksi);
 	/* Some signals can not be waited for. */
 	SIG_CANTMASK(waitset);
 	ps = p->p_sigacts;
 	PROC_LOCK(p);
 	saved_mask = td->td_sigmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	if ((p->p_sysent->sv_flags & SV_SIG_DISCIGN) != 0 ||
 	    !kern_sig_discard_ign) {
 		thread_lock(td);
 		td->td_flags |= TDF_SIGWAIT;
 		thread_unlock(td);
 	}
 	for (;;) {
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		KASSERT(sig >= 0, ("sig %d", sig));
 		if (sig != 0 && SIGISMEMBER(waitset, sig)) {
 			if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
 			    sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
 				error = 0;
 				break;
 			}
 		}
 
 		if (error != 0)
 			break;
 
 		/*
 		 * POSIX says this must be checked after looking for pending
 		 * signals.
 		 */
 		if (timeout != NULL && !timevalid) {
 			error = EINVAL;
 			break;
 		}
 
 		if (traced) {
 			error = EINTR;
 			break;
 		}
 
 		error = msleep_sbt(&p->p_sigacts, &p->p_mtx, PPAUSE | PCATCH,
 		    "sigwait", sbt, precision, C_ABSOLUTE);
 
 		/* The syscalls can not be restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 
 		/*
 		 * If PTRACE_SCE or PTRACE_SCX were set after
 		 * userspace entered the syscall, return spurious
 		 * EINTR after wait was done.  Only do this as last
 		 * resort after rechecking for possible queued signals
 		 * and expired timeouts.
 		 */
 		if (error == 0 && (p->p_ptevents & PTRACE_SYSCALL) != 0)
 			traced = true;
 	}
 	thread_lock(td);
 	td->td_flags &= ~TDF_SIGWAIT;
 	thread_unlock(td);
 
 	new_block = saved_mask;
 	SIGSETNAND(new_block, td->td_sigmask);
 	td->td_sigmask = saved_mask;
 	/*
 	 * Fewer signals can be delivered to us, reschedule signal
 	 * notification.
 	 */
 	if (p->p_numthreads != 1)
 		reschedule_signals(p, new_block, 0);
 
 	if (error == 0) {
 		SDT_PROBE2(proc, , , signal__clear, sig, ksi);
 
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
 		}
 #endif
 		if (sig == SIGKILL) {
 			proc_td_siginfo_capture(td, &ksi->ksi_info);
 			sigexit(td, sig);
 		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sys_sigpending(struct thread *td, struct sigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(struct thread *td, struct osigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(struct thread *td, struct osigvec_args *uap)
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(struct thread *td, struct osigblock_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(struct thread *td, struct osigsetmask_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set in the
  * meantime.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap)
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 	int has_sig, sig;
 
 	/* Ensure the sigfastblock value is up to date. */
 	sigfastblock_fetch(td);
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
 	    SIGPROCMASK_PROC_LOCKED);
 	td->td_pflags |= TDP_OLDMASK;
 	ast_sched(td, TDA_SIGSUSPEND);
 
 	/*
 	 * Process signals now. Otherwise, we can get spurious wakeup
 	 * due to signal entered process queue, but delivered to other
 	 * thread. But sigsuspend should return only on signal
 	 * delivery.
 	 */
 	(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
 	for (has_sig = 0; !has_sig;) {
 		while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
 			0) == 0)
 			/* void */;
 		thread_suspend_check(0);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0) {
 			KASSERT(sig >= 0, ("sig %d", sig));
 			has_sig += postsig(sig);
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 
 		/*
 		 * If PTRACE_SCE or PTRACE_SCX were set after
 		 * userspace entered the syscall, return spurious
 		 * EINTR.
 		 */
 		if ((p->p_ptevents & PTRACE_SYSCALL) != 0)
 			has_sig += 1;
 	}
 	PROC_UNLOCK(p);
 	td->td_errno = EINTR;
 	td->td_pflags |= TDP_NERRNO;
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(struct thread *td, struct osigsuspend_args *uap)
 {
 	sigset_t mask;
 
 	OSIG2SIG(uap->mask, mask);
 	return (kern_sigsuspend(td, mask));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(struct thread *td, struct osigstack_args *uap)
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap)
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 struct killpg1_ctx {
 	struct thread *td;
 	ksiginfo_t *ksi;
 	int sig;
 	bool sent;
 	bool found;
 	int ret;
 };
 
 static void
 killpg1_sendsig_locked(struct proc *p, struct killpg1_ctx *arg)
 {
 	int err;
 
 	err = p_cansignal(arg->td, p, arg->sig);
 	if (err == 0 && arg->sig != 0)
 		pksignal(p, arg->sig, arg->ksi);
 	if (err != ESRCH)
 		arg->found = true;
 	if (err == 0)
 		arg->sent = true;
 	else if (arg->ret == 0 && err != ESRCH && err != EPERM)
 		arg->ret = err;
 }
 
 static void
 killpg1_sendsig(struct proc *p, bool notself, struct killpg1_ctx *arg)
 {
 
 	if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 ||
 	    (notself && p == arg->td->td_proc) || p->p_state == PRS_NEW)
 		return;
 
 	PROC_LOCK(p);
 	killpg1_sendsig_locked(p, arg);
 	PROC_UNLOCK(p);
 }
 
 static void
 kill_processes_prison_cb(struct proc *p, void *arg)
 {
 	struct killpg1_ctx *ctx = arg;
 
 	if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 ||
 	    (p == ctx->td->td_proc) || p->p_state == PRS_NEW)
 		return;
 
 	killpg1_sendsig_locked(p, ctx);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * td is the calling thread, as usual.
  */
 static int
 killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
 {
 	struct proc *p;
 	struct pgrp *pgrp;
 	struct killpg1_ctx arg;
 
 	arg.td = td;
 	arg.ksi = ksi;
 	arg.sig = sig;
 	arg.sent = false;
 	arg.found = false;
 	arg.ret = 0;
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		prison_proc_iterate(td->td_ucred->cr_prison,
 		    kill_processes_prison_cb, &arg);
 	} else {
 again:
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		if (!sx_try_xlock(&pgrp->pg_killsx)) {
 			PGRP_UNLOCK(pgrp);
 			sx_xlock(&pgrp->pg_killsx);
 			sx_xunlock(&pgrp->pg_killsx);
 			goto again;
 		}
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			killpg1_sendsig(p, false, &arg);
 		}
 		PGRP_UNLOCK(pgrp);
 		sx_xunlock(&pgrp->pg_killsx);
 	}
 	MPASS(arg.ret != 0 || arg.found || !arg.sent);
 	if (arg.ret == 0 && !arg.sent)
 		arg.ret = arg.found ? EPERM : ESRCH;
 	return (arg.ret);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 sys_kill(struct thread *td, struct kill_args *uap)
 {
 
 	return (kern_kill(td, uap->pid, uap->signum));
 }
 
 int
 kern_kill(struct thread *td, pid_t pid, int signum)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	/*
 	 * A process in capability mode can send signals only to himself.
 	 * The main rationale behind this is that abort(3) is implemented as
 	 * kill(getpid(), SIGABRT).
 	 */
 	if (pid != td->td_proc->p_pid) {
 		if (CAP_TRACING(td))
 			ktrcapfail(CAPFAIL_SIGNAL, &signum);
 		if (IN_CAPABILITY_MODE(td))
 			return (ECAPMODE);
 	}
 
 	AUDIT_ARG_SIGNUM(signum);
 	AUDIT_ARG_PID(pid);
 	if ((u_int)signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 
 	if (pid > 0) {
 		/* kill single process */
 		if ((p = pfind_any(pid)) == NULL)
 			return (ESRCH);
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, signum);
 		if (error == 0 && signum)
 			pksignal(p, signum, &ksi);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, signum, 0, 1, &ksi));
 	case 0:			/* signal own process group */
 		return (killpg1(td, signum, 0, 0, &ksi));
 	default:		/* negative explicit process group */
 		return (killpg1(td, signum, -pid, 0, &ksi));
 	}
 	/* NOTREACHED */
 }
 
 int
 sys_pdkill(struct thread *td, struct pdkill_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_FD(uap->fd);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p);
 	if (error)
 		return (error);
 	AUDIT_ARG_PROCESS(p);
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum)
 		kern_psignal(p, uap->signum);
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(struct thread *td, struct okillpg_args *uap)
 {
 	ksiginfo_t ksi;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 int
 sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	union sigval sv;
 
 	sv.sival_ptr = uap->value;
 
 	return (kern_sigqueue(td, uap->pid, uap->signum, &sv));
 }
 
 int
 kern_sigqueue(struct thread *td, pid_t pid, int signumf, union sigval *value)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	struct thread *td2;
 	u_int signum;
 	int error;
 
 	signum = signumf & ~__SIGQUEUE_TID;
 	if (signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (pid <= 0)
 		return (EINVAL);
 
 	if ((signumf & __SIGQUEUE_TID) == 0) {
 		if ((p = pfind_any(pid)) == NULL)
 			return (ESRCH);
 		td2 = NULL;
 	} else {
 		p = td->td_proc;
 		td2 = tdfind((lwpid_t)pid, p->p_pid);
 		if (td2 == NULL)
 			return (ESRCH);
 	}
 
 	error = p_cansignal(td, p, signum);
 	if (error == 0 && signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_flags = KSI_SIGQ;
 		ksi.ksi_signo = signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value = *value;
 		error = tdsendsignal(p, td2, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
 {
 	struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    (checkctty == 0 || p->p_flag & P_CONTROLT))
 				pksignal(p, sig, ksi);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * Recalculate the signal mask and reset the signal disposition after
  * usermode frame for delivery is formed.  Should be called after
  * mach-specific routine, because sysent->sv_sendsig() needs correct
  * ps_siginfo and signal mask.
  */
 static void
 postsig_done(int sig, struct thread *td, struct sigacts *ps)
 {
 	sigset_t mask;
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	td->td_ru.ru_nsignals++;
 	mask = ps->ps_catchmask[_SIG_IDX(sig)];
 	if (!SIGISMEMBER(ps->ps_signodefer, sig))
 		SIGADDSET(mask, sig);
 	kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
 	    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
 	if (SIGISMEMBER(ps->ps_sigreset, sig))
 		sigdflt(ps, sig);
 }
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 	sigset_t sigmask;
 	int sig;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 	sigfastblock_fetch(td);
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sigmask = td->td_sigmask;
 	if (td->td_sigblock_val != 0)
 		SIGSETOR(sigmask, fastblock_mask);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(sigmask, sig)) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, ksi->ksi_code);
 #endif
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
 		    ksi, &td->td_sigmask);
 		postsig_done(sig, td, ps);
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit && (SIGISMEMBER(sigmask, sig) ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 			td->td_pflags &= ~TDP_SIGFASTBLOCK;
 			td->td_sigblock_val = 0;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsendsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, bool fast_sigblock)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS(!fast_sigblock || p == curproc);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching context to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig) &&
 	    (!fast_sigblock || curthread->td_sigblock_val == 0))
 		return (curthread);
 
 	/* Find a non-stopped thread that does not mask the signal. */
 	signal_td = NULL;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig) && (!fast_sigblock ||
 		    td != curthread || td->td_sigblock_val == 0) &&
 		    (td->td_flags & TDF_BOUNDARY) == 0) {
 			signal_td = td;
 			break;
 		}
 	}
 	/* Select random (first) thread if no better match was found. */
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  *
  * NB: This function may be entered from the debugger via the "kill" DDB
  * command.  There is little that can be done to mitigate the possibly messy
  * side effects of this unwise possibility.
  */
 void
 kern_psignal(struct proc *p, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(p, NULL, sig, &ksi);
 }
 
 int
 pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
 {
 
 	return (tdsendsignal(p, NULL, sig, ksi));
 }
 
 /* Utility function for finding a thread to send signal event to. */
 int
 sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **ttd)
 {
 	struct thread *td;
 
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
 		if (td == NULL)
 			return (ESRCH);
 		*ttd = td;
 	} else {
 		*ttd = NULL;
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 tdsignal(struct thread *td, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(td->td_proc, td, sig, &ksi);
 }
 
 void
 tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
 {
 
 	(void) tdsendsignal(td->td_proc, td, sig, ksi);
 }
 
 static int
 sig_sleepq_abort(struct thread *td, int intrval)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	if (intrval == 0 && (td->td_flags & TDF_SIGWAIT) == 0) {
 		thread_unlock(td);
 		return (0);
 	}
 	return (sleepq_abort(td, intrval));
 }
 
 int
 tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 	int wakeup_swapper;
 
 	MPASS(td == NULL || p == td->td_proc);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 		panic("%s(): invalid signal %d", __func__, sig);
 
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0)
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	if (td == NULL) {
 		td = sigtd(p, sig, false);
 		sigqueue = &p->p_sigqueue;
 	} else
 		sigqueue = &td->td_sigqueue;
 
 	SDT_PROBE3(proc, , , signal__send, td, p, sig);
 
 	/*
 	 * If the signal is being ignored, then we forget about it
 	 * immediately, except when the target process executes
 	 * sigwait().  (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN, action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		if (kern_sig_discard_ign &&
 		    (p->p_sysent->sv_flags & SV_SIG_DISCIGN) == 0) {
 			SDT_PROBE3(proc, , , signal__discard, td, p, sig);
 
 			mtx_unlock(&ps->ps_mtx);
 			if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0)
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		} else {
 			action = SIG_CATCH;
 			intrval = 0;
 		}
 	} else {
 		if (SIGISMEMBER(td->td_sigmask, sig))
 			action = SIG_HOLD;
 		else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 			action = SIG_CATCH;
 		else
 			action = SIG_DFL;
 		if (SIGISMEMBER(ps->ps_sigintr, sig))
 			intrval = EINTR;
 		else
 			intrval = ERESTART;
 	}
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SIGPROP_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SIGPROP_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SIGPROP_TTYSTOP) != 0 &&
 		    (p->p_pgrp->pg_flags & PGRP_ORPHANED) != 0 &&
 		    action == SIG_DFL) {
 			if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0)
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		sigqueue_delete_proc(p, SIGCONT);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 
 	wakeup_swapper = 0;
 
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediately, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try to do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		KASSERT(!(p->p_flag & P_WEXIT),
 		    ("signal to stopped but exiting process"));
 		if (sig == SIGKILL) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SIGPROP_CONT) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			PROC_SLOCK(p);
 			if (p->p_numthreads == p->p_suspcount) {
 				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xsig = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
 				thread_unsuspend(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
 				goto out_cont;
 			}
 			if (action == SIG_CATCH) {
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 				PROC_SUNLOCK(p);
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 			goto out_cont;
 		}
 
 		if (prop & SIGPROP_STOP) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		PROC_SLOCK(p);
 		thread_lock(td);
 		if (TD_CAN_ABORT(td))
 			wakeup_swapper = sig_sleepq_abort(td, intrval);
 		else
 			thread_unlock(td);
 		PROC_SUNLOCK(p);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			tdsigwakeup(td, sig, action, intrval);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SIGPROP_STOP) {
 			if (p->p_flag & (P_PPWAIT|P_WEXIT))
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xsig = sig;
 			PROC_SLOCK(p);
 			wakeup_swapper = sig_suspend_threads(td, p);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xsig);
 			} else
 				PROC_SUNLOCK(p);
 			goto out;
 		}
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 runfast:
 	tdsigwakeup(td, sig, action, intrval);
 	PROC_SLOCK(p);
 	thread_unsuspend(p);
 	PROC_SUNLOCK(p);
 out_cont:
 	itimer_proc_continue(p);
 	kqtimer_proc_continue(p);
 out:
 	/* If we jump here, proc slock should not be owned. */
 	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	if (wakeup_swapper)
 		kick_proc0();
 
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	int prop, wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	prop = sigprop(sig);
 
 	PROC_SLOCK(p);
 	thread_lock(td);
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.  Be careful to avoid bumping the
 	 * priority of the idle thread, since we still allow to signal
 	 * kernel processes.
 	 */
 	if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 &&
 	    td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 		sched_prio(td, PUSER);
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			goto out;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SIGPROP_CONT) && action == SIG_DFL) {
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			return;
 		}
 
 		/*
 		 * Don't awaken a sleeping thread for SIGSTOP if the
 		 * STOP signal is deferred.
 		 */
 		if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY |
 		    TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			goto out;
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 			sched_prio(td, PUSER);
 
 		wakeup_swapper = sig_sleepq_abort(td, intrval);
 		PROC_SUNLOCK(p);
 		if (wakeup_swapper)
 			kick_proc0();
 		return;
 	}
 
 	/*
 	 * Other states do nothing with the signal immediately,
 	 * other than kicking ourselves if we are running.
 	 * It will either never be noticed, or noticed very soon.
 	 */
 #ifdef SMP
 	if (TD_IS_RUNNING(td) && td != curthread)
 		forward_signal(td);
 #endif
 
 out:
 	PROC_SUNLOCK(p);
 	thread_unlock(td);
 }
 
 static void
 ptrace_coredumpreq(struct thread *td, struct proc *p,
     struct thr_coredump_req *tcq)
 {
 	void *rl_cookie;
 
 	if (p->p_sysent->sv_coredump == NULL) {
 		tcq->tc_error = ENOSYS;
 		return;
 	}
 
 	rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX);
 	tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp,
 	    tcq->tc_limit, tcq->tc_flags);
 	vn_rangelock_unlock(tcq->tc_vp, rl_cookie);
 }
 
 static void
 ptrace_syscallreq(struct thread *td, struct proc *p,
     struct thr_syscall_req *tsr)
 {
 	struct sysentvec *sv;
 	struct sysent *se;
 	register_t rv_saved[2];
 	int error, nerror;
 	int sc;
 	bool audited, sy_thr_static;
 
 	sv = p->p_sysent;
 	if (sv->sv_table == NULL || sv->sv_size < tsr->ts_sa.code) {
 		tsr->ts_ret.sr_error = ENOSYS;
 		return;
 	}
 
 	sc = tsr->ts_sa.code;
 	if (sc == SYS_syscall || sc == SYS___syscall) {
 		sc = tsr->ts_sa.args[0];
 		memmove(&tsr->ts_sa.args[0], &tsr->ts_sa.args[1],
 		    sizeof(register_t) * (tsr->ts_nargs - 1));
 	}
 
 	tsr->ts_sa.callp = se = &sv->sv_table[sc];
 
 	VM_CNT_INC(v_syscall);
 	td->td_pticks = 0;
 	if (__predict_false(td->td_cowgen != atomic_load_int(
 	    &td->td_proc->p_cowgen)))
 		thread_cow_update(td);
 
 	td->td_sa = tsr->ts_sa;
 
 #ifdef CAPABILITY_MODE
 	if ((se->sy_flags & SYF_CAPENABLED) == 0) {
 		if (CAP_TRACING(td))
 			ktrcapfail(CAPFAIL_SYSCALL, NULL);
 		if (IN_CAPABILITY_MODE(td)) {
 			tsr->ts_ret.sr_error = ECAPMODE;
 			return;
 		}
 	}
 #endif
 
 	sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0;
 	audited = AUDIT_SYSCALL_ENTER(sc, td) != 0;
 
 	if (!sy_thr_static) {
 		error = syscall_thread_enter(td, &se);
 		sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0;
 		if (error != 0) {
 			tsr->ts_ret.sr_error = error;
 			return;
 		}
 	}
 
 	rv_saved[0] = td->td_retval[0];
 	rv_saved[1] = td->td_retval[1];
 	nerror = td->td_errno;
 	td->td_retval[0] = 0;
 	td->td_retval[1] = 0;
 
 #ifdef KDTRACE_HOOKS
 	if (se->sy_entry != 0)
 		(*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_ENTRY, 0);
 #endif
 	tsr->ts_ret.sr_error = se->sy_call(td, tsr->ts_sa.args);
 #ifdef KDTRACE_HOOKS
 	if (se->sy_return != 0)
 		(*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_RETURN,
 		    tsr->ts_ret.sr_error != 0 ? -1 : td->td_retval[0]);
 #endif
 
 	tsr->ts_ret.sr_retval[0] = td->td_retval[0];
 	tsr->ts_ret.sr_retval[1] = td->td_retval[1];
 	td->td_retval[0] = rv_saved[0];
 	td->td_retval[1] = rv_saved[1];
 	td->td_errno = nerror;
 
 	if (audited)
 		AUDIT_SYSCALL_EXIT(error, td);
 	if (!sy_thr_static)
 		syscall_thread_exit(td, se);
 }
 
 static void
 ptrace_remotereq(struct thread *td, int flag)
 {
 	struct proc *p;
 
 	MPASS(td == curthread);
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((td->td_dbgflags & flag) == 0)
 		return;
 	KASSERT((p->p_flag & P_STOPPED_TRACE) != 0, ("not stopped"));
 	KASSERT(td->td_remotereq != NULL, ("td_remotereq is NULL"));
 
 	PROC_UNLOCK(p);
 	switch (flag) {
 	case TDB_COREDUMPREQ:
 		ptrace_coredumpreq(td, p, td->td_remotereq);
 		break;
 	case TDB_SCREMOTEREQ:
 		ptrace_syscallreq(td, p, td->td_remotereq);
 		break;
 	default:
 		__unreachable();
 	}
 	PROC_LOCK(p);
 
 	MPASS((td->td_dbgflags & flag) != 0);
 	td->td_dbgflags &= ~flag;
 	td->td_remotereq = NULL;
 	wakeup(p);
 }
 
 static int
 sig_suspend_threads(struct thread *td, struct proc *p)
 {
 	struct thread *td2;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	wakeup_swapper = 0;
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		ast_sched_locked(td2, TDA_SUSPEND);
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR)) {
 			if (td2->td_flags & TDF_SBDRY) {
 				/*
 				 * Once a thread is asleep with
 				 * TDF_SBDRY and without TDF_SERESTART
 				 * or TDF_SEINTR set, it should never
 				 * become suspended due to this check.
 				 */
 				KASSERT(!TD_IS_SUSPENDED(td2),
 				    ("thread with deferred stops suspended"));
 				if (TD_SBDRY_INTR(td2)) {
 					wakeup_swapper |= sleepq_abort(td2,
 					    TD_SBDRY_ERRNO(td2));
 					continue;
 				}
 			} else if (!TD_IS_SUSPENDED(td2))
 				thread_suspend_one(td2);
 		} else if (!TD_IS_SUSPENDED(td2)) {
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 		thread_unlock(td2);
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Stop the process for an event deemed interesting to the debugger. If si is
  * non-NULL, this is a signal exchange; the new signal requested by the
  * debugger will be returned for handling. If si is NULL, this is some other
  * type of interesting event. The debugger may request a signal be delivered in
  * that case as well, however it will be deferred until it can be handled.
  */
 int
 ptracestop(struct thread *td, int sig, ksiginfo_t *si)
 {
 	struct proc *p = td->td_proc;
 	struct thread *td2;
 	ksiginfo_t ksi;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process"));
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
 	td->td_xsig = sig;
 
 	if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) {
 		td->td_dbgflags |= TDB_XSIG;
 		CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d",
 		    td->td_tid, p->p_pid, td->td_dbgflags, sig);
 		PROC_SLOCK(p);
 		while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
 			if (P_KILLED(p)) {
 				/*
 				 * Ensure that, if we've been PT_KILLed, the
 				 * exit status reflects that. Another thread
 				 * may also be in ptracestop(), having just
 				 * received the SIGKILL, but this thread was
 				 * unsuspended first.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				td->td_xsig = SIGKILL;
 				p->p_ptevents = 0;
 				break;
 			}
 			if (p->p_flag & P_SINGLE_EXIT &&
 			    !(td->td_dbgflags & TDB_EXIT)) {
 				/*
 				 * Ignore ptrace stops except for thread exit
 				 * events when the process exits.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				PROC_SUNLOCK(p);
 				return (0);
 			}
 
 			/*
 			 * Make wait(2) work.  Ensure that right after the
 			 * attach, the thread which was decided to become the
 			 * leader of attach gets reported to the waiter.
 			 * Otherwise, just avoid overwriting another thread's
 			 * assignment to p_xthread.  If another thread has
 			 * already set p_xthread, the current thread will get
 			 * a chance to report itself upon the next iteration.
 			 */
 			if ((td->td_dbgflags & TDB_FSTP) != 0 ||
 			    ((p->p_flag2 & P2_PTRACE_FSTP) == 0 &&
 			    p->p_xthread == NULL)) {
 				p->p_xsig = sig;
 				p->p_xthread = td;
 
 				/*
 				 * If we are on sleepqueue already,
 				 * let sleepqueue code decide if it
 				 * needs to go sleep after attach.
 				 */
 				if (td->td_wchan == NULL)
 					td->td_dbgflags &= ~TDB_FSTP;
 
 				p->p_flag2 &= ~P2_PTRACE_FSTP;
 				p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE;
 				sig_suspend_threads(td, p);
 			}
 			if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
 				td->td_dbgflags &= ~TDB_STOPATFORK;
 			}
 stopme:
 			td->td_dbgflags |= TDB_SSWITCH;
 			thread_suspend_switch(td, p);
 			td->td_dbgflags &= ~TDB_SSWITCH;
 			if ((td->td_dbgflags & (TDB_COREDUMPREQ |
 			    TDB_SCREMOTEREQ)) != 0) {
 				MPASS((td->td_dbgflags & (TDB_COREDUMPREQ |
 				    TDB_SCREMOTEREQ)) !=
 				    (TDB_COREDUMPREQ | TDB_SCREMOTEREQ));
 				PROC_SUNLOCK(p);
 				ptrace_remotereq(td, td->td_dbgflags &
 				    (TDB_COREDUMPREQ | TDB_SCREMOTEREQ));
 				PROC_SLOCK(p);
 				goto stopme;
 			}
 			if (p->p_xthread == td)
 				p->p_xthread = NULL;
 			if (!(p->p_flag & P_TRACED))
 				break;
 			if (td->td_dbgflags & TDB_SUSPEND) {
 				if (p->p_flag & P_SINGLE_EXIT)
 					break;
 				goto stopme;
 			}
 		}
 		PROC_SUNLOCK(p);
 	}
 
 	if (si != NULL && sig == td->td_xsig) {
 		/* Parent wants us to take the original signal unchanged. */
 		si->ksi_flags |= KSI_HEAD;
 		if (sigqueue_add(&td->td_sigqueue, sig, si) != 0)
 			si->ksi_signo = 0;
 	} else if (td->td_xsig != 0) {
 		/*
 		 * If parent wants us to take a new signal, then it will leave
 		 * it in td->td_xsig; otherwise we just look for signals again.
 		 */
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = td->td_xsig;
 		ksi.ksi_flags |= KSI_PTRACE;
 		td2 = sigtd(p, td->td_xsig, false);
 		tdsendsignal(p, td2, td->td_xsig, &ksi);
 		if (td != td2)
 			return (0);
 	}
 
 	return (td->td_xsig);
 }
 
 static void
 reschedule_signals(struct proc *p, sigset_t block, int flags)
 {
 	struct sigacts *ps;
 	struct thread *td;
 	int sig;
 	bool fastblk, pslocked;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	pslocked = (flags & SIGPROCMASK_PS_LOCKED) != 0;
 	mtx_assert(&ps->ps_mtx, pslocked ? MA_OWNED : MA_NOTOWNED);
 	if (SIGISEMPTY(p->p_siglist))
 		return;
 	SIGSETAND(block, p->p_siglist);
 	fastblk = (flags & SIGPROCMASK_FASTBLK) != 0;
 	SIG_FOREACH(sig, &block) {
 		td = sigtd(p, sig, fastblk);
 
 		/*
 		 * If sigtd() selected us despite sigfastblock is
 		 * blocking, do not activate AST or wake us, to avoid
 		 * loop in AST handler.
 		 */
 		if (fastblk && td == curthread)
 			continue;
 
 		signotify(td);
 		if (!pslocked)
 			mtx_lock(&ps->ps_mtx);
 		if (p->p_flag & P_TRACED ||
 		    (SIGISMEMBER(ps->ps_sigcatch, sig) &&
 		    !SIGISMEMBER(td->td_sigmask, sig))) {
 			tdsigwakeup(td, sig, SIG_CATCH,
 			    (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
 			    ERESTART));
 		}
 		if (!pslocked)
 			mtx_unlock(&ps->ps_mtx);
 	}
 }
 
 void
 tdsigcleanup(struct thread *td)
 {
 	struct proc *p;
 	sigset_t unblocked;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_flush(&td->td_sigqueue);
 	if (p->p_numthreads == 1)
 		return;
 
 	/*
 	 * Since we cannot handle signals, notify signal post code
 	 * about this by filling the sigmask.
 	 *
 	 * Also, if needed, wake up thread(s) that do not block the
 	 * same signals as the exiting thread, since the thread might
 	 * have been selected for delivery and woken up.
 	 */
 	SIGFILLSET(unblocked);
 	SIGSETNAND(unblocked, td->td_sigmask);
 	SIGFILLSET(td->td_sigmask);
 	reschedule_signals(p, unblocked, 0);
 
 }
 
 static int
 sigdeferstop_curr_flags(int cflags)
 {
 
 	MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 ||
 	    (cflags & TDF_SBDRY) != 0);
 	return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART));
 }
 
 /*
  * Defer the delivery of SIGSTOP for the current thread, according to
  * the requested mode.  Returns previous flags, which must be restored
  * by sigallowstop().
  *
  * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and
  * cleared by the current thread, which allow the lock-less read-only
  * accesses below.
  */
 int
 sigdeferstop_impl(int mode)
 {
 	struct thread *td;
 	int cflags, nflags;
 
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	switch (mode) {
 	case SIGDEFERSTOP_NOP:
 		nflags = cflags;
 		break;
 	case SIGDEFERSTOP_OFF:
 		nflags = 0;
 		break;
 	case SIGDEFERSTOP_SILENT:
 		nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART);
 		break;
 	case SIGDEFERSTOP_EINTR:
 		nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART;
 		break;
 	case SIGDEFERSTOP_ERESTART:
 		nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR;
 		break;
 	default:
 		panic("sigdeferstop: invalid mode %x", mode);
 		break;
 	}
 	if (cflags == nflags)
 		return (SIGDEFERSTOP_VAL_NCHG);
 	thread_lock(td);
 	td->td_flags = (td->td_flags & ~cflags) | nflags;
 	thread_unlock(td);
 	return (cflags);
 }
 
 /*
  * Restores the STOP handling mode, typically permitting the delivery
  * of SIGSTOP for the current thread.  This does not immediately
  * suspend if a stop was posted.  Instead, the thread will suspend
  * either via ast() or a subsequent interruptible sleep.
  */
 void
 sigallowstop_impl(int prev)
 {
 	struct thread *td;
 	int cflags;
 
 	KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop"));
 	KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0,
 	    ("sigallowstop: incorrect previous mode %x", prev));
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	if (cflags != prev) {
 		thread_lock(td);
 		td->td_flags = (td->td_flags & ~cflags) | prev;
 		thread_unlock(td);
 	}
 }
 
 enum sigstatus {
 	SIGSTATUS_HANDLE,
 	SIGSTATUS_HANDLED,
 	SIGSTATUS_IGNORE,
 	SIGSTATUS_SBDRY_STOP,
 };
 
 /*
  * The thread has signal "sig" pending.  Figure out what to do with it:
  *
  * _HANDLE     -> the caller should handle the signal
  * _HANDLED    -> handled internally, reload pending signal set
  * _IGNORE     -> ignored, remove from the set of pending signals and try the
  *                next pending signal
  * _SBDRY_STOP -> the signal should stop the thread but this is not
  *                permitted in the current context
  */
 static enum sigstatus
 sigprocess(struct thread *td, int sig)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	struct sigqueue *queue;
 	ksiginfo_t ksi;
 	int prop;
 
 	KASSERT(_SIG_VALID(sig), ("%s: invalid signal %d", __func__, sig));
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * We should allow pending but ignored signals below
 	 * if there is sigwait() active, or P_TRACED was
 	 * on when they were posted.
 	 */
 	if (SIGISMEMBER(ps->ps_sigignore, sig) &&
 	    (p->p_flag & P_TRACED) == 0 &&
 	    (td->td_flags & TDF_SIGWAIT) == 0) {
 		return (SIGSTATUS_IGNORE);
 	}
 
 	/*
 	 * If the process is going to single-thread mode to prepare
 	 * for exit, there is no sense in delivering any signal
 	 * to usermode.  Another important consequence is that
 	 * msleep(..., PCATCH, ...) now is only interruptible by a
 	 * suspend request.
 	 */
 	if ((p->p_flag2 & P2_WEXIT) != 0)
 		return (SIGSTATUS_IGNORE);
 
 	if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) {
 		/*
 		 * If traced, always stop.
 		 * Remove old signal from queue before the stop.
 		 * XXX shrug off debugger, it causes siginfo to
 		 * be thrown away.
 		 */
 		queue = &td->td_sigqueue;
 		ksiginfo_init(&ksi);
 		if (sigqueue_get(queue, sig, &ksi) == 0) {
 			queue = &p->p_sigqueue;
 			sigqueue_get(queue, sig, &ksi);
 		}
 		td->td_si = ksi.ksi_info;
 
 		mtx_unlock(&ps->ps_mtx);
 		sig = ptracestop(td, sig, &ksi);
 		mtx_lock(&ps->ps_mtx);
 
 		td->td_si.si_signo = 0;
 
 		/*
 		 * Keep looking if the debugger discarded or
 		 * replaced the signal.
 		 */
 		if (sig == 0)
 			return (SIGSTATUS_HANDLED);
 
 		/*
 		 * If the signal became masked, re-queue it.
 		 */
 		if (SIGISMEMBER(td->td_sigmask, sig)) {
 			ksi.ksi_flags |= KSI_HEAD;
 			sigqueue_add(&p->p_sigqueue, sig, &ksi);
 			return (SIGSTATUS_HANDLED);
 		}
 
 		/*
 		 * If the traced bit got turned off, requeue the signal and
 		 * reload the set of pending signals.  This ensures that p_sig*
 		 * and p_sigact are consistent.
 		 */
 		if ((p->p_flag & P_TRACED) == 0) {
 			if ((ksi.ksi_flags & KSI_PTRACE) == 0) {
 				ksi.ksi_flags |= KSI_HEAD;
 				sigqueue_add(queue, sig, &ksi);
 			}
 			return (SIGSTATUS_HANDLED);
 		}
 	}
 
 	/*
 	 * Decide whether the signal should be returned.
 	 * Return the signal's number, or fall through
 	 * to clear it from the pending mask.
 	 */
 	switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 	case (intptr_t)SIG_DFL:
 		/*
 		 * Don't take default actions on system processes.
 		 */
 		if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 			/*
 			 * Are you sure you want to ignore SIGSEGV
 			 * in init? XXX
 			 */
 			printf("Process (pid %lu) got signal %d\n",
 				(u_long)p->p_pid, sig);
 #endif
 			return (SIGSTATUS_IGNORE);
 		}
 
 		/*
 		 * If there is a pending stop signal to process with
 		 * default action, stop here, then clear the signal.
 		 * Traced or exiting processes should ignore stops.
 		 * Additionally, a member of an orphaned process group
 		 * should ignore tty stops.
 		 */
 		prop = sigprop(sig);
 		if (prop & SIGPROP_STOP) {
 			mtx_unlock(&ps->ps_mtx);
 			if ((p->p_flag & (P_TRACED | P_WEXIT |
 			    P_SINGLE_EXIT)) != 0 || ((p->p_pgrp->
 			    pg_flags & PGRP_ORPHANED) != 0 &&
 			    (prop & SIGPROP_TTYSTOP) != 0)) {
 				mtx_lock(&ps->ps_mtx);
 				return (SIGSTATUS_IGNORE);
 			}
 			if (TD_SBDRY_INTR(td)) {
 				KASSERT((td->td_flags & TDF_SBDRY) != 0,
 				    ("lost TDF_SBDRY"));
 				mtx_lock(&ps->ps_mtx);
 				return (SIGSTATUS_SBDRY_STOP);
 			}
 			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 			    &p->p_mtx.lock_object, "Catching SIGSTOP");
 			sigqueue_delete(&td->td_sigqueue, sig);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xsig = sig;
 			PROC_SLOCK(p);
 			sig_suspend_threads(td, p);
 			thread_suspend_switch(td, p);
 			PROC_SUNLOCK(p);
 			mtx_lock(&ps->ps_mtx);
 			return (SIGSTATUS_HANDLED);
 		} else if ((prop & SIGPROP_IGNORE) != 0 &&
 		    (td->td_flags & TDF_SIGWAIT) == 0) {
 			/*
 			 * Default action is to ignore; drop it if
 			 * not in kern_sigtimedwait().
 			 */
 			return (SIGSTATUS_IGNORE);
 		} else {
 			return (SIGSTATUS_HANDLE);
 		}
 
 	case (intptr_t)SIG_IGN:
 		if ((td->td_flags & TDF_SIGWAIT) == 0)
 			return (SIGSTATUS_IGNORE);
 		else
 			return (SIGSTATUS_HANDLE);
 
 	default:
 		/*
 		 * This signal has an action, let postsig() process it.
 		 */
 		return (SIGSTATUS_HANDLE);
 	}
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling
  * issignal by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(struct thread *td)
 {
 	struct proc *p;
 	sigset_t sigpending;
 	int sig;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	for (;;) {
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags &
 		    (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 
 		/*
 		 * Do fast sigblock if requested by usermode.  Since
 		 * we do know that there was a signal pending at this
 		 * point, set the FAST_SIGBLOCK_PEND as indicator for
 		 * usermode to perform a dummy call to
 		 * FAST_SIGBLOCK_UNBLOCK, which causes immediate
 		 * delivery of postponed pending signal.
 		 */
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) {
 			if (td->td_sigblock_val != 0)
 				SIGSETNAND(sigpending, fastblock_mask);
 			if (SIGISEMPTY(sigpending)) {
 				td->td_pflags |= TDP_SIGFASTPENDING;
 				return (0);
 			}
 		}
 
 		if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED &&
 		    (p->p_flag2 & P2_PTRACE_FSTP) != 0 &&
 		    SIGISMEMBER(sigpending, SIGSTOP)) {
 			/*
 			 * If debugger just attached, always consume
 			 * SIGSTOP from ptrace(PT_ATTACH) first, to
 			 * execute the debugger attach ritual in
 			 * order.
 			 */
 			td->td_dbgflags |= TDB_FSTP;
 			SIGEMPTYSET(sigpending);
 			SIGADDSET(sigpending, SIGSTOP);
 		}
 
 		SIG_FOREACH(sig, &sigpending) {
 			switch (sigprocess(td, sig)) {
 			case SIGSTATUS_HANDLE:
 				return (sig);
 			case SIGSTATUS_HANDLED:
 				goto next;
 			case SIGSTATUS_IGNORE:
 				sigqueue_delete(&td->td_sigqueue, sig);
 				sigqueue_delete(&p->p_sigqueue, sig);
 				break;
 			case SIGSTATUS_SBDRY_STOP:
 				return (-1);
 			}
 		}
 next:;
 	}
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		PROC_SLOCK(p);
 	}
 }
 
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 int
 postsig(int sig)
 {
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
 	    sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
 		return (0);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
 #endif
 
 	if (action == SIG_DFL) {
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		proc_td_siginfo_capture(td, &ksi.ksi_info);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN, ("postsig action %p", action));
 		KASSERT(!SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action: blocked sig %d", sig));
 
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		if (p->p_sig == sig) {
 			p->p_sig = 0;
 		}
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 		postsig_done(sig, td, ps);
 	}
 	return (1);
 }
 
 int
 sig_ast_checksusp(struct thread *td)
 {
 	struct proc *p __diagused;
 	int ret;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!td_ast_pending(td, TDA_SUSPEND))
 		return (0);
 
 	ret = thread_suspend_check(1);
 	MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
 	return (ret);
 }
 
 int
 sig_ast_needsigchk(struct thread *td)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	int ret, sig;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!td_ast_pending(td, TDA_SIG))
 		return (0);
 
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sig = cursig(td);
 	if (sig == -1) {
 		mtx_unlock(&ps->ps_mtx);
 		KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY"));
 		KASSERT(TD_SBDRY_INTR(td),
 		    ("lost TDF_SERESTART of TDF_SEINTR"));
 		KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
 		    (TDF_SEINTR | TDF_SERESTART),
 		    ("both TDF_SEINTR and TDF_SERESTART"));
 		ret = TD_SBDRY_ERRNO(td);
 	} else if (sig != 0) {
 		ret = SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART;
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		mtx_unlock(&ps->ps_mtx);
 		ret = 0;
 	}
 
 	/*
 	 * Do not go into sleep if this thread was the ptrace(2)
 	 * attach leader.  cursig() consumed SIGSTOP from PT_ATTACH,
 	 * but we usually act on the signal by interrupting sleep, and
 	 * should do that here as well.
 	 */
 	if ((td->td_dbgflags & TDB_FSTP) != 0) {
 		if (ret == 0)
 			ret = EINTR;
 		td->td_dbgflags &= ~TDB_FSTP;
 	}
 
 	return (ret);
 }
 
 int
 sig_intr(void)
 {
 	struct thread *td;
 	struct proc *p;
 	int ret;
 
 	td = curthread;
 	if (!td_ast_pending(td, TDA_SIG) && !td_ast_pending(td, TDA_SUSPEND))
 		return (0);
 
 	p = td->td_proc;
 
 	PROC_LOCK(p);
 	ret = sig_ast_checksusp(td);
 	if (ret == 0)
 		ret = sig_ast_needsigchk(td);
 	PROC_UNLOCK(p);
 	return (ret);
 }
 
 bool
 curproc_sigkilled(void)
 {
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	bool res;
 
 	td = curthread;
 	if (!td_ast_pending(td, TDA_SIG))
 		return (false);
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	res = SIGISMEMBER(td->td_sigqueue.sq_signals, SIGKILL) ||
 	    SIGISMEMBER(p->p_sigqueue.sq_signals, SIGKILL);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (res);
 }
 
 void
 proc_wkilled(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_flag & P_WKILLED) == 0) {
 		p->p_flag |= P_WKILLED;
 		/*
 		 * Notify swapper that there is a process to swap in.
 		 * The notification is racy, at worst it would take 10
 		 * seconds for the swapper process to notice.
 		 */
 		if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0)
 			wakeup(&proc0);
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(struct proc *p, const char *why)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n",
 	    p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id,
 	    p->p_ucred->cr_uid, why);
 	proc_wkilled(p);
 	kern_psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 	const char *coreinfo;
 	int rv;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	proc_set_p2_wexit(p);
 
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SIGPROP_CORE) &&
 	    thread_single(p, SINGLE_NO_EXIT) == 0) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		rv = coredump(td);
 		switch (rv) {
 		case 0:
 			sig |= WCOREFLAG;
 			coreinfo = " (core dumped)";
 			break;
 		case EFAULT:
 			coreinfo = " (no core dump - bad address)";
 			break;
 		case EINVAL:
 			coreinfo = " (no core dump - invalid argument)";
 			break;
 		case EFBIG:
 			coreinfo = " (no core dump - too large)";
 			break;
 		default:
 			coreinfo = " (no core dump - other error)";
 			break;
 		}
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), jid %d, uid %d: exited on "
 			    "signal %d%s\n", p->p_pid, p->p_comm,
 			    p->p_ucred->cr_prison->pr_id,
 			    td->td_ucred->cr_uid,
 			    sig &~ WCOREFLAG, coreinfo);
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, 0, sig);
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int sig)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, sig);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 
 	childproc_jobstate(p, reason, p->p_xsig);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason, status;
 
 	if (WCOREDUMP(p->p_xsig)) {
 		reason = CLD_DUMPED;
 		status = WTERMSIG(p->p_xsig);
 	} else if (WIFSIGNALED(p->p_xsig)) {
 		reason = CLD_KILLED;
 		status = WTERMSIG(p->p_xsig);
 	} else {
 		reason = CLD_EXITED;
 		status = p->p_xexit;
 	}
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 #define	MAX_NUM_CORE_FILES 100000
 #ifndef NUM_CORE_FILES
 #define	NUM_CORE_FILES 5
 #endif
 CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES);
 static int num_cores = NUM_CORE_FILES;
 
 static int
 sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int new_val;
 
 	new_val = num_cores;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val > MAX_NUM_CORE_FILES)
 		new_val = MAX_NUM_CORE_FILES;
 	if (new_val < 0)
 		new_val = 0;
 	num_cores = new_val;
 	return (0);
 }
 SYSCTL_PROC(_debug, OID_AUTO, ncores,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
     sysctl_debug_num_cores_check, "I",
     "Maximum number of generated process corefiles while using index format");
 
 #define	GZIP_SUFFIX	".gz"
 #define	ZSTD_SUFFIX	".zst"
 
 int compress_user_cores = 0;
 
 static int
 sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = compress_user_cores;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val != 0 && !compressor_avail(val))
 		return (EINVAL);
 	compress_user_cores = val;
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
     sysctl_compress_user_cores, "I",
     "Enable compression of user corefiles ("
     __XSTRING(COMPRESS_GZIP) " = gzip, "
     __XSTRING(COMPRESS_ZSTD) " = zstd)");
 
 int compress_user_cores_level = 6;
 SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
     &compress_user_cores_level, 0,
     "Corefile compression level");
 
 /*
  * Protect the access to corefilename[] by allproc_lock.
  */
 #define	corefilename_lock	allproc_lock
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
 
 static int
 sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	sx_xlock(&corefilename_lock);
 	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
 	    req);
 	sx_xunlock(&corefilename_lock);
 
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
     CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
     "Process corefile name format string");
 
 static void
 vnode_close_locked(struct thread *td, struct vnode *vp)
 {
 
 	VOP_UNLOCK(vp);
 	vn_close(vp, FWRITE, td->td_ucred, td);
 }
 
 /*
  * If the core format has a %I in it, then we need to check
  * for existing corefiles before defining a name.
  * To do this we iterate over 0..ncores to find a
  * non-existing core file name to use. If all core files are
  * already used we choose the oldest one.
  */
 static int
 corefile_open_last(struct thread *td, char *name, int indexpos,
     int indexlen, int ncores, struct vnode **vpp)
 {
 	struct vnode *oldvp, *nextvp, *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error, i, flags, oflags, cmode;
 	char ch;
 	struct timespec lasttime;
 
 	nextvp = oldvp = NULL;
 	cmode = S_IRUSR | S_IWUSR;
 	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 
 	for (i = 0; i < ncores; i++) {
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 
 		ch = name[indexpos + indexlen];
 		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
 		    i);
 		name[indexpos + indexlen] = ch;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error != 0)
 			break;
 
 		vp = nd.ni_vp;
 		NDFREE_PNBUF(&nd);
 		if ((flags & O_CREAT) == O_CREAT) {
 			nextvp = vp;
 			break;
 		}
 
 		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 		if (error != 0) {
 			vnode_close_locked(td, vp);
 			break;
 		}
 
 		if (oldvp == NULL ||
 		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
 		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
 		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
 			if (oldvp != NULL)
 				vn_close(oldvp, FWRITE, td->td_ucred, td);
 			oldvp = vp;
 			VOP_UNLOCK(oldvp);
 			lasttime = vattr.va_mtime;
 		} else {
 			vnode_close_locked(td, vp);
 		}
 	}
 
 	if (oldvp != NULL) {
 		if (nextvp == NULL) {
 			if ((td->td_proc->p_flag & P_SUGID) != 0) {
 				error = EFAULT;
 				vn_close(oldvp, FWRITE, td->td_ucred, td);
 			} else {
 				nextvp = oldvp;
 				error = vn_lock(nextvp, LK_EXCLUSIVE);
 				if (error != 0) {
 					vn_close(nextvp, FWRITE, td->td_ucred,
 					    td);
 					nextvp = NULL;
 				}
 			}
 		} else {
 			vn_close(oldvp, FWRITE, td->td_ucred, td);
 		}
 	}
 	if (error != 0) {
 		if (nextvp != NULL)
 			vnode_close_locked(td, oldvp);
 	} else {
 		*vpp = nextvp;
 	}
 
 	return (error);
 }
 
 /*
  * corefile_open(comm, uid, pid, td, compress, vpp, namep)
  * Expand the name described in corefilename, using name, uid, and pid
  * and open/create core file.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 static int
 corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
     int compress, int signum, struct vnode **vpp, char **namep)
 {
 	struct sbuf sb;
 	struct nameidata nd;
 	const char *format;
 	char *hostname, *name;
 	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
 
 	hostname = NULL;
 	format = corefilename;
 	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
 	indexlen = 0;
 	indexpos = -1;
 	ncores = num_cores;
 	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
 	sx_slock(&corefilename_lock);
 	for (i = 0; format[i] != '\0'; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				sbuf_putc(&sb, '%');
 				break;
 			case 'H':	/* hostname */
 				if (hostname == NULL) {
 					hostname = malloc(MAXHOSTNAMELEN,
 					    M_TEMP, M_WAITOK);
 				}
 				getcredhostname(td->td_ucred, hostname,
 				    MAXHOSTNAMELEN);
 				sbuf_printf(&sb, "%s", hostname);
 				break;
 			case 'I':	/* autoincrementing index */
 				if (indexpos != -1) {
 					sbuf_printf(&sb, "%%I");
 					break;
 				}
 
 				indexpos = sbuf_len(&sb);
 				sbuf_printf(&sb, "%u", ncores - 1);
 				indexlen = sbuf_len(&sb) - indexpos;
 				break;
 			case 'N':	/* process name */
 				sbuf_printf(&sb, "%s", comm);
 				break;
 			case 'P':	/* process id */
 				sbuf_printf(&sb, "%u", pid);
 				break;
 			case 'S':	/* signal number */
 				sbuf_printf(&sb, "%i", signum);
 				break;
 			case 'U':	/* user id */
 				sbuf_printf(&sb, "%u", uid);
 				break;
 			default:
 				log(LOG_ERR,
 				    "Unknown format character %c in "
 				    "corename `%s'\n", format[i], format);
 				break;
 			}
 			break;
 		default:
 			sbuf_putc(&sb, format[i]);
 			break;
 		}
 	}
 	sx_sunlock(&corefilename_lock);
 	free(hostname, M_TEMP);
 	if (compress == COMPRESS_GZIP)
 		sbuf_printf(&sb, GZIP_SUFFIX);
 	else if (compress == COMPRESS_ZSTD)
 		sbuf_printf(&sb, ZSTD_SUFFIX);
 	if (sbuf_error(&sb) != 0) {
 		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
 		    "long\n", (long)pid, comm, (u_long)uid);
 		sbuf_delete(&sb);
 		free(name, M_TEMP);
 		return (ENOMEM);
 	}
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (indexpos != -1) {
 		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
 		    vpp);
 		if (error != 0) {
 			log(LOG_ERR,
 			    "pid %d (%s), uid (%u):  Path `%s' failed "
 			    "on initial open test, error = %d\n",
 			    pid, comm, uid, name, error);
 		}
 	} else {
 		cmode = S_IRUSR | S_IWUSR;
 		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 		if ((td->td_proc->p_flag & P_SUGID) != 0)
 			flags |= O_EXCL;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error == 0) {
 			*vpp = nd.ni_vp;
 			NDFREE_PNBUF(&nd);
 		}
 	}
 
 	if (error != 0) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, error);
 #endif
 		free(name, M_TEMP);
 		return (error);
 	}
 	*namep = name;
 	return (0);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp;
 	struct flock lf;
 	struct vattr vattr;
 	size_t fullpathsize;
 	int error, error1, locked;
 	char *name;			/* name of corefile */
 	void *rl_cookie;
 	off_t limit;
 	char *fullpath, *freepath = NULL;
 	struct sbuf *sb;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 
 	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
 	    (p->p_flag2 & P2_NOTRACE) != 0) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(td, RLIMIT_CORE);
 	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
 		PROC_UNLOCK(p);
 		return (EFBIG);
 	}
 	PROC_UNLOCK(p);
 
 	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
 	    compress_user_cores, p->p_sig, &vp, &name);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Don't dump to non-regular files or files with links.
 	 * Do not dump into system files. Effective user must own the corefile.
 	 */
 	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
 	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
 	    vattr.va_uid != cred->cr_uid) {
 		VOP_UNLOCK(vp);
 		error = EFAULT;
 		goto out;
 	}
 
 	VOP_UNLOCK(vp);
 
 	/* Postpone other writers, including core dumps of other processes. */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	if (p->p_sysent->sv_coredump != NULL) {
 		error = p->p_sysent->sv_coredump(td, vp, limit, 0);
 	} else {
 		error = ENOSYS;
 	}
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 	vn_rangelock_unlock(vp, rl_cookie);
 
 	/*
 	 * Notify the userland helper that a process triggered a core dump.
 	 * This allows the helper to run an automated debugging session.
 	 */
 	if (error != 0 || coredump_devctl == 0)
 		goto out;
 	sb = sbuf_new_auto();
 	if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
 		goto out2;
 	sbuf_printf(sb, "comm=\"");
 	devctl_safe_quote_sb(sb, fullpath);
 	free(freepath, M_TEMP);
 	sbuf_printf(sb, "\" core=\"");
 
 	/*
 	 * We can't lookup core file vp directly. When we're replacing a core, and
 	 * other random times, we flush the name cache, so it will fail. Instead,
 	 * if the path of the core is relative, add the current dir in front if it.
 	 */
 	if (name[0] != '/') {
 		fullpathsize = MAXPATHLEN;
 		freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
 		if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
 			free(freepath, M_TEMP);
 			goto out2;
 		}
 		devctl_safe_quote_sb(sb, fullpath);
 		free(freepath, M_TEMP);
 		sbuf_putc(sb, '/');
 	}
 	devctl_safe_quote_sb(sb, name);
 	sbuf_printf(sb, "\"");
 	if (sbuf_finish(sb) == 0)
 		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
 out2:
 	sbuf_delete(sb);
 out:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 #ifdef AUDIT
 	audit_proc_coredump(td, name, error);
 #endif
 	free(name, M_TEMP);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(struct thread *td, struct nosys_args *args)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	if (SV_PROC_FLAG(p, SV_SIGSYS) != 0 && kern_signosys) {
 		PROC_LOCK(p);
 		tdsignal(td, SIGSYS);
 		PROC_UNLOCK(p);
 	}
 	if (kern_lognosys == 1 || kern_lognosys == 3) {
 		uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	if (kern_lognosys == 2 || kern_lognosys == 3 ||
 	    (p->p_pid == 1 && (kern_lognosys & 3) == 0)) {
 		printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(struct sigio **sigiop, int sig, int checkctty)
 {
 	ksiginfo_t ksi;
 	struct sigio *sigio;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			kern_psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				kern_psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	knlist_remove(kn->kn_knlist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	refcount_init(&ps->ps_refcnt, 1);
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	if (refcount_release(&ps->ps_refcnt) == 0)
 		return;
 	mtx_destroy(&ps->ps_mtx);
 	free(ps, M_SUBPROC);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 
 	refcount_acquire(&ps->ps_refcnt);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 
 	return (ps->ps_refcnt > 1);
 }
 
 void
 sig_drop_caught(struct proc *p)
 {
 	int sig;
 	struct sigacts *ps;
 
 	ps = p->p_sigacts;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	SIG_FOREACH(sig, &ps->ps_sigcatch) {
 		sigdflt(ps, sig);
 		if ((sigprop(sig) & SIGPROP_IGNORE) != 0)
 			sigqueue_delete_proc(p, sig);
 	}
 }
 
 static void
 sigfastblock_failed(struct thread *td, bool sendsig, bool write)
 {
 	ksiginfo_t ksi;
 
 	/*
 	 * Prevent further fetches and SIGSEGVs, allowing thread to
 	 * issue syscalls despite corruption.
 	 */
 	sigfastblock_clear(td);
 
 	if (!sendsig)
 		return;
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = SIGSEGV;
 	ksi.ksi_code = write ? SEGV_ACCERR : SEGV_MAPERR;
 	ksi.ksi_addr = td->td_sigblock_ptr;
 	trapsignal(td, &ksi);
 }
 
 static bool
 sigfastblock_fetch_sig(struct thread *td, bool sendsig, uint32_t *valp)
 {
 	uint32_t res;
 
 	if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0)
 		return (true);
 	if (fueword32((void *)td->td_sigblock_ptr, &res) == -1) {
 		sigfastblock_failed(td, sendsig, false);
 		return (false);
 	}
 	*valp = res;
 	td->td_sigblock_val = res & ~SIGFASTBLOCK_FLAGS;
 	return (true);
 }
 
 static void
 sigfastblock_resched(struct thread *td, bool resched)
 {
 	struct proc *p;
 
 	if (resched) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		reschedule_signals(p, td->td_sigmask, 0);
 		PROC_UNLOCK(p);
 	}
 	ast_sched(td, TDA_SIG);
 }
 
 int
 sys_sigfastblock(struct thread *td, struct sigfastblock_args *uap)
 {
 	struct proc *p;
 	int error, res;
 	uint32_t oldval;
 
 	error = 0;
 	p = td->td_proc;
 	switch (uap->cmd) {
 	case SIGFASTBLOCK_SETPTR:
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) {
 			error = EBUSY;
 			break;
 		}
 		if (((uintptr_t)(uap->ptr) & (sizeof(uint32_t) - 1)) != 0) {
 			error = EINVAL;
 			break;
 		}
 		td->td_pflags |= TDP_SIGFASTBLOCK;
 		td->td_sigblock_ptr = uap->ptr;
 		break;
 
 	case SIGFASTBLOCK_UNBLOCK:
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		for (;;) {
 			res = casueword32(td->td_sigblock_ptr,
 			    SIGFASTBLOCK_PEND, &oldval, 0);
 			if (res == -1) {
 				error = EFAULT;
 				sigfastblock_failed(td, false, true);
 				break;
 			}
 			if (res == 0)
 				break;
 			MPASS(res == 1);
 			if (oldval != SIGFASTBLOCK_PEND) {
 				error = EBUSY;
 				break;
 			}
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				break;
 		}
 		if (error != 0)
 			break;
 
 		/*
 		 * td_sigblock_val is cleared there, but not on a
 		 * syscall exit.  The end effect is that a single
 		 * interruptible sleep, while user sigblock word is
 		 * set, might return EINTR or ERESTART to usermode
 		 * without delivering signal.  All further sleeps,
 		 * until userspace clears the word and does
 		 * sigfastblock(UNBLOCK), observe current word and no
 		 * longer get interrupted.  It is slight
 		 * non-conformance, with alternative to have read the
 		 * sigblock word on each syscall entry.
 		 */
 		td->td_sigblock_val = 0;
 
 		/*
 		 * Rely on normal ast mechanism to deliver pending
 		 * signals to current thread.  But notify others about
 		 * fake unblock.
 		 */
 		sigfastblock_resched(td, error == 0 && p->p_numthreads != 1);
 
 		break;
 
 	case SIGFASTBLOCK_UNSETPTR:
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) {
 			error = EINVAL;
 			break;
 		}
 		if (!sigfastblock_fetch_sig(td, false, &oldval)) {
 			error = EFAULT;
 			break;
 		}
 		if (oldval != 0 && oldval != SIGFASTBLOCK_PEND) {
 			error = EBUSY;
 			break;
 		}
 		sigfastblock_clear(td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 void
 sigfastblock_clear(struct thread *td)
 {
 	bool resched;
 
 	if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0)
 		return;
 	td->td_sigblock_val = 0;
 	resched = (td->td_pflags & TDP_SIGFASTPENDING) != 0 ||
 	    SIGPENDING(td);
 	td->td_pflags &= ~(TDP_SIGFASTBLOCK | TDP_SIGFASTPENDING);
 	sigfastblock_resched(td, resched);
 }
 
 void
 sigfastblock_fetch(struct thread *td)
 {
 	uint32_t val;
 
 	(void)sigfastblock_fetch_sig(td, true, &val);
 }
 
 static void
 sigfastblock_setpend1(struct thread *td)
 {
 	int res;
 	uint32_t oldval;
 
 	if ((td->td_pflags & TDP_SIGFASTPENDING) == 0)
 		return;
 	res = fueword32((void *)td->td_sigblock_ptr, &oldval);
 	if (res == -1) {
 		sigfastblock_failed(td, true, false);
 		return;
 	}
 	for (;;) {
 		res = casueword32(td->td_sigblock_ptr, oldval, &oldval,
 		    oldval | SIGFASTBLOCK_PEND);
 		if (res == -1) {
 			sigfastblock_failed(td, true, true);
 			return;
 		}
 		if (res == 0) {
 			td->td_sigblock_val = oldval & ~SIGFASTBLOCK_FLAGS;
 			td->td_pflags &= ~TDP_SIGFASTPENDING;
 			break;
 		}
 		MPASS(res == 1);
 		if (thread_check_susp(td, false) != 0)
 			break;
 	}
 }
 
 static void
 sigfastblock_setpend(struct thread *td, bool resched)
 {
 	struct proc *p;
 
 	sigfastblock_setpend1(td);
 	if (resched) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		reschedule_signals(p, fastblock_mask, SIGPROCMASK_FASTBLK);
 		PROC_UNLOCK(p);
 	}
 }
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
index bb78e4a35451..5a6ebacb780c 100644
--- a/sys/kern/subr_log.c
+++ b/sys/kern/subr_log.c
@@ -1,309 +1,309 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)subr_log.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Error log buffer for kernel printf's.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/filio.h>
 #include <sys/ttycom.h>
 #include <sys/msgbuf.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/poll.h>
 #include <sys/filedesc.h>
 #include <sys/sysctl.h>
 
 #define LOG_RDPRI	(PZERO + 1)
 
 #define LOG_ASYNC	0x04
 
 static	d_open_t	logopen;
 static	d_close_t	logclose;
 static	d_read_t	logread;
 static	d_ioctl_t	logioctl;
 static	d_poll_t	logpoll;
 static	d_kqfilter_t	logkqfilter;
 
 static	void logtimeout(void *arg);
 
 static struct cdevsw log_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	logopen,
 	.d_close =	logclose,
 	.d_read =	logread,
 	.d_ioctl =	logioctl,
 	.d_poll =	logpoll,
 	.d_kqfilter =	logkqfilter,
 	.d_name =	"log",
 };
 
 static int	logkqread(struct knote *note, long hint);
 static void	logkqdetach(struct knote *note);
 
-static struct filterops log_read_filterops = {
+static const struct filterops log_read_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	logkqdetach,
 	.f_event =	logkqread,
 };
 
 static struct logsoftc {
 	int	sc_state;		/* see above for possibilities */
 	struct	selinfo sc_selp;	/* process waiting on select call */
 	struct  sigio *sc_sigio;	/* information for async I/O */
 	struct	callout sc_callout;	/* callout to wakeup syslog  */
 } logsoftc;
 
 int			log_open;	/* also used in log() */
 static struct cv	log_wakeup;
 struct mtx		msgbuf_lock;
 MTX_SYSINIT(msgbuf_lock, &msgbuf_lock, "msgbuf lock", MTX_DEF);
 
 static int	log_wakeups_per_second = 5;
 SYSCTL_INT(_kern, OID_AUTO, log_wakeups_per_second, CTLFLAG_RW,
     &log_wakeups_per_second, 0,
     "How often (times per second) to check for /dev/log waiters.");
 
 /*ARGSUSED*/
 static	int
 logopen(struct cdev *dev, int flags, int mode, struct thread *td)
 {
 
 	if (log_wakeups_per_second < 1) {
 		printf("syslog wakeup is less than one.  Adjusting to 1.\n");
 		log_wakeups_per_second = 1;
 	}
 
 	mtx_lock(&msgbuf_lock);
 	if (log_open) {
 		mtx_unlock(&msgbuf_lock);
 		return (EBUSY);
 	}
 	log_open = 1;
 	callout_reset_sbt(&logsoftc.sc_callout,
 	    SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
 	mtx_unlock(&msgbuf_lock);
 
 	fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio);	/* signal process only */
 	return (0);
 }
 
 /*ARGSUSED*/
 static	int
 logclose(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 
 	funsetown(&logsoftc.sc_sigio);
 
 	mtx_lock(&msgbuf_lock);
 	callout_stop(&logsoftc.sc_callout);
 	logsoftc.sc_state = 0;
 	log_open = 0;
 	mtx_unlock(&msgbuf_lock);
 
 	return (0);
 }
 
 /*ARGSUSED*/
 static	int
 logread(struct cdev *dev, struct uio *uio, int flag)
 {
 	char buf[128];
 	struct msgbuf *mbp = msgbufp;
 	int error = 0, l;
 
 	mtx_lock(&msgbuf_lock);
 	while (msgbuf_getcount(mbp) == 0) {
 		if (flag & IO_NDELAY) {
 			mtx_unlock(&msgbuf_lock);
 			return (EWOULDBLOCK);
 		}
 		if ((error = cv_wait_sig(&log_wakeup, &msgbuf_lock)) != 0) {
 			mtx_unlock(&msgbuf_lock);
 			return (error);
 		}
 	}
 
 	while (uio->uio_resid > 0) {
 		l = imin(sizeof(buf), uio->uio_resid);
 		l = msgbuf_getbytes(mbp, buf, l);
 		if (l == 0)
 			break;
 		mtx_unlock(&msgbuf_lock);
 		error = uiomove(buf, l, uio);
 		if (error || uio->uio_resid == 0)
 			return (error);
 		mtx_lock(&msgbuf_lock);
 	}
 	mtx_unlock(&msgbuf_lock);
 	return (error);
 }
 
 /*ARGSUSED*/
 static	int
 logpoll(struct cdev *dev, int events, struct thread *td)
 {
 	int revents = 0;
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		mtx_lock(&msgbuf_lock);
 		if (msgbuf_getcount(msgbufp) > 0)
 			revents |= events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &logsoftc.sc_selp);
 		mtx_unlock(&msgbuf_lock);
 	}
 	return (revents);
 }
 
 static int
 logkqfilter(struct cdev *dev, struct knote *kn)
 {
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_fop = &log_read_filterops;
 	kn->kn_hook = NULL;
 
 	mtx_lock(&msgbuf_lock);
 	knlist_add(&logsoftc.sc_selp.si_note, kn, 1);
 	mtx_unlock(&msgbuf_lock);
 	return (0);
 }
 
 static int
 logkqread(struct knote *kn, long hint)
 {
 
 	mtx_assert(&msgbuf_lock, MA_OWNED);
 	kn->kn_data = msgbuf_getcount(msgbufp);
 	return (kn->kn_data != 0);
 }
 
 static void
 logkqdetach(struct knote *kn)
 {
 
 	mtx_lock(&msgbuf_lock);
 	knlist_remove(&logsoftc.sc_selp.si_note, kn, 1);
 	mtx_unlock(&msgbuf_lock);
 }
 
 static void
 logtimeout(void *arg)
 {
 
 	if (!log_open)
 		return;
 	if (msgbuftrigger == 0)
 		goto done;
 	msgbuftrigger = 0;
 	selwakeuppri(&logsoftc.sc_selp, LOG_RDPRI);
 	KNOTE_LOCKED(&logsoftc.sc_selp.si_note, 0);
 	if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
 		pgsigio(&logsoftc.sc_sigio, SIGIO, 0);
 	cv_broadcastpri(&log_wakeup, LOG_RDPRI);
 done:
 	if (log_wakeups_per_second < 1) {
 		printf("syslog wakeup is less than one.  Adjusting to 1.\n");
 		log_wakeups_per_second = 1;
 	}
 	callout_reset_sbt(&logsoftc.sc_callout,
 	    SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
 }
 
 /*ARGSUSED*/
 static	int
 logioctl(struct cdev *dev, u_long com, caddr_t data, int flag, struct thread *td)
 {
 
 	switch (com) {
 	/* return number of characters immediately available */
 	case FIONREAD:
 		*(int *)data = msgbuf_getcount(msgbufp);
 		break;
 
 	case FIONBIO:
 		break;
 
 	case FIOASYNC:
 		mtx_lock(&msgbuf_lock);
 		if (*(int *)data)
 			logsoftc.sc_state |= LOG_ASYNC;
 		else
 			logsoftc.sc_state &= ~LOG_ASYNC;
 		mtx_unlock(&msgbuf_lock);
 		break;
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &logsoftc.sc_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&logsoftc.sc_sigio);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		return (fsetown(-(*(int *)data), &logsoftc.sc_sigio));
 
 	/* This is deprecated, FIOGETOWN should be used instead */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&logsoftc.sc_sigio);
 		break;
 
 	default:
 		return (ENOTTY);
 	}
 	return (0);
 }
 
 static void
 log_drvinit(void *unused)
 {
 
 	cv_init(&log_wakeup, "klog");
 	callout_init_mtx(&logsoftc.sc_callout, &msgbuf_lock, 0);
 	knlist_init_mtx(&logsoftc.sc_selp.si_note, &msgbuf_lock);
 	make_dev_credf(MAKEDEV_ETERNAL, &log_cdevsw, 0, NULL, UID_ROOT,
 	    GID_WHEEL, 0600, "klog");
 }
 
 SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,log_drvinit,NULL);
diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c
index 739dbf75b01e..20e73f9c6b1b 100644
--- a/sys/kern/sys_eventfd.c
+++ b/sys/kern/sys_eventfd.c
@@ -1,346 +1,346 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/types.h>
 #include <sys/user.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/stat.h>
 #include <sys/errno.h>
 #include <sys/event.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/selinfo.h>
 #include <sys/eventfd.h>
 
 #include <security/audit/audit.h>
 
 _Static_assert(EFD_CLOEXEC == O_CLOEXEC, "Mismatched EFD_CLOEXEC");
 _Static_assert(EFD_NONBLOCK == O_NONBLOCK, "Mismatched EFD_NONBLOCK");
 
 MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures");
 
 static fo_rdwr_t	eventfd_read;
 static fo_rdwr_t	eventfd_write;
 static fo_ioctl_t	eventfd_ioctl;
 static fo_poll_t	eventfd_poll;
 static fo_kqfilter_t	eventfd_kqfilter;
 static fo_stat_t	eventfd_stat;
 static fo_close_t	eventfd_close;
 static fo_fill_kinfo_t	eventfd_fill_kinfo;
 
-static struct fileops eventfdops = {
+static const struct fileops eventfdops = {
 	.fo_read = eventfd_read,
 	.fo_write = eventfd_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = eventfd_ioctl,
 	.fo_poll = eventfd_poll,
 	.fo_kqfilter = eventfd_kqfilter,
 	.fo_stat = eventfd_stat,
 	.fo_close = eventfd_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = eventfd_fill_kinfo,
 	.fo_cmp = file_kcmp_generic,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_eventfddetach(struct knote *kn);
 static int	filt_eventfdread(struct knote *kn, long hint);
 static int	filt_eventfdwrite(struct knote *kn, long hint);
 
-static struct filterops eventfd_rfiltops = {
+static const struct filterops eventfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
 	.f_event = filt_eventfdread
 };
 
-static struct filterops eventfd_wfiltops = {
+static const struct filterops eventfd_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
 	.f_event = filt_eventfdwrite
 };
 
 struct eventfd {
 	eventfd_t	efd_count;
 	uint32_t	efd_flags;
 	struct selinfo	efd_sel;
 	struct mtx	efd_lock;
 };
 
 int
 eventfd_create_file(struct thread *td, struct file *fp, uint32_t initval,
     int flags)
 {
 	struct eventfd *efd;
 	int fflags;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_VALUE(initval);
 
 	efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO);
 	efd->efd_flags = flags;
 	efd->efd_count = initval;
 	mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
 	knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
 
 	fflags = FREAD | FWRITE;
 	if ((flags & EFD_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 	finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops);
 
 	return (0);
 }
 
 static int
 eventfd_close(struct file *fp, struct thread *td)
 {
 	struct eventfd *efd;
 
 	efd = fp->f_data;
 	seldrain(&efd->efd_sel);
 	knlist_destroy(&efd->efd_sel.si_note);
 	mtx_destroy(&efd->efd_lock);
 	free(efd, M_EVENTFD);
 	return (0);
 }
 
 static int
 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct eventfd *efd;
 	eventfd_t count;
 	int error;
 
 	if (uio->uio_resid < sizeof(eventfd_t))
 		return (EINVAL);
 
 	error = 0;
 	efd = fp->f_data;
 	mtx_lock(&efd->efd_lock);
 	while (error == 0 && efd->efd_count == 0) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&efd->efd_lock);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH,
 		    "efdrd", 0);
 	}
 	if (error == 0) {
 		MPASS(efd->efd_count > 0);
 		if ((efd->efd_flags & EFD_SEMAPHORE) != 0) {
 			count = 1;
 			--efd->efd_count;
 		} else {
 			count = efd->efd_count;
 			efd->efd_count = 0;
 		}
 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
 		selwakeup(&efd->efd_sel);
 		wakeup(&efd->efd_count);
 		mtx_unlock(&efd->efd_lock);
 		error = uiomove(&count, sizeof(eventfd_t), uio);
 	} else
 		mtx_unlock(&efd->efd_lock);
 
 	return (error);
 }
 
 static int
 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct eventfd *efd;
 	eventfd_t count;
 	int error;
 
 	if (uio->uio_resid < sizeof(eventfd_t))
 		return (EINVAL);
 
 	error = uiomove(&count, sizeof(eventfd_t), uio);
 	if (error != 0)
 		return (error);
 	if (count == UINT64_MAX)
 		return (EINVAL);
 
 	efd = fp->f_data;
 	mtx_lock(&efd->efd_lock);
 retry:
 	if (UINT64_MAX - efd->efd_count <= count) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&efd->efd_lock);
 			/* Do not not return the number of bytes written */
 			uio->uio_resid += sizeof(eventfd_t);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
 		    PCATCH, "efdwr", 0);
 		if (error == 0)
 			goto retry;
 	}
 	if (error == 0) {
 		MPASS(UINT64_MAX - efd->efd_count > count);
 		efd->efd_count += count;
 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
 		selwakeup(&efd->efd_sel);
 		wakeup(&efd->efd_count);
 	}
 	mtx_unlock(&efd->efd_lock);
 
 	return (error);
 }
 
 static int
 eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct eventfd *efd;
 	int revents;
 
 	efd = fp->f_data;
 	revents = 0;
 	mtx_lock(&efd->efd_lock);
 	if ((events & (POLLIN | POLLRDNORM)) != 0 && efd->efd_count > 0)
 		revents |= events & (POLLIN | POLLRDNORM);
 	if ((events & (POLLOUT | POLLWRNORM)) != 0 && UINT64_MAX - 1 >
 	    efd->efd_count)
 		revents |= events & (POLLOUT | POLLWRNORM);
 	if (revents == 0)
 		selrecord(td, &efd->efd_sel);
 	mtx_unlock(&efd->efd_lock);
 
 	return (revents);
 }
 
 static int
 eventfd_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct eventfd *efd = fp->f_data;
 
 	mtx_lock(&efd->efd_lock);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &eventfd_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &eventfd_wfiltops;
 		break;
 	default:
 		mtx_unlock(&efd->efd_lock);
 		return (EINVAL);
 	}
 
 	kn->kn_hook = efd;
 	knlist_add(&efd->efd_sel.si_note, kn, 1);
 	mtx_unlock(&efd->efd_lock);
 
 	return (0);
 }
 
 static void
 filt_eventfddetach(struct knote *kn)
 {
 	struct eventfd *efd = kn->kn_hook;
 
 	mtx_lock(&efd->efd_lock);
 	knlist_remove(&efd->efd_sel.si_note, kn, 1);
 	mtx_unlock(&efd->efd_lock);
 }
 
 static int
 filt_eventfdread(struct knote *kn, long hint)
 {
 	struct eventfd *efd = kn->kn_hook;
 	int ret;
 
 	mtx_assert(&efd->efd_lock, MA_OWNED);
 	kn->kn_data = (int64_t)efd->efd_count;
 	ret = efd->efd_count > 0;
 
 	return (ret);
 }
 
 static int
 filt_eventfdwrite(struct knote *kn, long hint)
 {
 	struct eventfd *efd = kn->kn_hook;
 	int ret;
 
 	mtx_assert(&efd->efd_lock, MA_OWNED);
 	kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count);
 	ret = UINT64_MAX - 1 > efd->efd_count;
 
 	return (ret);
 }
 
 static int
 eventfd_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 	switch (cmd) {
 	case FIONBIO:
 	case FIOASYNC:
 		return (0);
 	}
 
 	return (ENOTTY);
 }
 
 static int
 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
 	bzero((void *)st, sizeof *st);
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 static int
 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct eventfd *efd = fp->f_data;
 
 	kif->kf_type = KF_TYPE_EVENTFD;
 	mtx_lock(&efd->efd_lock);
 	kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count;
 	kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags;
 	kif->kf_un.kf_eventfd.kf_eventfd_addr = (uintptr_t)efd;
 	mtx_unlock(&efd->efd_lock);
 	return (0);
 }
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index c34c7b24a269..f2f1a42adf2b 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -1,1903 +1,1903 @@
 /*-
  * Copyright (c) 1996 John S. Dyson
  * Copyright (c) 2012 Giovanni Trematerra
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  */
 
 /*
  * This file contains a high-performance replacement for the socket-based
  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  * all features of sockets, but does do everything that pipes normally
  * do.
  */
 
 /*
  * This code has two modes of operation, a small write mode and a large
  * write mode.  The small write mode acts like conventional pipes with
  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
  * and PIPE_SIZE in size, the sending process pins the underlying pages in
  * memory, and the receiving process copies directly from these pinned pages
  * in the sending process.
  *
  * If the sending process receives a signal, it is possible that it will
  * go away, and certainly its address space can change, because control
  * is returned back to the user-mode side.  In that case, the pipe code
  * arranges to copy the buffer supplied by the user process, to a pageable
  * kernel buffer, and the receiving process will grab the data from the
  * pageable kernel buffer.  Since signals don't happen all that often,
  * the copy operation is normally eliminated.
  *
  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
  * happen for small transfers so that the system will not spend all of
  * its time context switching.
  *
  * In order to limit the resource use of pipes, two sysctls exist:
  *
  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
  * address space available to us in pipe_map. This value is normally
  * autotuned, but may also be loader tuned.
  *
  * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
  * memory in use by pipes.
  *
  * Based on how large pipekva is relative to maxpipekva, the following
  * will happen:
  *
  * 0% - 50%:
  *     New pipes are given 16K of memory backing, pipes may dynamically
  *     grow to as large as 64K where needed.
  * 50% - 75%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes may NOT grow.
  * 75% - 100%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes will be shrunk down to 4K whenever possible.
  *
  * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
  * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
  * resize which MUST occur for reverse-direction pipes when they are
  * first used.
  *
  * Additional information about the current state of pipes may be obtained
  * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
  * and kern.ipc.piperesizefail.
  *
  * Locking rules:  There are two locks present here:  A mutex, used via
  * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
  * the flag, as mutexes can not persist over uiomove.  The mutex
  * exists only to guard access to the flag, and is not in itself a
  * locking mechanism.  Also note that there is only a single mutex for
  * both directions of a pipe.
  *
  * As pipelock() may have to sleep before it can acquire the flag, it
  * is important to reread all data after a call to pipelock(); everything
  * in the structure may have changed.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/ttycom.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #include <sys/event.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 /*
  * Use this define if you want to disable *fancy* VM things.  Expect an
  * approx 30% decrease in transfer rate.  This could be useful for
  * NetBSD or OpenBSD.
  */
 /* #define PIPE_NODIRECT */
 
 #define PIPE_PEER(pipe)	\
 	(((pipe)->pipe_type & PIPE_TYPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
 
 /*
  * interfaces to the outside world
  */
 static fo_rdwr_t	pipe_read;
 static fo_rdwr_t	pipe_write;
 static fo_truncate_t	pipe_truncate;
 static fo_ioctl_t	pipe_ioctl;
 static fo_poll_t	pipe_poll;
 static fo_kqfilter_t	pipe_kqfilter;
 static fo_stat_t	pipe_stat;
 static fo_close_t	pipe_close;
 static fo_chmod_t	pipe_chmod;
 static fo_chown_t	pipe_chown;
 static fo_fill_kinfo_t	pipe_fill_kinfo;
 
-struct fileops pipeops = {
+const struct fileops pipeops = {
 	.fo_read = pipe_read,
 	.fo_write = pipe_write,
 	.fo_truncate = pipe_truncate,
 	.fo_ioctl = pipe_ioctl,
 	.fo_poll = pipe_poll,
 	.fo_kqfilter = pipe_kqfilter,
 	.fo_stat = pipe_stat,
 	.fo_close = pipe_close,
 	.fo_chmod = pipe_chmod,
 	.fo_chown = pipe_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = pipe_fill_kinfo,
 	.fo_cmp = file_kcmp_generic,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_pipedetach(struct knote *kn);
 static void	filt_pipedetach_notsup(struct knote *kn);
 static int	filt_pipenotsup(struct knote *kn, long hint);
 static int	filt_piperead(struct knote *kn, long hint);
 static int	filt_pipewrite(struct knote *kn, long hint);
 
-static struct filterops pipe_nfiltops = {
+static const struct filterops pipe_nfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach_notsup,
 	.f_event = filt_pipenotsup
 };
-static struct filterops pipe_rfiltops = {
+static const struct filterops pipe_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_piperead
 };
-static struct filterops pipe_wfiltops = {
+static const struct filterops pipe_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_pipewrite
 };
 
 /*
  * Default pipe buffer size(s), this can be kind-of large now because pipe
  * space is pageable.  The pipe code will try to maintain locality of
  * reference for performance reasons, so small amounts of outstanding I/O
  * will not wipe the cache.
  */
 #define MINPIPESIZE (PIPE_SIZE/3)
 #define MAXPIPESIZE (2*PIPE_SIZE/3)
 
 static long amountpipekva;
 static int pipefragretry;
 static int pipeallocfail;
 static int piperesizefail;
 static int piperesizeallowed = 1;
 static long pipe_mindirect = PIPE_MINDIRECT;
 static int pipebuf_reserv = 2;
 
 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 	   &maxpipekva, 0, "Pipe KVA limit");
 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
 	   &amountpipekva, 0, "Pipe KVA usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
 	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
 	  &pipeallocfail, 0, "Pipe allocation failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
 	  &piperesizefail, 0, "Pipe resize failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
 	  &piperesizeallowed, 0, "Pipe resizing allowed");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipebuf_reserv, CTLFLAG_RW,
     &pipebuf_reserv, 0,
     "Superuser-reserved percentage of the pipe buffers space");
 
 static void pipeinit(void *dummy __unused);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
 static int pipe_create(struct pipe *pipe, bool backing);
 static int pipe_paircreate(struct thread *td, struct pipepair **p_pp);
 static __inline int pipelock(struct pipe *cpipe, int catch);
 static __inline void pipeunlock(struct pipe *cpipe);
 static void pipe_timestamp(struct timespec *tsp);
 #ifndef PIPE_NODIRECT
 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
 static void pipe_destroy_write_buffer(struct pipe *wpipe);
 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
 static void pipe_clone_write_buffer(struct pipe *wpipe);
 #endif
 static int pipespace(struct pipe *cpipe, int size);
 static int pipespace_new(struct pipe *cpipe, int size);
 
 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
 static int	pipe_zone_init(void *mem, int size, int flags);
 static void	pipe_zone_fini(void *mem, int size);
 
 static uma_zone_t pipe_zone;
 static struct unrhdr64 pipeino_unr;
 static dev_t pipedev_ino;
 
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 
 static void
 pipeinit(void *dummy __unused)
 {
 
 	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
 	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
 	    UMA_ALIGN_PTR, 0);
 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
 	new_unrhdr64(&pipeino_unr, 1);
 	pipedev_ino = devfs_alloc_cdp_inode();
 	KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
 }
 
 static int
 sysctl_handle_pipe_mindirect(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	long tmp_pipe_mindirect = pipe_mindirect;
 
 	error = sysctl_handle_long(oidp, &tmp_pipe_mindirect, arg2, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	/*
 	 * Don't allow pipe_mindirect to be set so low that we violate
 	 * atomicity requirements.
 	 */
 	if (tmp_pipe_mindirect <= PIPE_BUF)
 		return (EINVAL);
 	pipe_mindirect = tmp_pipe_mindirect;
 	return (0);
 }
 SYSCTL_OID(_kern_ipc, OID_AUTO, pipe_mindirect, CTLTYPE_LONG | CTLFLAG_RW,
     &pipe_mindirect, 0, sysctl_handle_pipe_mindirect, "L",
     "Minimum write size triggering VM optimization");
 
 static int
 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	/*
 	 * We zero both pipe endpoints to make sure all the kmem pointers
 	 * are NULL, flag fields are zero'd, etc.  We timestamp both
 	 * endpoints with the same time.
 	 */
 	rpipe = &pp->pp_rpipe;
 	bzero(rpipe, sizeof(*rpipe));
 	pipe_timestamp(&rpipe->pipe_ctime);
 	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	wpipe = &pp->pp_wpipe;
 	bzero(wpipe, sizeof(*wpipe));
 	wpipe->pipe_ctime = rpipe->pipe_ctime;
 	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	rpipe->pipe_peer = wpipe;
 	rpipe->pipe_pair = pp;
 	wpipe->pipe_peer = rpipe;
 	wpipe->pipe_pair = pp;
 
 	/*
 	 * Mark both endpoints as present; they will later get free'd
 	 * one at a time.  When both are free'd, then the whole pair
 	 * is released.
 	 */
 	rpipe->pipe_present = PIPE_ACTIVE;
 	wpipe->pipe_present = PIPE_ACTIVE;
 
 	/*
 	 * Eventually, the MAC Framework may initialize the label
 	 * in ctor or init, but for now we do it elswhere to avoid
 	 * blocking in ctor or init.
 	 */
 	pp->pp_label = NULL;
 
 	return (0);
 }
 
 static int
 pipe_zone_init(void *mem, int size, int flags)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW);
 	return (0);
 }
 
 static void
 pipe_zone_fini(void *mem, int size)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_destroy(&pp->pp_mtx);
 }
 
 static int
 pipe_paircreate(struct thread *td, struct pipepair **p_pp)
 {
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 	int error;
 
 	*p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
 #ifdef MAC
 	/*
 	 * The MAC label is shared between the connected endpoints.  As a
 	 * result mac_pipe_init() and mac_pipe_create() are called once
 	 * for the pair, and not on the endpoints.
 	 */
 	mac_pipe_init(pp);
 	mac_pipe_create(td->td_ucred, pp);
 #endif
 	rpipe = &pp->pp_rpipe;
 	wpipe = &pp->pp_wpipe;
 	pp->pp_owner = crhold(td->td_ucred);
 
 	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
 	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
 
 	/*
 	 * Only the forward direction pipe is backed by big buffer by
 	 * default.
 	 */
 	error = pipe_create(rpipe, true);
 	if (error != 0)
 		goto fail;
 	error = pipe_create(wpipe, false);
 	if (error != 0) {
 		/*
 		 * This cleanup leaves the pipe inode number for rpipe
 		 * still allocated, but never used.  We do not free
 		 * inode numbers for opened pipes, which is required
 		 * for correctness because numbers must be unique.
 		 * But also it avoids any memory use by the unr
 		 * allocator, so stashing away the transient inode
 		 * number is reasonable.
 		 */
 		pipe_free_kmem(rpipe);
 		goto fail;
 	}
 
 	rpipe->pipe_state |= PIPE_DIRECTOK;
 	wpipe->pipe_state |= PIPE_DIRECTOK;
 	return (0);
 
 fail:
 	knlist_destroy(&rpipe->pipe_sel.si_note);
 	knlist_destroy(&wpipe->pipe_sel.si_note);
 	crfree(pp->pp_owner);
 #ifdef MAC
 	mac_pipe_destroy(pp);
 #endif
 	uma_zfree(pipe_zone, pp);
 	return (error);
 }
 
 int
 pipe_named_ctor(struct pipe **ppipe, struct thread *td)
 {
 	struct pipepair *pp;
 	int error;
 
 	error = pipe_paircreate(td, &pp);
 	if (error != 0)
 		return (error);
 	pp->pp_rpipe.pipe_type |= PIPE_TYPE_NAMED;
 	*ppipe = &pp->pp_rpipe;
 	return (0);
 }
 
 void
 pipe_dtor(struct pipe *dpipe)
 {
 	struct pipe *peer;
 
 	peer = (dpipe->pipe_type & PIPE_TYPE_NAMED) != 0 ? dpipe->pipe_peer : NULL;
 	funsetown(&dpipe->pipe_sigio);
 	pipeclose(dpipe);
 	if (peer != NULL) {
 		funsetown(&peer->pipe_sigio);
 		pipeclose(peer);
 	}
 }
 
 /*
  * Get a timestamp.
  *
  * This used to be vfs_timestamp but the higher precision is unnecessary and
  * can very negatively affect performance in virtualized environments (e.g., on
  * vms running on amd64 when using the rdtscp instruction).
  */
 static void
 pipe_timestamp(struct timespec *tsp)
 {
 
 	getnanotime(tsp);
 }
 
 /*
  * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
  * the zone pick up the pieces via pipeclose().
  */
 int
 kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1,
     struct filecaps *fcaps2)
 {
 	struct file *rf, *wf;
 	struct pipe *rpipe, *wpipe;
 	struct pipepair *pp;
 	int fd, fflags, error;
 
 	error = pipe_paircreate(td, &pp);
 	if (error != 0)
 		return (error);
 	rpipe = &pp->pp_rpipe;
 	wpipe = &pp->pp_wpipe;
 	error = falloc_caps(td, &rf, &fd, flags, fcaps1);
 	if (error) {
 		pipeclose(rpipe);
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `rf' has been held for us by falloc_caps(). */
 	fildes[0] = fd;
 
 	fflags = FREAD | FWRITE;
 	if ((flags & O_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	/*
 	 * Warning: once we've gotten past allocation of the fd for the
 	 * read-side, we can only drop the read side via fdrop() in order
 	 * to avoid races against processes which manage to dup() the read
 	 * side while we are blocked trying to allocate the write side.
 	 */
 	finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
 	error = falloc_caps(td, &wf, &fd, flags, fcaps2);
 	if (error) {
 		fdclose(td, rf, fildes[0]);
 		fdrop(rf, td);
 		/* rpipe has been closed by fdrop(). */
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `wf' has been held for us by falloc_caps(). */
 	finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
 	fdrop(wf, td);
 	fildes[1] = fd;
 	fdrop(rf, td);
 
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD10
 /* ARGSUSED */
 int
 freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused)
 {
 	int error;
 	int fildes[2];
 
 	error = kern_pipe(td, fildes, 0, NULL, NULL);
 	if (error)
 		return (error);
 
 	td->td_retval[0] = fildes[0];
 	td->td_retval[1] = fildes[1];
 
 	return (0);
 }
 #endif
 
 int
 sys_pipe2(struct thread *td, struct pipe2_args *uap)
 {
 	int error, fildes[2];
 
 	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
 		return (EINVAL);
 	error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
 	if (error)
 		return (error);
 	error = copyout(fildes, uap->fildes, 2 * sizeof(int));
 	if (error) {
 		(void)kern_close(td, fildes[0]);
 		(void)kern_close(td, fildes[1]);
 	}
 	return (error);
 }
 
 /*
  * Allocate kva for pipe circular buffer, the space is pageable
  * This routine will 'realloc' the size of a pipe safely, if it fails
  * it will retain the old buffer.
  * If it fails it will return ENOMEM.
  */
 static int
 pipespace_new(struct pipe *cpipe, int size)
 {
 	caddr_t buffer;
 	int error, cnt, firstseg;
 	static int curfail = 0;
 	static struct timeval lastfail;
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
 	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
 		("pipespace: resize of direct writes not allowed"));
 retry:
 	cnt = cpipe->pipe_buffer.cnt;
 	if (cnt > size)
 		size = cnt;
 
 	size = round_page(size);
 	buffer = (caddr_t) vm_map_min(pipe_map);
 
 	if (!chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo,
 	    size, lim_cur(curthread, RLIMIT_PIPEBUF))) {
 		if (cpipe->pipe_buffer.buffer == NULL &&
 		    size > SMALL_PIPE_SIZE) {
 			size = SMALL_PIPE_SIZE;
 			goto retry;
 		}
 		return (ENOMEM);
 	}
 
 	vm_map_lock(pipe_map);
 	if (priv_check(curthread, PRIV_PIPEBUF) != 0 && maxpipekva / 100 *
 	    (100 - pipebuf_reserv) < amountpipekva + size) {
 		vm_map_unlock(pipe_map);
 		chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0);
 		if (cpipe->pipe_buffer.buffer == NULL &&
 		    size > SMALL_PIPE_SIZE) {
 			size = SMALL_PIPE_SIZE;
 			pipefragretry++;
 			goto retry;
 		}
 		return (ENOMEM);
 	}
 	error = vm_map_find_locked(pipe_map, NULL, 0, (vm_offset_t *)&buffer,
 	    size, 0, VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
 	vm_map_unlock(pipe_map);
 	if (error != KERN_SUCCESS) {
 		chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0);
 		if (cpipe->pipe_buffer.buffer == NULL &&
 		    size > SMALL_PIPE_SIZE) {
 			size = SMALL_PIPE_SIZE;
 			pipefragretry++;
 			goto retry;
 		}
 		if (cpipe->pipe_buffer.buffer == NULL) {
 			pipeallocfail++;
 			if (ppsratecheck(&lastfail, &curfail, 1))
 				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
 		} else {
 			piperesizefail++;
 		}
 		return (ENOMEM);
 	}
 
 	/* copy data, then free old resources if we're resizing */
 	if (cnt > 0) {
 		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
 			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, firstseg);
 			if ((cnt - firstseg) > 0)
 				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
 					cpipe->pipe_buffer.in);
 		} else {
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, cnt);
 		}
 	}
 	pipe_free_kmem(cpipe);
 	cpipe->pipe_buffer.buffer = buffer;
 	cpipe->pipe_buffer.size = size;
 	cpipe->pipe_buffer.in = cnt;
 	cpipe->pipe_buffer.out = 0;
 	cpipe->pipe_buffer.cnt = cnt;
 	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
 	return (0);
 }
 
 /*
  * Wrapper for pipespace_new() that performs locking assertions.
  */
 static int
 pipespace(struct pipe *cpipe, int size)
 {
 
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 	    ("Unlocked pipe passed to pipespace"));
 	return (pipespace_new(cpipe, size));
 }
 
 /*
  * lock a pipe for I/O, blocking other access
  */
 static __inline int
 pipelock(struct pipe *cpipe, int catch)
 {
 	int error, prio;
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 
 	prio = PRIBIO;
 	if (catch)
 		prio |= PCATCH;
 	while (cpipe->pipe_state & PIPE_LOCKFL) {
 		KASSERT(cpipe->pipe_waiters >= 0,
 		    ("%s: bad waiter count %d", __func__,
 		    cpipe->pipe_waiters));
 		cpipe->pipe_waiters++;
 		error = msleep(&cpipe->pipe_waiters, PIPE_MTX(cpipe), prio,
 		    "pipelk", 0);
 		cpipe->pipe_waiters--;
 		if (error != 0)
 			return (error);
 	}
 	cpipe->pipe_state |= PIPE_LOCKFL;
 	return (0);
 }
 
 /*
  * unlock a pipe I/O lock
  */
 static __inline void
 pipeunlock(struct pipe *cpipe)
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 		("Unlocked pipe passed to pipeunlock"));
 	KASSERT(cpipe->pipe_waiters >= 0,
 	    ("%s: bad waiter count %d", __func__,
 	    cpipe->pipe_waiters));
 	cpipe->pipe_state &= ~PIPE_LOCKFL;
 	if (cpipe->pipe_waiters > 0)
 		wakeup_one(&cpipe->pipe_waiters);
 }
 
 void
 pipeselwakeup(struct pipe *cpipe)
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	if (cpipe->pipe_state & PIPE_SEL) {
 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
 		if (!SEL_WAITING(&cpipe->pipe_sel))
 			cpipe->pipe_state &= ~PIPE_SEL;
 	}
 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
 	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
 }
 
 /*
  * Initialize and allocate VM and memory for pipe.  The structure
  * will start out zero'd from the ctor, so we just manage the kmem.
  */
 static int
 pipe_create(struct pipe *pipe, bool large_backing)
 {
 	int error;
 
 	error = pipespace_new(pipe, !large_backing || amountpipekva >
 	    maxpipekva / 2 ? SMALL_PIPE_SIZE : PIPE_SIZE);
 	if (error == 0)
 		pipe->pipe_ino = alloc_unr64(&pipeino_unr);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct pipe *rpipe;
 	int error;
 	int nread = 0;
 	int size;
 
 	rpipe = fp->f_data;
 
 	/*
 	 * Try to avoid locking the pipe if we have nothing to do.
 	 *
 	 * There are programs which share one pipe amongst multiple processes
 	 * and perform non-blocking reads in parallel, even if the pipe is
 	 * empty.  This in particular is the case with BSD make, which when
 	 * spawned with a high -j number can find itself with over half of the
 	 * calls failing to find anything.
 	 */
 	if ((fp->f_flag & FNONBLOCK) != 0 && !mac_pipe_check_read_enabled()) {
 		if (__predict_false(uio->uio_resid == 0))
 			return (0);
 		if ((atomic_load_short(&rpipe->pipe_state) & PIPE_EOF) == 0 &&
 		    atomic_load_int(&rpipe->pipe_buffer.cnt) == 0 &&
 		    atomic_load_int(&rpipe->pipe_pages.cnt) == 0)
 			return (EAGAIN);
 	}
 
 	PIPE_LOCK(rpipe);
 	++rpipe->pipe_busy;
 	error = pipelock(rpipe, 1);
 	if (error)
 		goto unlocked_error;
 
 #ifdef MAC
 	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (amountpipekva > (3 * maxpipekva) / 4) {
 		if ((rpipe->pipe_state & PIPE_DIRECTW) == 0 &&
 		    rpipe->pipe_buffer.size > SMALL_PIPE_SIZE &&
 		    rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE &&
 		    piperesizeallowed == 1) {
 			PIPE_UNLOCK(rpipe);
 			pipespace(rpipe, SMALL_PIPE_SIZE);
 			PIPE_LOCK(rpipe);
 		}
 	}
 
 	while (uio->uio_resid) {
 		/*
 		 * normal pipe buffer receive
 		 */
 		if (rpipe->pipe_buffer.cnt > 0) {
 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 			if (size > rpipe->pipe_buffer.cnt)
 				size = rpipe->pipe_buffer.cnt;
 			if (size > uio->uio_resid)
 				size = uio->uio_resid;
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(
 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 			    size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 
 			rpipe->pipe_buffer.out += size;
 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 				rpipe->pipe_buffer.out = 0;
 
 			rpipe->pipe_buffer.cnt -= size;
 
 			/*
 			 * If there is no more to read in the pipe, reset
 			 * its pointers to the beginning.  This improves
 			 * cache hit stats.
 			 */
 			if (rpipe->pipe_buffer.cnt == 0) {
 				rpipe->pipe_buffer.in = 0;
 				rpipe->pipe_buffer.out = 0;
 			}
 			nread += size;
 #ifndef PIPE_NODIRECT
 		/*
 		 * Direct copy, bypassing a kernel buffer.
 		 */
 		} else if ((size = rpipe->pipe_pages.cnt) != 0) {
 			if (size > uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 			PIPE_UNLOCK(rpipe);
 			error = uiomove_fromphys(rpipe->pipe_pages.ms,
 			    rpipe->pipe_pages.pos, size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 			nread += size;
 			rpipe->pipe_pages.pos += size;
 			rpipe->pipe_pages.cnt -= size;
 			if (rpipe->pipe_pages.cnt == 0) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 #endif
 		} else {
 			/*
 			 * detect EOF condition
 			 * read returns 0 on EOF, no need to set error
 			 */
 			if (rpipe->pipe_state & PIPE_EOF)
 				break;
 
 			/*
 			 * If the "write-side" has been blocked, wake it up now.
 			 */
 			if (rpipe->pipe_state & PIPE_WANTW) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 
 			/*
 			 * Break if some data was read.
 			 */
 			if (nread > 0)
 				break;
 
 			/*
 			 * Unlock the pipe buffer for our remaining processing.
 			 * We will either break out with an error or we will
 			 * sleep and relock to loop.
 			 */
 			pipeunlock(rpipe);
 
 			/*
 			 * Handle non-blocking mode operation or
 			 * wait for more data.
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 			} else {
 				rpipe->pipe_state |= PIPE_WANTR;
 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
 				    PRIBIO | PCATCH,
 				    "piperd", 0)) == 0)
 					error = pipelock(rpipe, 1);
 			}
 			if (error)
 				goto unlocked_error;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	pipeunlock(rpipe);
 
 	/* XXX: should probably do this before getting any locks. */
 	if (error == 0)
 		pipe_timestamp(&rpipe->pipe_atime);
 unlocked_error:
 	--rpipe->pipe_busy;
 
 	/*
 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
 	 */
 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 		wakeup(rpipe);
 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
 		/*
 		 * Handle write blocking hysteresis.
 		 */
 		if (rpipe->pipe_state & PIPE_WANTW) {
 			rpipe->pipe_state &= ~PIPE_WANTW;
 			wakeup(rpipe);
 		}
 	}
 
 	/*
 	 * Only wake up writers if there was actually something read.
 	 * Otherwise, when calling read(2) at EOF, a spurious wakeup occurs.
 	 */
 	if (nread > 0 &&
 	    rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
 		pipeselwakeup(rpipe);
 
 	PIPE_UNLOCK(rpipe);
 	if (nread > 0)
 		td->td_ru.ru_msgrcv++;
 	return (error);
 }
 
 #ifndef PIPE_NODIRECT
 /*
  * Map the sending processes' buffer into kernel space and wire it.
  * This is similar to a physical write operation.
  */
 static int
 pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio)
 {
 	u_int size;
 	int i;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
 	    ("%s: PIPE_DIRECTW set on %p", __func__, wpipe));
 	KASSERT(wpipe->pipe_pages.cnt == 0,
 	    ("%s: pipe map for %p contains residual data", __func__, wpipe));
 
 	if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
                 size = wpipe->pipe_buffer.size;
 	else
                 size = uio->uio_iov->iov_len;
 
 	wpipe->pipe_state |= PIPE_DIRECTW;
 	PIPE_UNLOCK(wpipe);
 	i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
 	    wpipe->pipe_pages.ms, PIPENPAGES);
 	PIPE_LOCK(wpipe);
 	if (i < 0) {
 		wpipe->pipe_state &= ~PIPE_DIRECTW;
 		return (EFAULT);
 	}
 
 	wpipe->pipe_pages.npages = i;
 	wpipe->pipe_pages.pos =
 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
 	wpipe->pipe_pages.cnt = size;
 
 	uio->uio_iov->iov_len -= size;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
 	if (uio->uio_iov->iov_len == 0) {
 		uio->uio_iov++;
 		uio->uio_iovcnt--;
 	}
 	uio->uio_resid -= size;
 	uio->uio_offset += size;
 	return (0);
 }
 
 /*
  * Unwire the process buffer.
  */
 static void
 pipe_destroy_write_buffer(struct pipe *wpipe)
 {
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
 	    ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
 	KASSERT(wpipe->pipe_pages.cnt == 0,
 	    ("%s: pipe map for %p contains residual data", __func__, wpipe));
 
 	wpipe->pipe_state &= ~PIPE_DIRECTW;
 	vm_page_unhold_pages(wpipe->pipe_pages.ms, wpipe->pipe_pages.npages);
 	wpipe->pipe_pages.npages = 0;
 }
 
 /*
  * In the case of a signal, the writing process might go away.  This
  * code copies the data into the circular buffer so that the source
  * pages can be freed without loss of data.
  */
 static void
 pipe_clone_write_buffer(struct pipe *wpipe)
 {
 	struct uio uio;
 	struct iovec iov;
 	int size;
 	int pos;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
 	    ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
 
 	size = wpipe->pipe_pages.cnt;
 	pos = wpipe->pipe_pages.pos;
 	wpipe->pipe_pages.cnt = 0;
 
 	wpipe->pipe_buffer.in = size;
 	wpipe->pipe_buffer.out = 0;
 	wpipe->pipe_buffer.cnt = size;
 
 	PIPE_UNLOCK(wpipe);
 	iov.iov_base = wpipe->pipe_buffer.buffer;
 	iov.iov_len = size;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = size;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 	uiomove_fromphys(wpipe->pipe_pages.ms, pos, size, &uio);
 	PIPE_LOCK(wpipe);
 	pipe_destroy_write_buffer(wpipe);
 }
 
 /*
  * This implements the pipe buffer write mechanism.  Note that only
  * a direct write OR a normal pipe write can be pending at any given time.
  * If there are any characters in the pipe buffer, the direct write will
  * be deferred until the receiving process grabs all of the bytes from
  * the pipe buffer.  Then the direct mapping write is set-up.
  */
 static int
 pipe_direct_write(struct pipe *wpipe, struct uio *uio)
 {
 	int error;
 
 retry:
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
 		error = EPIPE;
 		goto error1;
 	}
 	if (wpipe->pipe_state & PIPE_DIRECTW) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdww", 0);
 		pipelock(wpipe, 0);
 		if (error != 0)
 			goto error1;
 		goto retry;
 	}
 	if (wpipe->pipe_buffer.cnt > 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdwc", 0);
 		pipelock(wpipe, 0);
 		if (error != 0)
 			goto error1;
 		goto retry;
 	}
 
 	error = pipe_build_write_buffer(wpipe, uio);
 	if (error) {
 		goto error1;
 	}
 
 	while (wpipe->pipe_pages.cnt != 0 &&
 	    (wpipe->pipe_state & PIPE_EOF) == 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
 		    "pipdwt", 0);
 		pipelock(wpipe, 0);
 		if (error != 0)
 			break;
 	}
 
 	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
 		wpipe->pipe_pages.cnt = 0;
 		pipe_destroy_write_buffer(wpipe);
 		pipeselwakeup(wpipe);
 		error = EPIPE;
 	} else if (error == EINTR || error == ERESTART) {
 		pipe_clone_write_buffer(wpipe);
 	} else {
 		pipe_destroy_write_buffer(wpipe);
 	}
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
 	    ("pipe %p leaked PIPE_DIRECTW", wpipe));
 	return (error);
 
 error1:
 	wakeup(wpipe);
 	return (error);
 }
 #endif
 
 static int
 pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct pipe *wpipe, *rpipe;
 	ssize_t orig_resid;
 	int desiredsize, error;
 
 	rpipe = fp->f_data;
 	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 	error = pipelock(wpipe, 1);
 	if (error) {
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 	/*
 	 * detect loss of pipe read side, issue SIGPIPE if lost.
 	 */
 	if (wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (EPIPE);
 	}
 #ifdef MAC
 	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
 	if (error) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 #endif
 	++wpipe->pipe_busy;
 
 	/* Choose a larger size if it's advantageous */
 	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
 	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
 		if (piperesizeallowed != 1)
 			break;
 		if (amountpipekva > maxpipekva / 2)
 			break;
 		if (desiredsize == BIG_PIPE_SIZE)
 			break;
 		desiredsize = desiredsize * 2;
 	}
 
 	/* Choose a smaller size if we're in a OOM situation */
 	if (amountpipekva > (3 * maxpipekva) / 4 &&
 	    wpipe->pipe_buffer.size > SMALL_PIPE_SIZE &&
 	    wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE &&
 	    piperesizeallowed == 1)
 		desiredsize = SMALL_PIPE_SIZE;
 
 	/* Resize if the above determined that a new size was necessary */
 	if (desiredsize != wpipe->pipe_buffer.size &&
 	    (wpipe->pipe_state & PIPE_DIRECTW) == 0) {
 		PIPE_UNLOCK(wpipe);
 		pipespace(wpipe, desiredsize);
 		PIPE_LOCK(wpipe);
 	}
 	MPASS(wpipe->pipe_buffer.size != 0);
 
 	orig_resid = uio->uio_resid;
 
 	while (uio->uio_resid) {
 		int space;
 
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			break;
 		}
 #ifndef PIPE_NODIRECT
 		/*
 		 * If the transfer is large, we can gain performance if
 		 * we do process-to-process copies directly.
 		 * If the write is non-blocking, we don't use the
 		 * direct write mechanism.
 		 *
 		 * The direct write mechanism will detect the reader going
 		 * away on us.
 		 */
 		if (uio->uio_segflg == UIO_USERSPACE &&
 		    uio->uio_iov->iov_len >= pipe_mindirect &&
 		    wpipe->pipe_buffer.size >= pipe_mindirect &&
 		    (fp->f_flag & FNONBLOCK) == 0) {
 			error = pipe_direct_write(wpipe, uio);
 			if (error != 0)
 				break;
 			continue;
 		}
 #endif
 
 		/*
 		 * Pipe buffered writes cannot be coincidental with
 		 * direct writes.  We wait until the currently executing
 		 * direct write is completed before we start filling the
 		 * pipe buffer.  We break out if a signal occurs or the
 		 * reader goes away.
 		 */
 		if (wpipe->pipe_pages.cnt != 0) {
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 			pipeselwakeup(wpipe);
 			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
 			    "pipbww", 0);
 			pipelock(wpipe, 0);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 
 		/* Writes of size <= PIPE_BUF must be atomic. */
 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
 			space = 0;
 
 		if (space > 0) {
 			int size;	/* Transfer size */
 			int segsize;	/* first segment to transfer */
 
 			/*
 			 * Transfer size is minimum of uio transfer
 			 * and free space in pipe buffer.
 			 */
 			if (space > uio->uio_resid)
 				size = uio->uio_resid;
 			else
 				size = space;
 			/*
 			 * First segment to transfer is minimum of
 			 * transfer size and contiguous space in
 			 * pipe buffer.  If first segment to transfer
 			 * is less than the transfer size, we've got
 			 * a wraparound in the buffer.
 			 */
 			segsize = wpipe->pipe_buffer.size -
 				wpipe->pipe_buffer.in;
 			if (segsize > size)
 				segsize = size;
 
 			/* Transfer first segment */
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
 					segsize, uio);
 			PIPE_LOCK(rpipe);
 
 			if (error == 0 && segsize < size) {
 				KASSERT(wpipe->pipe_buffer.in + segsize ==
 					wpipe->pipe_buffer.size,
 					("Pipe buffer wraparound disappeared"));
 				/*
 				 * Transfer remaining part now, to
 				 * support atomic writes.  Wraparound
 				 * happened.
 				 */
 
 				PIPE_UNLOCK(rpipe);
 				error = uiomove(
 				    &wpipe->pipe_buffer.buffer[0],
 				    size - segsize, uio);
 				PIPE_LOCK(rpipe);
 			}
 			if (error == 0) {
 				wpipe->pipe_buffer.in += size;
 				if (wpipe->pipe_buffer.in >=
 				    wpipe->pipe_buffer.size) {
 					KASSERT(wpipe->pipe_buffer.in ==
 						size - segsize +
 						wpipe->pipe_buffer.size,
 						("Expected wraparound bad"));
 					wpipe->pipe_buffer.in = size - segsize;
 				}
 
 				wpipe->pipe_buffer.cnt += size;
 				KASSERT(wpipe->pipe_buffer.cnt <=
 					wpipe->pipe_buffer.size,
 					("Pipe buffer overflow"));
 			}
 			if (error != 0)
 				break;
 			continue;
 		} else {
 			/*
 			 * If the "read-side" has been blocked, wake it up now.
 			 */
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 
 			/*
 			 * don't block on non-blocking I/O
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 				break;
 			}
 
 			/*
 			 * We have no more space and have something to offer,
 			 * wake up select/poll.
 			 */
 			pipeselwakeup(wpipe);
 
 			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe),
 			    PRIBIO | PCATCH, "pipewr", 0);
 			pipelock(wpipe, 0);
 			if (error != 0)
 				break;
 			continue;
 		}
 	}
 
 	--wpipe->pipe_busy;
 
 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 		wakeup(wpipe);
 	} else if (wpipe->pipe_buffer.cnt > 0) {
 		/*
 		 * If we have put any characters in the buffer, we wake up
 		 * the reader.
 		 */
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 	}
 
 	/*
 	 * Don't return EPIPE if any byte was written.
 	 * EINTR and other interrupts are handled by generic I/O layer.
 	 * Do not pretend that I/O succeeded for obvious user error
 	 * like EFAULT.
 	 */
 	if (uio->uio_resid != orig_resid && error == EPIPE)
 		error = 0;
 
 	if (error == 0)
 		pipe_timestamp(&wpipe->pipe_mtime);
 
 	/*
 	 * We have something to offer,
 	 * wake up select/poll.
 	 */
 	if (wpipe->pipe_buffer.cnt)
 		pipeselwakeup(wpipe);
 
 	pipeunlock(wpipe);
 	PIPE_UNLOCK(rpipe);
 	if (uio->uio_resid != orig_resid)
 		td->td_ru.ru_msgsnd++;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
 		error = vnops.fo_truncate(fp, length, active_cred, td);
 	else
 		error = invfo_truncate(fp, length, active_cred, td);
 	return (error);
 }
 
 /*
  * we implement a very minimal set of ioctls for compatibility with sockets.
  */
 static int
 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *mpipe = fp->f_data;
 	int error;
 
 	PIPE_LOCK(mpipe);
 
 #ifdef MAC
 	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
 	if (error) {
 		PIPE_UNLOCK(mpipe);
 		return (error);
 	}
 #endif
 
 	error = 0;
 	switch (cmd) {
 	case FIONBIO:
 		break;
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			mpipe->pipe_state |= PIPE_ASYNC;
 		} else {
 			mpipe->pipe_state &= ~PIPE_ASYNC;
 		}
 		break;
 
 	case FIONREAD:
 		if (!(fp->f_flag & FREAD)) {
 			*(int *)data = 0;
 			PIPE_UNLOCK(mpipe);
 			return (0);
 		}
 		if (mpipe->pipe_pages.cnt != 0)
 			*(int *)data = mpipe->pipe_pages.cnt;
 		else
 			*(int *)data = mpipe->pipe_buffer.cnt;
 		break;
 
 	case FIOSETOWN:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&mpipe->pipe_sigio);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
 		break;
 
 	default:
 		error = ENOTTY;
 		break;
 	}
 	PIPE_UNLOCK(mpipe);
 out_unlocked:
 	return (error);
 }
 
 static int
 pipe_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *rpipe;
 	struct pipe *wpipe;
 	int levents, revents;
 #ifdef MAC
 	int error;
 #endif
 
 	revents = 0;
 	rpipe = fp->f_data;
 	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 #ifdef MAC
 	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
 		if (rpipe->pipe_pages.cnt > 0 || rpipe->pipe_buffer.cnt > 0)
 			revents |= events & (POLLIN | POLLRDNORM);
 
 	if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
 		if (wpipe->pipe_present != PIPE_ACTIVE ||
 		    (wpipe->pipe_state & PIPE_EOF) ||
 		    ((wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
 		     ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
 			 wpipe->pipe_buffer.size == 0)))
 			revents |= events & (POLLOUT | POLLWRNORM);
 
 	levents = events &
 	    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
 	if (rpipe->pipe_type & PIPE_TYPE_NAMED && fp->f_flag & FREAD && levents &&
 	    fp->f_pipegen == rpipe->pipe_wgen)
 		events |= POLLINIGNEOF;
 
 	if ((events & POLLINIGNEOF) == 0) {
 		if (rpipe->pipe_state & PIPE_EOF) {
 			if (fp->f_flag & FREAD)
 				revents |= (events & (POLLIN | POLLRDNORM));
 			if (wpipe->pipe_present != PIPE_ACTIVE ||
 			    (wpipe->pipe_state & PIPE_EOF))
 				revents |= POLLHUP;
 		}
 	}
 
 	if (revents == 0) {
 		/*
 		 * Add ourselves regardless of eventmask as we have to return
 		 * POLLHUP even if it was not asked for.
 		 */
 		if ((fp->f_flag & FREAD) != 0) {
 			selrecord(td, &rpipe->pipe_sel);
 			if (SEL_WAITING(&rpipe->pipe_sel))
 				rpipe->pipe_state |= PIPE_SEL;
 		}
 
 		if ((fp->f_flag & FWRITE) != 0 &&
 		    wpipe->pipe_present == PIPE_ACTIVE) {
 			selrecord(td, &wpipe->pipe_sel);
 			if (SEL_WAITING(&wpipe->pipe_sel))
 				wpipe->pipe_state |= PIPE_SEL;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	PIPE_UNLOCK(rpipe);
 
 	return (revents);
 }
 
 /*
  * We shouldn't need locks here as we're doing a read and this should
  * be a natural race.
  */
 static int
 pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred)
 {
 	struct pipe *pipe;
 #ifdef MAC
 	int error;
 #endif
 
 	pipe = fp->f_data;
 #ifdef MAC
 	if (mac_pipe_check_stat_enabled()) {
 		PIPE_LOCK(pipe);
 		error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
 		PIPE_UNLOCK(pipe);
 		if (error) {
 			return (error);
 		}
 	}
 #endif
 
 	/* For named pipes ask the underlying filesystem. */
 	if (pipe->pipe_type & PIPE_TYPE_NAMED) {
 		return (vnops.fo_stat(fp, ub, active_cred));
 	}
 
 	bzero(ub, sizeof(*ub));
 	ub->st_mode = S_IFIFO;
 	ub->st_blksize = PAGE_SIZE;
 	if (pipe->pipe_pages.cnt != 0)
 		ub->st_size = pipe->pipe_pages.cnt;
 	else
 		ub->st_size = pipe->pipe_buffer.cnt;
 	ub->st_blocks = howmany(ub->st_size, ub->st_blksize);
 	ub->st_atim = pipe->pipe_atime;
 	ub->st_mtim = pipe->pipe_mtime;
 	ub->st_ctim = pipe->pipe_ctime;
 	ub->st_uid = fp->f_cred->cr_uid;
 	ub->st_gid = fp->f_cred->cr_gid;
 	ub->st_dev = pipedev_ino;
 	ub->st_ino = pipe->pipe_ino;
 	/*
 	 * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pipe_close(struct file *fp, struct thread *td)
 {
 
 	if (fp->f_vnode != NULL) 
 		return vnops.fo_close(fp, td);
 	fp->f_ops = &badfileops;
 	pipe_dtor(fp->f_data);
 	fp->f_data = NULL;
 	return (0);
 }
 
 static int
 pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
 		error = vn_chmod(fp, mode, active_cred, td);
 	else
 		error = invfo_chmod(fp, mode, active_cred, td);
 	return (error);
 }
 
 static int
 pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
 		error = vn_chown(fp, uid, gid, active_cred, td);
 	else
 		error = invfo_chown(fp, uid, gid, active_cred, td);
 	return (error);
 }
 
 static int
 pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct pipe *pi;
 
 	if (fp->f_type == DTYPE_FIFO)
 		return (vn_fill_kinfo(fp, kif, fdp));
 	kif->kf_type = KF_TYPE_PIPE;
 	pi = fp->f_data;
 	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
 	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
 	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
 	kif->kf_un.kf_pipe.kf_pipe_buffer_in = pi->pipe_buffer.in;
 	kif->kf_un.kf_pipe.kf_pipe_buffer_out = pi->pipe_buffer.out;
 	kif->kf_un.kf_pipe.kf_pipe_buffer_size = pi->pipe_buffer.size;
 	return (0);
 }
 
 static void
 pipe_free_kmem(struct pipe *cpipe)
 {
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
 	    ("pipe_free_kmem: pipe mutex locked"));
 
 	if (cpipe->pipe_buffer.buffer != NULL) {
 		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
 		chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo,
 		    -cpipe->pipe_buffer.size, 0);
 		vm_map_remove(pipe_map,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
 		cpipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
 	{
 		cpipe->pipe_pages.cnt = 0;
 		cpipe->pipe_pages.pos = 0;
 		cpipe->pipe_pages.npages = 0;
 	}
 #endif
 }
 
 /*
  * shutdown the pipe
  */
 static void
 pipeclose(struct pipe *cpipe)
 {
 #ifdef MAC
 	struct pipepair *pp;
 #endif
 	struct pipe *ppipe;
 
 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
 
 	PIPE_LOCK(cpipe);
 	pipelock(cpipe, 0);
 #ifdef MAC
 	pp = cpipe->pipe_pair;
 #endif
 
 	/*
 	 * If the other side is blocked, wake it up saying that
 	 * we want to close it down.
 	 */
 	cpipe->pipe_state |= PIPE_EOF;
 	while (cpipe->pipe_busy) {
 		wakeup(cpipe);
 		cpipe->pipe_state |= PIPE_WANT;
 		pipeunlock(cpipe);
 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
 		pipelock(cpipe, 0);
 	}
 
 	pipeselwakeup(cpipe);
 
 	/*
 	 * Disconnect from peer, if any.
 	 */
 	ppipe = cpipe->pipe_peer;
 	if (ppipe->pipe_present == PIPE_ACTIVE) {
 		ppipe->pipe_state |= PIPE_EOF;
 		wakeup(ppipe);
 		pipeselwakeup(ppipe);
 	}
 
 	/*
 	 * Mark this endpoint as free.  Release kmem resources.  We
 	 * don't mark this endpoint as unused until we've finished
 	 * doing that, or the pipe might disappear out from under
 	 * us.
 	 */
 	PIPE_UNLOCK(cpipe);
 	pipe_free_kmem(cpipe);
 	PIPE_LOCK(cpipe);
 	cpipe->pipe_present = PIPE_CLOSING;
 	pipeunlock(cpipe);
 
 	/*
 	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
 	 * PIPE_FINALIZED, that allows other end to free the
 	 * pipe_pair, only after the knotes are completely dismantled.
 	 */
 	knlist_clear(&cpipe->pipe_sel.si_note, 1);
 	cpipe->pipe_present = PIPE_FINALIZED;
 	seldrain(&cpipe->pipe_sel);
 	knlist_destroy(&cpipe->pipe_sel.si_note);
 
 	/*
 	 * If both endpoints are now closed, release the memory for the
 	 * pipe pair.  If not, unlock.
 	 */
 	if (ppipe->pipe_present == PIPE_FINALIZED) {
 		PIPE_UNLOCK(cpipe);
 		crfree(cpipe->pipe_pair->pp_owner);
 #ifdef MAC
 		mac_pipe_destroy(pp);
 #endif
 		uma_zfree(pipe_zone, cpipe->pipe_pair);
 	} else
 		PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 pipe_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct pipe *cpipe;
 
 	/*
 	 * If a filter is requested that is not supported by this file
 	 * descriptor, don't return an error, but also don't ever generate an
 	 * event.
 	 */
 	if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
 		kn->kn_fop = &pipe_nfiltops;
 		return (0);
 	}
 	if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
 		kn->kn_fop = &pipe_nfiltops;
 		return (0);
 	}
 	cpipe = fp->f_data;
 	PIPE_LOCK(cpipe);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pipe_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pipe_wfiltops;
 		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
 			/* other end of pipe has been closed */
 			PIPE_UNLOCK(cpipe);
 			return (EPIPE);
 		}
 		cpipe = PIPE_PEER(cpipe);
 		break;
 	default:
 		if ((cpipe->pipe_type & PIPE_TYPE_NAMED) != 0) {
 			PIPE_UNLOCK(cpipe);
 			return (vnops.fo_kqfilter(fp, kn));
 		}
 		PIPE_UNLOCK(cpipe);
 		return (EINVAL);
 	}
 
 	kn->kn_hook = cpipe; 
 	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 	return (0);
 }
 
 static void
 filt_pipedetach(struct knote *kn)
 {
 	struct pipe *cpipe = kn->kn_hook;
 
 	PIPE_LOCK(cpipe);
 	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 filt_piperead(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct pipe *rpipe = kn->kn_hook;
 
 	PIPE_LOCK_ASSERT(rpipe, MA_OWNED);
 	kn->kn_data = rpipe->pipe_buffer.cnt;
 	if (kn->kn_data == 0)
 		kn->kn_data = rpipe->pipe_pages.cnt;
 
 	if ((rpipe->pipe_state & PIPE_EOF) != 0 &&
 	    ((rpipe->pipe_type & PIPE_TYPE_NAMED) == 0 ||
 	    fp->f_pipegen != rpipe->pipe_wgen)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	kn->kn_flags &= ~EV_EOF;
 	return (kn->kn_data > 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_pipewrite(struct knote *kn, long hint)
 {
 	struct pipe *wpipe = kn->kn_hook;
 
 	/*
 	 * If this end of the pipe is closed, the knote was removed from the
 	 * knlist and the list lock (i.e., the pipe lock) is therefore not held.
 	 */
 	if (wpipe->pipe_present == PIPE_ACTIVE ||
 	    (wpipe->pipe_type & PIPE_TYPE_NAMED) != 0) {
 		PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 
 		if (wpipe->pipe_state & PIPE_DIRECTW) {
 			kn->kn_data = 0;
 		} else if (wpipe->pipe_buffer.size > 0) {
 			kn->kn_data = wpipe->pipe_buffer.size -
 			    wpipe->pipe_buffer.cnt;
 		} else {
 			kn->kn_data = PIPE_BUF;
 		}
 	}
 
 	if (wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	kn->kn_flags &= ~EV_EOF;
 	return (kn->kn_data >= PIPE_BUF);
 }
 
 static void
 filt_pipedetach_notsup(struct knote *kn)
 {
 
 }
 
 static int
 filt_pipenotsup(struct knote *kn, long hint)
 {
 
 	return (0);
 }
diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c
index e8e0efd5bb00..dbf8e579530f 100644
--- a/sys/kern/sys_procdesc.c
+++ b/sys/kern/sys_procdesc.c
@@ -1,571 +1,571 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2009, 2016 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * FreeBSD process descriptor facility.
  *
  * Some processes are represented by a file descriptor, which will be used in
  * preference to signaling and pids for the purposes of process management,
  * and is, in effect, a form of capability.  When a process descriptor is
  * used with a process, it ceases to be visible to certain traditional UNIX
  * process facilities, such as waitpid(2).
  *
  * Some semantics:
  *
  * - At most one process descriptor will exist for any process, although
  *   references to that descriptor may be held from many processes (or even
  *   be in flight between processes over a local domain socket).
  * - Last close on the process descriptor will terminate the process using
  *   SIGKILL and reparent it to init so that there's a process to reap it
  *   when it's done exiting.
  * - If the process exits before the descriptor is closed, it will not
  *   generate SIGCHLD on termination, or be picked up by waitpid().
  * - The pdkill(2) system call may be used to deliver a signal to the process
  *   using its process descriptor.
  * - The pdwait4(2) system call may be used to block (or not) on a process
  *   descriptor to collect termination information.
  *
  * Open questions:
  *
  * - Will we want to add a pidtoprocdesc(2) system call to allow process
  *   descriptors to be created for processes without pdfork(2)?
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/sysproto.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/ucred.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 
 FEATURE(process_descriptors, "Process Descriptors");
 
 MALLOC_DEFINE(M_PROCDESC, "procdesc", "process descriptors");
 
 static fo_poll_t	procdesc_poll;
 static fo_kqfilter_t	procdesc_kqfilter;
 static fo_stat_t	procdesc_stat;
 static fo_close_t	procdesc_close;
 static fo_fill_kinfo_t	procdesc_fill_kinfo;
 static fo_cmp_t		procdesc_cmp;
 
-static struct fileops procdesc_ops = {
+static const struct fileops procdesc_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = procdesc_poll,
 	.fo_kqfilter = procdesc_kqfilter,
 	.fo_stat = procdesc_stat,
 	.fo_close = procdesc_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = procdesc_fill_kinfo,
 	.fo_cmp = procdesc_cmp,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 /*
  * Return a locked process given a process descriptor, or ESRCH if it has
  * died.
  */
 int
 procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp,
     struct proc **p)
 {
 	struct procdesc *pd;
 	struct file *fp;
 	int error;
 
 	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
 		error = EBADF;
 		goto out;
 	}
 	pd = fp->f_data;
 	sx_slock(&proctree_lock);
 	if (pd->pd_proc != NULL) {
 		*p = pd->pd_proc;
 		PROC_LOCK(*p);
 	} else
 		error = ESRCH;
 	sx_sunlock(&proctree_lock);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Function to be used by procstat(1) sysctls when returning procdesc
  * information.
  */
 pid_t
 procdesc_pid(struct file *fp_procdesc)
 {
 	struct procdesc *pd;
 
 	KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
 	   ("procdesc_pid: !procdesc"));
 
 	pd = fp_procdesc->f_data;
 	return (pd->pd_pid);
 }
 
 /*
  * Retrieve the PID associated with a process descriptor.
  */
 int
 kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
 		error = EBADF;
 		goto out;
 	}
 	*pidp = procdesc_pid(fp);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * System call to return the pid of a process given its process descriptor.
  */
 int
 sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
 {
 	pid_t pid;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = kern_pdgetpid(td, uap->fd, &cap_pdgetpid_rights, &pid);
 	if (error == 0)
 		error = copyout(&pid, uap->pidp, sizeof(pid));
 	return (error);
 }
 
 /*
  * When a new process is forked by pdfork(), a file descriptor is allocated
  * by the fork code first, then the process is forked, and then we get a
  * chance to set up the process descriptor.  Failure is not permitted at this
  * point, so procdesc_new() must succeed.
  */
 void
 procdesc_new(struct proc *p, int flags)
 {
 	struct procdesc *pd;
 
 	pd = malloc(sizeof(*pd), M_PROCDESC, M_WAITOK | M_ZERO);
 	pd->pd_proc = p;
 	pd->pd_pid = p->p_pid;
 	p->p_procdesc = pd;
 	pd->pd_flags = 0;
 	if (flags & PD_DAEMON)
 		pd->pd_flags |= PDF_DAEMON;
 	PROCDESC_LOCK_INIT(pd);
 	knlist_init_mtx(&pd->pd_selinfo.si_note, &pd->pd_lock);
 
 	/*
 	 * Process descriptors start out with two references: one from their
 	 * struct file, and the other from their struct proc.
 	 */
 	refcount_init(&pd->pd_refcount, 2);
 }
 
 /*
  * Create a new process decriptor for the process that refers to it.
  */
 int
 procdesc_falloc(struct thread *td, struct file **resultfp, int *resultfd,
     int flags, struct filecaps *fcaps)
 {
 	int fflags;
 
 	fflags = 0;
 	if (flags & PD_CLOEXEC)
 		fflags = O_CLOEXEC;
 
 	return (falloc_caps(td, resultfp, resultfd, fflags, fcaps));
 }
 
 /*
  * Initialize a file with a process descriptor.
  */
 void
 procdesc_finit(struct procdesc *pdp, struct file *fp)
 {
 
 	finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
 }
 
 static void
 procdesc_free(struct procdesc *pd)
 {
 
 	/*
 	 * When the last reference is released, we assert that the descriptor
 	 * has been closed, but not that the process has exited, as we will
 	 * detach the descriptor before the process dies if the descript is
 	 * closed, as we can't wait synchronously.
 	 */
 	if (refcount_release(&pd->pd_refcount)) {
 		KASSERT(pd->pd_proc == NULL,
 		    ("procdesc_free: pd_proc != NULL"));
 		KASSERT((pd->pd_flags & PDF_CLOSED),
 		    ("procdesc_free: !PDF_CLOSED"));
 
 		knlist_destroy(&pd->pd_selinfo.si_note);
 		PROCDESC_LOCK_DESTROY(pd);
 		free(pd, M_PROCDESC);
 	}
 }
 
 /*
  * procdesc_exit() - notify a process descriptor that its process is exiting.
  * We use the proctree_lock to ensure that process exit either happens
  * strictly before or strictly after a concurrent call to procdesc_close().
  */
 int
 procdesc_exit(struct proc *p)
 {
 	struct procdesc *pd;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
 
 	pd = p->p_procdesc;
 
 	PROCDESC_LOCK(pd);
 	KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == p->p_reaper,
 	    ("procdesc_exit: closed && parent not reaper"));
 
 	pd->pd_flags |= PDF_EXITED;
 	pd->pd_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig);
 
 	/*
 	 * If the process descriptor has been closed, then we have nothing
 	 * to do; return 1 so that init will get SIGCHLD and do the reaping.
 	 * Clean up the procdesc now rather than letting it happen during
 	 * that reap.
 	 */
 	if (pd->pd_flags & PDF_CLOSED) {
 		PROCDESC_UNLOCK(pd);
 		pd->pd_proc = NULL;
 		p->p_procdesc = NULL;
 		procdesc_free(pd);
 		return (1);
 	}
 	if (pd->pd_flags & PDF_SELECTED) {
 		pd->pd_flags &= ~PDF_SELECTED;
 		selwakeup(&pd->pd_selinfo);
 	}
 	KNOTE_LOCKED(&pd->pd_selinfo.si_note, NOTE_EXIT);
 	PROCDESC_UNLOCK(pd);
 	return (0);
 }
 
 /*
  * When a process descriptor is reaped, perhaps as a result of close() or
  * pdwait4(), release the process's reference on the process descriptor.
  */
 void
 procdesc_reap(struct proc *p)
 {
 	struct procdesc *pd;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
 
 	pd = p->p_procdesc;
 	pd->pd_proc = NULL;
 	p->p_procdesc = NULL;
 	procdesc_free(pd);
 }
 
 /*
  * procdesc_close() - last close on a process descriptor.  If the process is
  * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
  * its reaper clean up the mess; if not, we have to clean up the zombie
  * ourselves.
  */
 static int
 procdesc_close(struct file *fp, struct thread *td)
 {
 	struct procdesc *pd;
 	struct proc *p;
 
 	KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));
 
 	pd = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	sx_xlock(&proctree_lock);
 	PROCDESC_LOCK(pd);
 	pd->pd_flags |= PDF_CLOSED;
 	PROCDESC_UNLOCK(pd);
 	p = pd->pd_proc;
 	if (p == NULL) {
 		/*
 		 * This is the case where process' exit status was already
 		 * collected and procdesc_reap() was already called.
 		 */
 		sx_xunlock(&proctree_lock);
 	} else {
 		PROC_LOCK(p);
 		AUDIT_ARG_PROCESS(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			/*
 			 * If the process is already dead and just awaiting
 			 * reaping, do that now.  This will release the
 			 * process's reference to the process descriptor when it
 			 * calls back into procdesc_reap().
 			 */
 			proc_reap(curthread, p, NULL, 0);
 		} else {
 			/*
 			 * If the process is not yet dead, we need to kill it,
 			 * but we can't wait around synchronously for it to go
 			 * away, as that path leads to madness (and deadlocks).
 			 * First, detach the process from its descriptor so that
 			 * its exit status will be reported normally.
 			 */
 			pd->pd_proc = NULL;
 			p->p_procdesc = NULL;
 			procdesc_free(pd);
 
 			/*
 			 * Next, reparent it to its reaper (usually init(8)) so
 			 * that there's someone to pick up the pieces; finally,
 			 * terminate with prejudice.
 			 */
 			p->p_sigparent = SIGCHLD;
 			if ((p->p_flag & P_TRACED) == 0) {
 				proc_reparent(p, p->p_reaper, true);
 			} else {
 				proc_clear_orphan(p);
 				p->p_oppid = p->p_reaper->p_pid;
 				proc_add_orphan(p, p->p_reaper);
 			}
 			if ((pd->pd_flags & PDF_DAEMON) == 0)
 				kern_psignal(p, SIGKILL);
 			PROC_UNLOCK(p);
 			sx_xunlock(&proctree_lock);
 		}
 	}
 
 	/*
 	 * Release the file descriptor's reference on the process descriptor.
 	 */
 	procdesc_free(pd);
 	return (0);
 }
 
 static int
 procdesc_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct procdesc *pd;
 	int revents;
 
 	revents = 0;
 	pd = fp->f_data;
 	PROCDESC_LOCK(pd);
 	if (pd->pd_flags & PDF_EXITED)
 		revents |= POLLHUP;
 	if (revents == 0) {
 		selrecord(td, &pd->pd_selinfo);
 		pd->pd_flags |= PDF_SELECTED;
 	}
 	PROCDESC_UNLOCK(pd);
 	return (revents);
 }
 
 static void
 procdesc_kqops_detach(struct knote *kn)
 {
 	struct procdesc *pd;
 
 	pd = kn->kn_fp->f_data;
 	knlist_remove(&pd->pd_selinfo.si_note, kn, 0);
 }
 
 static int
 procdesc_kqops_event(struct knote *kn, long hint)
 {
 	struct procdesc *pd;
 	u_int event;
 
 	pd = kn->kn_fp->f_data;
 	if (hint == 0) {
 		/*
 		 * Initial test after registration. Generate a NOTE_EXIT in
 		 * case the process already terminated before registration.
 		 */
 		event = pd->pd_flags & PDF_EXITED ? NOTE_EXIT : 0;
 	} else {
 		/* Mask off extra data. */
 		event = (u_int)hint & NOTE_PCTRLMASK;
 	}
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = pd->pd_xstat;
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
-static struct filterops procdesc_kqops = {
+static const struct filterops procdesc_kqops = {
 	.f_isfd = 1,
 	.f_detach = procdesc_kqops_detach,
 	.f_event = procdesc_kqops_event,
 };
 
 static int
 procdesc_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct procdesc *pd;
 
 	pd = fp->f_data;
 	switch (kn->kn_filter) {
 	case EVFILT_PROCDESC:
 		kn->kn_fop = &procdesc_kqops;
 		kn->kn_flags |= EV_CLEAR;
 		knlist_add(&pd->pd_selinfo.si_note, kn, 0);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static int
 procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct procdesc *pd;
 	struct timeval pstart, boottime;
 
 	/*
 	 * XXXRW: Perhaps we should cache some more information from the
 	 * process so that we can return it reliably here even after it has
 	 * died.  For example, caching its credential data.
 	 */
 	bzero(sb, sizeof(*sb));
 	pd = fp->f_data;
 	sx_slock(&proctree_lock);
 	if (pd->pd_proc != NULL) {
 		PROC_LOCK(pd->pd_proc);
 		AUDIT_ARG_PROCESS(pd->pd_proc);
 
 		/* Set birth and [acm] times to process start time. */
 		pstart = pd->pd_proc->p_stats->p_start;
 		getboottime(&boottime);
 		timevaladd(&pstart, &boottime);
 		TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
 		sb->st_atim = sb->st_birthtim;
 		sb->st_ctim = sb->st_birthtim;
 		sb->st_mtim = sb->st_birthtim;
 		if (pd->pd_proc->p_state != PRS_ZOMBIE)
 			sb->st_mode = S_IFREG | S_IRWXU;
 		else
 			sb->st_mode = S_IFREG;
 		sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
 		sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
 		PROC_UNLOCK(pd->pd_proc);
 	} else
 		sb->st_mode = S_IFREG;
 	sx_sunlock(&proctree_lock);
 	return (0);
 }
 
 static int
 procdesc_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct procdesc *pdp;
 
 	kif->kf_type = KF_TYPE_PROCDESC;
 	pdp = fp->f_data;
 	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
 	return (0);
 }
 
 static int
 procdesc_cmp(struct file *fp1, struct file *fp2, struct thread *td)
 {
 	struct procdesc *pdp1, *pdp2;
 
 	if (fp2->f_type != DTYPE_PROCDESC)
 		return (3);
 	pdp1 = fp1->f_data;
 	pdp2 = fp2->f_data;
 	return (kcmp_cmp((uintptr_t)pdp1->pd_pid, (uintptr_t)pdp2->pd_pid));
 }
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index 58891b0de000..ca7ead961e68 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -1,855 +1,855 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/aio.h>
 #include <sys/domain.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/filio.h>			/* XXX */
 #include <sys/sockio.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/ucred.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/user.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "socket AIO stats");
 
 static int empty_results;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results,
     0, "socket operation returned EAGAIN");
 
 static int empty_retries;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries,
     0, "socket operation retries");
 
 static fo_rdwr_t soo_read;
 static fo_rdwr_t soo_write;
 static fo_ioctl_t soo_ioctl;
 static fo_poll_t soo_poll;
 extern fo_kqfilter_t soo_kqfilter;
 static fo_stat_t soo_stat;
 static fo_close_t soo_close;
 static fo_chmod_t soo_chmod;
 static fo_fill_kinfo_t soo_fill_kinfo;
 static fo_aio_queue_t soo_aio_queue;
 
 static void	soo_aio_cancel(struct kaiocb *job);
 
-struct fileops	socketops = {
+const struct fileops socketops = {
 	.fo_read = soo_read,
 	.fo_write = soo_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = soo_ioctl,
 	.fo_poll = soo_poll,
 	.fo_kqfilter = soo_kqfilter,
 	.fo_stat = soo_stat,
 	.fo_close = soo_close,
 	.fo_chmod = soo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = soo_fill_kinfo,
 	.fo_aio_queue = soo_aio_queue,
 	.fo_cmp = file_kcmp_generic,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static int
 soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_receive(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = soreceive(so, 0, uio, 0, 0, 0);
 	return (error);
 }
 
 static int
 soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_send(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = sousrsend(so, NULL, uio, NULL, 0, NULL);
 	return (error);
 }
 
 static int
 soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error = 0;
 
 	switch (cmd) {
 	case FIONBIO:
 		SOCK_LOCK(so);
 		if (*(int *)data)
 			so->so_state |= SS_NBIO;
 		else
 			so->so_state &= ~SS_NBIO;
 		SOCK_UNLOCK(so);
 		break;
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			SOCK_LOCK(so);
 			so->so_state |= SS_ASYNC;
 			if (SOLISTENING(so)) {
 				so->sol_sbrcv_flags |= SB_ASYNC;
 				so->sol_sbsnd_flags |= SB_ASYNC;
 			} else {
 				SOCK_RECVBUF_LOCK(so);
 				so->so_rcv.sb_flags |= SB_ASYNC;
 				SOCK_RECVBUF_UNLOCK(so);
 				SOCK_SENDBUF_LOCK(so);
 				so->so_snd.sb_flags |= SB_ASYNC;
 				SOCK_SENDBUF_UNLOCK(so);
 			}
 			SOCK_UNLOCK(so);
 		} else {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ASYNC;
 			if (SOLISTENING(so)) {
 				so->sol_sbrcv_flags &= ~SB_ASYNC;
 				so->sol_sbsnd_flags &= ~SB_ASYNC;
 			} else {
 				SOCK_RECVBUF_LOCK(so);
 				so->so_rcv.sb_flags &= ~SB_ASYNC;
 				SOCK_RECVBUF_UNLOCK(so);
 				SOCK_SENDBUF_LOCK(so);
 				so->so_snd.sb_flags &= ~SB_ASYNC;
 				SOCK_SENDBUF_UNLOCK(so);
 			}
 			SOCK_UNLOCK(so);
 		}
 		break;
 
 	case FIONREAD:
 		SOCK_RECVBUF_LOCK(so);
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
 		}
 		SOCK_RECVBUF_UNLOCK(so);
 		break;
 
 	case FIONWRITE:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = sbavail(&so->so_snd);
 		}
 		break;
 
 	case FIONSPACE:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) ||
 			    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) {
 				*(int *)data = 0;
 			} else {
 				*(int *)data = sbspace(&so->so_snd);
 			}
 		}
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &so->so_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&so->so_sigio);
 		break;
 
 	case SIOCSPGRP:
 		error = fsetown(-(*(int *)data), &so->so_sigio);
 		break;
 
 	case SIOCGPGRP:
 		*(int *)data = -fgetown(&so->so_sigio);
 		break;
 
 	case SIOCATMARK:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
 		}
 		break;
 	default:
 		/*
 		 * Interface/routing/protocol specific ioctls: interface and
 		 * routing ioctls should have a different entry since a
 		 * socket is unnecessary.
 		 */
 		if (IOCGROUP(cmd) == 'i')
 			error = ifioctl(so, cmd, data, td);
 		else if (IOCGROUP(cmd) == 'r') {
 			CURVNET_SET(so->so_vnet);
 			error = rtioctl_fib(cmd, data, so->so_fibnum);
 			CURVNET_RESTORE();
 		} else {
 			CURVNET_SET(so->so_vnet);
 			error = so->so_proto->pr_control(so, cmd, data, 0, td);
 			CURVNET_RESTORE();
 		}
 		break;
 	}
 	return (error);
 }
 
 static int
 soo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 #ifdef MAC
 	int error;
 
 	error = mac_socket_check_poll(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	return (sopoll(so, events, fp->f_cred, td));
 }
 
 static int
 soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred)
 {
 	struct socket *so = fp->f_data;
 	int error = 0;
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
 #ifdef MAC
 	error = mac_socket_check_stat(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	SOCK_LOCK(so);
 	if (!SOLISTENING(so)) {
 		struct sockbuf *sb;
 
 		/*
 		 * If SBS_CANTRCVMORE is set, but there's still data left
 		 * in the receive buffer, the socket is still readable.
 		 */
 		sb = &so->so_rcv;
 		SOCK_RECVBUF_LOCK(so);
 		if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
 			ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
 		ub->st_size = sbavail(sb) - sb->sb_ctl;
 		SOCK_RECVBUF_UNLOCK(so);
 
 		sb = &so->so_snd;
 		SOCK_SENDBUF_LOCK(so);
 		if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
 			ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
 		SOCK_SENDBUF_UNLOCK(so);
 	}
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
 	if (so->so_proto->pr_sense)
 		error = so->so_proto->pr_sense(so, ub);
 	SOCK_UNLOCK(so);
 	return (error);
 }
 
 /*
  * API socket close on file pointer.  We call soclose() to close the socket
  * (including initiating closing protocols).  soclose() will sorele() the
  * file reference but the actual socket will not go away until the socket's
  * ref count hits 0.
  */
 static int
 soo_close(struct file *fp, struct thread *td)
 {
 	int error = 0;
 	struct socket *so;
 
 	so = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	if (so)
 		error = soclose(so);
 	return (error);
 }
 
 static int
 soo_chmod(struct file *fp, mode_t mode, struct ucred *cred, struct thread *td)
 {
 	struct socket *so;
 	int error;
 
 	so = fp->f_data;
 	if (so->so_proto->pr_chmod != NULL)
 		error = so->so_proto->pr_chmod(so, mode, cred, td);
 	else
 		error = EINVAL;
 	return (error);
 }
 
 static int
 soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct sockaddr *sa;
 	struct inpcb *inpcb;
 	struct unpcb *unpcb;
 	struct socket *so;
 	int error;
 
 	kif->kf_type = KF_TYPE_SOCKET;
 	so = fp->f_data;
 	CURVNET_SET(so->so_vnet);
 	kif->kf_un.kf_sock.kf_sock_domain0 =
 	    so->so_proto->pr_domain->dom_family;
 	kif->kf_un.kf_sock.kf_sock_type0 = so->so_type;
 	kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol;
 	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
 	switch (kif->kf_un.kf_sock.kf_sock_domain0) {
 	case AF_INET:
 	case AF_INET6:
 		if (so->so_pcb != NULL) {
 			inpcb = (struct inpcb *)(so->so_pcb);
 			kif->kf_un.kf_sock.kf_sock_inpcb =
 			    (uintptr_t)inpcb->inp_ppcb;
 		}
 		kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 		    so->so_rcv.sb_state;
 		kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 		    so->so_snd.sb_state;
 		kif->kf_un.kf_sock.kf_sock_sendq =
 		    sbused(&so->so_snd);
 		kif->kf_un.kf_sock.kf_sock_recvq =
 		    sbused(&so->so_rcv);
 		break;
 	case AF_UNIX:
 		if (so->so_pcb != NULL) {
 			unpcb = (struct unpcb *)(so->so_pcb);
 			if (unpcb->unp_conn) {
 				kif->kf_un.kf_sock.kf_sock_unpconn =
 				    (uintptr_t)unpcb->unp_conn;
 				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 				    so->so_rcv.sb_state;
 				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 				    so->so_snd.sb_state;
 				kif->kf_un.kf_sock.kf_sock_sendq =
 				    sbused(&so->so_snd);
 				kif->kf_un.kf_sock.kf_sock_recvq =
 				    sbused(&so->so_rcv);
 			}
 		}
 		break;
 	}
 	error = so->so_proto->pr_sockaddr(so, &sa);
 	if (error == 0 &&
 	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) {
 		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	error = so->so_proto->pr_peeraddr(so, &sa);
 	if (error == 0 &&
 	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) {
 		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
 	    sizeof(kif->kf_path));
 	CURVNET_RESTORE();
 	return (0);	
 }
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * completed by the AIO job so far.
  */
 #define	aio_done	backend3
 
 static STAILQ_HEAD(, task) soaio_jobs;
 static struct mtx soaio_jobs_lock;
 static struct task soaio_kproc_task;
 static int soaio_starting, soaio_idle, soaio_queued;
 static struct unrhdr *soaio_kproc_unr;
 
 static int soaio_max_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0,
     "Maximum number of kernel processes to use for async socket IO");
 
 static int soaio_num_procs;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0,
     "Number of active kernel processes for async socket IO");
 
 static int soaio_target_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD,
     &soaio_target_procs, 0,
     "Preferred number of ready kernel processes for async socket IO");
 
 static int soaio_lifetime;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static void
 soaio_kproc_loop(void *arg)
 {
 	struct proc *p;
 	struct vmspace *myvm;
 	struct task *task;
 	int error, id, pending;
 
 	id = (intptr_t)arg;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = curproc;
 	myvm = vmspace_acquire_ref(p);
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(soaio_starting > 0);
 	soaio_starting--;
 	for (;;) {
 		while (!STAILQ_EMPTY(&soaio_jobs)) {
 			task = STAILQ_FIRST(&soaio_jobs);
 			STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link);
 			soaio_queued--;
 			pending = task->ta_pending;
 			task->ta_pending = 0;
 			mtx_unlock(&soaio_jobs_lock);
 
 			task->ta_func(task->ta_context, pending);
 
 			mtx_lock(&soaio_jobs_lock);
 		}
 		MPASS(soaio_queued == 0);
 
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&soaio_jobs_lock);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&soaio_jobs_lock);
 			continue;
 		}
 
 		soaio_idle++;
 		error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-",
 		    soaio_lifetime);
 		soaio_idle--;
 		if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) &&
 		    soaio_num_procs > soaio_target_procs)
 			break;
 	}
 	soaio_num_procs--;
 	mtx_unlock(&soaio_jobs_lock);
 	free_unr(soaio_kproc_unr, id);
 	kproc_exit(0);
 }
 
 static void
 soaio_kproc_create(void *context, int pending)
 {
 	struct proc *p;
 	int error, id;
 
 	mtx_lock(&soaio_jobs_lock);
 	for (;;) {
 		if (soaio_num_procs < soaio_target_procs) {
 			/* Must create */
 		} else if (soaio_num_procs >= soaio_max_procs) {
 			/*
 			 * Hit the limit on kernel processes, don't
 			 * create another one.
 			 */
 			break;
 		} else if (soaio_queued <= soaio_idle + soaio_starting) {
 			/*
 			 * No more AIO jobs waiting for a process to be
 			 * created, so stop.
 			 */
 			break;
 		}
 		soaio_starting++;
 		mtx_unlock(&soaio_jobs_lock);
 
 		id = alloc_unr(soaio_kproc_unr);
 		error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id,
 		    &p, 0, 0, "soaiod%d", id);
 		if (error != 0) {
 			free_unr(soaio_kproc_unr, id);
 			mtx_lock(&soaio_jobs_lock);
 			soaio_starting--;
 			break;
 		}
 
 		mtx_lock(&soaio_jobs_lock);
 		soaio_num_procs++;
 	}
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 void
 soaio_enqueue(struct task *task)
 {
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(task->ta_pending == 0);
 	task->ta_pending++;
 	STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link);
 	soaio_queued++;
 	if (soaio_queued <= soaio_idle)
 		wakeup_one(&soaio_idle);
 	else if (soaio_num_procs < soaio_max_procs)
 		taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task);
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 static void
 soaio_init(void)
 {
 
 	soaio_lifetime = AIOD_LIFETIME_DEFAULT;
 	STAILQ_INIT(&soaio_jobs);
 	mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF);
 	soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL);
 	TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL);
 }
 SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL);
 
 static __inline int
 soaio_ready(struct socket *so, struct sockbuf *sb)
 {
 	return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so));
 }
 
 static void
 soaio_process_job(struct socket *so, sb_which which, struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct sockbuf *sb = sobuf(so, which);
 #ifdef MAC
 	struct file *fp = job->fd_file;
 #endif
 	size_t cnt, done, job_total_nbytes __diagused;
 	long ru_before;
 	int error, flags;
 
 	SOCK_BUF_UNLOCK(so, which);
 	aio_switch_vmspace(job);
 	td = curthread;
 retry:
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 
 	job_total_nbytes = job->uiop->uio_resid + job->aio_done;
 	done = job->aio_done;
 	cnt = job->uiop->uio_resid;
 	job->uiop->uio_offset = 0;
 	job->uiop->uio_td = td;
 	flags = MSG_NBIO;
 
 	/*
 	 * For resource usage accounting, only count a completed request
 	 * as a single message to avoid counting multiple calls to
 	 * sosend/soreceive on a blocking socket.
 	 */
 
 	if (sb == &so->so_rcv) {
 		ru_before = td->td_ru.ru_msgrcv;
 #ifdef MAC
 		error = mac_socket_check_receive(fp->f_cred, so);
 		if (error == 0)
 
 #endif
 			error = soreceive(so, NULL, job->uiop, NULL, NULL,
 			    &flags);
 		if (td->td_ru.ru_msgrcv != ru_before)
 			job->msgrcv = 1;
 	} else {
 		if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 			flags |= MSG_MORETOCOME;
 		ru_before = td->td_ru.ru_msgsnd;
 #ifdef MAC
 		error = mac_socket_check_send(fp->f_cred, so);
 		if (error == 0)
 #endif
 			error = sousrsend(so, NULL, job->uiop, NULL, flags,
 			    job->userproc);
 		if (td->td_ru.ru_msgsnd != ru_before)
 			job->msgsnd = 1;
 	}
 
 	done += cnt - job->uiop->uio_resid;
 	job->aio_done = done;
 	td->td_ucred = td_savedcred;
 
 	if (error == EWOULDBLOCK) {
 		/*
 		 * The request was either partially completed or not
 		 * completed at all due to racing with a read() or
 		 * write() on the socket.  If the socket is
 		 * non-blocking, return with any partial completion.
 		 * If the socket is blocking or if no progress has
 		 * been made, requeue this request at the head of the
 		 * queue to try again when the socket is ready.
 		 */
 		MPASS(done != job_total_nbytes);
 		SOCK_BUF_LOCK(so, which);
 		if (done == 0 || !(so->so_state & SS_NBIO)) {
 			empty_results++;
 			if (soaio_ready(so, sb)) {
 				empty_retries++;
 				SOCK_BUF_UNLOCK(so, which);
 				goto retry;
 			}
 			
 			if (!aio_set_cancel_function(job, soo_aio_cancel)) {
 				SOCK_BUF_UNLOCK(so, which);
 				if (done != 0)
 					aio_complete(job, done, 0);
 				else
 					aio_cancel(job);
 				SOCK_BUF_LOCK(so, which);
 			} else {
 				TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list);
 			}
 			return;
 		}
 		SOCK_BUF_UNLOCK(so, which);
 	}		
 	if (done != 0 && (error == ERESTART || error == EINTR ||
 	    error == EWOULDBLOCK))
 		error = 0;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, done, 0);
 	SOCK_BUF_LOCK(so, which);
 }
 
 static void
 soaio_process_sb(struct socket *so, sb_which which)
 {
 	struct kaiocb *job;
 	struct sockbuf *sb = sobuf(so, which);
 
 	CURVNET_SET(so->so_vnet);
 	SOCK_BUF_LOCK(so, which);
 	while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) {
 		job = TAILQ_FIRST(&sb->sb_aiojobq);
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		soaio_process_job(so, which, job);
 	}
 
 	/*
 	 * If there are still pending requests, the socket must not be
 	 * ready so set SB_AIO to request a wakeup when the socket
 	 * becomes ready.
 	 */
 	if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags |= SB_AIO;
 	sb->sb_flags &= ~SB_AIO_RUNNING;
 	SOCK_BUF_UNLOCK(so, which);
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 soaio_rcv(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, SO_RCV);
 }
 
 void
 soaio_snd(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, SO_SND);
 }
 
 void
 sowakeup_aio(struct socket *so, sb_which which)
 {
 	struct sockbuf *sb = sobuf(so, which);
 
 	SOCK_BUF_LOCK_ASSERT(so, which);
 
 	sb->sb_flags &= ~SB_AIO;
 	if (sb->sb_flags & SB_AIO_RUNNING)
 		return;
 	sb->sb_flags |= SB_AIO_RUNNING;
 	soref(so);
 	soaio_enqueue(&sb->sb_aiotask);
 }
 
 static void
 soo_aio_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	long done;
 	int opcode;
 	sb_which which;
 
 	so = job->fd_file->f_data;
 	opcode = job->uaiocb.aio_lio_opcode;
 	if (opcode & LIO_READ) {
 		sb = &so->so_rcv;
 		which = SO_RCV;
 	} else {
 		MPASS(opcode & LIO_WRITE);
 		sb = &so->so_snd;
 		which = SO_SND;
 	}
 
 	SOCK_BUF_LOCK(so, which);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 	if (TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags &= ~SB_AIO;
 	SOCK_BUF_UNLOCK(so, which);
 
 	done = job->aio_done;
 	if (done != 0)
 		aio_complete(job, done, 0);
 	else
 		aio_cancel(job);
 }
 
 static int
 soo_aio_queue(struct file *fp, struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	sb_which which;
 	int error;
 
 	so = fp->f_data;
 	error = so->so_proto->pr_aio_queue(so, job);
 	if (error == 0)
 		return (0);
 
 	/* Lock through the socket, since this may be a listening socket. */
 	switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) {
 	case LIO_READ:
 		SOCK_RECVBUF_LOCK(so);
 		sb = &so->so_rcv;
 		which = SO_RCV;
 		break;
 	case LIO_WRITE:
 		SOCK_SENDBUF_LOCK(so);
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if (SOLISTENING(so)) {
 		SOCK_BUF_UNLOCK(so, which);
 		return (EINVAL);
 	}
 
 	if (!aio_set_cancel_function(job, soo_aio_cancel))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list);
 	if (!(sb->sb_flags & SB_AIO_RUNNING)) {
 		if (soaio_ready(so, sb))
 			sowakeup_aio(so, which);
 		else
 			sb->sb_flags |= SB_AIO;
 	}
 	SOCK_BUF_UNLOCK(so, which);
 	return (0);
 }
diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c
index 8ac5b845f7ac..ab7e048a2ab1 100644
--- a/sys/kern/sys_timerfd.c
+++ b/sys/kern/sys_timerfd.c
@@ -1,609 +1,609 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
  * Copyright (c) 2023 Jake Freeland <jfree@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/selinfo.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timerfd.h>
 #include <sys/timespec.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 
 static MALLOC_DEFINE(M_TIMERFD, "timerfd", "timerfd structures");
 
 static struct mtx timerfd_list_lock;
 static LIST_HEAD(, timerfd) timerfd_list;
 MTX_SYSINIT(timerfd, &timerfd_list_lock, "timerfd_list_lock", MTX_DEF);
 
 static struct unrhdr64 tfdino_unr;
 
 #define	TFD_NOJUMP	0	/* Realtime clock has not jumped. */
 #define	TFD_READ	1	/* Jumped, tfd has been read since. */
 #define	TFD_ZREAD	2	/* Jumped backwards, CANCEL_ON_SET=false. */
 #define	TFD_CANCELED	4	/* Jumped, CANCEL_ON_SET=true. */
 #define	TFD_JUMPED	(TFD_ZREAD | TFD_CANCELED)
 
 /*
  * One structure allocated per timerfd descriptor.
  *
  * Locking semantics:
  * (t)	locked by tfd_lock mtx
  * (l)	locked by timerfd_list_lock sx
  * (c)	const until freeing
  */
 struct timerfd {
 	/* User specified. */
 	struct itimerspec tfd_time;	/* (t) tfd timer */
 	clockid_t	tfd_clockid;	/* (c) timing base */
 	int		tfd_flags;	/* (c) creation flags */
 	int		tfd_timflags;	/* (t) timer flags */
 
 	/* Used internally. */
 	timerfd_t	tfd_count;	/* (t) expiration count since read */
 	bool		tfd_expired;	/* (t) true upon initial expiration */
 	struct mtx	tfd_lock;	/* tfd mtx lock */
 	struct callout	tfd_callout;	/* (t) expiration notification */
 	struct selinfo	tfd_sel;	/* (t) I/O alerts */
 	struct timespec	tfd_boottim;	/* (t) cached boottime */
 	int		tfd_jumped;	/* (t) timer jump status */
 	LIST_ENTRY(timerfd) entry;	/* (l) entry in list */
 
 	/* For stat(2). */
 	ino_t		tfd_ino;	/* (c) inode number */
 	struct timespec	tfd_atim;	/* (t) time of last read */
 	struct timespec	tfd_mtim;	/* (t) time of last settime */
 	struct timespec tfd_birthtim;	/* (c) creation time */
 };
 
 static void
 timerfd_init(void *data)
 {
 	new_unrhdr64(&tfdino_unr, 1);
 }
 
 SYSINIT(timerfd, SI_SUB_VFS, SI_ORDER_ANY, timerfd_init, NULL);
 
 static inline void
 timerfd_getboottime(struct timespec *ts)
 {
 	struct timeval tv;
 
 	getboottime(&tv);
 	TIMEVAL_TO_TIMESPEC(&tv, ts);
 }
 
 /*
  * Call when a discontinuous jump has occured in CLOCK_REALTIME and
  * update timerfd's cached boottime. A jump can be triggered using
  * functions like clock_settime(2) or settimeofday(2).
  *
  * Timer is marked TFD_CANCELED if TFD_TIMER_CANCEL_ON_SET is set
  * and the realtime clock jumps.
  * Timer is marked TFD_ZREAD if TFD_TIMER_CANCEL_ON_SET is not set,
  * but the realtime clock jumps backwards.
  */
 void
 timerfd_jumped(void)
 {
 	struct timerfd *tfd;
 	struct timespec boottime, diff;
 
 	if (LIST_EMPTY(&timerfd_list))
 		return;
 
 	timerfd_getboottime(&boottime);
 	mtx_lock(&timerfd_list_lock);
 	LIST_FOREACH(tfd, &timerfd_list, entry) {
 		mtx_lock(&tfd->tfd_lock);
 		if (tfd->tfd_clockid != CLOCK_REALTIME ||
 		    (tfd->tfd_timflags & TFD_TIMER_ABSTIME) == 0 ||
 		    timespeccmp(&boottime, &tfd->tfd_boottim, ==)) {
 			mtx_unlock(&tfd->tfd_lock);
 			continue;
 		}
 
 		if (callout_active(&tfd->tfd_callout)) {
 			if ((tfd->tfd_timflags & TFD_TIMER_CANCEL_ON_SET) != 0)
 				tfd->tfd_jumped = TFD_CANCELED;
 			else if (timespeccmp(&boottime, &tfd->tfd_boottim, <))
 				tfd->tfd_jumped = TFD_ZREAD;
 
 			/*
 			 * Do not reschedule callout when
 			 * inside interval time loop.
 			 */
 			if (!tfd->tfd_expired) {
 				timespecsub(&boottime,
 				    &tfd->tfd_boottim, &diff);
 				timespecsub(&tfd->tfd_time.it_value,
 				    &diff, &tfd->tfd_time.it_value);
 				if (callout_stop(&tfd->tfd_callout) == 1) {
 					callout_schedule_sbt(&tfd->tfd_callout,
 					    tstosbt(tfd->tfd_time.it_value),
 					    0, C_ABSOLUTE);
 				}
 			}
 		}
 
 		tfd->tfd_boottim = boottime;
 		mtx_unlock(&tfd->tfd_lock);
 	}
 	mtx_unlock(&timerfd_list_lock);
 }
 
 static int
 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct timerfd *tfd = fp->f_data;
 	timerfd_t count;
 	int error = 0;
 
 	if (uio->uio_resid < sizeof(timerfd_t))
 		return (EINVAL);
 
 	mtx_lock(&tfd->tfd_lock);
 retry:
 	getnanotime(&tfd->tfd_atim);
 	if ((tfd->tfd_jumped & TFD_JUMPED) != 0) {
 		if (tfd->tfd_jumped == TFD_CANCELED)
 			error = ECANCELED;
 		tfd->tfd_jumped = TFD_READ;
 		tfd->tfd_count = 0;
 		mtx_unlock(&tfd->tfd_lock);
 		return (error);
 	} else {
 		tfd->tfd_jumped = TFD_NOJUMP;
 	}
 	if (tfd->tfd_count == 0) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&tfd->tfd_lock);
 			return (EAGAIN);
 		}
 		td->td_rtcgen = atomic_load_acq_int(&rtc_generation);
 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock,
 		    PCATCH, "tfdrd", 0);
 		if (error == 0) {
 			goto retry;
 		} else {
 			mtx_unlock(&tfd->tfd_lock);
 			return (error);
 		}
 	}
 
 	count = tfd->tfd_count;
 	tfd->tfd_count = 0;
 	mtx_unlock(&tfd->tfd_lock);
 	error = uiomove(&count, sizeof(timerfd_t), uio);
 
 	return (error);
 }
 
 static int
 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 	switch (cmd) {
 	case FIOASYNC:
 		if (*(int *)data != 0)
 			atomic_set_int(&fp->f_flag, FASYNC);
 		else
 			atomic_clear_int(&fp->f_flag, FASYNC);
 		return (0);
 	case FIONBIO:
 		if (*(int *)data != 0)
 			atomic_set_int(&fp->f_flag, FNONBLOCK);
 		else
 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		return (0);
 	}
 	return (ENOTTY);
 }
 
 static int
 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct timerfd *tfd = fp->f_data;
 	int revents = 0;
 
 	mtx_lock(&tfd->tfd_lock);
 	if ((events & (POLLIN | POLLRDNORM)) != 0 &&
 	    tfd->tfd_count > 0 && tfd->tfd_jumped != TFD_READ)
 		revents |= events & (POLLIN | POLLRDNORM);
 	if (revents == 0)
 		selrecord(td, &tfd->tfd_sel);
 	mtx_unlock(&tfd->tfd_lock);
 
 	return (revents);
 }
 
 static void
 filt_timerfddetach(struct knote *kn)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	mtx_lock(&tfd->tfd_lock);
 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
 	mtx_unlock(&tfd->tfd_lock);
 }
 
 static int
 filt_timerfdread(struct knote *kn, long hint)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	mtx_assert(&tfd->tfd_lock, MA_OWNED);
 	kn->kn_data = (int64_t)tfd->tfd_count;
 	return (tfd->tfd_count > 0);
 }
 
-static struct filterops timerfd_rfiltops = {
+static const struct filterops timerfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_timerfddetach,
 	.f_event = filt_timerfdread,
 };
 
 static int
 timerfd_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct timerfd *tfd = fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_fop = &timerfd_rfiltops;
 	kn->kn_hook = tfd;
 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static int
 timerfd_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct timerfd *tfd = fp->f_data;
 
 	bzero(sb, sizeof(*sb));
 	sb->st_nlink = fp->f_count - 1;
 	sb->st_uid = fp->f_cred->cr_uid;
 	sb->st_gid = fp->f_cred->cr_gid;
 	sb->st_blksize = PAGE_SIZE;
 	mtx_lock(&tfd->tfd_lock);
 	sb->st_atim = tfd->tfd_atim;
 	sb->st_mtim = tfd->tfd_mtim;
 	mtx_unlock(&tfd->tfd_lock);
 	sb->st_ctim = sb->st_mtim;
 	sb->st_ino = tfd->tfd_ino;
 	sb->st_birthtim = tfd->tfd_birthtim;
 
 	return (0);
 }
 
 static int
 timerfd_close(struct file *fp, struct thread *td)
 {
 	struct timerfd *tfd = fp->f_data;
 
 	mtx_lock(&timerfd_list_lock);
 	LIST_REMOVE(tfd, entry);
 	mtx_unlock(&timerfd_list_lock);
 
 	callout_drain(&tfd->tfd_callout);
 	seldrain(&tfd->tfd_sel);
 	knlist_destroy(&tfd->tfd_sel.si_note);
 	mtx_destroy(&tfd->tfd_lock);
 	free(tfd, M_TIMERFD);
 	fp->f_ops = &badfileops;
 
 	return (0);
 }
 
 static int
 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct timerfd *tfd = fp->f_data;
 
 	kif->kf_type = KF_TYPE_TIMERFD;
 	kif->kf_un.kf_timerfd.kf_timerfd_clockid = tfd->tfd_clockid;
 	kif->kf_un.kf_timerfd.kf_timerfd_flags = tfd->tfd_flags;
 	kif->kf_un.kf_timerfd.kf_timerfd_addr = (uintptr_t)tfd;
 
 	return (0);
 }
 
-static struct fileops timerfdops = {
+static const struct fileops timerfdops = {
 	.fo_read = timerfd_read,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = timerfd_ioctl,
 	.fo_poll = timerfd_poll,
 	.fo_kqfilter = timerfd_kqfilter,
 	.fo_stat = timerfd_stat,
 	.fo_close = timerfd_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = timerfd_fill_kinfo,
 	.fo_cmp = file_kcmp_generic,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 static void
 timerfd_curval(struct timerfd *tfd, struct itimerspec *old_value)
 {
 	struct timespec curr_value;
 
 	mtx_assert(&tfd->tfd_lock, MA_OWNED);
 	*old_value = tfd->tfd_time;
 	if (timespecisset(&tfd->tfd_time.it_value)) {
 		nanouptime(&curr_value);
 		timespecsub(&tfd->tfd_time.it_value, &curr_value,
 		    &old_value->it_value);
 	}
 }
 
 static void
 timerfd_expire(void *arg)
 {
 	struct timerfd *tfd = (struct timerfd *)arg;
 	struct timespec uptime;
 
 	++tfd->tfd_count;
 	tfd->tfd_expired = true;
 	if (timespecisset(&tfd->tfd_time.it_interval)) {
 		/* Count missed events. */
 		nanouptime(&uptime);
 		if (timespeccmp(&uptime, &tfd->tfd_time.it_value, >)) {
 			timespecsub(&uptime, &tfd->tfd_time.it_value, &uptime);
 			tfd->tfd_count += tstosbt(uptime) /
 			    tstosbt(tfd->tfd_time.it_interval);
 		}
 		timespecadd(&tfd->tfd_time.it_value,
 		    &tfd->tfd_time.it_interval, &tfd->tfd_time.it_value);
 		callout_schedule_sbt(&tfd->tfd_callout,
 		    tstosbt(tfd->tfd_time.it_value),
 		    0, C_ABSOLUTE);
 	} else {
 		/* Single shot timer. */
 		callout_deactivate(&tfd->tfd_callout);
 		timespecclear(&tfd->tfd_time.it_value);
 	}
 
 	wakeup(&tfd->tfd_count);
 	selwakeup(&tfd->tfd_sel);
 	KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
 }
 
 int
 kern_timerfd_create(struct thread *td, int clockid, int flags)
 {
 	struct file *fp;
 	struct timerfd *tfd;
 	int error, fd, fflags;
 
 	AUDIT_ARG_VALUE(clockid);
 	AUDIT_ARG_FFLAGS(flags);
 
 	switch (clockid) {
 	case CLOCK_REALTIME:
 		/* FALLTHROUGH */
 	case CLOCK_MONOTONIC:
 		/* FALLTHROUGH */
 	case CLOCK_UPTIME:
 		/*
 		 * CLOCK_BOOTTIME should be added once different from
 		 * CLOCK_UPTIME
 		 */
 		break;
 	default:
 		return (EINVAL);
 	}
 	if ((flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) != 0)
 		return (EINVAL);
 
 	fflags = FREAD;
 	if ((flags & TFD_CLOEXEC) != 0)
 		fflags |= O_CLOEXEC;
 	if ((flags & TFD_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	error = falloc(td, &fp, &fd, fflags);
 	if (error != 0)
 		return (error);
 
 	tfd = malloc(sizeof(*tfd), M_TIMERFD, M_WAITOK | M_ZERO);
 	tfd->tfd_clockid = (clockid_t)clockid;
 	tfd->tfd_flags = flags;
 	tfd->tfd_ino = alloc_unr64(&tfdino_unr);
 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
 	timerfd_getboottime(&tfd->tfd_boottim);
 	getnanotime(&tfd->tfd_birthtim);
 	mtx_lock(&timerfd_list_lock);
 	LIST_INSERT_HEAD(&timerfd_list, tfd, entry);
 	mtx_unlock(&timerfd_list_lock);
 
 	finit(fp, fflags, DTYPE_TIMERFD, tfd, &timerfdops);
 
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (0);
 }
 
 int
 kern_timerfd_gettime(struct thread *td, int fd, struct itimerspec *curr_value)
 {
 	struct file *fp;
 	struct timerfd *tfd;
 	int error;
 
 	error = fget(td, fd, &cap_write_rights, &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_TIMERFD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	tfd = fp->f_data;
 
 	mtx_lock(&tfd->tfd_lock);
 	timerfd_curval(tfd, curr_value);
 	mtx_unlock(&tfd->tfd_lock);
 
 	fdrop(fp, td);
 	return (0);
 }
 
 int
 kern_timerfd_settime(struct thread *td, int fd, int flags,
     const struct itimerspec *new_value, struct itimerspec *old_value)
 {
 	struct file *fp;
 	struct timerfd *tfd;
 	struct timespec ts;
 	int error = 0;
 
 	if ((flags & ~(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)) != 0)
 		return (EINVAL);
 	if (!timespecvalid_interval(&new_value->it_value) ||
 	    !timespecvalid_interval(&new_value->it_interval))
 		return (EINVAL);
 
 	error = fget(td, fd, &cap_write_rights, &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_TIMERFD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	tfd = fp->f_data;
 
 	mtx_lock(&tfd->tfd_lock);
 	getnanotime(&tfd->tfd_mtim);
 	tfd->tfd_timflags = flags;
 
 	/* Store old itimerspec, if applicable. */
 	if (old_value != NULL)
 		timerfd_curval(tfd, old_value);
 
 	/* Set new expiration. */
 	tfd->tfd_time = *new_value;
 	if (timespecisset(&tfd->tfd_time.it_value)) {
 		if ((flags & TFD_TIMER_ABSTIME) == 0) {
 			nanouptime(&ts);
 			timespecadd(&tfd->tfd_time.it_value, &ts,
 			    &tfd->tfd_time.it_value);
 		} else if (tfd->tfd_clockid == CLOCK_REALTIME) {
 			/* ECANCELED if unread jump is pending. */
 			if (tfd->tfd_jumped == TFD_CANCELED)
 				error = ECANCELED;
 			/* Convert from CLOCK_REALTIME to CLOCK_BOOTTIME. */
 			timespecsub(&tfd->tfd_time.it_value, &tfd->tfd_boottim,
 			    &tfd->tfd_time.it_value);
 		}
 		callout_reset_sbt(&tfd->tfd_callout,
 		    tstosbt(tfd->tfd_time.it_value),
 		    0, timerfd_expire, tfd, C_ABSOLUTE);
 	} else {
 		callout_stop(&tfd->tfd_callout);
 	}
 	tfd->tfd_count = 0;
 	tfd->tfd_expired = false;
 	tfd->tfd_jumped = TFD_NOJUMP;
 	mtx_unlock(&tfd->tfd_lock);
 
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_timerfd_create(struct thread *td, struct timerfd_create_args *uap)
 {
 	return (kern_timerfd_create(td, uap->clockid, uap->flags));
 }
 
 int
 sys_timerfd_gettime(struct thread *td, struct timerfd_gettime_args *uap)
 {
 	struct itimerspec curr_value;
 	int error;
 
 	error = kern_timerfd_gettime(td, uap->fd, &curr_value);
 	if (error == 0)
 		error = copyout(&curr_value, uap->curr_value,
 		    sizeof(curr_value));
 
 	return (error);
 }
 
 int
 sys_timerfd_settime(struct thread *td, struct timerfd_settime_args *uap)
 {
 	struct itimerspec new_value, old_value;
 	int error;
 
 	error = copyin(uap->new_value, &new_value, sizeof(new_value));
 	if (error != 0)
 		return (error);
 	if (uap->old_value == NULL) {
 		error = kern_timerfd_settime(td, uap->fd, uap->flags,
 		    &new_value, NULL);
 	} else {
 		error = kern_timerfd_settime(td, uap->fd, uap->flags,
 		    &new_value, &old_value);
 		if (error == 0)
 			error = copyout(&old_value, uap->old_value,
 			    sizeof(old_value));
 	}
 	return (error);
 }
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index b6e300321e9c..b1b3b268d0e9 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -1,2487 +1,2487 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_printf.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #ifdef COMPAT_43TTY
 #include <sys/ioctl_compat.h>
 #endif /* COMPAT_43TTY */
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/serial.h>
 #include <sys/signal.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #define TTYDEFCHARS
 #include <sys/ttydefaults.h>
 #undef TTYDEFCHARS
 #include <sys/ucred.h>
 #include <sys/vnode.h>
 
 #include <fs/devfs/devfs.h>
 
 #include <machine/stdarg.h>
 
 static MALLOC_DEFINE(M_TTY, "tty", "tty device");
 
 static void tty_rel_free(struct tty *tp);
 
 static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
 static struct sx tty_list_sx;
 SX_SYSINIT(tty_list, &tty_list_sx, "tty list");
 static unsigned int tty_list_count = 0;
 
 /* Character device of /dev/console. */
 static struct cdev	*dev_console;
 static const char	*dev_console_filename;
 
 /*
  * Flags that are supported and stored by this implementation.
  */
 #define TTYSUP_IFLAG	(IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|INLCR|\
 			IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL|IUTF8)
 #define TTYSUP_OFLAG	(OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET)
 #define TTYSUP_LFLAG	(ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\
 			ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\
 			FLUSHO|NOKERNINFO|NOFLSH)
 #define TTYSUP_CFLAG	(CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\
 			HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
 			CDSR_OFLOW|CCAR_OFLOW|CNO_RTSDTR)
 
 #define	TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
 
 static int  tty_drainwait = 5 * 60;
 SYSCTL_INT(_kern, OID_AUTO, tty_drainwait, CTLFLAG_RWTUN,
     &tty_drainwait, 0, "Default output drain timeout in seconds");
 
 /*
  * Set TTY buffer sizes.
  */
 
 #define	TTYBUF_MAX	65536
 
 #ifdef PRINTF_BUFR_SIZE
 #define	TTY_PRBUF_SIZE	PRINTF_BUFR_SIZE
 #else
 #define	TTY_PRBUF_SIZE	256
 #endif
 
 /*
  * Allocate buffer space if necessary, and set low watermarks, based on speed.
  * Note that the ttyxxxq_setsize() functions may drop and then reacquire the tty
  * lock during memory allocation.  They will return ENXIO if the tty disappears
  * while unlocked.
  */
 static int
 tty_watermarks(struct tty *tp)
 {
 	size_t bs = 0;
 	int error;
 
 	/* Provide an input buffer for 2 seconds of data. */
 	if (tp->t_termios.c_cflag & CREAD)
 		bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX);
 	error = ttyinq_setsize(&tp->t_inq, tp, bs);
 	if (error != 0)
 		return (error);
 
 	/* Set low watermark at 10% (when 90% is available). */
 	tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10;
 
 	/* Provide an output buffer for 2 seconds of data. */
 	bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX);
 	error = ttyoutq_setsize(&tp->t_outq, tp, bs);
 	if (error != 0)
 		return (error);
 
 	/* Set low watermark at 10% (when 90% is available). */
 	tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10;
 
 	return (0);
 }
 
 static int
 tty_drain(struct tty *tp, int leaving)
 {
 	sbintime_t timeout_at;
 	size_t bytes;
 	int error;
 
 	if (ttyhook_hashook(tp, getc_inject))
 		/* buffer is inaccessible */
 		return (0);
 
 	/*
 	 * For close(), use the recent historic timeout of "1 second without
 	 * making progress".  For tcdrain(), use t_drainwait as the timeout,
 	 * with zero meaning "no timeout" which gives POSIX behavior.
 	 */
 	if (leaving)
 		timeout_at = getsbinuptime() + SBT_1S;
 	else if (tp->t_drainwait != 0)
 		timeout_at = getsbinuptime() + SBT_1S * tp->t_drainwait;
 	else
 		timeout_at = 0;
 
 	/*
 	 * Poll the output buffer and the hardware for completion, at 10 Hz.
 	 * Polling is required for devices which are not able to signal an
 	 * interrupt when the transmitter becomes idle (most USB serial devs).
 	 * The unusual structure of this loop ensures we check for busy one more
 	 * time after tty_timedwait() returns EWOULDBLOCK, so that success has
 	 * higher priority than timeout if the IO completed in the last 100mS.
 	 */
 	error = 0;
 	bytes = ttyoutq_bytesused(&tp->t_outq);
 	for (;;) {
 		if (ttyoutq_bytesused(&tp->t_outq) == 0 && !ttydevsw_busy(tp))
 			return (0);
 		if (error != 0)
 			return (error);
 		ttydevsw_outwakeup(tp);
 		error = tty_timedwait(tp, &tp->t_outwait, hz / 10);
 		if (error != 0 && error != EWOULDBLOCK)
 			return (error);
 		else if (timeout_at == 0 || getsbinuptime() < timeout_at)
 			error = 0;
 		else if (leaving && ttyoutq_bytesused(&tp->t_outq) < bytes) {
 			/* In close, making progress, grant an extra second. */
 			error = 0;
 			timeout_at += SBT_1S;
 			bytes = ttyoutq_bytesused(&tp->t_outq);
 		}
 	}
 }
 
 /*
  * Though ttydev_enter() and ttydev_leave() seem to be related, they
  * don't have to be used together. ttydev_enter() is used by the cdev
  * operations to prevent an actual operation from being processed when
  * the TTY has been abandoned. ttydev_leave() is used by ttydev_open()
  * and ttydev_close() to determine whether per-TTY data should be
  * deallocated.
  */
 
 static __inline int
 ttydev_enter(struct tty *tp)
 {
 
 	tty_lock(tp);
 
 	if (tty_gone(tp) || !tty_opened(tp)) {
 		/* Device is already gone. */
 		tty_unlock(tp);
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 ttydev_leave(struct tty *tp)
 {
 
 	tty_assert_locked(tp);
 
 	if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) {
 		/* Device is still opened somewhere. */
 		tty_unlock(tp);
 		return;
 	}
 
 	tp->t_flags |= TF_OPENCLOSE;
 
 	/* Remove console TTY. */
 	constty_clear(tp);
 
 	/* Drain any output. */
 	if (!tty_gone(tp))
 		tty_drain(tp, 1);
 
 	ttydisc_close(tp);
 
 	/* Free i/o queues now since they might be large. */
 	ttyinq_free(&tp->t_inq);
 	tp->t_inlow = 0;
 	ttyoutq_free(&tp->t_outq);
 	tp->t_outlow = 0;
 
 	if (!tty_gone(tp))
 		ttydevsw_close(tp);
 
 	tp->t_flags &= ~TF_OPENCLOSE;
 	cv_broadcast(&tp->t_dcdwait);
 	tty_rel_free(tp);
 }
 
 /*
  * Operations that are exposed through the character device in /dev.
  */
 static int
 ttydev_open(struct cdev *dev, int oflags, int devtype __unused,
     struct thread *td)
 {
 	struct tty *tp;
 	int error;
 
 	tp = dev->si_drv1;
 	error = 0;
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		/* Device is already gone. */
 		tty_unlock(tp);
 		return (ENXIO);
 	}
 
 	/*
 	 * Block when other processes are currently opening or closing
 	 * the TTY.
 	 */
 	while (tp->t_flags & TF_OPENCLOSE) {
 		error = tty_wait(tp, &tp->t_dcdwait);
 		if (error != 0) {
 			tty_unlock(tp);
 			return (error);
 		}
 	}
 	tp->t_flags |= TF_OPENCLOSE;
 
 	/*
 	 * Make sure the "tty" and "cua" device cannot be opened at the
 	 * same time.  The console is a "tty" device.
 	 */
 	if (TTY_CALLOUT(tp, dev)) {
 		if (tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) {
 			error = EBUSY;
 			goto done;
 		}
 	} else {
 		if (tp->t_flags & TF_OPENED_OUT) {
 			error = EBUSY;
 			goto done;
 		}
 	}
 
 	if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) {
 		error = EBUSY;
 		goto done;
 	}
 
 	if (!tty_opened(tp)) {
 		/* Set proper termios flags. */
 		if (TTY_CALLOUT(tp, dev))
 			tp->t_termios = tp->t_termios_init_out;
 		else
 			tp->t_termios = tp->t_termios_init_in;
 		ttydevsw_param(tp, &tp->t_termios);
 		/* Prevent modem control on callout devices and /dev/console. */
 		if (TTY_CALLOUT(tp, dev) || dev == dev_console)
 			tp->t_termios.c_cflag |= CLOCAL;
 
 		if ((tp->t_termios.c_cflag & CNO_RTSDTR) == 0)
 			ttydevsw_modem(tp, SER_DTR|SER_RTS, 0);
 
 		error = ttydevsw_open(tp);
 		if (error != 0)
 			goto done;
 
 		ttydisc_open(tp);
 		error = tty_watermarks(tp);
 		if (error != 0)
 			goto done;
 	}
 
 	/* Wait for Carrier Detect. */
 	if ((oflags & O_NONBLOCK) == 0 &&
 	    (tp->t_termios.c_cflag & CLOCAL) == 0) {
 		while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) {
 			error = tty_wait(tp, &tp->t_dcdwait);
 			if (error != 0)
 				goto done;
 		}
 	}
 
 	if (dev == dev_console)
 		tp->t_flags |= TF_OPENED_CONS;
 	else if (TTY_CALLOUT(tp, dev))
 		tp->t_flags |= TF_OPENED_OUT;
 	else
 		tp->t_flags |= TF_OPENED_IN;
 	MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 ||
 	    (tp->t_flags & TF_OPENED_OUT) == 0);
 
 done:	tp->t_flags &= ~TF_OPENCLOSE;
 	cv_broadcast(&tp->t_dcdwait);
 	ttydev_leave(tp);
 
 	return (error);
 }
 
 static int
 ttydev_close(struct cdev *dev, int fflag, int devtype __unused,
     struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 
 	tty_lock(tp);
 
 	/*
 	 * Don't actually close the device if it is being used as the
 	 * console.
 	 */
 	MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 ||
 	    (tp->t_flags & TF_OPENED_OUT) == 0);
 	if (dev == dev_console)
 		tp->t_flags &= ~TF_OPENED_CONS;
 	else
 		tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT);
 
 	if (tp->t_flags & TF_OPENED) {
 		tty_unlock(tp);
 		return (0);
 	}
 
 	/* If revoking, flush output now to avoid draining it later. */
 	if ((fflag & FREVOKE) != 0) {
 		tty_flush(tp, FWRITE);
 		knlist_delete(&tp->t_inpoll.si_note, td, 1);
 		knlist_delete(&tp->t_outpoll.si_note, td, 1);
 	}
 
 	tp->t_flags &= ~TF_EXCLUDE;
 
 	/* Properly wake up threads that are stuck - revoke(). */
 	tp->t_revokecnt++;
 	tty_wakeup(tp, FREAD|FWRITE);
 	cv_broadcast(&tp->t_bgwait);
 	cv_broadcast(&tp->t_dcdwait);
 
 	ttydev_leave(tp);
 
 	return (0);
 }
 
 static __inline int
 tty_is_ctty(struct tty *tp, struct proc *p)
 {
 
 	tty_assert_locked(tp);
 
 	return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT);
 }
 
 int
 tty_wait_background(struct tty *tp, struct thread *td, int sig)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	ksiginfo_t ksi;
 	int error;
 
 	MPASS(sig == SIGTTIN || sig == SIGTTOU);
 	tty_assert_locked(tp);
 
 	p = td->td_proc;
 	for (;;) {
 		pg = p->p_pgrp;
 		PGRP_LOCK(pg);
 		PROC_LOCK(p);
 
 		/*
 		 * pg may no longer be our process group.
 		 * Re-check after locking.
 		 */
 		if (p->p_pgrp != pg) {
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			continue;
 		}
 
 		/*
 		 * The process should only sleep, when:
 		 * - This terminal is the controlling terminal
 		 * - Its process group is not the foreground process
 		 *   group
 		 * - The parent process isn't waiting for the child to
 		 *   exit
 		 * - the signal to send to the process isn't masked
 		 */
 		if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) {
 			/* Allow the action to happen. */
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			return (0);
 		}
 
 		if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) ||
 		    SIGISMEMBER(td->td_sigmask, sig)) {
 			/* Only allow them in write()/ioctl(). */
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			return (sig == SIGTTOU ? 0 : EIO);
 		}
 
 		if ((p->p_flag & P_PPWAIT) != 0 ||
 		    (pg->pg_flags & PGRP_ORPHANED) != 0) {
 			/* Don't allow the action to happen. */
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			return (EIO);
 		}
 		PROC_UNLOCK(p);
 
 		/*
 		 * Send the signal and sleep until we're the new
 		 * foreground process group.
 		 */
 		if (sig != 0) {
 			ksiginfo_init(&ksi);
 			ksi.ksi_code = SI_KERNEL;
 			ksi.ksi_signo = sig;
 			sig = 0;
 		}
 
 		pgsignal(pg, ksi.ksi_signo, 1, &ksi);
 		PGRP_UNLOCK(pg);
 
 		error = tty_wait(tp, &tp->t_bgwait);
 		if (error)
 			return (error);
 	}
 }
 
 static int
 ttydev_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		goto done;
 	error = ttydisc_read(tp, uio, ioflag);
 	tty_unlock(tp);
 
 	/*
 	 * The read() call should not throw an error when the device is
 	 * being destroyed. Silently convert it to an EOF.
 	 */
 done:	if (error == ENXIO)
 		error = 0;
 	return (error);
 }
 
 static int
 ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct tty *tp = dev->si_drv1;
 	int defer, error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	if (tp->t_termios.c_lflag & TOSTOP) {
 		error = tty_wait_background(tp, curthread, SIGTTOU);
 		if (error)
 			goto done;
 	}
 
 	if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) {
 		/* Allow non-blocking writes to bypass serialization. */
 		error = ttydisc_write(tp, uio, ioflag);
 	} else {
 		/* Serialize write() calls. */
 		while (tp->t_flags & TF_BUSY_OUT) {
 			error = tty_wait(tp, &tp->t_outserwait);
 			if (error)
 				goto done;
 		}
 
 		tp->t_flags |= TF_BUSY_OUT;
 		defer = sigdeferstop(SIGDEFERSTOP_ERESTART);
 		error = ttydisc_write(tp, uio, ioflag);
 		sigallowstop(defer);
 		tp->t_flags &= ~TF_BUSY_OUT;
 		cv_signal(&tp->t_outserwait);
 	}
 
 done:	tty_unlock(tp);
 	return (error);
 }
 
 static int
 ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	switch (cmd) {
 	case TIOCCBRK:
 	case TIOCCONS:
 	case TIOCDRAIN:
 	case TIOCEXCL:
 	case TIOCFLUSH:
 	case TIOCNXCL:
 	case TIOCSBRK:
 	case TIOCSCTTY:
 	case TIOCSETA:
 	case TIOCSETAF:
 	case TIOCSETAW:
 	case TIOCSPGRP:
 	case TIOCSTART:
 	case TIOCSTAT:
 	case TIOCSTI:
 	case TIOCSTOP:
 	case TIOCSWINSZ:
 #if 0
 	case TIOCSDRAINWAIT:
 	case TIOCSETD:
 #endif
 #ifdef COMPAT_43TTY
 	case  TIOCLBIC:
 	case  TIOCLBIS:
 	case  TIOCLSET:
 	case  TIOCSETC:
 	case OTIOCSETD:
 	case  TIOCSETN:
 	case  TIOCSETP:
 	case  TIOCSLTC:
 #endif /* COMPAT_43TTY */
 		/*
 		 * If the ioctl() causes the TTY to be modified, let it
 		 * wait in the background.
 		 */
 		error = tty_wait_background(tp, curthread, SIGTTOU);
 		if (error)
 			goto done;
 	}
 
 	if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) {
 		struct termios *old = &tp->t_termios;
 		struct termios *new = (struct termios *)data;
 		struct termios *lock = TTY_CALLOUT(tp, dev) ?
 		    &tp->t_termios_lock_out : &tp->t_termios_lock_in;
 		int cc;
 
 		/*
 		 * Lock state devices.  Just overwrite the values of the
 		 * commands that are currently in use.
 		 */
 		new->c_iflag = (old->c_iflag & lock->c_iflag) |
 		    (new->c_iflag & ~lock->c_iflag);
 		new->c_oflag = (old->c_oflag & lock->c_oflag) |
 		    (new->c_oflag & ~lock->c_oflag);
 		new->c_cflag = (old->c_cflag & lock->c_cflag) |
 		    (new->c_cflag & ~lock->c_cflag);
 		new->c_lflag = (old->c_lflag & lock->c_lflag) |
 		    (new->c_lflag & ~lock->c_lflag);
 		for (cc = 0; cc < NCCS; ++cc)
 			if (lock->c_cc[cc])
 				new->c_cc[cc] = old->c_cc[cc];
 		if (lock->c_ispeed)
 			new->c_ispeed = old->c_ispeed;
 		if (lock->c_ospeed)
 			new->c_ospeed = old->c_ospeed;
 	}
 
 	error = tty_ioctl(tp, cmd, data, fflag, td);
 done:	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ttydev_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error, revents = 0;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
 
 	if (events & (POLLIN|POLLRDNORM)) {
 		/* See if we can read something. */
 		if (ttydisc_read_poll(tp) > 0)
 			revents |= events & (POLLIN|POLLRDNORM);
 	}
 
 	if (tp->t_flags & TF_ZOMBIE) {
 		/* Hangup flag on zombie state. */
 		revents |= POLLHUP;
 	} else if (events & (POLLOUT|POLLWRNORM)) {
 		/* See if we can write something. */
 		if (ttydisc_write_poll(tp) > 0)
 			revents |= events & (POLLOUT|POLLWRNORM);
 	}
 
 	if (revents == 0) {
 		if (events & (POLLIN|POLLRDNORM))
 			selrecord(td, &tp->t_inpoll);
 		if (events & (POLLOUT|POLLWRNORM))
 			selrecord(td, &tp->t_outpoll);
 	}
 
 	tty_unlock(tp);
 
 	return (revents);
 }
 
 static int
 ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	/* Handle mmap() through the driver. */
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (-1);
 	error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr);
 	tty_unlock(tp);
 
 	return (error);
 }
 
 /*
  * kqueue support.
  */
 
 static void
 tty_kqops_read_detach(struct knote *kn)
 {
 	struct tty *tp = kn->kn_hook;
 
 	knlist_remove(&tp->t_inpoll.si_note, kn, 0);
 }
 
 static int
 tty_kqops_read_event(struct knote *kn, long hint __unused)
 {
 	struct tty *tp = kn->kn_hook;
 
 	tty_assert_locked(tp);
 
 	if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_read_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 tty_kqops_write_detach(struct knote *kn)
 {
 	struct tty *tp = kn->kn_hook;
 
 	knlist_remove(&tp->t_outpoll.si_note, kn, 0);
 }
 
 static int
 tty_kqops_write_event(struct knote *kn, long hint __unused)
 {
 	struct tty *tp = kn->kn_hook;
 
 	tty_assert_locked(tp);
 
 	if (tty_gone(tp)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_write_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
-static struct filterops tty_kqops_read = {
+static const struct filterops tty_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_read_detach,
 	.f_event = tty_kqops_read_event,
 };
 
-static struct filterops tty_kqops_write = {
+static const struct filterops tty_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_write_detach,
 	.f_event = tty_kqops_write_event,
 };
 
 static int
 ttydev_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_hook = tp;
 		kn->kn_fop = &tty_kqops_read;
 		knlist_add(&tp->t_inpoll.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		kn->kn_hook = tp;
 		kn->kn_fop = &tty_kqops_write;
 		knlist_add(&tp->t_outpoll.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	tty_unlock(tp);
 	return (error);
 }
 
 static struct cdevsw ttydev_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttydev_open,
 	.d_close	= ttydev_close,
 	.d_read		= ttydev_read,
 	.d_write	= ttydev_write,
 	.d_ioctl	= ttydev_ioctl,
 	.d_kqfilter	= ttydev_kqfilter,
 	.d_poll		= ttydev_poll,
 	.d_mmap		= ttydev_mmap,
 	.d_name		= "ttydev",
 	.d_flags	= D_TTY,
 };
 
 /*
  * Init/lock-state devices
  */
 
 static int
 ttyil_open(struct cdev *dev, int oflags __unused, int devtype __unused,
     struct thread *td)
 {
 	struct tty *tp;
 	int error;
 
 	tp = dev->si_drv1;
 	error = 0;
 	tty_lock(tp);
 	if (tty_gone(tp))
 		error = ENODEV;
 	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ttyil_close(struct cdev *dev __unused, int flag __unused, int mode __unused,
     struct thread *td __unused)
 {
 
 	return (0);
 }
 
 static int
 ttyil_rdwr(struct cdev *dev __unused, struct uio *uio __unused,
     int ioflag __unused)
 {
 
 	return (ENODEV);
 }
 
 static int
 ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		error = ENODEV;
 		goto done;
 	}
 
 	error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
 	if (error != ENOIOCTL)
 		goto done;
 	error = 0;
 
 	switch (cmd) {
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		*(struct termios*)data = *(struct termios*)dev->si_drv2;
 		break;
 	case TIOCSETA:
 		/* Set terminal flags through tcsetattr(). */
 		error = priv_check(td, PRIV_TTY_SETA);
 		if (error)
 			break;
 		*(struct termios*)dev->si_drv2 = *(struct termios*)data;
 		break;
 	case TIOCGETD:
 		*(int *)data = TTYDISC;
 		break;
 	case TIOCGWINSZ:
 		bzero(data, sizeof(struct winsize));
 		break;
 	default:
 		error = ENOTTY;
 	}
 
 done:	tty_unlock(tp);
 	return (error);
 }
 
 static struct cdevsw ttyil_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttyil_open,
 	.d_close	= ttyil_close,
 	.d_read		= ttyil_rdwr,
 	.d_write	= ttyil_rdwr,
 	.d_ioctl	= ttyil_ioctl,
 	.d_name		= "ttyil",
 	.d_flags	= D_TTY,
 };
 
 static void
 tty_init_termios(struct tty *tp)
 {
 	struct termios *t = &tp->t_termios_init_in;
 
 	t->c_cflag = TTYDEF_CFLAG;
 	t->c_iflag = TTYDEF_IFLAG;
 	t->c_lflag = TTYDEF_LFLAG;
 	t->c_oflag = TTYDEF_OFLAG;
 	t->c_ispeed = TTYDEF_SPEED;
 	t->c_ospeed = TTYDEF_SPEED;
 	memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars);
 
 	tp->t_termios_init_out = *t;
 }
 
 void
 tty_init_console(struct tty *tp, speed_t s)
 {
 	struct termios *ti = &tp->t_termios_init_in;
 	struct termios *to = &tp->t_termios_init_out;
 
 	if (s != 0) {
 		ti->c_ispeed = ti->c_ospeed = s;
 		to->c_ispeed = to->c_ospeed = s;
 	}
 
 	ti->c_cflag |= CLOCAL;
 	to->c_cflag |= CLOCAL;
 }
 
 /*
  * Standard device routine implementations, mostly meant for
  * pseudo-terminal device drivers. When a driver creates a new terminal
  * device class, missing routines are patched.
  */
 
 static int
 ttydevsw_defopen(struct tty *tp __unused)
 {
 
 	return (0);
 }
 
 static void
 ttydevsw_defclose(struct tty *tp __unused)
 {
 
 }
 
 static void
 ttydevsw_defoutwakeup(struct tty *tp __unused)
 {
 
 	panic("Terminal device has output, while not implemented");
 }
 
 static void
 ttydevsw_definwakeup(struct tty *tp __unused)
 {
 
 }
 
 static int
 ttydevsw_defioctl(struct tty *tp __unused, u_long cmd __unused,
     caddr_t data __unused, struct thread *td __unused)
 {
 
 	return (ENOIOCTL);
 }
 
 static int
 ttydevsw_defcioctl(struct tty *tp __unused, int unit __unused,
     u_long cmd __unused, caddr_t data __unused, struct thread *td __unused)
 {
 
 	return (ENOIOCTL);
 }
 
 static int
 ttydevsw_defparam(struct tty *tp __unused, struct termios *t)
 {
 
 	/*
 	 * Allow the baud rate to be adjusted for pseudo-devices, but at
 	 * least restrict it to 115200 to prevent excessive buffer
 	 * usage.  Also disallow 0, to prevent foot shooting.
 	 */
 	if (t->c_ispeed < B50)
 		t->c_ispeed = B50;
 	else if (t->c_ispeed > B115200)
 		t->c_ispeed = B115200;
 	if (t->c_ospeed < B50)
 		t->c_ospeed = B50;
 	else if (t->c_ospeed > B115200)
 		t->c_ospeed = B115200;
 	t->c_cflag |= CREAD;
 
 	return (0);
 }
 
 static int
 ttydevsw_defmodem(struct tty *tp __unused, int sigon __unused,
     int sigoff __unused)
 {
 
 	/* Simulate a carrier to make the TTY layer happy. */
 	return (SER_DCD);
 }
 
 static int
 ttydevsw_defmmap(struct tty *tp __unused, vm_ooffset_t offset __unused,
     vm_paddr_t *paddr __unused, int nprot __unused,
     vm_memattr_t *memattr __unused)
 {
 
 	return (-1);
 }
 
 static void
 ttydevsw_defpktnotify(struct tty *tp __unused, char event __unused)
 {
 
 }
 
 static void
 ttydevsw_deffree(void *softc __unused)
 {
 
 	panic("Terminal device freed without a free-handler");
 }
 
 static bool
 ttydevsw_defbusy(struct tty *tp __unused)
 {
 
 	return (FALSE);
 }
 
 /*
  * TTY allocation and deallocation. TTY devices can be deallocated when
  * the driver doesn't use it anymore, when the TTY isn't a session's
  * controlling TTY and when the device node isn't opened through devfs.
  */
 
 struct tty *
 tty_alloc(struct ttydevsw *tsw, void *sc)
 {
 
 	return (tty_alloc_mutex(tsw, sc, NULL));
 }
 
 struct tty *
 tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
 {
 	struct tty *tp;
 
 	/* Make sure the driver defines all routines. */
 #define PATCH_FUNC(x) do {				\
 	if (tsw->tsw_ ## x == NULL)			\
 		tsw->tsw_ ## x = ttydevsw_def ## x;	\
 } while (0)
 	PATCH_FUNC(open);
 	PATCH_FUNC(close);
 	PATCH_FUNC(outwakeup);
 	PATCH_FUNC(inwakeup);
 	PATCH_FUNC(ioctl);
 	PATCH_FUNC(cioctl);
 	PATCH_FUNC(param);
 	PATCH_FUNC(modem);
 	PATCH_FUNC(mmap);
 	PATCH_FUNC(pktnotify);
 	PATCH_FUNC(free);
 	PATCH_FUNC(busy);
 #undef PATCH_FUNC
 
 	tp = malloc(sizeof(struct tty) + TTY_PRBUF_SIZE, M_TTY,
 	    M_WAITOK | M_ZERO);
 	tp->t_prbufsz = TTY_PRBUF_SIZE;
 	tp->t_devsw = tsw;
 	tp->t_devswsoftc = sc;
 	tp->t_flags = tsw->tsw_flags;
 	tp->t_drainwait = tty_drainwait;
 
 	tty_init_termios(tp);
 
 	cv_init(&tp->t_inwait, "ttyin");
 	cv_init(&tp->t_outwait, "ttyout");
 	cv_init(&tp->t_outserwait, "ttyosr");
 	cv_init(&tp->t_bgwait, "ttybg");
 	cv_init(&tp->t_dcdwait, "ttydcd");
 
 	/* Allow drivers to use a custom mutex to lock the TTY. */
 	if (mutex != NULL) {
 		tp->t_mtx = mutex;
 	} else {
 		tp->t_mtx = &tp->t_mtxobj;
 		mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF);
 	}
 
 	knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx);
 
 	return (tp);
 }
 
 static void
 tty_dealloc(void *arg)
 {
 	struct tty *tp = arg;
 
 	/*
 	 * ttyydev_leave() usually frees the i/o queues earlier, but it is
 	 * not always called between queue allocation and here.  The queues
 	 * may be allocated by ioctls on a pty control device without the
 	 * corresponding pty slave device ever being open, or after it is
 	 * closed.
 	 */
 	ttyinq_free(&tp->t_inq);
 	ttyoutq_free(&tp->t_outq);
 	seldrain(&tp->t_inpoll);
 	seldrain(&tp->t_outpoll);
 	knlist_clear(&tp->t_inpoll.si_note, 0);
 	knlist_clear(&tp->t_outpoll.si_note, 0);
 	knlist_destroy(&tp->t_inpoll.si_note);
 	knlist_destroy(&tp->t_outpoll.si_note);
 
 	cv_destroy(&tp->t_inwait);
 	cv_destroy(&tp->t_outwait);
 	cv_destroy(&tp->t_bgwait);
 	cv_destroy(&tp->t_dcdwait);
 	cv_destroy(&tp->t_outserwait);
 
 	if (tp->t_mtx == &tp->t_mtxobj)
 		mtx_destroy(&tp->t_mtxobj);
 	ttydevsw_free(tp);
 	free(tp, M_TTY);
 }
 
 static void
 tty_rel_free(struct tty *tp)
 {
 	struct cdev *dev;
 
 	tty_assert_locked(tp);
 
 #define	TF_ACTIVITY	(TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE)
 	if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) {
 		/* TTY is still in use. */
 		tty_unlock(tp);
 		return;
 	}
 
 	/* Stop asynchronous I/O. */
 	funsetown(&tp->t_sigio);
 
 	/* TTY can be deallocated. */
 	dev = tp->t_dev;
 	tp->t_dev = NULL;
 	tty_unlock(tp);
 
 	if (dev != NULL) {
 		sx_xlock(&tty_list_sx);
 		TAILQ_REMOVE(&tty_list, tp, t_list);
 		tty_list_count--;
 		sx_xunlock(&tty_list_sx);
 		destroy_dev_sched_cb(dev, tty_dealloc, tp);
 	}
 }
 
 void
 tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
 {
 
 	MPASS(tp->t_sessioncnt > 0);
 	tty_assert_locked(tp);
 
 	if (tp->t_pgrp == pg)
 		tp->t_pgrp = NULL;
 
 	tty_unlock(tp);
 }
 
 void
 tty_rel_sess(struct tty *tp, struct session *sess)
 {
 
 	MPASS(tp->t_sessioncnt > 0);
 
 	/* Current session has left. */
 	if (tp->t_session == sess) {
 		tp->t_session = NULL;
 		MPASS(tp->t_pgrp == NULL);
 	}
 	tp->t_sessioncnt--;
 	tty_rel_free(tp);
 }
 
 void
 tty_rel_gone(struct tty *tp)
 {
 
 	tty_assert_locked(tp);
 	MPASS(!tty_gone(tp));
 
 	/* Simulate carrier removal. */
 	ttydisc_modem(tp, 0);
 
 	/* Wake up all blocked threads. */
 	tty_wakeup(tp, FREAD|FWRITE);
 	cv_broadcast(&tp->t_bgwait);
 	cv_broadcast(&tp->t_dcdwait);
 
 	tp->t_flags |= TF_GONE;
 	tty_rel_free(tp);
 }
 
 static int
 tty_drop_ctty(struct tty *tp, struct proc *p)
 {
 	struct session *session;
 	struct vnode *vp;
 
 	/*
 	 * This looks terrible, but it's generally safe as long as the tty
 	 * hasn't gone away while we had the lock dropped.  All of our sanity
 	 * checking that this operation is OK happens after we've picked it back
 	 * up, so other state changes are generally not fatal and the potential
 	 * for this particular operation to happen out-of-order in a
 	 * multithreaded scenario is likely a non-issue.
 	 */
 	tty_unlock(tp);
 	sx_xlock(&proctree_lock);
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		sx_xunlock(&proctree_lock);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the session doesn't have a controlling TTY, or if we weren't
 	 * invoked on the controlling TTY, we'll return ENOIOCTL as we've
 	 * historically done.
 	 */
 	session = p->p_session;
 	if (session->s_ttyp == NULL || session->s_ttyp != tp) {
 		sx_xunlock(&proctree_lock);
 		return (ENOTTY);
 	}
 
 	if (!SESS_LEADER(p)) {
 		sx_xunlock(&proctree_lock);
 		return (EPERM);
 	}
 
 	PROC_LOCK(p);
 	SESS_LOCK(session);
 	vp = session->s_ttyvp;
 	session->s_ttyp = NULL;
 	session->s_ttyvp = NULL;
 	session->s_ttydp = NULL;
 	SESS_UNLOCK(session);
 
 	tp->t_sessioncnt--;
 	p->p_flag &= ~P_CONTROLT;
 	PROC_UNLOCK(p);
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * If we did have a vnode, release our reference.  Ordinarily we manage
 	 * these at the devfs layer, but we can't necessarily know that we were
 	 * invoked on the vnode referenced in the session (i.e. the vnode we
 	 * hold a reference to).  We explicitly don't check VBAD/VIRF_DOOMED here
 	 * to avoid a vnode leak -- in circumstances elsewhere where we'd hit a
 	 * VIRF_DOOMED vnode, release has been deferred until the controlling TTY
 	 * is either changed or released.
 	 */
 	if (vp != NULL)
 		devfs_ctty_unref(vp);
 	return (0);
 }
 
 /*
  * Exposing information about current TTY's through sysctl
  */
 
 static void
 tty_to_xtty(struct tty *tp, struct xtty *xt)
 {
 
 	tty_assert_locked(tp);
 
 	memset(xt, 0, sizeof(*xt));
 	xt->xt_size = sizeof(struct xtty);
 	xt->xt_insize = ttyinq_getsize(&tp->t_inq);
 	xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq);
 	xt->xt_inlc = ttyinq_bytesline(&tp->t_inq);
 	xt->xt_inlow = tp->t_inlow;
 	xt->xt_outsize = ttyoutq_getsize(&tp->t_outq);
 	xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq);
 	xt->xt_outlow = tp->t_outlow;
 	xt->xt_column = tp->t_column;
 	xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
 	xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0;
 	xt->xt_flags = tp->t_flags;
 	xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : (uint32_t)NODEV;
 }
 
 static int
 sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long lsize;
 	struct thread *td = curthread;
 	struct xtty *xtlist, *xt;
 	struct tty *tp;
 	struct proc *p;
 	int error;
 	bool cansee;
 
 	sx_slock(&tty_list_sx);
 	lsize = tty_list_count * sizeof(struct xtty);
 	if (lsize == 0) {
 		sx_sunlock(&tty_list_sx);
 		return (0);
 	}
 
 	xtlist = xt = malloc(lsize, M_TTY, M_WAITOK);
 
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		tty_lock(tp);
 		if (tp->t_session != NULL &&
 		    (p = atomic_load_ptr(&tp->t_session->s_leader)) != NULL) {
 			PROC_LOCK(p);
 			cansee = (p_cansee(td, p) == 0);
 			PROC_UNLOCK(p);
 		} else {
 			cansee = !jailed(td->td_ucred);
 		}
 		if (cansee) {
 			tty_to_xtty(tp, xt);
 			xt++;
 		}
 		tty_unlock(tp);
 	}
 	sx_sunlock(&tty_list_sx);
 
 	lsize = (xt - xtlist) * sizeof(struct xtty);
 	if (lsize > 0) {
 		error = SYSCTL_OUT(req, xtlist, lsize);
 	} else {
 		error = 0;
 	}
 	free(xtlist, M_TTY);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
 	0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs");
 
 /*
  * Device node creation. Device has been set up, now we can expose it to
  * the user.
  */
 
 int
 tty_makedevf(struct tty *tp, struct ucred *cred, int flags,
     const char *fmt, ...)
 {
 	va_list ap;
 	struct make_dev_args args;
 	struct cdev *dev, *init, *lock, *cua, *cinit, *clock;
 	const char *prefix = "tty";
 	char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */
 	uid_t uid;
 	gid_t gid;
 	mode_t mode;
 	int error;
 
 	/* Remove "tty" prefix from devices like PTY's. */
 	if (tp->t_flags & TF_NOPREFIX)
 		prefix = "";
 
 	va_start(ap, fmt);
 	vsnrprintf(name, sizeof name, 32, fmt, ap);
 	va_end(ap);
 
 	if (cred == NULL) {
 		/* System device. */
 		uid = UID_ROOT;
 		gid = GID_WHEEL;
 		mode = S_IRUSR|S_IWUSR;
 	} else {
 		/* User device. */
 		uid = cred->cr_ruid;
 		gid = GID_TTY;
 		mode = S_IRUSR|S_IWUSR|S_IWGRP;
 	}
 
 	flags = flags & TTYMK_CLONING ? MAKEDEV_REF : 0;
 	flags |= MAKEDEV_CHECKNAME;
 
 	/* Master call-in device. */
 	make_dev_args_init(&args);
 	args.mda_flags = flags;
 	args.mda_devsw = &ttydev_cdevsw;
 	args.mda_cr = cred;
 	args.mda_uid = uid;
 	args.mda_gid = gid;
 	args.mda_mode = mode;
 	args.mda_si_drv1 = tp;
 	error = make_dev_s(&args, &dev, "%s%s", prefix, name);
 	if (error != 0)
 		return (error);
 	tp->t_dev = dev;
 
 	init = lock = cua = cinit = clock = NULL;
 
 	/* Slave call-in devices. */
 	if (tp->t_flags & TF_INITLOCK) {
 		args.mda_devsw = &ttyil_cdevsw;
 		args.mda_unit = TTYUNIT_INIT;
 		args.mda_si_drv1 = tp;
 		args.mda_si_drv2 = &tp->t_termios_init_in;
 		error = make_dev_s(&args, &init, "%s%s.init", prefix, name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, init);
 
 		args.mda_unit = TTYUNIT_LOCK;
 		args.mda_si_drv2 = &tp->t_termios_lock_in;
 		error = make_dev_s(&args, &lock, "%s%s.lock", prefix, name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, lock);
 	}
 
 	/* Call-out devices. */
 	if (tp->t_flags & TF_CALLOUT) {
 		make_dev_args_init(&args);
 		args.mda_flags = flags;
 		args.mda_devsw = &ttydev_cdevsw;
 		args.mda_cr = cred;
 		args.mda_uid = UID_UUCP;
 		args.mda_gid = GID_DIALER;
 		args.mda_mode = 0660;
 		args.mda_unit = TTYUNIT_CALLOUT;
 		args.mda_si_drv1 = tp;
 		error = make_dev_s(&args, &cua, "cua%s", name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, cua);
 
 		/* Slave call-out devices. */
 		if (tp->t_flags & TF_INITLOCK) {
 			args.mda_devsw = &ttyil_cdevsw;
 			args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_INIT;
 			args.mda_si_drv2 = &tp->t_termios_init_out;
 			error = make_dev_s(&args, &cinit, "cua%s.init", name);
 			if (error != 0)
 				goto fail;
 			dev_depends(dev, cinit);
 
 			args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_LOCK;
 			args.mda_si_drv2 = &tp->t_termios_lock_out;
 			error = make_dev_s(&args, &clock, "cua%s.lock", name);
 			if (error != 0)
 				goto fail;
 			dev_depends(dev, clock);
 		}
 	}
 
 	sx_xlock(&tty_list_sx);
 	TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
 	tty_list_count++;
 	sx_xunlock(&tty_list_sx);
 
 	return (0);
 
 fail:
 	destroy_dev(dev);
 	if (init)
 		destroy_dev(init);
 	if (lock)
 		destroy_dev(lock);
 	if (cinit)
 		destroy_dev(cinit);
 	if (clock)
 		destroy_dev(clock);
 
 	return (error);
 }
 
 /*
  * Signalling processes.
  */
 
 void
 tty_signal_sessleader(struct tty *tp, int sig)
 {
 	struct proc *p;
 	struct session *s;
 
 	tty_assert_locked(tp);
 	MPASS(sig >= 1 && sig < NSIG);
 
 	/* Make signals start output again. */
 	tp->t_flags &= ~TF_STOPPED;
 	tp->t_termios.c_lflag &= ~FLUSHO;
 
 	/*
 	 * Load s_leader exactly once to avoid race where s_leader is
 	 * set to NULL by a concurrent invocation of killjobc() by the
 	 * session leader.  Note that we are not holding t_session's
 	 * lock for the read.
 	 */
 	if ((s = tp->t_session) != NULL &&
 	    (p = atomic_load_ptr(&s->s_leader)) != NULL) {
 		PROC_LOCK(p);
 		kern_psignal(p, sig);
 		PROC_UNLOCK(p);
 	}
 }
 
 void
 tty_signal_pgrp(struct tty *tp, int sig)
 {
 	ksiginfo_t ksi;
 
 	tty_assert_locked(tp);
 	MPASS(sig >= 1 && sig < NSIG);
 
 	/* Make signals start output again. */
 	tp->t_flags &= ~TF_STOPPED;
 	tp->t_termios.c_lflag &= ~FLUSHO;
 
 	if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO))
 		tty_info(tp);
 	if (tp->t_pgrp != NULL) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = sig;
 		ksi.ksi_code = SI_KERNEL;
 		PGRP_LOCK(tp->t_pgrp);
 		pgsignal(tp->t_pgrp, sig, 1, &ksi);
 		PGRP_UNLOCK(tp->t_pgrp);
 	}
 }
 
 void
 tty_wakeup(struct tty *tp, int flags)
 {
 
 	if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL)
 		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
 
 	if (flags & FWRITE) {
 		cv_broadcast(&tp->t_outwait);
 		selwakeup(&tp->t_outpoll);
 		KNOTE_LOCKED(&tp->t_outpoll.si_note, 0);
 	}
 	if (flags & FREAD) {
 		cv_broadcast(&tp->t_inwait);
 		selwakeup(&tp->t_inpoll);
 		KNOTE_LOCKED(&tp->t_inpoll.si_note, 0);
 	}
 }
 
 int
 tty_wait(struct tty *tp, struct cv *cv)
 {
 	int error;
 	int revokecnt = tp->t_revokecnt;
 
 	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
 	MPASS(!tty_gone(tp));
 
 	error = cv_wait_sig(cv, tp->t_mtx);
 
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
 
 	return (error);
 }
 
 int
 tty_timedwait(struct tty *tp, struct cv *cv, int hz)
 {
 	int error;
 	int revokecnt = tp->t_revokecnt;
 
 	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
 	MPASS(!tty_gone(tp));
 
 	error = cv_timedwait_sig(cv, tp->t_mtx, hz);
 
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
 
 	return (error);
 }
 
 void
 tty_flush(struct tty *tp, int flags)
 {
 
 	if (flags & FWRITE) {
 		tp->t_flags &= ~TF_HIWAT_OUT;
 		ttyoutq_flush(&tp->t_outq);
 		tty_wakeup(tp, FWRITE);
 		if (!tty_gone(tp)) {
 			ttydevsw_outwakeup(tp);
 			ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE);
 		}
 	}
 	if (flags & FREAD) {
 		tty_hiwat_in_unblock(tp);
 		ttyinq_flush(&tp->t_inq);
 		tty_wakeup(tp, FREAD);
 		if (!tty_gone(tp)) {
 			ttydevsw_inwakeup(tp);
 			ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD);
 		}
 	}
 }
 
 void
 tty_set_winsize(struct tty *tp, const struct winsize *wsz)
 {
 
 	if (memcmp(&tp->t_winsize, wsz, sizeof(*wsz)) == 0)
 		return;
 	tp->t_winsize = *wsz;
 	tty_signal_pgrp(tp, SIGWINCH);
 }
 
 static int
 tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
     struct thread *td)
 {
 	int error;
 
 	switch (cmd) {
 	/*
 	 * Modem commands.
 	 * The SER_* and TIOCM_* flags are the same, but one bit
 	 * shifted. I don't know why.
 	 */
 	case TIOCSDTR:
 		ttydevsw_modem(tp, SER_DTR, 0);
 		return (0);
 	case TIOCCDTR:
 		ttydevsw_modem(tp, 0, SER_DTR);
 		return (0);
 	case TIOCMSET: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp,
 		    (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1,
 		    ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1);
 		return (0);
 	}
 	case TIOCMBIS: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0);
 		return (0);
 	}
 	case TIOCMBIC: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1);
 		return (0);
 	}
 	case TIOCMGET:
 		*(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1);
 		return (0);
 
 	case FIOASYNC:
 		if (*(int *)data)
 			tp->t_flags |= TF_ASYNC;
 		else
 			tp->t_flags &= ~TF_ASYNC;
 		return (0);
 	case FIONBIO:
 		/* This device supports non-blocking operation. */
 		return (0);
 	case FIONREAD:
 		*(int *)data = ttydisc_bytesavail(tp);
 		return (0);
 	case FIONWRITE:
 	case TIOCOUTQ:
 		*(int *)data = ttyoutq_bytesused(&tp->t_outq);
 		return (0);
 	case FIOSETOWN:
 		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
 			/* Not allowed to set ownership. */
 			return (ENOTTY);
 
 		/* Temporarily unlock the TTY to set ownership. */
 		tty_unlock(tp);
 		error = fsetown(*(int *)data, &tp->t_sigio);
 		tty_lock(tp);
 		return (error);
 	case FIOGETOWN:
 		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
 			/* Not allowed to set ownership. */
 			return (ENOTTY);
 
 		/* Get ownership. */
 		*(int *)data = fgetown(&tp->t_sigio);
 		return (0);
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		*(struct termios*)data = tp->t_termios;
 		return (0);
 	case TIOCSETA:
 	case TIOCSETAW:
 	case TIOCSETAF: {
 		struct termios *t = data;
 		bool canonicalize = false;
 
 		/*
 		 * Who makes up these funny rules? According to POSIX,
 		 * input baud rate is set equal to the output baud rate
 		 * when zero.
 		 */
 		if (t->c_ispeed == 0)
 			t->c_ispeed = t->c_ospeed;
 
 		/* Discard any unsupported bits. */
 		t->c_iflag &= TTYSUP_IFLAG;
 		t->c_oflag &= TTYSUP_OFLAG;
 		t->c_lflag &= TTYSUP_LFLAG;
 		t->c_cflag &= TTYSUP_CFLAG;
 
 		/* Set terminal flags through tcsetattr(). */
 		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
 			error = tty_drain(tp, 0);
 			if (error)
 				return (error);
 			if (cmd == TIOCSETAF)
 				tty_flush(tp, FREAD);
 		}
 
 		/*
 		 * Only call param() when the flags really change.
 		 */
 		if ((t->c_cflag & CIGNORE) == 0 &&
 		    (tp->t_termios.c_cflag != t->c_cflag ||
 		    ((tp->t_termios.c_iflag ^ t->c_iflag) &
 		    (IXON|IXOFF|IXANY)) ||
 		    tp->t_termios.c_ispeed != t->c_ispeed ||
 		    tp->t_termios.c_ospeed != t->c_ospeed)) {
 			error = ttydevsw_param(tp, t);
 			if (error)
 				return (error);
 
 			/* XXX: CLOCAL? */
 
 			tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
 			tp->t_termios.c_ispeed = t->c_ispeed;
 			tp->t_termios.c_ospeed = t->c_ospeed;
 
 			/* Baud rate has changed - update watermarks. */
 			error = tty_watermarks(tp);
 			if (error)
 				return (error);
 		}
 
 		/*
 		 * We'll canonicalize any partial input if we're transitioning
 		 * ICANON one way or the other.  If we're going from -ICANON ->
 		 * ICANON, then in the worst case scenario we're in the middle
 		 * of a line but both ttydisc_read() and FIONREAD will search
 		 * for one of our line terminals.
 		 */
 		if ((t->c_lflag & ICANON) != (tp->t_termios.c_lflag & ICANON))
 			canonicalize = true;
 		else if (tp->t_termios.c_cc[VEOF] != t->c_cc[VEOF] ||
 		    tp->t_termios.c_cc[VEOL] != t->c_cc[VEOL])
 			canonicalize = true;
 
 		/* Copy new non-device driver parameters. */
 		tp->t_termios.c_iflag = t->c_iflag;
 		tp->t_termios.c_oflag = t->c_oflag;
 		tp->t_termios.c_lflag = t->c_lflag;
 		memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc);
 
 		ttydisc_optimize(tp);
 
 		if (canonicalize)
 			ttydisc_canonicalize(tp);
 		if ((t->c_lflag & ICANON) == 0) {
 			/*
 			 * When in non-canonical mode, wake up all
 			 * readers. Any partial input has already been
 			 * canonicalized above if we were in canonical mode.
 			 * VMIN and VTIME could also be adjusted.
 			 */
 			tty_wakeup(tp, FREAD);
 		}
 
 		/*
 		 * For packet mode: notify the PTY consumer that VSTOP
 		 * and VSTART may have been changed.
 		 */
 		if (tp->t_termios.c_iflag & IXON &&
 		    tp->t_termios.c_cc[VSTOP] == CTRL('S') &&
 		    tp->t_termios.c_cc[VSTART] == CTRL('Q'))
 			ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP);
 		else
 			ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP);
 		return (0);
 	}
 	case TIOCGETD:
 		/* For compatibility - we only support TTYDISC. */
 		*(int *)data = TTYDISC;
 		return (0);
 	case TIOCGPGRP:
 		if (!tty_is_ctty(tp, td->td_proc))
 			return (ENOTTY);
 
 		if (tp->t_pgrp != NULL)
 			*(int *)data = tp->t_pgrp->pg_id;
 		else
 			*(int *)data = NO_PID;
 		return (0);
 	case TIOCGSID:
 		if (!tty_is_ctty(tp, td->td_proc))
 			return (ENOTTY);
 
 		MPASS(tp->t_session);
 		*(int *)data = tp->t_session->s_sid;
 		return (0);
 	case TIOCNOTTY:
 		return (tty_drop_ctty(tp, td->td_proc));
 	case TIOCSCTTY: {
 		struct proc *p = td->td_proc;
 
 		/* XXX: This looks awful. */
 		tty_unlock(tp);
 		sx_xlock(&proctree_lock);
 		tty_lock(tp);
 
 		if (!SESS_LEADER(p)) {
 			/* Only the session leader may do this. */
 			sx_xunlock(&proctree_lock);
 			return (EPERM);
 		}
 
 		if (tp->t_session != NULL && tp->t_session == p->p_session) {
 			/* This is already our controlling TTY. */
 			sx_xunlock(&proctree_lock);
 			return (0);
 		}
 
 		if (p->p_session->s_ttyp != NULL ||
 		    (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL &&
 		    tp->t_session->s_ttyvp->v_type != VBAD)) {
 			/*
 			 * There is already a relation between a TTY and
 			 * a session, or the caller is not the session
 			 * leader.
 			 *
 			 * Allow the TTY to be stolen when the vnode is
 			 * invalid, but the reference to the TTY is
 			 * still active.  This allows immediate reuse of
 			 * TTYs of which the session leader has been
 			 * killed or the TTY revoked.
 			 */
 			sx_xunlock(&proctree_lock);
 			return (EPERM);
 		}
 
 		/* Connect the session to the TTY. */
 		tp->t_session = p->p_session;
 		tp->t_session->s_ttyp = tp;
 		tp->t_sessioncnt++;
 
 		/* Assign foreground process group. */
 		tp->t_pgrp = p->p_pgrp;
 		PROC_LOCK(p);
 		p->p_flag |= P_CONTROLT;
 		PROC_UNLOCK(p);
 
 		sx_xunlock(&proctree_lock);
 		return (0);
 	}
 	case TIOCSPGRP: {
 		struct pgrp *pg;
 
 		/*
 		 * XXX: Temporarily unlock the TTY to locate the process
 		 * group. This code would be lot nicer if we would ever
 		 * decompose proctree_lock.
 		 */
 		tty_unlock(tp);
 		sx_slock(&proctree_lock);
 		pg = pgfind(*(int *)data);
 		if (pg != NULL)
 			PGRP_UNLOCK(pg);
 		if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
 			sx_sunlock(&proctree_lock);
 			tty_lock(tp);
 			return (EPERM);
 		}
 		tty_lock(tp);
 
 		/*
 		 * Determine if this TTY is the controlling TTY after
 		 * relocking the TTY.
 		 */
 		if (!tty_is_ctty(tp, td->td_proc)) {
 			sx_sunlock(&proctree_lock);
 			return (ENOTTY);
 		}
 		tp->t_pgrp = pg;
 		sx_sunlock(&proctree_lock);
 
 		/* Wake up the background process groups. */
 		cv_broadcast(&tp->t_bgwait);
 		return (0);
 	}
 	case TIOCFLUSH: {
 		int flags = *(int *)data;
 
 		if (flags == 0)
 			flags = (FREAD|FWRITE);
 		else
 			flags &= (FREAD|FWRITE);
 		tty_flush(tp, flags);
 		return (0);
 	}
 	case TIOCDRAIN:
 		/* Drain TTY output. */
 		return tty_drain(tp, 0);
 	case TIOCGDRAINWAIT:
 		*(int *)data = tp->t_drainwait;
 		return (0);
 	case TIOCSDRAINWAIT:
 		error = priv_check(td, PRIV_TTY_DRAINWAIT);
 		if (error == 0)
 			tp->t_drainwait = *(int *)data;
 		return (error);
 	case TIOCCONS:
 		/* Set terminal as console TTY. */
 		if (*(int *)data) {
 			error = priv_check(td, PRIV_TTY_CONSOLE);
 			if (error)
 				return (error);
 			error = constty_set(tp);
 		} else {
 			error = constty_clear(tp);
 		}
 		return (error);
 	case TIOCGWINSZ:
 		/* Obtain window size. */
 		*(struct winsize*)data = tp->t_winsize;
 		return (0);
 	case TIOCSWINSZ:
 		/* Set window size. */
 		tty_set_winsize(tp, data);
 		return (0);
 	case TIOCEXCL:
 		tp->t_flags |= TF_EXCLUDE;
 		return (0);
 	case TIOCNXCL:
 		tp->t_flags &= ~TF_EXCLUDE;
 		return (0);
 	case TIOCSTOP:
 		tp->t_flags |= TF_STOPPED;
 		ttydevsw_pktnotify(tp, TIOCPKT_STOP);
 		return (0);
 	case TIOCSTART:
 		tp->t_flags &= ~TF_STOPPED;
 		tp->t_termios.c_lflag &= ~FLUSHO;
 		ttydevsw_outwakeup(tp);
 		ttydevsw_pktnotify(tp, TIOCPKT_START);
 		return (0);
 	case TIOCSTAT:
 		tty_info(tp);
 		return (0);
 	case TIOCSTI:
 		if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
 			return (EPERM);
 		if (!tty_is_ctty(tp, td->td_proc) &&
 		    priv_check(td, PRIV_TTY_STI))
 			return (EACCES);
 		ttydisc_rint(tp, *(char *)data, 0);
 		ttydisc_rint_done(tp);
 		return (0);
 	}
 
 #ifdef COMPAT_43TTY
 	return tty_ioctl_compat(tp, cmd, data, fflag, td);
 #else /* !COMPAT_43TTY */
 	return (ENOIOCTL);
 #endif /* COMPAT_43TTY */
 }
 
 int
 tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
 {
 	int error;
 
 	tty_assert_locked(tp);
 
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	error = ttydevsw_ioctl(tp, cmd, data, td);
 	if (error == ENOIOCTL)
 		error = tty_generic_ioctl(tp, cmd, data, fflag, td);
 
 	return (error);
 }
 
 dev_t
 tty_udev(struct tty *tp)
 {
 
 	if (tp->t_dev)
 		return (dev2udev(tp->t_dev));
 	else
 		return (NODEV);
 }
 
 int
 tty_checkoutq(struct tty *tp)
 {
 
 	/* 256 bytes should be enough to print a log message. */
 	return (ttyoutq_bytesleft(&tp->t_outq) >= 256);
 }
 
 void
 tty_hiwat_in_block(struct tty *tp)
 {
 
 	if ((tp->t_flags & TF_HIWAT_IN) == 0 &&
 	    tp->t_termios.c_iflag & IXOFF &&
 	    tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) {
 		/*
 		 * Input flow control. Only enter the high watermark when we
 		 * can successfully store the VSTOP character.
 		 */
 		if (ttyoutq_write_nofrag(&tp->t_outq,
 		    &tp->t_termios.c_cc[VSTOP], 1) == 0)
 			tp->t_flags |= TF_HIWAT_IN;
 	} else {
 		/* No input flow control. */
 		tp->t_flags |= TF_HIWAT_IN;
 	}
 }
 
 void
 tty_hiwat_in_unblock(struct tty *tp)
 {
 
 	if (tp->t_flags & TF_HIWAT_IN &&
 	    tp->t_termios.c_iflag & IXOFF &&
 	    tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) {
 		/*
 		 * Input flow control. Only leave the high watermark when we
 		 * can successfully store the VSTART character.
 		 */
 		if (ttyoutq_write_nofrag(&tp->t_outq,
 		    &tp->t_termios.c_cc[VSTART], 1) == 0)
 			tp->t_flags &= ~TF_HIWAT_IN;
 	} else {
 		/* No input flow control. */
 		tp->t_flags &= ~TF_HIWAT_IN;
 	}
 
 	if (!tty_gone(tp))
 		ttydevsw_inwakeup(tp);
 }
 
 /*
  * TTY hooks interface.
  */
 
 static int
 ttyhook_defrint(struct tty *tp, char c, int flags)
 {
 
 	if (ttyhook_rint_bypass(tp, &c, 1) != 1)
 		return (-1);
 
 	return (0);
 }
 
 int
 ttyhook_register(struct tty **rtp, struct proc *p, int fd, struct ttyhook *th,
     void *softc)
 {
 	struct tty *tp;
 	struct file *fp;
 	struct cdev *dev;
 	struct cdevsw *cdp;
 	struct filedesc *fdp;
 	cap_rights_t rights;
 	int error, ref;
 
 	/* Validate the file descriptor. */
 	/*
 	 * XXX this code inspects a file descriptor from a different process,
 	 * but there is no dedicated routine to do it in fd code, making the
 	 * ordeal highly questionable.
 	 */
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	error = fget_cap_noref(fdp, fd, cap_rights_init_one(&rights, CAP_TTYHOOK),
 	    &fp, NULL);
 	if (error == 0 && !fhold(fp))
 		error = EBADF;
 	FILEDESC_SUNLOCK(fdp);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto done1;
 	}
 
 	/*
 	 * Make sure the vnode is bound to a character device.
 	 * Unlocked check for the vnode type is ok there, because we
 	 * only shall prevent calling devvn_refthread on the file that
 	 * never has been opened over a character device.
 	 */
 	if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) {
 		error = EINVAL;
 		goto done1;
 	}
 
 	/* Make sure it is a TTY. */
 	cdp = devvn_refthread(fp->f_vnode, &dev, &ref);
 	if (cdp == NULL) {
 		error = ENXIO;
 		goto done1;
 	}
 	if (dev != fp->f_data) {
 		error = ENXIO;
 		goto done2;
 	}
 	if (cdp != &ttydev_cdevsw) {
 		error = ENOTTY;
 		goto done2;
 	}
 	tp = dev->si_drv1;
 
 	/* Try to attach the hook to the TTY. */
 	error = EBUSY;
 	tty_lock(tp);
 	MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0));
 	if (tp->t_flags & TF_HOOK)
 		goto done3;
 
 	tp->t_flags |= TF_HOOK;
 	tp->t_hook = th;
 	tp->t_hooksoftc = softc;
 	*rtp = tp;
 	error = 0;
 
 	/* Maybe we can switch into bypass mode now. */
 	ttydisc_optimize(tp);
 
 	/* Silently convert rint() calls to rint_bypass() when possible. */
 	if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass))
 		th->th_rint = ttyhook_defrint;
 
 done3:	tty_unlock(tp);
 done2:	dev_relthread(dev, ref);
 done1:	fdrop(fp, curthread);
 	return (error);
 }
 
 void
 ttyhook_unregister(struct tty *tp)
 {
 
 	tty_assert_locked(tp);
 	MPASS(tp->t_flags & TF_HOOK);
 
 	/* Disconnect the hook. */
 	tp->t_flags &= ~TF_HOOK;
 	tp->t_hook = NULL;
 
 	/* Maybe we need to leave bypass mode. */
 	ttydisc_optimize(tp);
 
 	/* Maybe deallocate the TTY as well. */
 	tty_rel_free(tp);
 }
 
 /*
  * /dev/console handling.
  */
 
 static int
 ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct tty *tp;
 
 	/* System has no console device. */
 	if (dev_console_filename == NULL)
 		return (ENXIO);
 
 	/* Look up corresponding TTY by device name. */
 	sx_slock(&tty_list_sx);
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		if (strcmp(dev_console_filename, tty_devname(tp)) == 0) {
 			dev_console->si_drv1 = tp;
 			break;
 		}
 	}
 	sx_sunlock(&tty_list_sx);
 
 	/* System console has no TTY associated. */
 	if (dev_console->si_drv1 == NULL)
 		return (ENXIO);
 
 	return (ttydev_open(dev, oflags, devtype, td));
 }
 
 static int
 ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 
 	log_console(uio);
 
 	return (ttydev_write(dev, uio, ioflag));
 }
 
 /*
  * /dev/console is a little different than normal TTY's.  When opened,
  * it determines which TTY to use.  When data gets written to it, it
  * will be logged in the kernel message buffer.
  */
 static struct cdevsw ttyconsdev_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttyconsdev_open,
 	.d_close	= ttydev_close,
 	.d_read		= ttydev_read,
 	.d_write	= ttyconsdev_write,
 	.d_ioctl	= ttydev_ioctl,
 	.d_kqfilter	= ttydev_kqfilter,
 	.d_poll		= ttydev_poll,
 	.d_mmap		= ttydev_mmap,
 	.d_name		= "ttyconsdev",
 	.d_flags	= D_TTY,
 };
 
 static void
 ttyconsdev_init(void *unused __unused)
 {
 
 	dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0,
 	    NULL, UID_ROOT, GID_WHEEL, 0600, "console");
 }
 
 SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL);
 
 void
 ttyconsdev_select(const char *name)
 {
 
 	dev_console_filename = name;
 }
 
 /*
  * Debugging routines.
  */
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 
 static const struct {
 	int flag;
 	char val;
 } ttystates[] = {
 #if 0
 	{ TF_NOPREFIX,		'N' },
 #endif
 	{ TF_INITLOCK,		'I' },
 	{ TF_CALLOUT,		'C' },
 
 	/* Keep these together -> 'Oi' and 'Oo'. */
 	{ TF_OPENED,		'O' },
 	{ TF_OPENED_IN,		'i' },
 	{ TF_OPENED_OUT,	'o' },
 	{ TF_OPENED_CONS,	'c' },
 
 	{ TF_GONE,		'G' },
 	{ TF_OPENCLOSE,		'B' },
 	{ TF_ASYNC,		'Y' },
 	{ TF_LITERAL,		'L' },
 
 	/* Keep these together -> 'Hi' and 'Ho'. */
 	{ TF_HIWAT,		'H' },
 	{ TF_HIWAT_IN,		'i' },
 	{ TF_HIWAT_OUT,		'o' },
 
 	{ TF_STOPPED,		'S' },
 	{ TF_EXCLUDE,		'X' },
 	{ TF_BYPASS,		'l' },
 	{ TF_ZOMBIE,		'Z' },
 	{ TF_HOOK,		's' },
 
 	/* Keep these together -> 'bi' and 'bo'. */
 	{ TF_BUSY,		'b' },
 	{ TF_BUSY_IN,		'i' },
 	{ TF_BUSY_OUT,		'o' },
 
 	{ 0,			'\0'},
 };
 
 #define	TTY_FLAG_BITS \
 	"\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN" \
 	"\5OPENED_OUT\6OPENED_CONS\7GONE\10OPENCLOSE" \
 	"\11ASYNC\12LITERAL\13HIWAT_IN\14HIWAT_OUT" \
 	"\15STOPPED\16EXCLUDE\17BYPASS\20ZOMBIE" \
 	"\21HOOK\22BUSY_IN\23BUSY_OUT"
 
 #define DB_PRINTSYM(name, addr) \
 	db_printf("%s  " #name ": ", sep); \
 	db_printsym((db_addr_t) addr, DB_STGY_ANY); \
 	db_printf("\n");
 
 static void
 _db_show_devsw(const char *sep, const struct ttydevsw *tsw)
 {
 
 	db_printf("%sdevsw: ", sep);
 	db_printsym((db_addr_t)tsw, DB_STGY_ANY);
 	db_printf(" (%p)\n", tsw);
 	DB_PRINTSYM(open, tsw->tsw_open);
 	DB_PRINTSYM(close, tsw->tsw_close);
 	DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup);
 	DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup);
 	DB_PRINTSYM(ioctl, tsw->tsw_ioctl);
 	DB_PRINTSYM(param, tsw->tsw_param);
 	DB_PRINTSYM(modem, tsw->tsw_modem);
 	DB_PRINTSYM(mmap, tsw->tsw_mmap);
 	DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify);
 	DB_PRINTSYM(free, tsw->tsw_free);
 }
 
 static void
 _db_show_hooks(const char *sep, const struct ttyhook *th)
 {
 
 	db_printf("%shook: ", sep);
 	db_printsym((db_addr_t)th, DB_STGY_ANY);
 	db_printf(" (%p)\n", th);
 	if (th == NULL)
 		return;
 	DB_PRINTSYM(rint, th->th_rint);
 	DB_PRINTSYM(rint_bypass, th->th_rint_bypass);
 	DB_PRINTSYM(rint_done, th->th_rint_done);
 	DB_PRINTSYM(rint_poll, th->th_rint_poll);
 	DB_PRINTSYM(getc_inject, th->th_getc_inject);
 	DB_PRINTSYM(getc_capture, th->th_getc_capture);
 	DB_PRINTSYM(getc_poll, th->th_getc_poll);
 	DB_PRINTSYM(close, th->th_close);
 }
 
 static void
 _db_show_termios(const char *name, const struct termios *t)
 {
 
 	db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x "
 	    "lflag 0x%x ispeed %u ospeed %u\n", name,
 	    t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag,
 	    t->c_ispeed, t->c_ospeed);
 }
 
 /* DDB command to show TTY statistics. */
 DB_SHOW_COMMAND(tty, db_show_tty)
 {
 	struct tty *tp;
 
 	if (!have_addr) {
 		db_printf("usage: show tty <addr>\n");
 		return;
 	}
 	tp = (struct tty *)addr;
 
 	db_printf("%p: %s\n", tp, tty_devname(tp));
 	db_printf("\tmtx: %p\n", tp->t_mtx);
 	db_printf("\tflags: 0x%b\n", tp->t_flags, TTY_FLAG_BITS);
 	db_printf("\trevokecnt: %u\n", tp->t_revokecnt);
 
 	/* Buffering mechanisms. */
 	db_printf("\tinq: %p begin %u linestart %u reprint %u end %u "
 	    "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin,
 	    tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end,
 	    tp->t_inq.ti_nblocks, tp->t_inq.ti_quota);
 	db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n",
 	    &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end,
 	    tp->t_outq.to_nblocks, tp->t_outq.to_quota);
 	db_printf("\tinlow: %zu\n", tp->t_inlow);
 	db_printf("\toutlow: %zu\n", tp->t_outlow);
 	_db_show_termios("\ttermios", &tp->t_termios);
 	db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n",
 	    tp->t_winsize.ws_row, tp->t_winsize.ws_col,
 	    tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel);
 	db_printf("\tcolumn: %u\n", tp->t_column);
 	db_printf("\twritepos: %u\n", tp->t_writepos);
 	db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags);
 
 	/* Init/lock-state devices. */
 	_db_show_termios("\ttermios_init_in", &tp->t_termios_init_in);
 	_db_show_termios("\ttermios_init_out", &tp->t_termios_init_out);
 	_db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in);
 	_db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out);
 
 	/* Hooks */
 	_db_show_devsw("\t", tp->t_devsw);
 	_db_show_hooks("\t", tp->t_hook);
 
 	/* Process info. */
 	db_printf("\tpgrp: %p gid %d\n", tp->t_pgrp,
 	    tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
 	db_printf("\tsession: %p", tp->t_session);
 	if (tp->t_session != NULL)
 	    db_printf(" count %u leader %p tty %p sid %d login %s",
 		tp->t_session->s_count, tp->t_session->s_leader,
 		tp->t_session->s_ttyp, tp->t_session->s_sid,
 		tp->t_session->s_login);
 	db_printf("\n");
 	db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt);
 	db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc);
 	db_printf("\thooksoftc: %p\n", tp->t_hooksoftc);
 	db_printf("\tdev: %p\n", tp->t_dev);
 }
 
 /* DDB command to list TTYs. */
 DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys)
 {
 	struct tty *tp;
 	size_t isiz, osiz;
 	int i, j;
 
 	/* Make the output look like `pstat -t'. */
 	db_printf("PTR        ");
 #if defined(__LP64__)
 	db_printf("        ");
 #endif
 	db_printf("      LINE   INQ  CAN  LIN  LOW  OUTQ  USE  LOW   "
 	    "COL  SESS  PGID STATE\n");
 
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE;
 		osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE;
 
 		db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d "
 		    "%5d ", tp, tty_devname(tp), isiz,
 		    tp->t_inq.ti_linestart - tp->t_inq.ti_begin,
 		    tp->t_inq.ti_end - tp->t_inq.ti_linestart,
 		    isiz - tp->t_inlow, osiz,
 		    tp->t_outq.to_end - tp->t_outq.to_begin,
 		    osiz - tp->t_outlow, MIN(tp->t_column, 99999),
 		    tp->t_session ? tp->t_session->s_sid : 0,
 		    tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
 
 		/* Flag bits. */
 		for (i = j = 0; ttystates[i].flag; i++)
 			if (tp->t_flags & ttystates[i].flag) {
 				db_printf("%c", ttystates[i].val);
 				j++;
 			}
 		if (j == 0)
 			db_printf("-");
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c
index 4a3b3d77c89e..d629fa0e7593 100644
--- a/sys/kern/tty_pts.c
+++ b/sys/kern/tty_pts.c
@@ -1,869 +1,869 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 /* Add compatibility bits for FreeBSD. */
 #define PTS_COMPAT
 /* Add pty(4) compat bits. */
 #define PTS_EXTERNAL
 /* Add bits to make Linux binaries work. */
 #define PTS_LINUX
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/serial.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 
 #include <machine/stdarg.h>
 
 /*
  * Our utmp(5) format is limited to 8-byte TTY line names.  This means
  * we can at most allocate 1000 pseudo-terminals ("pts/999").  Allow
  * users to increase this number, assuming they have manually increased
  * UT_LINESIZE.
  */
 static struct unrhdr *pts_pool;
 
 static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");
 
 /*
  * Per-PTS structure.
  *
  * List of locks
  * (t)	locked by tty_lock()
  * (c)	const until freeing
  */
 struct pts_softc {
 	int		pts_unit;	/* (c) Device unit number. */
 	unsigned int	pts_flags;	/* (t) Device flags. */
 #define	PTS_PKT		0x1	/* Packet mode. */
 #define	PTS_FINISHED	0x2	/* Return errors on read()/write(). */
 	char		pts_pkt;	/* (t) Unread packet mode data. */
 
 	struct cv	pts_inwait;	/* (t) Blocking write() on master. */
 	struct selinfo	pts_inpoll;	/* (t) Select queue for write(). */
 	struct cv	pts_outwait;	/* (t) Blocking read() on master. */
 	struct selinfo	pts_outpoll;	/* (t) Select queue for read(). */
 
 #ifdef PTS_EXTERNAL
 	struct cdev	*pts_cdev;	/* (c) Master device node. */
 #endif /* PTS_EXTERNAL */
 
 	struct ucred	*pts_cred;	/* (c) Resource limit. */
 };
 
 /*
  * Controller-side file operations.
  */
 
 static int
 ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 	char pkt;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	tty_lock(tp);
 
 	for (;;) {
 		/*
 		 * Implement packet mode. When packet mode is turned on,
 		 * the first byte contains a bitmask of events that
 		 * occurred (start, stop, flush, window size, etc).
 		 */
 		if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
 			pkt = psc->pts_pkt;
 			psc->pts_pkt = 0;
 			tty_unlock(tp);
 
 			error = ureadc(pkt, uio);
 			return (error);
 		}
 
 		/*
 		 * Transmit regular data.
 		 *
 		 * XXX: We shouldn't use ttydisc_getc_poll()! Even
 		 * though in this implementation, there is likely going
 		 * to be data, we should just call ttydisc_getc_uio()
 		 * and use its return value to sleep.
 		 */
 		if (ttydisc_getc_poll(tp)) {
 			if (psc->pts_flags & PTS_PKT) {
 				/*
 				 * XXX: Small race. Fortunately PTY
 				 * consumers aren't multithreaded.
 				 */
 
 				tty_unlock(tp);
 				error = ureadc(TIOCPKT_DATA, uio);
 				if (error)
 					return (error);
 				tty_lock(tp);
 			}
 
 			error = ttydisc_getc_uio(tp, uio);
 			break;
 		}
 
 		/* Maybe the device isn't used anyway. */
 		if (psc->pts_flags & PTS_FINISHED)
 			break;
 
 		/* Wait for more data. */
 		if (fp->f_flag & O_NONBLOCK) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
 		if (error != 0)
 			break;
 	}
 
 	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	char ib[256], *ibstart;
 	size_t iblen, rintlen;
 	int error = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	for (;;) {
 		ibstart = ib;
 		iblen = MIN(uio->uio_resid, sizeof ib);
 		error = uiomove(ib, iblen, uio);
 
 		tty_lock(tp);
 		if (error != 0) {
 			iblen = 0;
 			goto done;
 		}
 
 		/*
 		 * When possible, avoid the slow path. rint_bypass()
 		 * copies all input to the input queue at once.
 		 */
 		MPASS(iblen > 0);
 		do {
 			rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
 			ibstart += rintlen;
 			iblen -= rintlen;
 			if (iblen == 0) {
 				/* All data written. */
 				break;
 			}
 
 			/* Maybe the device isn't used anyway. */
 			if (psc->pts_flags & PTS_FINISHED) {
 				error = EIO;
 				goto done;
 			}
 
 			/* Wait for more data. */
 			if (fp->f_flag & O_NONBLOCK) {
 				error = EWOULDBLOCK;
 				goto done;
 			}
 
 			/* Wake up users on the slave side. */
 			ttydisc_rint_done(tp);
 			error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
 			if (error != 0)
 				goto done;
 		} while (iblen > 0);
 
 		if (uio->uio_resid == 0)
 			break;
 		tty_unlock(tp);
 	}
 
 done:	ttydisc_rint_done(tp);
 	tty_unlock(tp);
 
 	/*
 	 * Don't account for the part of the buffer that we couldn't
 	 * pass to the TTY.
 	 */
 	uio->uio_resid += iblen;
 	return (error);
 }
 
 static int
 ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0, sig;
 
 	switch (cmd) {
 	case FIODTYPE:
 		*(int *)data = D_TTY;
 		return (0);
 	case FIONBIO:
 		/* This device supports non-blocking operation. */
 		return (0);
 	case FIONREAD:
 		tty_lock(tp);
 		*(int *)data = ttydisc_getc_poll(tp);
 		tty_unlock(tp);
 		return (0);
 	case FIODGNAME:
 #ifdef COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 	{
 		struct fiodgname_arg *fgn;
 		const char *p;
 		int i;
 
 		/* Reverse device name lookups, for ptsname() and ttyname(). */
 		fgn = data;
 		p = tty_devname(tp);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			return (EINVAL);
 		return (copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i));
 	}
 
 	/*
 	 * We need to implement TIOCGPGRP and TIOCGSID here again. When
 	 * called on the pseudo-terminal master, it should not check if
 	 * the terminal is the foreground terminal of the calling
 	 * process.
 	 *
 	 * TIOCGETA is also implemented here. Various Linux PTY routines
 	 * often call isatty(), which is implemented by tcgetattr().
 	 */
 #ifdef PTS_LINUX
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		tty_lock(tp);
 		*(struct termios*)data = tp->t_termios;
 		tty_unlock(tp);
 		return (0);
 #endif /* PTS_LINUX */
 	case TIOCSETAF:
 	case TIOCSETAW:
 		/*
 		 * We must make sure we turn tcsetattr() calls of TCSAFLUSH and
 		 * TCSADRAIN into something different. If an application would
 		 * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
 		 * deadlock waiting for all data to be read.
 		 */
 		cmd = TIOCSETA;
 		break;
 #if defined(PTS_COMPAT) || defined(PTS_LINUX)
 	case TIOCGPTN:
 		/*
 		 * Get the device unit number.
 		 */
 		if (psc->pts_unit < 0)
 			return (ENOTTY);
 		*(unsigned int *)data = psc->pts_unit;
 		return (0);
 #endif /* PTS_COMPAT || PTS_LINUX */
 	case TIOCGPGRP:
 		/* Get the foreground process group ID. */
 		tty_lock(tp);
 		if (tp->t_pgrp != NULL)
 			*(int *)data = tp->t_pgrp->pg_id;
 		else
 			*(int *)data = NO_PID;
 		tty_unlock(tp);
 		return (0);
 	case TIOCGSID:
 		/* Get the session leader process ID. */
 		tty_lock(tp);
 		if (tp->t_session == NULL)
 			error = ENOTTY;
 		else
 			*(int *)data = tp->t_session->s_sid;
 		tty_unlock(tp);
 		return (error);
 	case TIOCPTMASTER:
 		/* Yes, we are a pseudo-terminal master. */
 		return (0);
 	case TIOCSIG:
 		/* Signal the foreground process group. */
 		sig = *(int *)data;
 		if (sig < 1 || sig >= NSIG)
 			return (EINVAL);
 
 		tty_lock(tp);
 		tty_signal_pgrp(tp, sig);
 		tty_unlock(tp);
 		return (0);
 	case TIOCPKT:
 		/* Enable/disable packet mode. */
 		tty_lock(tp);
 		if (*(int *)data)
 			psc->pts_flags |= PTS_PKT;
 		else
 			psc->pts_flags &= ~PTS_PKT;
 		tty_unlock(tp);
 		return (0);
 	}
 
 	/* Just redirect this ioctl to the slave device. */
 	tty_lock(tp);
 	error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
 	tty_unlock(tp);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	return (error);
 }
 
 static int
 ptsdev_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int revents = 0;
 
 	tty_lock(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		/* Slave device is not opened. */
 		tty_unlock(tp);
 		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
 	}
 
 	if (events & (POLLIN|POLLRDNORM)) {
 		/* See if we can getc something. */
 		if (ttydisc_getc_poll(tp) ||
 		    (psc->pts_flags & PTS_PKT && psc->pts_pkt))
 			revents |= events & (POLLIN|POLLRDNORM);
 	}
 	if (events & (POLLOUT|POLLWRNORM)) {
 		/* See if we can rint something. */
 		if (ttydisc_rint_poll(tp))
 			revents |= events & (POLLOUT|POLLWRNORM);
 	}
 
 	/*
 	 * No need to check for POLLHUP here. This device cannot be used
 	 * as a callout device, which means we always have a carrier,
 	 * because the master is.
 	 */
 
 	if (revents == 0) {
 		/*
 		 * This code might look misleading, but the naming of
 		 * poll events on this side is the opposite of the slave
 		 * device.
 		 */
 		if (events & (POLLIN|POLLRDNORM))
 			selrecord(td, &psc->pts_outpoll);
 		if (events & (POLLOUT|POLLWRNORM))
 			selrecord(td, &psc->pts_inpoll);
 	}
 
 	tty_unlock(tp);
 
 	return (revents);
 }
 
 /*
  * kqueue support.
  */
 
 static void
 pts_kqops_read_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_read_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_getc_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 pts_kqops_write_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_write_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_rint_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
-static struct filterops pts_kqops_read = {
+static const struct filterops pts_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_read_detach,
 	.f_event = pts_kqops_read_event,
 };
-static struct filterops pts_kqops_write = {
+static const struct filterops pts_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_write_detach,
 	.f_event = pts_kqops_write_event,
 };
 
 static int
 ptsdev_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 
 	tty_lock(tp);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pts_kqops_read;
 		knlist_add(&psc->pts_outpoll.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pts_kqops_write;
 		knlist_add(&psc->pts_inpoll.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	tty_unlock(tp);
 	return (error);
 }
 
 static int
 ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct tty *tp = fp->f_data;
 #ifdef PTS_EXTERNAL
 	struct pts_softc *psc = tty_softc(tp);
 #endif /* PTS_EXTERNAL */
 	struct cdev *dev = tp->t_dev;
 
 	/*
 	 * According to POSIX, we must implement an fstat(). This also
 	 * makes this implementation compatible with Linux binaries,
 	 * because Linux calls fstat() on the pseudo-terminal master to
 	 * obtain st_rdev.
 	 *
 	 * XXX: POSIX also mentions we must fill in st_dev, but how?
 	 */
 
 	bzero(sb, sizeof *sb);
 #ifdef PTS_EXTERNAL
 	if (psc->pts_cdev != NULL)
 		sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
 	else
 #endif /* PTS_EXTERNAL */
 		sb->st_ino = sb->st_rdev = tty_udev(tp);
 
 	sb->st_atim = dev->si_atime;
 	sb->st_ctim = dev->si_ctime;
 	sb->st_mtim = dev->si_mtime;
 	sb->st_uid = dev->si_uid;
 	sb->st_gid = dev->si_gid;
 	sb->st_mode = dev->si_mode | S_IFCHR;
 
 	return (0);
 }
 
 static int
 ptsdev_close(struct file *fp, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 
 	/* Deallocate TTY device. */
 	tty_lock(tp);
 	tty_rel_gone(tp);
 
 	/*
 	 * Open of /dev/ptmx or /dev/ptyXX changes the type of file
 	 * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
 	 * use count, we need to decrement it, and possibly do other
 	 * required cleanup.
 	 */
 	if (fp->f_vnode != NULL)
 		return (vnops.fo_close(fp, td));
 
 	return (0);
 }
 
 static int
 ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct tty *tp;
 
 	kif->kf_type = KF_TYPE_PTS;
 	tp = fp->f_data;
 	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
 	kif->kf_un.kf_pts.kf_pts_dev_freebsd11 =
 	    kif->kf_un.kf_pts.kf_pts_dev; /* truncate */
 	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
 	return (0);
 }
 
-static struct fileops ptsdev_ops = {
+static const struct fileops ptsdev_ops = {
 	.fo_read	= ptsdev_read,
 	.fo_write	= ptsdev_write,
 	.fo_truncate	= invfo_truncate,
 	.fo_ioctl	= ptsdev_ioctl,
 	.fo_poll	= ptsdev_poll,
 	.fo_kqfilter	= ptsdev_kqfilter,
 	.fo_stat	= ptsdev_stat,
 	.fo_close	= ptsdev_close,
 	.fo_chmod	= invfo_chmod,
 	.fo_chown	= invfo_chown,
 	.fo_sendfile	= invfo_sendfile,
 	.fo_fill_kinfo	= ptsdev_fill_kinfo,
 	.fo_cmp		= file_kcmp_generic,
 	.fo_flags	= DFLAG_PASSABLE,
 };
 
 /*
  * Driver-side hooks.
  */
 
 static void
 ptsdrv_outwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_outwait);
 	selwakeup(&psc->pts_outpoll);
 	KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
 }
 
 static void
 ptsdrv_inwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_inwait);
 	selwakeup(&psc->pts_inpoll);
 	KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
 }
 
 static int
 ptsdrv_open(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	psc->pts_flags &= ~PTS_FINISHED;
 
 	return (0);
 }
 
 static void
 ptsdrv_close(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/* Wake up any blocked readers/writers. */
 	psc->pts_flags |= PTS_FINISHED;
 	ptsdrv_outwakeup(tp);
 	ptsdrv_inwakeup(tp);
 }
 
 static void
 ptsdrv_pktnotify(struct tty *tp, char event)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/*
 	 * Clear conflicting flags.
 	 */
 
 	switch (event) {
 	case TIOCPKT_STOP:
 		psc->pts_pkt &= ~TIOCPKT_START;
 		break;
 	case TIOCPKT_START:
 		psc->pts_pkt &= ~TIOCPKT_STOP;
 		break;
 	case TIOCPKT_NOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_DOSTOP;
 		break;
 	case TIOCPKT_DOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_NOSTOP;
 		break;
 	}
 
 	psc->pts_pkt |= event;
 	ptsdrv_outwakeup(tp);
 }
 
 static void
 ptsdrv_free(void *softc)
 {
 	struct pts_softc *psc = softc;
 
 	/* Make device number available again. */
 	if (psc->pts_unit >= 0)
 		free_unr(pts_pool, psc->pts_unit);
 
 	chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
 	racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
 	crfree(psc->pts_cred);
 
 	seldrain(&psc->pts_inpoll);
 	seldrain(&psc->pts_outpoll);
 	knlist_destroy(&psc->pts_inpoll.si_note);
 	knlist_destroy(&psc->pts_outpoll.si_note);
 
 #ifdef PTS_EXTERNAL
 	/* Destroy master device as well. */
 	if (psc->pts_cdev != NULL)
 		destroy_dev_sched(psc->pts_cdev);
 #endif /* PTS_EXTERNAL */
 
 	free(psc, M_PTS);
 }
 
 static struct ttydevsw pts_class = {
 	.tsw_flags	= TF_NOPREFIX,
 	.tsw_outwakeup	= ptsdrv_outwakeup,
 	.tsw_inwakeup	= ptsdrv_inwakeup,
 	.tsw_open	= ptsdrv_open,
 	.tsw_close	= ptsdrv_close,
 	.tsw_pktnotify	= ptsdrv_pktnotify,
 	.tsw_free	= ptsdrv_free,
 };
 
 #ifndef PTS_EXTERNAL
 static
 #endif /* !PTS_EXTERNAL */
 int
 pts_alloc(int fflags, struct thread *td, struct file *fp)
 {
 	int unit, ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Try to allocate a new pts unit number. */
 	unit = alloc_unr(pts_pool);
 	if (unit < 0) {
 		racct_sub(p, RACCT_NPTS, 1);
 		chgptscnt(cred->cr_ruidinfo, -1, 0);
 		return (EAGAIN);
 	}
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = unit;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 
 #ifdef PTS_EXTERNAL
 int
 pts_alloc_external(int fflags, struct thread *td, struct file *fp,
     struct cdev *dev, const char *name)
 {
 	int ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = -1;
 	psc->pts_cdev = dev;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "%s", name);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 #endif /* PTS_EXTERNAL */
 
 int
 sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap)
 {
 	int error, fd;
 	struct file *fp;
 
 	/*
 	 * POSIX states it's unspecified when other flags are passed. We
 	 * don't allow this.
 	 */
 	if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC))
 		return (EINVAL);
 
 	error = falloc(td, &fp, &fd, uap->flags);
 	if (error)
 		return (error);
 
 	/* Allocate the actual pseudo-TTY. */
 	error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
 	if (error != 0) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	/* Pass it back to userspace. */
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static void
 pts_init(void *unused)
 {
 
 	pts_pool = new_unrhdr(0, INT_MAX, NULL);
 }
 
 SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
index 5fcabbac7923..7dd0f9796682 100644
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c
@@ -1,2938 +1,2938 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 #include <security/audit/audit.h>
 
 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	uint32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
-static struct fileops		mqueueops;
+static const struct fileops	mqueueops;
 static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
-struct filterops mq_rfiltops = {
+static const struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
 };
-struct filterops mq_wfiltops = {
+static const struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
 };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return (uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_add_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	struct mqfs_info *mqfs;
 	int old, exp;
 
 	mqfs = node->mn_info;
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp) {
 		int locked = sx_xlocked(&mqfs->mi_lock);
 		if (!locked)
 			sx_xlock(&mqfs->mi_lock);
 		mqfs_destroy(node);
 		if (!locked)
 			sx_xunlock(&mqfs->mi_lock);
 	}
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime =
 	    node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
 	    curthread);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
 	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vrecycle(vp);
 	VOP_UNLOCK(vp);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	struct mqfs_info  *mqfs;
 	struct vnode *newvpp;
 	int error;
 
 	mqfs = pn->mn_info;
 	*vpp = NULL;
 	sx_xlock(&mqfs->mi_lock);
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			break;
 		}
 	}
 
 	if (vd != NULL) {
 found:
 		*vpp = vd->mv_vnode;
 		sx_xunlock(&mqfs->mi_lock);
 		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(*vpp);
 		return (error);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
 	if (error)
 		return (error);
 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(newvpp, mp);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&mqfs->mi_lock);
 	/*
 	 * Check if it has already been allocated
 	 * while we were blocked.
 	 */
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			sx_xunlock(&mqfs->mi_lock);
 
 			vgone(newvpp);
 			vput(newvpp);
 			goto found;
 		}
 	}
 
 	*vpp = newvpp;
 
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	vn_set_state(*vpp, VSTATE_CONSTRUCTED);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
 	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
 	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		/* Only match names within the same prison root directory */
 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
 		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqfs_info *mqfs;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	td = curthread;
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error) {
 				mqnode_release(pn);
 				return (error);
 			}
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		mqnode_release(pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int rc;
 
 	rc = mqfs_lookupx(ap);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		error = ENOSPC;
 	} else {
 		mqnode_addref(pn);
 		sx_xunlock(&mqfs->mi_lock);
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static int
 do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	uma_zfree(mvdata_zone, vd);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	struct file *a_fp;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid,
 	    ap->a_accmode, ap->a_cred);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = NODEV;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	td = curthread;
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if (vap->va_type != VNON ||
 	    vap->va_nlink != VNOVAL ||
 	    vap->va_fsid != VNOVAL ||
 	    vap->va_fileid != VNOVAL ||
 	    vap->va_blocksize != VNOVAL ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    vap->va_rdev != VNOVAL ||
 	    (int)vap->va_bytes != VNOVAL ||
 	    vap->va_gen != VNOVAL) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if ((ap->a_cred->cr_uid != pn->mn_uid || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != pn->mn_uid &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 	    "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 	    mq->mq_totalbytes,
 	    mq->mq_maxmsg,
 	    mq->mq_curmsgs,
 	    mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	uint64_t **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 
 		/*
 		 * Only show names within the same prison root directory
 		 * (or not associated with a prison, e.g. "." and "..").
 		 */
 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
 			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		entry.d_off = offset + entry.d_reclen;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		dirent_terminate(&entry);
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	if (pn == NULL) {
 		error = ENOSPC;
 	} else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 	}
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * See if this prison root is obsolete, and clean up associated queues if it is.
  */
 static int
 mqfs_prison_remove(void *obj, void *data __unused)
 {
 	const struct prison *pr = obj;
 	struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
 	struct vnode *pr_root;
 
 	pr_root = pr->pr_root;
 	if (pr->pr_parent->pr_root == pr_root)
 		return (0);
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr != pr && tpr->pr_root == pr_root)
 			return (0);
 	}
 	/*
 	 * No jails are rooted in this directory anymore,
 	 * so no queues should be either.
 	 */
 	sx_xlock(&mqfs_data.mi_lock);
 	LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
 	    mn_sibling, tpn) {
 		if (pn->mn_pr_root == pr_root)
 			(void)do_unlink(pn, curthread->td_ucred);
 	}
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		free(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	seldrain(&mq->mq_rsel);
 	seldrain(&mq->mq_wsel);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		free(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	free(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		error = sigev_findtd(p, &nt->nt_sigev, &td);
 		if (error) {
 			mq->mq_notifier = NULL;
 			return;
 		}
 		if (!KSI_ONQ(&nt->nt_ksi)) {
 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
 		}
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 static int
 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
     const struct mq_attr *attr)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mqfs_node *pn;
 	struct pwddesc *pdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, cmode;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	pdp = td->td_proc->p_pd;
 	cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT;
 	mq = NULL;
 	if ((flags & O_CREAT) != 0 && attr != NULL) {
 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
 			return (EINVAL);
 	}
 
 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	/*
 	 * "." and ".." are magic directories, populated on the fly, and cannot
 	 * be opened as queues.
 	 */
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(attr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			accmode_t accmode = 0;
 
 			if (flags & FREAD)
 				accmode |= VREAD;
 			if (flags & FWRITE)
 				accmode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 			    pn->mn_gid, accmode, td->td_ucred);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_event_rights, fget,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_read_rights, fget_read,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_write_rights, fget_write,
 	    fpp, ppn, pmq);
 }
 
 static int
 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
     struct mq_attr *oattr)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	u_int oflag, flag;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
 		return (EINVAL);
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr->mq_maxmsg  = mq->mq_maxmsg;
 	oattr->mq_msgsize = mq->mq_msgsize;
 	oattr->mq_curmsgs = mq->mq_curmsgs;
 	if (attr != NULL) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr->mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr->mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error != 0)
 			return (error);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	}
 	return (error);
 }
 
 int
 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp, *fp2;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (sigev != NULL) {
 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
 		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
 		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	fp2 = fget_noref(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, mqd), &cap_event_rights);
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 #endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 int
 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct mqueue *mq;
 #ifdef INVARIANTS
 	struct filedesc *fdp;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 #endif
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_noref(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	sx_xlock(&mqfs_data.mi_lock);
 	st->st_atim = pn->mn_atime;
 	st->st_mtim = pn->mn_mtime;
 	st->st_ctim = pn->mn_ctime;
 	st->st_birthtim = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
 	    active_cred);
 	if (error != 0)
 		goto out;
 	pn->mn_mode = mode & ACCESSPERMS;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	if (uid == (uid_t)-1)
 		uid = pn->mn_uid;
 	if (gid == (gid_t)-1)
 		gid = pn->mn_gid;
 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	pn->mn_uid = uid;
 	pn->mn_gid = gid;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static int
 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_MQUEUE;
 	return (0);
 }
 
-static struct fileops mqueueops = {
+static const struct fileops mqueueops = {
 	.fo_read		= invfo_rdwr,
 	.fo_write		= invfo_rdwr,
 	.fo_truncate		= invfo_truncate,
 	.fo_ioctl		= invfo_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
 	.fo_fill_kinfo		= mqf_fill_kinfo,
 	.fo_cmp			= file_kcmp_generic,
 	.fo_flags		= DFLAG_PASSABLE,
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops);
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 static struct vfsconf mqueuefs_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "mqueuefs",
 	.vfc_vfsops = &mqfs_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_SYNTHETIC
 };
 
 static struct syscall_helper_data mq_syscalls[] = {
 	SYSCALL_INIT_HELPER(kmq_open),
 	SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static void
 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 static void
 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 int
 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	struct mq_attr32 attr32;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 int
 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	struct mq_attr32 attr32, oattr32;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error != 0)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
 		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
 }
 
 int
 freebsd32_kmq_timedsend(struct thread *td,
     struct freebsd32_kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 	    uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_timedreceive(struct thread *td,
     struct freebsd32_kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 	    uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	struct sigevent32 ev32;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 mqinit(void)
 {
 	int error;
 
 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 mqunload(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(mq32_syscalls);
 #endif
 	syscall_helper_unregister(mq_syscalls);
 	return (0);
 }
 
 static int
 mq_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	error = vfs_modevent(module, cmd, arg);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = mqinit();
 		if (error != 0)
 			mqunload();
 		break;
 	case MOD_UNLOAD:
 		error = mqunload();
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t mqueuefs_mod = {
 	"mqueuefs",
 	mq_modload,
 	&mqueuefs_vfsconf
 };
 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(mqueuefs, 1);
diff --git a/sys/kern/uipc_sem.c b/sys/kern/uipc_sem.c
index b4652e9106ac..35ca9a9fb06e 100644
--- a/sys/kern/uipc_sem.c
+++ b/sys/kern/uipc_sem.c
@@ -1,1109 +1,1109 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
  * Copyright (c) 2005, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/_semaphore.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
 /*
  * TODO
  *
  * - Resource limits?
  * - Replace global sem_lock with mtx_pool locks?
  * - Add a MAC check_create() hook for creating new named semaphores.
  */
 
 #ifndef SEM_MAX
 #define	SEM_MAX	30
 #endif
 
 #ifdef SEM_DEBUG
 #define	DP(x)	printf x
 #else
 #define	DP(x)
 #endif
 
 struct ksem_mapping {
 	char		*km_path;
 	Fnv32_t		km_fnv;
 	struct ksem	*km_ksem;
 	LIST_ENTRY(ksem_mapping) km_link;
 };
 
 static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
 static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
 static struct sx ksem_dict_lock;
 static struct mtx ksem_count_lock;
 static struct mtx sem_lock;
 static u_long ksem_hash;
 static int ksem_dead;
 
 #define	KSEM_HASH(fnv)	(&ksem_dictionary[(fnv) & ksem_hash])
 
 static int nsems = 0;
 SYSCTL_DECL(_p1003_1b);
 SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
     "Number of active kernel POSIX semaphores");
 
 static int	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
 		    struct timespec *abstime);
 static int	ksem_access(struct ksem *ks, struct ucred *ucred);
 static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
 		    unsigned int value);
 static int	ksem_create(struct thread *td, const char *path,
 		    semid_t *semidp, mode_t mode, unsigned int value,
 		    int flags, int compat32);
 static void	ksem_drop(struct ksem *ks);
 static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp);
 static struct ksem *ksem_hold(struct ksem *ks);
 static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
 static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
 static void	ksem_module_destroy(void);
 static int	ksem_module_init(void);
 static int	ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	sem_modload(struct module *module, int cmd, void *arg);
 
 static fo_stat_t	ksem_stat;
 static fo_close_t	ksem_closef;
 static fo_chmod_t	ksem_chmod;
 static fo_chown_t	ksem_chown;
 static fo_fill_kinfo_t	ksem_fill_kinfo;
 
 /* File descriptor operations. */
-static struct fileops ksem_ops = {
+static const struct fileops ksem_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = ksem_stat,
 	.fo_close = ksem_closef,
 	.fo_chmod = ksem_chmod,
 	.fo_chown = ksem_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = ksem_fill_kinfo,
 	.fo_cmp = file_kcmp_generic,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 FEATURE(posix_sem, "POSIX semaphores");
 
 static int
 ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct ksem *ks;
 #ifdef MAC
 	int error;
 #endif
 
 	ks = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * Attempt to return sanish values for fstat() on a semaphore
 	 * file descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 
 	mtx_lock(&sem_lock);
 	sb->st_atim = ks->ks_atime;
 	sb->st_ctim = ks->ks_ctime;
 	sb->st_mtim = ks->ks_mtime;
 	sb->st_birthtim = ks->ks_birthtime;
 	sb->st_uid = ks->ks_uid;
 	sb->st_gid = ks->ks_gid;
 	sb->st_mode = S_IFREG | ks->ks_mode;		/* XXX */
 	mtx_unlock(&sem_lock);
 
 	return (0);
 }
 
 static int
 ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setmode(active_cred, ks, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
 	    active_cred);
 	if (error != 0)
 		goto out;
 	ks->ks_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = ks->ks_uid;
 	if (gid == (gid_t)-1)
                  gid = ks->ks_gid;
 	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
 	    (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	ks->ks_uid = uid;
 	ks->ks_gid = gid;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_closef(struct file *fp, struct thread *td)
 {
 	struct ksem *ks;
 
 	ks = fp->f_data;
 	fp->f_data = NULL;
 	ksem_drop(ks);
 
 	return (0);
 }
 
 static int
 ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	const char *path, *pr_path;
 	struct ksem *ks;
 	size_t pr_pathlen;
 
 	kif->kf_type = KF_TYPE_SEM;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	kif->kf_un.kf_sem.kf_sem_value = ks->ks_value;
 	kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode;	/* XXX */
 	mtx_unlock(&sem_lock);
 	if (ks->ks_path != NULL) {
 		sx_slock(&ksem_dict_lock);
 		if (ks->ks_path != NULL) {
 			path = ks->ks_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				if (strncmp(path, pr_path, pr_pathlen) == 0 &&
 				    path[pr_pathlen] == '/')
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 		sx_sunlock(&ksem_dict_lock);
 	}
 	return (0);
 }
 
 /*
  * ksem object management including creation and reference counting
  * routines.
  */
 static struct ksem *
 ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
 {
 	struct ksem *ks;
 
 	mtx_lock(&ksem_count_lock);
 	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
 		mtx_unlock(&ksem_count_lock);
 		return (NULL);
 	}
 	nsems++;
 	mtx_unlock(&ksem_count_lock);
 	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
 	ks->ks_uid = ucred->cr_uid;
 	ks->ks_gid = ucred->cr_gid;
 	ks->ks_mode = mode;
 	ks->ks_value = value;
 	cv_init(&ks->ks_cv, "ksem");
 	vfs_timestamp(&ks->ks_birthtime);
 	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
 	refcount_init(&ks->ks_ref, 1);
 #ifdef MAC
 	mac_posixsem_init(ks);
 	mac_posixsem_create(ucred, ks);
 #endif
 
 	return (ks);
 }
 
 static struct ksem *
 ksem_hold(struct ksem *ks)
 {
 
 	refcount_acquire(&ks->ks_ref);
 	return (ks);
 }
 
 static void
 ksem_drop(struct ksem *ks)
 {
 
 	if (refcount_release(&ks->ks_ref)) {
 #ifdef MAC
 		mac_posixsem_destroy(ks);
 #endif
 		cv_destroy(&ks->ks_cv);
 		free(ks, M_KSEM);
 		mtx_lock(&ksem_count_lock);
 		nsems--;
 		mtx_unlock(&ksem_count_lock);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for read
  * and write access.
  */
 static int
 ksem_access(struct ksem *ks, struct ucred *ucred)
 {
 	int error;
 
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
 	    VREAD | VWRITE, ucred);
 	if (error)
 		error = priv_check_cred(ucred, PRIV_SEM_WRITE);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to semaphore objects.  We use the FNV hash on the path to
  * store the mappings in a hash table.
  */
 static struct ksem *
 ksem_lookup(char *path, Fnv32_t fnv)
 {
 	struct ksem_mapping *map;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0)
 			return (map->km_ksem);
 	}
 
 	return (NULL);
 }
 
 static void
 ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
 {
 	struct ksem_mapping *map;
 
 	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
 	map->km_path = path;
 	map->km_fnv = fnv;
 	map->km_ksem = ksem_hold(ks);
 	ks->ks_path = path;
 	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
 }
 
 static int
 ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct ksem_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixsem_check_unlink(ucred, map->km_ksem);
 			if (error)
 				return (error);
 #endif
 			error = ksem_access(map->km_ksem, ucred);
 			if (error)
 				return (error);
 			map->km_ksem->ks_path = NULL;
 			LIST_REMOVE(map, km_link);
 			ksem_drop(map->km_ksem);
 			free(map->km_path, M_KSEM);
 			free(map, M_KSEM);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
     int compat32)
 {
 	semid_t semid;
 #ifdef COMPAT_FREEBSD32
 	int32_t semid32;
 #endif
 	void *ptr;
 	size_t ptrs;
 
 #ifdef COMPAT_FREEBSD32
 	if (compat32) {
 		semid32 = fd;
 		ptr = &semid32;
 		ptrs = sizeof(semid32);
 	} else {
 #endif
 		semid = fd;
 		ptr = &semid;
 		ptrs = sizeof(semid);
 		compat32 = 0; /* silence gcc */
 #ifdef COMPAT_FREEBSD32
 	}
 #endif
 
 	return (copyout(ptr, semidp, ptrs));
 }
 
 /* Other helper routines. */
 static int
 ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
     unsigned int value, int flags, int compat32)
 {
 	struct pwddesc *pdp;
 	struct ksem *ks;
 	struct file *fp;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error, fd;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_VALUE(value);
 
 	if (value > SEM_VALUE_MAX)
 		return (EINVAL);
 
 	pdp = td->td_proc->p_pd;
 	mode = (mode & ~pdp->pd_cmask) & ACCESSPERMS;
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error) {
 		if (name == NULL)
 			error = ENOSPC;
 		return (error);
 	}
 
 	/*
 	 * Go ahead and copyout the file descriptor now.  This is a bit
 	 * premature, but it is a lot easier to handle errors as opposed
 	 * to later when we've possibly created a new semaphore, etc.
 	 */
 	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
 	if (error) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	if (name == NULL) {
 		/* Create an anonymous semaphore. */
 		ks = ksem_alloc(td->td_ucred, mode, value);
 		if (ks == NULL)
 			error = ENOSPC;
 		else
 			ks->ks_flags |= KS_ANONYMOUS;
 	} else {
 		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
 		pr_path = td->td_ucred->cr_prison->pr_path;
 
 		/* Construct a full pathname for jailed callers. */
 		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 		    : strlcpy(path, pr_path, MAXPATHLEN);
 		error = copyinstr(name, path + pr_pathlen,
 		    MAXPATHLEN - pr_pathlen, NULL);
 
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_KSEM);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&ksem_dict_lock);
 		ks = ksem_lookup(path, fnv);
 		if (ks == NULL) {
 			/* Object does not exist, create it if requested. */
 			if (flags & O_CREAT) {
 				ks = ksem_alloc(td->td_ucred, mode, value);
 				if (ks == NULL)
 					error = ENFILE;
 				else {
 					ksem_insert(path, fnv, ks);
 					path = NULL;
 				}
 			} else
 				error = ENOENT;
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixsem_check_open(td->td_ucred,
 				    ks);
 				if (error == 0)
 #endif
 				error = ksem_access(ks, td->td_ucred);
 			}
 			if (error == 0)
 				ksem_hold(ks);
 #ifdef INVARIANTS
 			else
 				ks = NULL;
 #endif
 		}
 		sx_xunlock(&ksem_dict_lock);
 		if (path)
 			free(path, M_KSEM);
 	}
 
 	if (error) {
 		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
 
 	finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
 
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static int
 ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	error = fget(td, id, rightsp, &fp);
 	if (error)
 		return (EINVAL);
 	if (fp->f_type != DTYPE_SEM) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_DEAD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /* System calls. */
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_init_args {
 	unsigned int	value;
 	semid_t		*idp;
 };
 #endif
 int
 sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_open_args {
 	char		*name;
 	int		oflag;
 	mode_t		mode;
 	unsigned int	value;
 	semid_t		*idp;	
 };
 #endif
 int
 sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
 {
 
 	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_unlink_args {
 	char		*name;
 };
 #endif
 int
 sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 	    : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
 	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&ksem_dict_lock);
 	error = ksem_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&ksem_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_close_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_ANONYMOUS) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = kern_close(td, uap->id);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_post_args {
 	semid_t	id;
 };
 #endif
 int
 sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init_one(&rights, CAP_SEM_POST), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_value == SEM_VALUE_MAX) {
 		error = EOVERFLOW;
 		goto err;
 	}
 	++ks->ks_value;
 	if (ks->ks_waiters > 0)
 		cv_signal(&ks->ks_cv);
 	error = 0;
 	vfs_timestamp(&ks->ks_ctime);
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_wait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 0, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t		id;
 	const struct timespec *abstime;
 };
 #endif
 int
 sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
 {
 	struct timespec abstime;
 	struct timespec *ts;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime, sizeof(abstime));
 		if (error != 0)
 			return (error);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_trywait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 1, NULL));
 }
 
 static int
 kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime)
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
 	AUDIT_ARG_FD(id);
 	error = ksem_get(td, id, cap_rights_init_one(&rights, CAP_SEM_WAIT),
 	    &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
 	    (int)td->td_proc->p_pid));
 #ifdef MAC
 	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		DP(("kern_sem_wait mac failed\n"));
 		goto err;
 	}
 #endif
 	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
 	vfs_timestamp(&ks->ks_atime);
 	while (ks->ks_value == 0) {
 		ks->ks_waiters++;
 		if (tryflag != 0)
 			error = EAGAIN;
 		else if (abstime == NULL)
 			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
 		else {
 			for (;;) {
 				ts1 = *abstime;
 				getnanotime(&ts2);
 				timespecsub(&ts1, &ts2, &ts1);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
 				if (tv.tv_sec < 0) {
 					error = ETIMEDOUT;
 					break;
 				}
 				error = cv_timedwait_sig(&ks->ks_cv,
 				    &sem_lock, tvtohz(&tv));
 				if (error != EWOULDBLOCK)
 					break;
 			}
 		}
 		ks->ks_waiters--;
 		if (error)
 			goto err;
 	}
 	ks->ks_value--;
 	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
 	    (int)td->td_proc->p_pid, error));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_getvalue_args {
 	semid_t		id;
 	int		*val;
 };
 #endif
 int
 sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error, val;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init_one(&rights, CAP_SEM_GETVALUE), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		mtx_unlock(&sem_lock);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 	val = ks->ks_value;
 	vfs_timestamp(&ks->ks_atime);
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	error = copyout(&val, uap->val, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_destroy_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
 {
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (!(ks->ks_flags & KS_ANONYMOUS)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	mtx_lock(&sem_lock);
 	if (ks->ks_waiters != 0) {
 		mtx_unlock(&sem_lock);
 		error = EBUSY;
 		goto err;
 	}
 	ks->ks_flags |= KS_DEAD;
 	mtx_unlock(&sem_lock);
 
 	error = kern_close(td, uap->id);
 err:
 	fdrop(fp, td);
 	return (error);
 }
 
 static struct syscall_helper_data ksem_syscalls[] = {
 	SYSCALL_INIT_HELPER(ksem_init),
 	SYSCALL_INIT_HELPER(ksem_open),
 	SYSCALL_INIT_HELPER(ksem_unlink),
 	SYSCALL_INIT_HELPER(ksem_close),
 	SYSCALL_INIT_HELPER(ksem_post),
 	SYSCALL_INIT_HELPER(ksem_wait),
 	SYSCALL_INIT_HELPER(ksem_timedwait),
 	SYSCALL_INIT_HELPER(ksem_trywait),
 	SYSCALL_INIT_HELPER(ksem_getvalue),
 	SYSCALL_INIT_HELPER(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 int
 freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, (semid_t *)uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 1));
 }
 
 int
 freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
 {
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, (semid_t *)uap->idp, uap->mode, uap->value,
 	    uap->oflag, 1));
 }
 
 int
 freebsd32_ksem_timedwait(struct thread *td,
     struct freebsd32_ksem_timedwait_args *uap)
 {
 	struct timespec32 abstime32;
 	struct timespec *ts, abstime;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
 		if (error != 0)
 			return (error);
 		CP(abstime32, abstime, tv_sec);
 		CP(abstime32, abstime, tv_nsec);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 static struct syscall_helper_data ksem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 ksem_module_init(void)
 {
 	int error;
 
 	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
 	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
 	sx_init(&ksem_dict_lock, "ksem dictionary");
 	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
 	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 
 	error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #endif
 	return (0);
 }
 
 static void
 ksem_module_destroy(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(ksem32_syscalls);
 #endif
 	syscall_helper_unregister(ksem_syscalls);
 
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
 	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
 	sx_destroy(&ksem_dict_lock);
 	mtx_destroy(&ksem_count_lock);
 	mtx_destroy(&sem_lock);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
 }
 
 static int
 sem_modload(struct module *module, int cmd, void *arg)
 {
         int error = 0;
 
         switch (cmd) {
         case MOD_LOAD:
 		error = ksem_module_init();
 		if (error)
 			ksem_module_destroy();
                 break;
 
         case MOD_UNLOAD:
 		mtx_lock(&ksem_count_lock);
 		if (nsems != 0) {
 			error = EOPNOTSUPP;
 			mtx_unlock(&ksem_count_lock);
 			break;
 		}
 		ksem_dead = 1;
 		mtx_unlock(&ksem_count_lock);
 		ksem_module_destroy();
                 break;
 
         case MOD_SHUTDOWN:
                 break;
         default:
                 error = EINVAL;
                 break;
         }
         return (error);
 }
 
 static moduledata_t sem_mod = {
         "sem",
         &sem_modload,
         NULL
 };
 
 DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sem, 1);
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index f51998d0ed00..dad9fb23250f 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -1,2225 +1,2225 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
  * Copyright 2020 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2), shm_rename(2), and shm_unlink(2).
  * While most of the implementation is here, vm_mmap.c contains
  * mapping logic changes.
  *
  * posixshmcontrol(1) allows users to inspect the state of the memory
  * objects.  Per-uid swap resource limit controls total amount of
  * memory that user can consume for anonymous objects, including
  * shared.
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/jail.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr64 shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static void	shm_init(void *arg);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static void	shm_doremove(struct shm_mapping *map);
 static int	shm_dotruncate_cookie(struct shmfd *shmfd, off_t length,
     void *rl_cookie);
 static int	shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
     void *rl_cookie);
 static int	shm_copyin_path(struct thread *td, const char *userpath_in,
     char **path_out);
 static int	shm_deallocate(struct shmfd *shmfd, off_t *offset,
     off_t *length, int flags);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_ioctl_t	shm_ioctl;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
 static fo_mmap_t	shm_mmap;
 static fo_get_seals_t	shm_get_seals;
 static fo_add_seals_t	shm_add_seals;
 static fo_fallocate_t	shm_fallocate;
 static fo_fspacectl_t	shm_fspacectl;
 
 /* File descriptor operations. */
-struct fileops shm_ops = {
+const struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = shm_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
 	.fo_mmap = shm_mmap,
 	.fo_get_seals = shm_get_seals,
 	.fo_add_seals = shm_add_seals,
 	.fo_fallocate = shm_fallocate,
 	.fo_fspacectl = shm_fspacectl,
 	.fo_cmp = file_kcmp_generic,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "");
 
 static int largepage_reclaim_tries = 1;
 SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries,
     CTLFLAG_RWTUN, &largepage_reclaim_tries, 0,
     "Number of contig reclaims before giving up for default alloc policy");
 
 #define	shm_rangelock_unlock(shmfd, cookie)				\
 	rangelock_unlock(&(shmfd)->shm_rl, (cookie), &(shmfd)->shm_mtx)
 #define	shm_rangelock_rlock(shmfd, start, end)				\
 	rangelock_rlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
 #define	shm_rangelock_tryrlock(shmfd, start, end)			\
 	rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
 #define	shm_rangelock_wlock(shmfd, start, end)				\
 	rangelock_wlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	rv = vm_page_grab_valid_unlocked(&m, obj, idx,
 	    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
 	if (rv == VM_PAGER_OK)
 		goto found;
 
 	/*
 	 * Read I/O without either a corresponding resident page or swap
 	 * page: use zero_region.  This is intended to avoid instantiating
 	 * pages on read from a sparse region.
 	 */
 	VM_OBJECT_WLOCK(obj);
 	m = vm_page_lookup(obj, idx);
 	if (uio->uio_rw == UIO_READ && m == NULL &&
 	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
 	}
 
 	/*
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	rv = vm_page_grab_valid(&m, obj, idx,
 	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
 	if (rv != VM_PAGER_OK) {
 		VM_OBJECT_WUNLOCK(obj);
 		if (bootverbose) {
 			printf("uiomove_object: vm_obj %p idx %jd "
 			    "pager error %d\n", obj, idx, rv);
 		}
 		return (rv == VM_PAGER_AGAIN ? ENOSPC : EIO);
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 found:
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0)
 		vm_page_set_dirty(m);
 	vm_page_activate(m);
 	vm_page_sunbusy(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static u_long count_largepages[MAXPAGESIZES];
 
 static int
 shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
     int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 	vm_page_t m __diagused;
 	int psind;
 
 	psind = object->un_pager.phys.data_val;
 	if (psind == 0 || pidx >= object->size)
 		return (VM_PAGER_FAIL);
 	*first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE);
 
 	/*
 	 * We only busy the first page in the superpage run.  It is
 	 * useless to busy whole run since we only remove full
 	 * superpage, and it takes too long to busy e.g. 512 * 512 ==
 	 * 262144 pages constituing 1G amd64 superage.
 	 */
 	m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT);
 	MPASS(m != NULL);
 
 	*last = *first + atop(pagesizes[psind]) - 1;
 	return (VM_PAGER_OK);
 }
 
 static boolean_t
 shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
     int *before, int *after)
 {
 	int psind;
 
 	psind = object->un_pager.phys.data_val;
 	if (psind == 0 || pindex >= object->size)
 		return (FALSE);
 	if (before != NULL) {
 		*before = pindex - rounddown2(pindex, pagesizes[psind] /
 		    PAGE_SIZE);
 	}
 	if (after != NULL) {
 		*after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) -
 		    pindex;
 	}
 	return (TRUE);
 }
 
 static void
 shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred)
 {
 }
 
 static void
 shm_largepage_phys_dtor(vm_object_t object)
 {
 	int psind;
 
 	psind = object->un_pager.phys.data_val;
 	if (psind != 0) {
 		atomic_subtract_long(&count_largepages[psind],
 		    object->size / (pagesizes[psind] / PAGE_SIZE));
 		vm_wire_sub(object->size);
 	} else {
 		KASSERT(object->size == 0,
 		    ("largepage phys obj %p not initialized bit size %#jx > 0",
 		    object, (uintmax_t)object->size));
 	}
 }
 
 static const struct phys_pager_ops shm_largepage_phys_ops = {
 	.phys_pg_populate =	shm_largepage_phys_populate,
 	.phys_pg_haspage =	shm_largepage_phys_haspage,
 	.phys_pg_ctor =		shm_largepage_phys_ctor,
 	.phys_pg_dtor =		shm_largepage_phys_dtor,
 };
 
 bool
 shm_largepage(struct shmfd *shmfd)
 {
 	return (shmfd->shm_object->type == OBJT_PHYS);
 }
 
 static void
 shm_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size)
 {
 	struct shmfd *shm;
 	vm_size_t c;
 
 	swap_pager_freespace(obj, start, size, &c);
 	if (c == 0)
 		return;
 
 	shm = obj->un_pager.swp.swp_priv;
 	if (shm == NULL)
 		return;
 	KASSERT(shm->shm_pages >= c,
 	    ("shm %p pages %jd free %jd", shm,
 	    (uintmax_t)shm->shm_pages, (uintmax_t)c));
 	shm->shm_pages -= c;
 }
 
 static void
 shm_page_inserted(vm_object_t obj, vm_page_t m)
 {
 	struct shmfd *shm;
 
 	shm = obj->un_pager.swp.swp_priv;
 	if (shm == NULL)
 		return;
 	if (!vm_pager_has_page(obj, m->pindex, NULL, NULL))
 		shm->shm_pages += 1;
 }
 
 static void
 shm_page_removed(vm_object_t obj, vm_page_t m)
 {
 	struct shmfd *shm;
 
 	shm = obj->un_pager.swp.swp_priv;
 	if (shm == NULL)
 		return;
 	if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) {
 		KASSERT(shm->shm_pages >= 1,
 		    ("shm %p pages %jd free 1", shm,
 		    (uintmax_t)shm->shm_pages));
 		shm->shm_pages -= 1;
 	}
 }
 
 static struct pagerops shm_swap_pager_ops = {
 	.pgo_kvme_type = KVME_TYPE_SWAP,
 	.pgo_freespace = shm_pager_freespace,
 	.pgo_page_inserted = shm_page_inserted,
 	.pgo_page_removed = shm_page_removed,
 };
 static int shmfd_pager_type = -1;
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = shm_rangelock_rlock(shmfd, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid);
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	shm_rangelock_unlock(shmfd, rl_cookie);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 	off_t size;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0)
 		return (EINVAL);
 	foffset_lock_uio(fp, uio, flags);
 	if (uio->uio_resid > OFF_MAX - uio->uio_offset) {
 		/*
 		 * Overflow is only an error if we're supposed to expand on
 		 * write.  Otherwise, we'll just truncate the write to the
 		 * size of the file, which can only grow up to OFF_MAX.
 		 */
 		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) {
 			foffset_unlock_uio(fp, uio, flags);
 			return (EFBIG);
 		}
 
 		size = shmfd->shm_size;
 	} else {
 		size = uio->uio_offset + uio->uio_resid;
 	}
 	if ((flags & FOF_OFFSET) == 0)
 		rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX);
 	else
 		rl_cookie = shm_rangelock_wlock(shmfd, uio->uio_offset, size);
 	if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) {
 		error = EPERM;
 	} else {
 		error = 0;
 		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 &&
 		    size > shmfd->shm_size) {
 			error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
 		}
 		if (error == 0)
 			error = uiomove_object(shmfd->shm_object,
 			    shmfd->shm_size, uio);
 	}
 	shm_rangelock_unlock(shmfd, rl_cookie);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 int
 shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	struct shm_largepage_conf *conf;
 	void *rl_cookie;
 
 	shmfd = fp->f_data;
 	switch (com) {
 	case FIONBIO:
 	case FIOASYNC:
 		/*
 		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
 		 * just like it would on an unlinked regular file
 		 */
 		return (0);
 	case FIOSSHMLPGCNF:
 		if (!shm_largepage(shmfd))
 			return (ENOTTY);
 		conf = data;
 		if (shmfd->shm_lp_psind != 0 &&
 		    conf->psind != shmfd->shm_lp_psind)
 			return (EINVAL);
 		if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
 		    pagesizes[conf->psind] == 0)
 			return (EINVAL);
 		if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
 		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
 		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
 			return (EINVAL);
 
 		rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX);
 		shmfd->shm_lp_psind = conf->psind;
 		shmfd->shm_lp_alloc_policy = conf->alloc_policy;
 		shmfd->shm_object->un_pager.phys.data_val = conf->psind;
 		shm_rangelock_unlock(shmfd, rl_cookie);
 		return (0);
 	case FIOGSHMLPGCNF:
 		if (!shm_largepage(shmfd))
 			return (ENOTTY);
 		conf = data;
 		rl_cookie = shm_rangelock_rlock(shmfd, 0, OFF_MAX);
 		conf->psind = shmfd->shm_lp_psind;
 		conf->alloc_policy = shmfd->shm_lp_alloc_policy;
 		shm_rangelock_unlock(shmfd, rl_cookie);
 		return (0);
 	default:
 		return (ENOTTY);
 	}
 }
 
 static int
 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 	sb->st_nlink = shmfd->shm_object->ref_count;
 	if (shm_largepage(shmfd)) {
 		sb->st_blocks = shmfd->shm_object->size /
 		    (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT);
 	} else {
 		sb->st_blocks = shmfd->shm_pages;
 	}
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
 static int
 shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
 	int error;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 
 	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 
 	/* Construct a full pathname for jailed callers. */
 	pr_pathlen = strcmp(pr_path, "/") ==
 	    0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(userpath_in, path + pr_pathlen,
 	    MAXPATHLEN - pr_pathlen, NULL);
 	if (error != 0)
 		goto out;
 
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 
 	/* Require paths to start with a '/' character. */
 	if (path[pr_pathlen] != '/') {
 		error = EINVAL;
 		goto out;
 	}
 
 	*path_out = path;
 
 out:
 	if (error != 0)
 		free(path, M_SHMFD);
 
 	return (error);
 }
 
 static int
 shm_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base,
     int end)
 {
 	vm_page_t m;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(base >= 0, ("%s: base %d", __func__, base));
 	KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base,
 	    end));
 
 retry:
 	m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
 	if (m != NULL) {
 		MPASS(vm_page_all_valid(m));
 	} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 		m = vm_page_alloc(object, idx,
 		    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
 		if (m == NULL)
 			goto retry;
 		vm_object_pip_add(object, 1);
 		VM_OBJECT_WUNLOCK(object);
 		rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 		VM_OBJECT_WLOCK(object);
 		vm_object_pip_wakeup(object);
 		if (rv == VM_PAGER_OK) {
 			/*
 			 * Since the page was not resident, and therefore not
 			 * recently accessed, immediately enqueue it for
 			 * asynchronous laundering.  The current operation is
 			 * not regarded as an access.
 			 */
 			vm_page_launder(m);
 		} else {
 			vm_page_free(m);
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 	}
 	if (m != NULL) {
 		pmap_zero_page_area(m, base, end - base);
 		KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid",
 		    __func__, m));
 		vm_page_set_dirty(m);
 		vm_page_xunbusy(m);
 	}
 
 	return (0);
 }
 
 static int
 shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
 {
 	vm_object_t object;
 	vm_pindex_t nobjsize;
 	vm_ooffset_t delta;
 	int base, error;
 
 	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
 	if (length == shmfd->shm_size)
 		return (0);
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
 			return (EPERM);
 
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0)
 			return (EBUSY);
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			error = shm_partial_page_invalidate(object,
 			    OFF_TO_IDX(length), base, PAGE_SIZE);
 			if (error)
 				return (error);
 		}
 		delta = IDX_TO_OFF(object->size - nobjsize);
 
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
 			return (EPERM);
 
 		/* Try to reserve additional swap space. */
 		delta = IDX_TO_OFF(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred))
 			return (ENOMEM);
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	return (0);
 }
 
 static int
 shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t newobjsz;
 	vm_pindex_t oldobjsz __unused;
 	int aflags, error, i, psind, try;
 
 	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
 
 	oldobjsz = object->size;
 	newobjsz = OFF_TO_IDX(length);
 	if (length == shmfd->shm_size)
 		return (0);
 	psind = shmfd->shm_lp_psind;
 	if (psind == 0 && length != 0)
 		return (EINVAL);
 	if ((length & (pagesizes[psind] - 1)) != 0)
 		return (EINVAL);
 
 	if (length < shmfd->shm_size) {
 		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
 			return (EPERM);
 		if (shmfd->shm_kmappings > 0)
 			return (EBUSY);
 		return (ENOTSUP);	/* Pages are unmanaged. */
 #if 0
 		vm_object_page_remove(object, newobjsz, oldobjsz, 0);
 		object->size = newobjsz;
 		shmfd->shm_size = length;
 		return (0);
 #endif
 	}
 
 	if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
 		return (EPERM);
 
 	aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
 	if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
 		aflags |= VM_ALLOC_WAITFAIL;
 	try = 0;
 
 	/*
 	 * Extend shmfd and object, keeping all already fully
 	 * allocated large pages intact even on error, because dropped
 	 * object lock might allowed mapping of them.
 	 */
 	while (object->size < newobjsz) {
 		m = vm_page_alloc_contig(object, object->size, aflags,
 		    pagesizes[psind] / PAGE_SIZE, 0, ~0,
 		    pagesizes[psind], 0,
 		    VM_MEMATTR_DEFAULT);
 		if (m == NULL) {
 			VM_OBJECT_WUNLOCK(object);
 			if (shmfd->shm_lp_alloc_policy ==
 			    SHM_LARGEPAGE_ALLOC_NOWAIT ||
 			    (shmfd->shm_lp_alloc_policy ==
 			    SHM_LARGEPAGE_ALLOC_DEFAULT &&
 			    try >= largepage_reclaim_tries)) {
 				VM_OBJECT_WLOCK(object);
 				return (ENOMEM);
 			}
 			error = vm_page_reclaim_contig(aflags,
 			    pagesizes[psind] / PAGE_SIZE, 0, ~0,
 			    pagesizes[psind], 0) ? 0 :
 			    vm_wait_intr(object);
 			if (error != 0) {
 				VM_OBJECT_WLOCK(object);
 				return (error);
 			}
 			try++;
 			VM_OBJECT_WLOCK(object);
 			continue;
 		}
 		try = 0;
 		for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) {
 			if ((m[i].flags & PG_ZERO) == 0)
 				pmap_zero_page(&m[i]);
 			vm_page_valid(&m[i]);
 			vm_page_xunbusy(&m[i]);
 		}
 		object->size += OFF_TO_IDX(pagesizes[psind]);
 		shmfd->shm_size += pagesizes[psind];
 		atomic_add_long(&count_largepages[psind], 1);
 		vm_wire_add(atop(pagesizes[psind]));
 	}
 	return (0);
 }
 
 static int
 shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie)
 {
 	int error;
 
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd,
 	    length, rl_cookie) : shm_dotruncate_locked(shmfd, length,
 	    rl_cookie);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	return (error);
 }
 
 int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	void *rl_cookie;
 	int error;
 
 	rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX);
 	error = shm_dotruncate_cookie(shmfd, length, rl_cookie);
 	shm_rangelock_unlock(shmfd, rl_cookie);
 	return (error);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
 struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
 {
 	struct shmfd *shmfd;
 	vm_object_t obj;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	if (largepage) {
 		obj = phys_pager_allocate(NULL, &shm_largepage_phys_ops,
 		    NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 		obj->un_pager.phys.phys_priv = shmfd;
 		shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
 	} else {
 		obj = vm_pager_allocate(shmfd_pager_type, NULL,
 		    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 		obj->un_pager.swp.swp_priv = shmfd;
 	}
 	KASSERT(obj != NULL, ("shm_create: vm_pager_allocate"));
 	VM_OBJECT_WLOCK(obj);
 	vm_object_set_flag(obj, OBJ_POSIXSHM);
 	VM_OBJECT_WUNLOCK(obj);
 	shmfd->shm_object = obj;
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
 struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
 void
 shm_drop(struct shmfd *shmfd)
 {
 	vm_object_t obj;
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		obj = shmfd->shm_object;
 		VM_OBJECT_WLOCK(obj);
 		if (shm_largepage(shmfd))
 			obj->un_pager.phys.phys_priv = NULL;
 		else
 			obj->un_pager.swp.swp_priv = NULL;
 		VM_OBJECT_WUNLOCK(obj);
 		vm_object_deallocate(obj);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
 int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static void
 shm_init(void *arg)
 {
 	char name[32];
 	int i;
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	new_unrhdr64(&shm_ino_unr, 1);
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 	shmfd_pager_type = vm_pager_alloc_dyn_type(&shm_swap_pager_ops,
 	    OBJT_SWAP);
 	MPASS(shmfd_pager_type != -1);
 
 	for (i = 1; i < MAXPAGESIZES; i++) {
 		if (pagesizes[i] == 0)
 			break;
 #define	M	(1024 * 1024)
 #define	G	(1024 * M)
 		if (pagesizes[i] >= G)
 			snprintf(name, sizeof(name), "%luG", pagesizes[i] / G);
 		else if (pagesizes[i] >= M)
 			snprintf(name, sizeof(name), "%luM", pagesizes[i] / M);
 		else
 			snprintf(name, sizeof(name), "%lu", pagesizes[i]);
 #undef G
 #undef M
 		SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages),
 		    OID_AUTO, name, CTLFLAG_RD, &count_largepages[i],
 		    "number of non-transient largepages allocated");
 	}
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 /*
  * Remove all shared memory objects that belong to a prison.
  */
 void
 shm_remove_prison(struct prison *pr)
 {
 	struct shm_mapping *shmm, *tshmm;
 	u_long i;
 
 	sx_xlock(&shm_dict_lock);
 	for (i = 0; i < shm_hash + 1; i++) {
 		LIST_FOREACH_SAFE(shmm, &shm_dictionary[i], sm_link, tshmm) {
 			if (shmm->sm_shmfd->shm_object->cred &&
 			    shmm->sm_shmfd->shm_object->cred->cr_prison == pr)
 				shm_doremove(shmm);
 		}
 	}
 	sx_xunlock(&shm_dict_lock);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			shm_doremove(map);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 shm_doremove(struct shm_mapping *map)
 {
 	map->sm_shmfd->shm_path = NULL;
 	LIST_REMOVE(map, sm_link);
 	shm_drop(map->sm_shmfd);
 	free(map->sm_path, M_SHMFD);
 	free(map, M_SHMFD);
 }
 
 int
 kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
     int shmflags, struct filecaps *fcaps, const char *name __unused)
 {
 	struct pwddesc *pdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	void *rl_cookie;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int error, fd, initial_seals;
 	bool largepage;
 
 	if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE |
 	    SHM_LARGEPAGE)) != 0)
 		return (EINVAL);
 
 	initial_seals = F_SEAL_SEAL;
 	if ((shmflags & SHM_ALLOW_SEALING) != 0)
 		initial_seals &= ~F_SEAL_SEAL;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	largepage = (shmflags & SHM_LARGEPAGE) != 0;
 	if (largepage && !PMAP_HAS_LARGEPAGES)
 		return (ENOTTY);
 
 	/*
 	 * Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
 	 * If the decision is made later to allow additional seals, care must be
 	 * taken below to ensure that the seals are properly set if the shmfd
 	 * already existed -- this currently assumes that only F_SEAL_SEAL can
 	 * be set and doesn't take further precautions to ensure the validity of
 	 * the seals being added with respect to current mappings.
 	 */
 	if ((initial_seals & ~F_SEAL_SEAL) != 0)
 		return (EINVAL);
 
 	if (userpath != SHM_ANON) {
 		error = shm_copyin_path(td, userpath, &path);
 		if (error != 0)
 			return (error);
 
 #ifdef CAPABILITY_MODE
 		/*
 		 * shm_open(2) is only allowed for anonymous objects.
 		 */
 		if (CAP_TRACING(td))
 			ktrcapfail(CAPFAIL_NAMEI, path);
 		if (IN_CAPABILITY_MODE(td)) {
 			free(path, M_SHMFD);
 			return (ECAPMODE);
 		}
 #endif
 
 		AUDIT_ARG_UPATH1_CANON(path);
 	} else {
 		path = NULL;
 	}
 
 	pdp = td->td_proc->p_pd;
 	cmode = (mode & ~pdp->pd_cmask) & ACCESSPERMS;
 
 	/*
 	 * shm_open(2) created shm should always have O_CLOEXEC set, as mandated
 	 * by POSIX.  We allow it to be unset here so that an in-kernel
 	 * interface may be written as a thin layer around shm, optionally not
 	 * setting CLOEXEC.  For shm_open(2), O_CLOEXEC is set unconditionally
 	 * in sys_shm_open() to keep this implementation compliant.
 	 */
 	error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps);
 	if (error) {
 		free(path, M_SHMFD);
 		return (error);
 	}
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (userpath == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode, largepage);
 		shmfd->shm_seals = initial_seals;
 		shmfd->shm_flags = shmflags;
 	} else {
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode,
 					    largepage);
 					shmfd->shm_seals = initial_seals;
 					shmfd->shm_flags = shmflags;
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX);
 
 			/*
 			 * kern_shm_open() likely shouldn't ever error out on
 			 * trying to set a seal that already exists, unlike
 			 * F_ADD_SEALS.  This would break terribly as
 			 * shm_open(2) actually sets F_SEAL_SEAL to maintain
 			 * historical behavior where the underlying file could
 			 * not be sealed.
 			 */
 			initial_seals &= ~shmfd->shm_seals;
 
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 
 			/*
 			 * initial_seals can't set additional seals if we've
 			 * already been set F_SEAL_SEAL.  If F_SEAL_SEAL is set,
 			 * then we've already removed that one from
 			 * initial_seals.  This is currently redundant as we
 			 * only allow setting F_SEAL_SEAL at creation time, but
 			 * it's cheap to check and decreases the effort required
 			 * to allow additional seals.
 			 */
 			if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 &&
 			    initial_seals != 0)
 				error = EPERM;
 			else if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else if (shmflags != 0 && shmflags != shmfd->shm_flags)
 				error = EINVAL;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 				VM_OBJECT_WLOCK(shmfd->shm_object);
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					error = shm_dotruncate_locked(shmfd, 0,
 					    rl_cookie);
 				VM_OBJECT_WUNLOCK(shmfd->shm_object);
 			}
 			if (error == 0) {
 				/*
 				 * Currently we only allow F_SEAL_SEAL to be
 				 * set initially.  As noted above, this would
 				 * need to be reworked should that change.
 				 */
 				shmfd->shm_seals |= initial_seals;
 				shm_hold(shmfd);
 			}
 			shm_rangelock_unlock(shmfd, rl_cookie);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 /* System calls. */
 #ifdef COMPAT_FREEBSD12
 int
 freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap)
 {
 
 	return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC,
 	    uap->mode, NULL));
 }
 #endif
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	Fnv32_t fnv;
 	int error;
 
 	error = shm_copyin_path(td, uap->path, &path);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_SHMFD);
 
 	return (error);
 }
 
 int
 sys_shm_rename(struct thread *td, struct shm_rename_args *uap)
 {
 	char *path_from = NULL, *path_to = NULL;
 	Fnv32_t fnv_from, fnv_to;
 	struct shmfd *fd_from;
 	struct shmfd *fd_to;
 	int error;
 	int flags;
 
 	flags = uap->flags;
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Make sure the user passed only valid flags.
 	 * If you add a new flag, please add a new term here.
 	 */
 	if ((flags & ~(
 	    SHM_RENAME_NOREPLACE |
 	    SHM_RENAME_EXCHANGE
 	    )) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * EXCHANGE and NOREPLACE don't quite make sense together. Let's
 	 * force the user to choose one or the other.
 	 */
 	if ((flags & SHM_RENAME_NOREPLACE) != 0 &&
 	    (flags & SHM_RENAME_EXCHANGE) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Renaming to or from anonymous makes no sense */
 	if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) {
 		error = EINVAL;
 		goto out;
 	}
 
 	error = shm_copyin_path(td, uap->path_from, &path_from);
 	if (error != 0)
 		goto out;
 
 	error = shm_copyin_path(td, uap->path_to, &path_to);
 	if (error != 0)
 		goto out;
 
 	AUDIT_ARG_UPATH1_CANON(path_from);
 	AUDIT_ARG_UPATH2_CANON(path_to);
 
 	/* Rename with from/to equal is a no-op */
 	if (strcmp(path_from, path_to) == 0)
 		goto out;
 
 	fnv_from = fnv_32_str(path_from, FNV1_32_INIT);
 	fnv_to = fnv_32_str(path_to, FNV1_32_INIT);
 
 	sx_xlock(&shm_dict_lock);
 
 	fd_from = shm_lookup(path_from, fnv_from);
 	if (fd_from == NULL) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	fd_to = shm_lookup(path_to, fnv_to);
 	if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) {
 		error = EEXIST;
 		goto out_locked;
 	}
 
 	/*
 	 * Unconditionally prevents shm_remove from invalidating the 'from'
 	 * shm's state.
 	 */
 	shm_hold(fd_from);
 	error = shm_remove(path_from, fnv_from, td->td_ucred);
 
 	/*
 	 * One of my assumptions failed if ENOENT (e.g. locking didn't
 	 * protect us)
 	 */
 	KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s",
 	    path_from));
 	if (error != 0) {
 		shm_drop(fd_from);
 		goto out_locked;
 	}
 
 	/*
 	 * If we are exchanging, we need to ensure the shm_remove below
 	 * doesn't invalidate the dest shm's state.
 	 */
 	if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL)
 		shm_hold(fd_to);
 
 	/*
 	 * NOTE: if path_to is not already in the hash, c'est la vie;
 	 * it simply means we have nothing already at path_to to unlink.
 	 * That is the ENOENT case.
 	 *
 	 * If we somehow don't have access to unlink this guy, but
 	 * did for the shm at path_from, then relink the shm to path_from
 	 * and abort with EACCES.
 	 *
 	 * All other errors: that is weird; let's relink and abort the
 	 * operation.
 	 */
 	error = shm_remove(path_to, fnv_to, td->td_ucred);
 	if (error != 0 && error != ENOENT) {
 		shm_insert(path_from, fnv_from, fd_from);
 		shm_drop(fd_from);
 		/* Don't free path_from now, since the hash references it */
 		path_from = NULL;
 		goto out_locked;
 	}
 
 	error = 0;
 
 	shm_insert(path_to, fnv_to, fd_from);
 
 	/* Don't free path_to now, since the hash references it */
 	path_to = NULL;
 
 	/* We kept a ref when we removed, and incremented again in insert */
 	shm_drop(fd_from);
 	KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 	    fd_from->shm_refs));
 
 	if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) {
 		shm_insert(path_from, fnv_from, fd_to);
 		path_from = NULL;
 		shm_drop(fd_to);
 		KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 		    fd_to->shm_refs));
 	}
 
 out_locked:
 	sx_xunlock(&shm_dict_lock);
 
 out:
 	free(path_from, M_SHMFD);
 	free(path_to, M_SHMFD);
 	return (error);
 }
 
 static int
 shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr,
     vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct vmspace *vms;
 	vm_map_entry_t next_entry, prev_entry;
 	vm_offset_t align, mask, maxaddr;
 	int docow, error, rv, try;
 	bool curmap;
 
 	if (shmfd->shm_lp_psind == 0)
 		return (EINVAL);
 
 	/* MAP_PRIVATE is disabled */
 	if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL |
 	    MAP_NOCORE | MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)
 		return (EINVAL);
 
 	vms = td->td_proc->p_vmspace;
 	curmap = map == &vms->vm_map;
 	if (curmap) {
 		error = kern_mmap_racct_check(td, map, size);
 		if (error != 0)
 			return (error);
 	}
 
 	docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT;
 	docow |= MAP_INHERIT_SHARE;
 	if ((flags & MAP_NOCORE) != 0)
 		docow |= MAP_DISABLE_COREDUMP;
 
 	mask = pagesizes[shmfd->shm_lp_psind] - 1;
 	if ((foff & mask) != 0)
 		return (EINVAL);
 	maxaddr = vm_map_max(map);
 	if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR)
 		maxaddr = MAP_32BIT_MAX_ADDR;
 	if (size == 0 || (size & mask) != 0 ||
 	    (*addr != 0 && ((*addr & mask) != 0 ||
 	    *addr + size < *addr || *addr + size > maxaddr)))
 		return (EINVAL);
 
 	align = flags & MAP_ALIGNMENT_MASK;
 	if (align == 0) {
 		align = pagesizes[shmfd->shm_lp_psind];
 	} else if (align == MAP_ALIGNED_SUPER) {
 		if (shmfd->shm_lp_psind != 1)
 			return (EINVAL);
 		align = pagesizes[1];
 	} else {
 		align >>= MAP_ALIGNMENT_SHIFT;
 		align = 1ULL << align;
 		/* Also handles overflow. */
 		if (align < pagesizes[shmfd->shm_lp_psind])
 			return (EINVAL);
 	}
 
 	vm_map_lock(map);
 	if ((flags & MAP_FIXED) == 0) {
 		try = 1;
 		if (curmap && (*addr == 0 ||
 		    (*addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    *addr < round_page((vm_offset_t)vms->vm_daddr +
 		    lim_max(td, RLIMIT_DATA))))) {
 			*addr = roundup2((vm_offset_t)vms->vm_daddr +
 			    lim_max(td, RLIMIT_DATA),
 			    pagesizes[shmfd->shm_lp_psind]);
 		}
 again:
 		rv = vm_map_find_aligned(map, addr, size, maxaddr, align);
 		if (rv != KERN_SUCCESS) {
 			if (try == 1) {
 				try = 2;
 				*addr = vm_map_min(map);
 				if ((*addr & mask) != 0)
 					*addr = (*addr + mask) & mask;
 				goto again;
 			}
 			goto fail1;
 		}
 	} else if ((flags & MAP_EXCL) == 0) {
 		rv = vm_map_delete(map, *addr, *addr + size);
 		if (rv != KERN_SUCCESS)
 			goto fail1;
 	} else {
 		error = ENOSPC;
 		if (vm_map_lookup_entry(map, *addr, &prev_entry))
 			goto fail;
 		next_entry = vm_map_entry_succ(prev_entry);
 		if (next_entry->start < *addr + size)
 			goto fail;
 	}
 
 	rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size,
 	    prot, max_prot, docow);
 fail1:
 	error = vm_mmap_to_errno(rv);
 fail:
 	vm_map_unlock(map);
 	return (error);
 }
 
 static int
 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t max_maxprot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct shmfd *shmfd;
 	vm_prot_t maxprot;
 	int error;
 	bool writecnt;
 	void *rl_cookie;
 
 	shmfd = fp->f_data;
 	maxprot = VM_PROT_NONE;
 
 	rl_cookie = shm_rangelock_rlock(shmfd, 0, objsize);
 	/* FREAD should always be set. */
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 
 	/*
 	 * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared
 	 * mapping with a write seal applied.  Private mappings are always
 	 * writeable.
 	 */
 	if ((flags & MAP_SHARED) == 0) {
 		if ((max_maxprot & VM_PROT_WRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		writecnt = false;
 	} else {
 		if ((fp->f_flag & FWRITE) != 0 &&
 		    (shmfd->shm_seals & F_SEAL_WRITE) == 0)
 			maxprot |= VM_PROT_WRITE;
 
 		/*
 		 * Any mappings from a writable descriptor may be upgraded to
 		 * VM_PROT_WRITE with mprotect(2), unless a write-seal was
 		 * applied between the open and subsequent mmap(2).  We want to
 		 * reject application of a write seal as long as any such
 		 * mapping exists so that the seal cannot be trivially bypassed.
 		 */
 		writecnt = (maxprot & VM_PROT_WRITE) != 0;
 		if (!writecnt && (prot & VM_PROT_WRITE) != 0) {
 			error = EACCES;
 			goto out;
 		}
 	}
 	maxprot &= max_maxprot;
 
 	/* See comment in vn_mmap(). */
 	if (
 #ifdef _LP64
 	    objsize > OFF_MAX ||
 #endif
 	    foff > OFF_MAX - objsize) {
 		error = EINVAL;
 		goto out;
 	}
 
 #ifdef MAC
 	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
 	if (error != 0)
 		goto out;
 #endif
 
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
 
 	if (shm_largepage(shmfd)) {
 		writecnt = false;
 		error = shm_mmap_large(shmfd, map, addr, objsize, prot,
 		    maxprot, flags, foff, td);
 	} else {
 		if (writecnt) {
 			vm_pager_update_writecount(shmfd->shm_object, 0,
 			    objsize);
 		}
 		error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
 		    shmfd->shm_object, foff, writecnt, td);
 	}
 	if (error != 0) {
 		if (writecnt)
 			vm_pager_release_writecount(shmfd->shm_object, 0,
 			    objsize);
 		vm_object_deallocate(shmfd->shm_object);
 	}
 out:
 	shm_rangelock_unlock(shmfd, rl_cookie);
 	return (error);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    VADMIN, active_cred);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list)
 {
 	const char *path, *pr_path;
 	size_t pr_pathlen;
 	bool visible;
 
 	sx_assert(&shm_dict_lock, SA_LOCKED);
 	kif->kf_type = KF_TYPE_SHM;
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		path = shmfd->shm_path;
 		pr_path = curthread->td_ucred->cr_prison->pr_path;
 		if (strcmp(pr_path, "/") != 0) {
 			/* Return the jail-rooted pathname. */
 			pr_pathlen = strlen(pr_path);
 			visible = strncmp(path, pr_path, pr_pathlen) == 0 &&
 			    path[pr_pathlen] == '/';
 			if (list && !visible)
 				return (EPERM);
 			if (visible)
 				path += pr_pathlen;
 		}
 		strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 	}
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp __unused)
 {
 	int res;
 
 	sx_slock(&shm_dict_lock);
 	res = shm_fill_kinfo_locked(fp->f_data, kif, false);
 	sx_sunlock(&shm_dict_lock);
 	return (res);
 }
 
 static int
 shm_add_seals(struct file *fp, int seals)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	vm_ooffset_t writemappings;
 	int error, nseals;
 
 	error = 0;
 	shmfd = fp->f_data;
 	rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX);
 
 	/* Even already-set seals should result in EPERM. */
 	if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) {
 		error = EPERM;
 		goto out;
 	}
 	nseals = seals & ~shmfd->shm_seals;
 	if ((nseals & F_SEAL_WRITE) != 0) {
 		if (shm_largepage(shmfd)) {
 			error = ENOTSUP;
 			goto out;
 		}
 
 		/*
 		 * The rangelock above prevents writable mappings from being
 		 * added after we've started applying seals.  The RLOCK here
 		 * is to avoid torn reads on ILP32 arches as unmapping/reducing
 		 * writemappings will be done without a rangelock.
 		 */
 		VM_OBJECT_RLOCK(shmfd->shm_object);
 		writemappings = shmfd->shm_object->un_pager.swp.writemappings;
 		VM_OBJECT_RUNLOCK(shmfd->shm_object);
 		/* kmappings are also writable */
 		if (writemappings > 0) {
 			error = EBUSY;
 			goto out;
 		}
 	}
 	shmfd->shm_seals |= nseals;
 out:
 	shm_rangelock_unlock(shmfd, rl_cookie);
 	return (error);
 }
 
 static int
 shm_get_seals(struct file *fp, int *seals)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	*seals = shmfd->shm_seals;
 	return (0);
 }
 
 static int
 shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags)
 {
 	vm_object_t object;
 	vm_pindex_t pistart, pi, piend;
 	vm_ooffset_t off, len;
 	int startofs, endofs, end;
 	int error;
 
 	off = *offset;
 	len = *length;
 	KASSERT(off + len <= (vm_ooffset_t)OFF_MAX, ("off + len overflows"));
 	if (off + len > shmfd->shm_size)
 		len = shmfd->shm_size - off;
 	object = shmfd->shm_object;
 	startofs = off & PAGE_MASK;
 	endofs = (off + len) & PAGE_MASK;
 	pistart = OFF_TO_IDX(off);
 	piend = OFF_TO_IDX(off + len);
 	pi = OFF_TO_IDX(off + PAGE_MASK);
 	error = 0;
 
 	/* Handle the case when offset is on or beyond shm size. */
 	if ((off_t)len <= 0) {
 		*length = 0;
 		return (0);
 	}
 
 	VM_OBJECT_WLOCK(object);
 
 	if (startofs != 0) {
 		end = pistart != piend ? PAGE_SIZE : endofs;
 		error = shm_partial_page_invalidate(object, pistart, startofs,
 		    end);
 		if (error)
 			goto out;
 		off += end - startofs;
 		len -= end - startofs;
 	}
 
 	if (pi < piend) {
 		vm_object_page_remove(object, pi, piend, 0);
 		off += IDX_TO_OFF(piend - pi);
 		len -= IDX_TO_OFF(piend - pi);
 	}
 
 	if (endofs != 0 && pistart != piend) {
 		error = shm_partial_page_invalidate(object, piend, 0, endofs);
 		if (error)
 			goto out;
 		off += endofs;
 		len -= endofs;
 	}
 
 out:
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	*offset = off;
 	*length = len;
 	return (error);
 }
 
 static int
 shm_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
     struct ucred *active_cred, struct thread *td)
 {
 	void *rl_cookie;
 	struct shmfd *shmfd;
 	off_t off, len;
 	int error;
 
 	KASSERT(cmd == SPACECTL_DEALLOC, ("shm_fspacectl: Invalid cmd"));
 	KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0,
 	    ("shm_fspacectl: non-zero flags"));
 	KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset,
 	    ("shm_fspacectl: offset/length overflow or underflow"));
 	error = EINVAL;
 	shmfd = fp->f_data;
 	off = *offset;
 	len = *length;
 
 	rl_cookie = shm_rangelock_wlock(shmfd, off, off + len);
 	switch (cmd) {
 	case SPACECTL_DEALLOC:
 		if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) {
 			error = EPERM;
 			break;
 		}
 		error = shm_deallocate(shmfd, &off, &len, flags);
 		*offset = off;
 		*length = len;
 		break;
 	default:
 		__assert_unreachable();
 	}
 	shm_rangelock_unlock(shmfd, rl_cookie);
 	return (error);
 }
 
 
 static int
 shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 	void *rl_cookie;
 	struct shmfd *shmfd;
 	size_t size;
 	int error;
 
 	/* This assumes that the caller already checked for overflow. */
 	error = 0;
 	shmfd = fp->f_data;
 	size = offset + len;
 
 	/*
 	 * Just grab the rangelock for the range that we may be attempting to
 	 * grow, rather than blocking read/write for regions we won't be
 	 * touching while this (potential) resize is in progress.  Other
 	 * attempts to resize the shmfd will have to take a write lock from 0 to
 	 * OFF_MAX, so this being potentially beyond the current usable range of
 	 * the shmfd is not necessarily a concern.  If other mechanisms are
 	 * added to grow a shmfd, this may need to be re-evaluated.
 	 */
 	rl_cookie = shm_rangelock_wlock(shmfd, offset, size);
 	if (size > shmfd->shm_size)
 		error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
 	shm_rangelock_unlock(shmfd, rl_cookie);
 	/* Translate to posix_fallocate(2) return value as needed. */
 	if (error == ENOMEM)
 		error = ENOSPC;
 	return (error);
 }
 
 static int
 sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS)
 {
 	struct shm_mapping *shmm;
 	struct sbuf sb;
 	struct kinfo_file kif;
 	u_long i;
 	int error, error2;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = 0;
 	sx_slock(&shm_dict_lock);
 	for (i = 0; i < shm_hash + 1; i++) {
 		LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) {
 			error = shm_fill_kinfo_locked(shmm->sm_shmfd,
 			    &kif, true);
 			if (error == EPERM) {
 				error = 0;
 				continue;
 			}
 			if (error != 0)
 				break;
 			pack_kinfo(&kif);
 			error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ?
 			    0 : ENOMEM;
 			if (error != 0)
 				break;
 		}
 	}
 	sx_sunlock(&shm_dict_lock);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list,
     CTLFLAG_RD | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE,
     NULL, 0, sysctl_posix_shm_list, "",
     "POSIX SHM list");
 
 int
 kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode,
     struct filecaps *caps)
 {
 
 	return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL));
 }
 
 /*
  * This version of the shm_open() interface leaves CLOEXEC behavior up to the
  * caller, and libc will enforce it for the traditional shm_open() call.  This
  * allows other consumers, like memfd_create(), to opt-in for CLOEXEC.  This
  * interface also includes a 'name' argument that is currently unused, but could
  * potentially be exported later via some interface for debugging purposes.
  * From the kernel's perspective, it is optional.  Individual consumers like
  * memfd_create() may require it in order to be compatible with other systems
  * implementing the same function.
  */
 int
 sys_shm_open2(struct thread *td, struct shm_open2_args *uap)
 {
 
 	return (kern_shm_open2(td, uap->path, uap->flags, uap->mode,
 	    uap->shmflags, NULL, uap->name));
 }
 
 int
 shm_get_path(struct vm_object *obj, char *path, size_t sz)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = NULL;
 	sx_slock(&shm_dict_lock);
 	VM_OBJECT_RLOCK(obj);
 	if ((obj->flags & OBJ_POSIXSHM) == 0) {
 		error = EINVAL;
 	} else {
 		if (obj->type == shmfd_pager_type)
 			shmfd = obj->un_pager.swp.swp_priv;
 		else if (obj->type == OBJT_PHYS)
 			shmfd = obj->un_pager.phys.phys_priv;
 		if (shmfd == NULL) {
 			error = ENXIO;
 		} else {
 			strlcpy(path, shmfd->shm_path == NULL ? "anon" :
 			    shmfd->shm_path, sz);
 		}
 	}
 	if (error != 0)
 		path[0] = '\0';
 	VM_OBJECT_RUNLOCK(obj);
 	sx_sunlock(&shm_dict_lock);
 	return (error);
 }
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 4ecd69d509ed..162c489ea6fe 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1,5168 +1,5168 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2004 The FreeBSD Foundation
  * Copyright (c) 2004-2008 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
  */
 
 /*
  * Comments on the socket life cycle:
  *
  * soalloc() sets of socket layer state for a socket, called only by
  * socreate() and sonewconn().  Socket layer private.
  *
  * sodealloc() tears down socket layer state for a socket, called only by
  * sofree() and sonewconn().  Socket layer private.
  *
  * pru_attach() associates protocol layer state with an allocated socket;
  * called only once, may fail, aborting socket allocation.  This is called
  * from socreate() and sonewconn().  Socket layer private.
  *
  * pru_detach() disassociates protocol layer state from an attached socket,
  * and will be called exactly once for sockets in which pru_attach() has
  * been successfully called.  If pru_attach() returned an error,
  * pru_detach() will not be called.  Socket layer private.
  *
  * pru_abort() and pru_close() notify the protocol layer that the last
  * consumer of a socket is starting to tear down the socket, and that the
  * protocol should terminate the connection.  Historically, pru_abort() also
  * detached protocol state from the socket state, but this is no longer the
  * case.
  *
  * socreate() creates a socket and attaches protocol state.  This is a public
  * interface that may be used by socket layer consumers to create new
  * sockets.
  *
  * sonewconn() creates a socket and attaches protocol state.  This is a
  * public interface  that may be used by protocols to create new sockets when
  * a new connection is received and will be available for accept() on a
  * listen socket.
  *
  * soclose() destroys a socket after possibly waiting for it to disconnect.
  * This is a public interface that socket consumers should use to close and
  * release a socket when done with it.
  *
  * soabort() destroys a socket without waiting for it to disconnect (used
  * only for incoming connections that are already partially or fully
  * connected).  This is used internally by the socket layer when clearing
  * listen socket queues (due to overflow or close on the listen socket), but
  * is also a public interface protocols may use to abort connections in
  * their incomplete listen queues should they no longer be required.  Sockets
  * placed in completed connection listen queues should not be aborted for
  * reasons described in the comment above the soclose() implementation.  This
  * is not a general purpose close routine, and except in the specific
  * circumstances described here, should not be used.
  *
  * sofree() will free a socket and its protocol state if all references on
  * the socket have been released, and is the public interface to attempt to
  * free a socket when a reference is removed.  This is a socket layer private
  * interface.
  *
  * NOTE: In addition to socreate() and soclose(), which provide a single
  * socket reference to the consumer to be managed as required, there are two
  * calls to explicitly manage socket references, soref(), and sorele().
  * Currently, these are generally required only when transitioning a socket
  * from a listen queue to a file descriptor, in order to prevent garbage
  * collection of the socket at an untimely moment.  For a number of reasons,
  * these interfaces are not preferred, and should be avoided.
  *
  * NOTE: With regard to VNETs the general rule is that callers do not set
  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  * and sorflush(), which are usually called from a pre-set VNET context.
  * sopoll() currently does not need a VNET context to be set.
  */
 
 #include <sys/cdefs.h>
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ktrace.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/domain.h>
 #include <sys/file.h>			/* for struct knote */
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/kthread.h>
 #include <sys/ktls.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/resourcevar.h>
 #include <net/route.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/jail.h>
 #include <sys/syslog.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 
 #include <vm/uma.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/sysent.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 static int	soreceive_generic_locked(struct socket *so,
 		    struct sockaddr **psa, struct uio *uio, struct mbuf **mp,
 		    struct mbuf **controlp, int *flagsp);
 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
 		    int flags);
 static int	soreceive_stream_locked(struct socket *so, struct sockbuf *sb,
 		    struct sockaddr **psa, struct uio *uio, struct mbuf **mp,
 		    struct mbuf **controlp, int flags);
 static int	sosend_generic_locked(struct socket *so, struct sockaddr *addr,
 		    struct uio *uio, struct mbuf *top, struct mbuf *control,
 		    int flags, struct thread *td);
 static void	so_rdknl_lock(void *);
 static void	so_rdknl_unlock(void *);
 static void	so_rdknl_assert_lock(void *, int);
 static void	so_wrknl_lock(void *);
 static void	so_wrknl_unlock(void *);
 static void	so_wrknl_assert_lock(void *, int);
 
 static void	filt_sordetach(struct knote *kn);
 static int	filt_soread(struct knote *kn, long hint);
 static void	filt_sowdetach(struct knote *kn);
 static int	filt_sowrite(struct knote *kn, long hint);
 static int	filt_soempty(struct knote *kn, long hint);
 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 fo_kqfilter_t	soo_kqfilter;
 
-static struct filterops soread_filtops = {
+static const struct filterops soread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sordetach,
 	.f_event = filt_soread,
 };
-static struct filterops sowrite_filtops = {
+static const struct filterops sowrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_sowrite,
 };
-static struct filterops soempty_filtops = {
+static const struct filterops soempty_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_soempty,
 };
 
 so_gen_t	so_gencnt;	/* generation count for sockets */
 
 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 
 #define	VNET_SO_ASSERT(so)						\
 	VNET_ASSERT(curvnet != NULL,					\
 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 
 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
 #define	V_socket_hhh		VNET(socket_hhh)
 
 #ifdef COMPAT_FREEBSD32
 #ifdef __amd64__
 /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */
 #define	__splice32_packed	__packed
 #else
 #define	__splice32_packed
 #endif
 struct splice32 {
 	int32_t	sp_fd;
 	int64_t sp_max;
 	struct timeval32 sp_idle;
 } __splice32_packed;
 #undef __splice32_packed
 #endif
 
 /*
  * Limit on the number of connections in the listen queue waiting
  * for accept(2).
  * NB: The original sysctl somaxconn is still available but hidden
  * to prevent confusion about the actual purpose of this number.
  */
 static u_int somaxconn = SOMAXCONN;
 
 static int
 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int val;
 
 	val = somaxconn;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 
 	/*
 	 * The purpose of the UINT_MAX / 3 limit, is so that the formula
 	 *   3 * so_qlimit / 2
 	 * below, will not overflow.
          */
 
 	if (val < 1 || val > UINT_MAX / 3)
 		return (EINVAL);
 
 	somaxconn = val;
 	return (0);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
     sysctl_somaxconn, "I",
     "Maximum listen socket pending connection accept queue size");
 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
     sizeof(int), sysctl_somaxconn, "I",
     "Maximum listen socket pending connection accept queue size (compat)");
 
 static int numopensockets;
 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
     &numopensockets, 0, "Number of open sockets");
 
 /*
  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  * so_gencnt field.
  */
 static struct mtx so_global_mtx;
 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 
 /*
  * General IPC sysctl name space, used by sockets and a variety of other IPC
  * types.
  */
 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPC");
 
 /*
  * Initialize the socket subsystem and set up the socket
  * memory allocator.
  */
 static uma_zone_t socket_zone;
 int	maxsockets;
 
 static void
 socket_zone_change(void *tag)
 {
 
 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 }
 
 static int splice_init_state;
 static struct sx splice_init_lock;
 SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init");
 
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0,
     "Settings relating to the SO_SPLICE socket option");
 
 static bool splice_receive_stream = true;
 SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN,
     &splice_receive_stream, 0,
     "Use soreceive_stream() for stream splices");
 
 static uma_zone_t splice_zone;
 static struct proc *splice_proc;
 struct splice_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, so_splice) head;
 	bool		running;
 } __aligned(CACHE_LINE_SIZE);
 static struct splice_wq *splice_wq;
 static uint32_t splice_index = 0;
 
 static void so_splice_timeout(void *arg, int pending);
 static void so_splice_xfer(struct so_splice *s);
 static int so_unsplice(struct socket *so, bool timeout);
 
 static void
 splice_work_thread(void *ctx)
 {
 	struct splice_wq *wq = ctx;
 	struct so_splice *s, *s_temp;
 	STAILQ_HEAD(, so_splice) local_head;
 	int cpu;
 
 	cpu = wq - splice_wq;
 	if (bootverbose)
 		printf("starting so_splice worker thread for CPU %d\n", cpu);
 
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 		STAILQ_INIT(&local_head);
 		STAILQ_CONCAT(&local_head, &wq->head);
 		STAILQ_INIT(&wq->head);
 		mtx_unlock(&wq->mtx);
 		STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) {
 			mtx_lock(&s->mtx);
 			CURVNET_SET(s->src->so_vnet);
 			so_splice_xfer(s);
 			CURVNET_RESTORE();
 		}
 	}
 }
 
 static void
 so_splice_dispatch_async(struct so_splice *sp)
 {
 	struct splice_wq *wq;
 	bool running;
 
 	wq = &splice_wq[sp->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->head, sp, next);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 void
 so_splice_dispatch(struct so_splice *sp)
 {
 	mtx_assert(&sp->mtx, MA_OWNED);
 
 	if (sp->state != SPLICE_IDLE) {
 		mtx_unlock(&sp->mtx);
 	} else {
 		sp->state = SPLICE_QUEUED;
 		mtx_unlock(&sp->mtx);
 		so_splice_dispatch_async(sp);
 	}
 }
 
 static int
 splice_zinit(void *mem, int size __unused, int flags __unused)
 {
 	struct so_splice *s;
 
 	s = (struct so_splice *)mem;
 	mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF);
 	return (0);
 }
 
 static void
 splice_zfini(void *mem, int size)
 {
 	struct so_splice *s;
 
 	s = (struct so_splice *)mem;
 	mtx_destroy(&s->mtx);
 }
 
 static int
 splice_init(void)
 {
 	struct thread *td;
 	int error, i, state;
 
 	state = atomic_load_acq_int(&splice_init_state);
 	if (__predict_true(state > 0))
 		return (0);
 	if (state < 0)
 		return (ENXIO);
 	sx_xlock(&splice_init_lock);
 	if (splice_init_state != 0) {
 		sx_xunlock(&splice_init_lock);
 		return (0);
 	}
 
 	splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL,
 	    NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0);
 
 	splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP,
 	    M_WAITOK | M_ZERO);
 
 	/*
 	 * Initialize the workqueues to run the splice work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&splice_wq[i].head);
 		mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF);
 	}
 
 	/* Start kthreads for each workqueue. */
 	error = 0;
 	CPU_FOREACH(i) {
 		error = kproc_kthread_add(splice_work_thread, &splice_wq[i],
 		    &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i);
 		if (error) {
 			printf("Can't add so_splice thread %d error %d\n",
 			    i, error);
 			break;
 		}
 
 		/*
 		 * It's possible to create loops with SO_SPLICE; ensure that
 		 * worker threads aren't able to starve the system too easily.
 		 */
 		thread_lock(td);
 		sched_prio(td, PUSER);
 		thread_unlock(td);
 	}
 
 	splice_init_state = error != 0 ? -1 : 1;
 	sx_xunlock(&splice_init_lock);
 
 	return (error);
 }
 
 /*
  * Lock a pair of socket's I/O locks for splicing.  Avoid blocking while holding
  * one lock in order to avoid potential deadlocks in case there is some other
  * code path which acquires more than one I/O lock at a time.
  */
 static void
 splice_lock_pair(struct socket *so_src, struct socket *so_dst)
 {
 	int error;
 
 	for (;;) {
 		error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR);
 		KASSERT(error == 0,
 		    ("%s: failed to lock send I/O lock: %d", __func__, error));
 		error = SOCK_IO_RECV_LOCK(so_src, 0);
 		KASSERT(error == 0 || error == EWOULDBLOCK,
 		    ("%s: failed to lock recv I/O lock: %d", __func__, error));
 		if (error == 0)
 			break;
 		SOCK_IO_SEND_UNLOCK(so_dst);
 
 		error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR);
 		KASSERT(error == 0,
 		    ("%s: failed to lock recv I/O lock: %d", __func__, error));
 		error = SOCK_IO_SEND_LOCK(so_dst, 0);
 		KASSERT(error == 0 || error == EWOULDBLOCK,
 		    ("%s: failed to lock send I/O lock: %d", __func__, error));
 		if (error == 0)
 			break;
 		SOCK_IO_RECV_UNLOCK(so_src);
 	}
 }
 
 static void
 splice_unlock_pair(struct socket *so_src, struct socket *so_dst)
 {
 	SOCK_IO_RECV_UNLOCK(so_src);
 	SOCK_IO_SEND_UNLOCK(so_dst);
 }
 
 /*
  * Move data from the source to the sink.  Assumes that both of the relevant
  * socket I/O locks are held.
  */
 static int
 so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max,
     ssize_t *lenp)
 {
 	struct uio uio;
 	struct mbuf *m;
 	struct sockbuf *sb_src, *sb_dst;
 	ssize_t len;
 	long space;
 	int error, flags;
 
 	SOCK_IO_RECV_ASSERT_LOCKED(so_src);
 	SOCK_IO_SEND_ASSERT_LOCKED(so_dst);
 
 	error = 0;
 	m = NULL;
 	memset(&uio, 0, sizeof(uio));
 
 	sb_src = &so_src->so_rcv;
 	sb_dst = &so_dst->so_snd;
 
 	space = sbspace(sb_dst);
 	if (space < 0)
 		space = 0;
 	len = MIN(max, MIN(space, sbavail(sb_src)));
 	if (len == 0) {
 		SOCK_RECVBUF_LOCK(so_src);
 		if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0)
 			error = EPIPE;
 		SOCK_RECVBUF_UNLOCK(so_src);
 	} else {
 		flags = MSG_DONTWAIT;
 		uio.uio_resid = len;
 		if (splice_receive_stream && sb_src->sb_tls_info == NULL) {
 			error = soreceive_stream_locked(so_src, sb_src, NULL,
 			    &uio, &m, NULL, flags);
 		} else {
 			error = soreceive_generic_locked(so_src, NULL,
 			    &uio, &m, NULL, &flags);
 		}
 		if (error != 0 && m != NULL) {
 			m_freem(m);
 			m = NULL;
 		}
 	}
 	if (m != NULL) {
 		len -= uio.uio_resid;
 		error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL,
 		    MSG_DONTWAIT, curthread);
 	} else if (error == 0) {
 		len = 0;
 		SOCK_SENDBUF_LOCK(so_dst);
 		if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0)
 			error = EPIPE;
 		SOCK_SENDBUF_UNLOCK(so_dst);
 	}
 	if (error == 0)
 		*lenp = len;
 	return (error);
 }
 
 /*
  * Transfer data from the source to the sink.
  *
  * If "direct" is true, the transfer is done in the context of whichever thread
  * is operating on one of the socket buffers.  We do not know which locks are
  * held, so we can only trylock the socket buffers; if this fails, we fall back
  * to the worker thread, which invokes this routine with "direct" set to false.
  */
 static void
 so_splice_xfer(struct so_splice *sp)
 {
 	struct socket *so_src, *so_dst;
 	off_t max;
 	ssize_t len;
 	int error;
 
 	mtx_assert(&sp->mtx, MA_OWNED);
 	KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING,
 	    ("so_splice_xfer: invalid state %d", sp->state));
 	KASSERT(sp->max != 0, ("so_splice_xfer: max == 0"));
 
 	if (sp->state == SPLICE_CLOSING) {
 		/* Userspace asked us to close the splice. */
 		goto closing;
 	}
 
 	sp->state = SPLICE_RUNNING;
 	so_src = sp->src;
 	so_dst = sp->dst;
 	max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX;
 	if (max < 0)
 		max = 0;
 
 	/*
 	 * Lock the sockets in order to block userspace from doing anything
 	 * sneaky.  If an error occurs or one of the sockets can no longer
 	 * transfer data, we will automatically unsplice.
 	 */
 	mtx_unlock(&sp->mtx);
 	splice_lock_pair(so_src, so_dst);
 
 	error = so_splice_xfer_data(so_src, so_dst, max, &len);
 
 	mtx_lock(&sp->mtx);
 
 	/*
 	 * Update our stats while still holding the socket locks.  This
 	 * synchronizes with getsockopt(SO_SPLICE), see the comment there.
 	 */
 	if (error == 0) {
 		KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len));
 		so_src->so_splice_sent += len;
 	}
 	splice_unlock_pair(so_src, so_dst);
 
 	switch (sp->state) {
 	case SPLICE_CLOSING:
 closing:
 		sp->state = SPLICE_CLOSED;
 		wakeup(sp);
 		mtx_unlock(&sp->mtx);
 		break;
 	case SPLICE_RUNNING:
 		if (error != 0 ||
 		    (sp->max > 0 && so_src->so_splice_sent >= sp->max)) {
 			sp->state = SPLICE_EXCEPTION;
 			soref(so_src);
 			mtx_unlock(&sp->mtx);
 			(void)so_unsplice(so_src, false);
 			sorele(so_src);
 		} else {
 			/*
 			 * Locklessly check for additional bytes in the source's
 			 * receive buffer and queue more work if possible.  We
 			 * may end up queuing needless work, but that's ok, and
 			 * if we race with a thread inserting more data into the
 			 * buffer and observe sbavail() == 0, the splice mutex
 			 * ensures that splice_push() will queue more work for
 			 * us.
 			 */
 			if (sbavail(&so_src->so_rcv) > 0 &&
 			    sbspace(&so_dst->so_snd) > 0) {
 				sp->state = SPLICE_QUEUED;
 				mtx_unlock(&sp->mtx);
 				so_splice_dispatch_async(sp);
 			} else {
 				sp->state = SPLICE_IDLE;
 				mtx_unlock(&sp->mtx);
 			}
 		}
 		break;
 	default:
 		__assert_unreachable();
 	}
 }
 
 static void
 socket_hhook_register(int subtype)
 {
 
 	if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
 	    &V_socket_hhh[subtype],
 	    HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register hook\n", __func__);
 }
 
 static void
 socket_hhook_deregister(int subtype)
 {
 
 	if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
 		printf("%s: WARNING: unable to deregister hook\n", __func__);
 }
 
 static void
 socket_init(void *tag)
 {
 
 	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 }
 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 
 static void
 socket_vnet_init(const void *unused __unused)
 {
 	int i;
 
 	/* We expect a contiguous range */
 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 		socket_hhook_register(i);
 }
 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
     socket_vnet_init, NULL);
 
 static void
 socket_vnet_uninit(const void *unused __unused)
 {
 	int i;
 
 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 		socket_hhook_deregister(i);
 }
 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
     socket_vnet_uninit, NULL);
 
 /*
  * Initialise maxsockets.  This SYSINIT must be run after
  * tunable_mbinit().
  */
 static void
 init_maxsockets(void *ignored)
 {
 
 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 	maxsockets = imax(maxsockets, maxfiles);
 }
 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 
 /*
  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
  * of the change so that they can update their dependent limits as required.
  */
 static int
 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 {
 	int error, newmaxsockets;
 
 	newmaxsockets = maxsockets;
 	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 	if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
 		if (newmaxsockets > maxsockets &&
 		    newmaxsockets <= maxfiles) {
 			maxsockets = newmaxsockets;
 			EVENTHANDLER_INVOKE(maxsockets_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     &maxsockets, 0, sysctl_maxsockets, "IU",
     "Maximum number of sockets available");
 
 /*
  * Socket operation routines.  These routines are called by the routines in
  * sys_socket.c or from a system process, and implement the semantics of
  * socket operations by switching out to the protocol specific routines.
  */
 
 /*
  * Get a socket structure from our zone, and initialize it.  Note that it
  * would probably be better to allocate socket and PCB at the same time, but
  * I'm not convinced that all the protocols can be easily modified to do
  * this.
  *
  * soalloc() returns a socket with a ref count of 0.
  */
 static struct socket *
 soalloc(struct vnet *vnet)
 {
 	struct socket *so;
 
 	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 	if (so == NULL)
 		return (NULL);
 #ifdef MAC
 	if (mac_socket_init(so, M_NOWAIT) != 0) {
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 #endif
 	if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 
 	/*
 	 * The socket locking protocol allows to lock 2 sockets at a time,
 	 * however, the first one must be a listening socket.  WITNESS lacks
 	 * a feature to change class of an existing lock, so we use DUPOK.
 	 */
 	mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
 	mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
 	so->so_rcv.sb_sel = &so->so_rdsel;
 	so->so_snd.sb_sel = &so->so_wrsel;
 	sx_init(&so->so_snd_sx, "so_snd_sx");
 	sx_init(&so->so_rcv_sx, "so_rcv_sx");
 	TAILQ_INIT(&so->so_snd.sb_aiojobq);
 	TAILQ_INIT(&so->so_rcv.sb_aiojobq);
 	TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
 	TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
 #ifdef VIMAGE
 	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 	    __func__, __LINE__, so));
 	so->so_vnet = vnet;
 #endif
 	/* We shouldn't need the so_global_mtx */
 	if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
 		/* Do we need more comprehensive error returns? */
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 	mtx_lock(&so_global_mtx);
 	so->so_gencnt = ++so_gencnt;
 	++numopensockets;
 #ifdef VIMAGE
 	vnet->vnet_sockcnt++;
 #endif
 	mtx_unlock(&so_global_mtx);
 
 	return (so);
 }
 
 /*
  * Free the storage associated with a socket at the socket layer, tear down
  * locks, labels, etc.  All protocol state is assumed already to have been
  * torn down (and possibly never set up) by the caller.
  */
 void
 sodealloc(struct socket *so)
 {
 
 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 
 	mtx_lock(&so_global_mtx);
 	so->so_gencnt = ++so_gencnt;
 	--numopensockets;	/* Could be below, but faster here. */
 #ifdef VIMAGE
 	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 	    __func__, __LINE__, so));
 	so->so_vnet->vnet_sockcnt--;
 #endif
 	mtx_unlock(&so_global_mtx);
 #ifdef MAC
 	mac_socket_destroy(so);
 #endif
 	hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
 
 	khelp_destroy_osd(&so->osd);
 	if (SOLISTENING(so)) {
 		if (so->sol_accept_filter != NULL)
 			accept_filt_setopt(so, NULL);
 	} else {
 		if (so->so_rcv.sb_hiwat)
 			(void)chgsbsize(so->so_cred->cr_uidinfo,
 			    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 		if (so->so_snd.sb_hiwat)
 			(void)chgsbsize(so->so_cred->cr_uidinfo,
 			    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 		sx_destroy(&so->so_snd_sx);
 		sx_destroy(&so->so_rcv_sx);
 		mtx_destroy(&so->so_snd_mtx);
 		mtx_destroy(&so->so_rcv_mtx);
 	}
 	crfree(so->so_cred);
 	mtx_destroy(&so->so_lock);
 	uma_zfree(socket_zone, so);
 }
 
 /*
  * socreate returns a socket with a ref count of 1 and a file descriptor
  * reference.  The socket should be closed with soclose().
  */
 int
 socreate(int dom, struct socket **aso, int type, int proto,
     struct ucred *cred, struct thread *td)
 {
 	struct protosw *prp;
 	struct socket *so;
 	int error;
 
 	/*
 	 * XXX: divert(4) historically abused PF_INET.  Keep this compatibility
 	 * shim until all applications have been updated.
 	 */
 	if (__predict_false(dom == PF_INET && type == SOCK_RAW &&
 	    proto == IPPROTO_DIVERT)) {
 		dom = PF_DIVERT;
 		printf("%s uses obsolete way to create divert(4) socket\n",
 		    td->td_proc->p_comm);
 	}
 
 	prp = pffindproto(dom, type, proto);
 	if (prp == NULL) {
 		/* No support for domain. */
 		if (pffinddomain(dom) == NULL)
 			return (EAFNOSUPPORT);
 		/* No support for socket type. */
 		if (proto == 0 && type != 0)
 			return (EPROTOTYPE);
 		return (EPROTONOSUPPORT);
 	}
 
 	MPASS(prp->pr_attach);
 
 	if ((prp->pr_flags & PR_CAPATTACH) == 0) {
 		if (CAP_TRACING(td))
 			ktrcapfail(CAPFAIL_PROTO, &proto);
 		if (IN_CAPABILITY_MODE(td))
 			return (ECAPMODE);
 	}
 
 	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 		return (EPROTONOSUPPORT);
 
 	so = soalloc(CRED_TO_VNET(cred));
 	if (so == NULL)
 		return (ENOBUFS);
 
 	so->so_type = type;
 	so->so_cred = crhold(cred);
 	if ((prp->pr_domain->dom_family == PF_INET) ||
 	    (prp->pr_domain->dom_family == PF_INET6) ||
 	    (prp->pr_domain->dom_family == PF_ROUTE))
 		so->so_fibnum = td->td_proc->p_fibnum;
 	else
 		so->so_fibnum = 0;
 	so->so_proto = prp;
 #ifdef MAC
 	mac_socket_create(cred, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	if ((prp->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
 	/*
 	 * Auto-sizing of socket buffers is managed by the protocols and
 	 * the appropriate flags must be set in the pru_attach function.
 	 */
 	CURVNET_SET(so->so_vnet);
 	error = prp->pr_attach(so, proto, td);
 	CURVNET_RESTORE();
 	if (error) {
 		sodealloc(so);
 		return (error);
 	}
 	soref(so);
 	*aso = so;
 	return (0);
 }
 
 #ifdef REGRESSION
 static int regression_sonewconn_earlytest = 1;
 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 #endif
 
 static int sooverprio = LOG_DEBUG;
 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW,
     &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable");
 
 static struct timeval overinterval = { 60, 0 };
 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
     &overinterval,
     "Delay in seconds between warnings for listen socket overflows");
 
 /*
  * When an attempt at a new connection is noted on a socket which supports
  * accept(2), the protocol has two options:
  * 1) Call legacy sonewconn() function, which would call protocol attach
  *    method, same as used for socket(2).
  * 2) Call solisten_clone(), do attach that is specific to a cloned connection,
  *    and then call solisten_enqueue().
  *
  * Note: the ref count on the socket is 0 on return.
  */
 struct socket *
 solisten_clone(struct socket *head)
 {
 	struct sbuf descrsb;
 	struct socket *so;
 	int len, overcount;
 	u_int qlen;
 	const char localprefix[] = "local:";
 	char descrbuf[SUNPATHLEN + sizeof(localprefix)];
 #if defined(INET6)
 	char addrbuf[INET6_ADDRSTRLEN];
 #elif defined(INET)
 	char addrbuf[INET_ADDRSTRLEN];
 #endif
 	bool dolog, over;
 
 	SOLISTEN_LOCK(head);
 	over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
 #ifdef REGRESSION
 	if (regression_sonewconn_earlytest && over) {
 #else
 	if (over) {
 #endif
 		head->sol_overcount++;
 		dolog = (sooverprio >= 0) &&
 			!!ratecheck(&head->sol_lastover, &overinterval);
 
 		/*
 		 * If we're going to log, copy the overflow count and queue
 		 * length from the listen socket before dropping the lock.
 		 * Also, reset the overflow count.
 		 */
 		if (dolog) {
 			overcount = head->sol_overcount;
 			head->sol_overcount = 0;
 			qlen = head->sol_qlen;
 		}
 		SOLISTEN_UNLOCK(head);
 
 		if (dolog) {
 			/*
 			 * Try to print something descriptive about the
 			 * socket for the error message.
 			 */
 			sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
 			    SBUF_FIXEDLEN);
 			switch (head->so_proto->pr_domain->dom_family) {
 #if defined(INET) || defined(INET6)
 #ifdef INET
 			case AF_INET:
 #endif
 #ifdef INET6
 			case AF_INET6:
 				if (head->so_proto->pr_domain->dom_family ==
 				    AF_INET6 ||
 				    (sotoinpcb(head)->inp_inc.inc_flags &
 				    INC_ISIPV6)) {
 					ip6_sprintf(addrbuf,
 					    &sotoinpcb(head)->inp_inc.inc6_laddr);
 					sbuf_printf(&descrsb, "[%s]", addrbuf);
 				} else
 #endif
 				{
 #ifdef INET
 					inet_ntoa_r(
 					    sotoinpcb(head)->inp_inc.inc_laddr,
 					    addrbuf);
 					sbuf_cat(&descrsb, addrbuf);
 #endif
 				}
 				sbuf_printf(&descrsb, ":%hu (proto %u)",
 				    ntohs(sotoinpcb(head)->inp_inc.inc_lport),
 				    head->so_proto->pr_protocol);
 				break;
 #endif /* INET || INET6 */
 			case AF_UNIX:
 				sbuf_cat(&descrsb, localprefix);
 				if (sotounpcb(head)->unp_addr != NULL)
 					len =
 					    sotounpcb(head)->unp_addr->sun_len -
 					    offsetof(struct sockaddr_un,
 					    sun_path);
 				else
 					len = 0;
 				if (len > 0)
 					sbuf_bcat(&descrsb,
 					    sotounpcb(head)->unp_addr->sun_path,
 					    len);
 				else
 					sbuf_cat(&descrsb, "(unknown)");
 				break;
 			}
 
 			/*
 			 * If we can't print something more specific, at least
 			 * print the domain name.
 			 */
 			if (sbuf_finish(&descrsb) != 0 ||
 			    sbuf_len(&descrsb) <= 0) {
 				sbuf_clear(&descrsb);
 				sbuf_cat(&descrsb,
 				    head->so_proto->pr_domain->dom_name ?:
 				    "unknown");
 				sbuf_finish(&descrsb);
 			}
 			KASSERT(sbuf_len(&descrsb) > 0,
 			    ("%s: sbuf creation failed", __func__));
 			/*
 			 * Preserve the historic listen queue overflow log
 			 * message, that starts with "sonewconn:".  It has
 			 * been known to sysadmins for years and also test
 			 * sys/kern/sonewconn_overflow checks for it.
 			 */
 			if (head->so_cred == 0) {
 				log(LOG_PRI(sooverprio),
 				    "sonewconn: pcb %p (%s): "
 				    "Listen queue overflow: %i already in "
 				    "queue awaiting acceptance (%d "
 				    "occurrences)\n", head->so_pcb,
 				    sbuf_data(&descrsb),
 			    	qlen, overcount);
 			} else {
 				log(LOG_PRI(sooverprio),
 				    "sonewconn: pcb %p (%s): "
 				    "Listen queue overflow: "
 				    "%i already in queue awaiting acceptance "
 				    "(%d occurrences), euid %d, rgid %d, jail %s\n",
 				    head->so_pcb, sbuf_data(&descrsb), qlen,
 				    overcount, head->so_cred->cr_uid,
 				    head->so_cred->cr_rgid,
 				    head->so_cred->cr_prison ?
 					head->so_cred->cr_prison->pr_name :
 					"not_jailed");
 			}
 			sbuf_delete(&descrsb);
 
 			overcount = 0;
 		}
 
 		return (NULL);
 	}
 	SOLISTEN_UNLOCK(head);
 	VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
 	    __func__, head));
 	so = soalloc(head->so_vnet);
 	if (so == NULL) {
 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 		    "limit reached or out of memory\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_listen = head;
 	so->so_type = head->so_type;
 	/*
 	 * POSIX is ambiguous on what options an accept(2)ed socket should
 	 * inherit from the listener.  Words "create a new socket" may be
 	 * interpreted as not inheriting anything.  Best programming practice
 	 * for application developers is to not rely on such inheritance.
 	 * FreeBSD had historically inherited all so_options excluding
 	 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options,
 	 * including those completely irrelevant to a new born socket.  For
 	 * compatibility with older versions we will inherit a list of
 	 * meaningful options.
 	 */
 	so->so_options = head->so_options & (SO_KEEPALIVE | SO_DONTROUTE |
 	    SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE);
 	so->so_linger = head->so_linger;
 	so->so_state = head->so_state;
 	so->so_fibnum = head->so_fibnum;
 	so->so_proto = head->so_proto;
 	so->so_cred = crhold(head->so_cred);
 #ifdef MAC
 	mac_socket_newconn(head, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	VNET_SO_ASSERT(head);
 	if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
 	so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
 	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 	so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
 	so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
 	if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
 
 	return (so);
 }
 
 /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */
 struct socket *
 sonewconn(struct socket *head, int connstatus)
 {
 	struct socket *so;
 
 	if ((so = solisten_clone(head)) == NULL)
 		return (NULL);
 
 	if (so->so_proto->pr_attach(so, 0, NULL) != 0) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 
 	(void)solisten_enqueue(so, connstatus);
 
 	return (so);
 }
 
 /*
  * Enqueue socket cloned by solisten_clone() to the listen queue of the
  * listener it has been cloned from.
  *
  * Return 'true' if socket landed on complete queue, otherwise 'false'.
  */
 bool
 solisten_enqueue(struct socket *so, int connstatus)
 {
 	struct socket *head = so->so_listen;
 
 	MPASS(refcount_load(&so->so_count) == 0);
 	refcount_init(&so->so_count, 1);
 
 	SOLISTEN_LOCK(head);
 	if (head->sol_accept_filter != NULL)
 		connstatus = 0;
 	so->so_state |= connstatus;
 	soref(head); /* A socket on (in)complete queue refs head. */
 	if (connstatus) {
 		TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 		so->so_qstate = SQ_COMP;
 		head->sol_qlen++;
 		solisten_wakeup(head);	/* unlocks */
 		return (true);
 	} else {
 		/*
 		 * Keep removing sockets from the head until there's room for
 		 * us to insert on the tail.  In pre-locking revisions, this
 		 * was a simple if(), but as we could be racing with other
 		 * threads and soabort() requires dropping locks, we must
 		 * loop waiting for the condition to be true.
 		 */
 		while (head->sol_incqlen > head->sol_qlimit) {
 			struct socket *sp;
 
 			sp = TAILQ_FIRST(&head->sol_incomp);
 			TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
 			head->sol_incqlen--;
 			SOCK_LOCK(sp);
 			sp->so_qstate = SQ_NONE;
 			sp->so_listen = NULL;
 			SOCK_UNLOCK(sp);
 			sorele_locked(head);	/* does SOLISTEN_UNLOCK, head stays */
 			soabort(sp);
 			SOLISTEN_LOCK(head);
 		}
 		TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
 		so->so_qstate = SQ_INCOMP;
 		head->sol_incqlen++;
 		SOLISTEN_UNLOCK(head);
 		return (false);
 	}
 }
 
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 /*
  * Socket part of sctp_peeloff().  Detach a new socket from an
  * association.  The new socket is returned with a reference.
  *
  * XXXGL: reduce copy-paste with solisten_clone().
  */
 struct socket *
 sopeeloff(struct socket *head)
 {
 	struct socket *so;
 
 	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 	    __func__, __LINE__, head));
 	so = soalloc(head->so_vnet);
 	if (so == NULL) {
 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 		    "limit reached or out of memory\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_type = head->so_type;
 	so->so_options = head->so_options;
 	so->so_linger = head->so_linger;
 	so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
 	so->so_fibnum = head->so_fibnum;
 	so->so_proto = head->so_proto;
 	so->so_cred = crhold(head->so_cred);
 #ifdef MAC
 	mac_socket_newconn(head, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	VNET_SO_ASSERT(head);
 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	if ((*so->so_proto->pr_attach)(so, 0, NULL)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 	if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
 
 	soref(so);
 
 	return (so);
 }
 #endif	/* SCTP */
 
 int
 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_bind(so, nam, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_bindat(fd, so, nam, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * solisten() transitions a socket from a non-listening state to a listening
  * state, but can also be used to update the listen queue depth on an
  * existing listen socket.  The protocol will call back into the sockets
  * layer using solisten_proto_check() and solisten_proto() to check and set
  * socket-layer listen state.  Call backs are used so that the protocol can
  * acquire both protocol and socket layer locks in whatever order is required
  * by the protocol.
  *
  * Protocol implementors are advised to hold the socket lock across the
  * socket-layer test and set to avoid races at the socket layer.
  */
 int
 solisten(struct socket *so, int backlog, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_listen(so, backlog, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Prepare for a call to solisten_proto().  Acquire all socket buffer locks in
  * order to interlock with socket I/O.
  */
 int
 solisten_proto_check(struct socket *so)
 {
 	SOCK_LOCK_ASSERT(so);
 
 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 	    SS_ISDISCONNECTING)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Sleeping is not permitted here, so simply fail if userspace is
 	 * attempting to transmit or receive on the socket.  This kind of
 	 * transient failure is not ideal, but it should occur only if userspace
 	 * is misusing the socket interfaces.
 	 */
 	if (!sx_try_xlock(&so->so_snd_sx))
 		return (EAGAIN);
 	if (!sx_try_xlock(&so->so_rcv_sx)) {
 		sx_xunlock(&so->so_snd_sx);
 		return (EAGAIN);
 	}
 	mtx_lock(&so->so_snd_mtx);
 	mtx_lock(&so->so_rcv_mtx);
 
 	/* Interlock with soo_aio_queue() and KTLS. */
 	if (!SOLISTENING(so)) {
 		bool ktls;
 
 #ifdef KERN_TLS
 		ktls = so->so_snd.sb_tls_info != NULL ||
 		    so->so_rcv.sb_tls_info != NULL;
 #else
 		ktls = false;
 #endif
 		if (ktls ||
 		    (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 ||
 		    (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) {
 			solisten_proto_abort(so);
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Undo the setup done by solisten_proto_check().
  */
 void
 solisten_proto_abort(struct socket *so)
 {
 	mtx_unlock(&so->so_snd_mtx);
 	mtx_unlock(&so->so_rcv_mtx);
 	sx_xunlock(&so->so_snd_sx);
 	sx_xunlock(&so->so_rcv_sx);
 }
 
 void
 solisten_proto(struct socket *so, int backlog)
 {
 	int sbrcv_lowat, sbsnd_lowat;
 	u_int sbrcv_hiwat, sbsnd_hiwat;
 	short sbrcv_flags, sbsnd_flags;
 	sbintime_t sbrcv_timeo, sbsnd_timeo;
 
 	SOCK_LOCK_ASSERT(so);
 	KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 	    SS_ISDISCONNECTING)) == 0,
 	    ("%s: bad socket state %p", __func__, so));
 
 	if (SOLISTENING(so))
 		goto listening;
 
 	/*
 	 * Change this socket to listening state.
 	 */
 	sbrcv_lowat = so->so_rcv.sb_lowat;
 	sbsnd_lowat = so->so_snd.sb_lowat;
 	sbrcv_hiwat = so->so_rcv.sb_hiwat;
 	sbsnd_hiwat = so->so_snd.sb_hiwat;
 	sbrcv_flags = so->so_rcv.sb_flags;
 	sbsnd_flags = so->so_snd.sb_flags;
 	sbrcv_timeo = so->so_rcv.sb_timeo;
 	sbsnd_timeo = so->so_snd.sb_timeo;
 
 #ifdef MAC
 	mac_socketpeer_label_free(so->so_peerlabel);
 #endif
 
 	sbdestroy(so, SO_SND);
 	sbdestroy(so, SO_RCV);
 
 #ifdef INVARIANTS
 	bzero(&so->so_rcv,
 	    sizeof(struct socket) - offsetof(struct socket, so_rcv));
 #endif
 
 	so->sol_sbrcv_lowat = sbrcv_lowat;
 	so->sol_sbsnd_lowat = sbsnd_lowat;
 	so->sol_sbrcv_hiwat = sbrcv_hiwat;
 	so->sol_sbsnd_hiwat = sbsnd_hiwat;
 	so->sol_sbrcv_flags = sbrcv_flags;
 	so->sol_sbsnd_flags = sbsnd_flags;
 	so->sol_sbrcv_timeo = sbrcv_timeo;
 	so->sol_sbsnd_timeo = sbsnd_timeo;
 
 	so->sol_qlen = so->sol_incqlen = 0;
 	TAILQ_INIT(&so->sol_incomp);
 	TAILQ_INIT(&so->sol_comp);
 
 	so->sol_accept_filter = NULL;
 	so->sol_accept_filter_arg = NULL;
 	so->sol_accept_filter_str = NULL;
 
 	so->sol_upcall = NULL;
 	so->sol_upcallarg = NULL;
 
 	so->so_options |= SO_ACCEPTCONN;
 
 listening:
 	if (backlog < 0 || backlog > somaxconn)
 		backlog = somaxconn;
 	so->sol_qlimit = backlog;
 
 	mtx_unlock(&so->so_snd_mtx);
 	mtx_unlock(&so->so_rcv_mtx);
 	sx_xunlock(&so->so_snd_sx);
 	sx_xunlock(&so->so_rcv_sx);
 }
 
 /*
  * Wakeup listeners/subsystems once we have a complete connection.
  * Enters with lock, returns unlocked.
  */
 void
 solisten_wakeup(struct socket *sol)
 {
 
 	if (sol->sol_upcall != NULL)
 		(void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
 	else {
 		selwakeuppri(&sol->so_rdsel, PSOCK);
 		KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
 	}
 	SOLISTEN_UNLOCK(sol);
 	wakeup_one(&sol->sol_comp);
 	if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
 		pgsigio(&sol->so_sigio, SIGIO, 0);
 }
 
 /*
  * Return single connection off a listening socket queue.  Main consumer of
  * the function is kern_accept4().  Some modules, that do their own accept
  * management also use the function.  The socket reference held by the
  * listen queue is handed to the caller.
  *
  * Listening socket must be locked on entry and is returned unlocked on
  * return.
  * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
  */
 int
 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
 {
 	struct socket *so;
 	int error;
 
 	SOLISTEN_LOCK_ASSERT(head);
 
 	while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
 	    head->so_error == 0) {
 		error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
 		    "accept", 0);
 		if (error != 0) {
 			SOLISTEN_UNLOCK(head);
 			return (error);
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 	} else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
 		error = EWOULDBLOCK;
 	else
 		error = 0;
 	if (error) {
 		SOLISTEN_UNLOCK(head);
 		return (error);
 	}
 	so = TAILQ_FIRST(&head->sol_comp);
 	SOCK_LOCK(so);
 	KASSERT(so->so_qstate == SQ_COMP,
 	    ("%s: so %p not SQ_COMP", __func__, so));
 	head->sol_qlen--;
 	so->so_qstate = SQ_NONE;
 	so->so_listen = NULL;
 	TAILQ_REMOVE(&head->sol_comp, so, so_list);
 	if (flags & ACCEPT4_INHERIT)
 		so->so_state |= (head->so_state & SS_NBIO);
 	else
 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 	SOCK_UNLOCK(so);
 	sorele_locked(head);
 
 	*ret = so;
 	return (0);
 }
 
 static struct so_splice *
 so_splice_alloc(off_t max)
 {
 	struct so_splice *sp;
 
 	sp = uma_zalloc(splice_zone, M_WAITOK);
 	sp->src = NULL;
 	sp->dst = NULL;
 	sp->max = max > 0 ? max : -1;
 	do {
 		sp->wq_index = atomic_fetchadd_32(&splice_index, 1) %
 		    (mp_maxid + 1);
 	} while (CPU_ABSENT(sp->wq_index));
 	sp->state = SPLICE_IDLE;
 	TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout,
 	    sp);
 	return (sp);
 }
 
 static void
 so_splice_free(struct so_splice *sp)
 {
 	KASSERT(sp->state == SPLICE_CLOSED,
 	    ("so_splice_free: sp %p not closed", sp));
 	uma_zfree(splice_zone, sp);
 }
 
 static void
 so_splice_timeout(void *arg, int pending __unused)
 {
 	struct so_splice *sp;
 
 	sp = arg;
 	(void)so_unsplice(sp->src, true);
 }
 
 /*
  * Splice the output from so to the input of so2.
  */
 static int
 so_splice(struct socket *so, struct socket *so2, struct splice *splice)
 {
 	struct so_splice *sp;
 	int error;
 
 	if (splice->sp_max < 0)
 		return (EINVAL);
 	/* Handle only TCP for now; TODO: other streaming protos */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP ||
 	    so2->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EPROTONOSUPPORT);
 	if (so->so_vnet != so2->so_vnet)
 		return (EINVAL);
 
 	/* so_splice_xfer() assumes that we're using these implementations. */
 	KASSERT(so->so_proto->pr_sosend == sosend_generic,
 	    ("so_splice: sosend not sosend_generic"));
 	KASSERT(so2->so_proto->pr_soreceive == soreceive_generic ||
 	    so2->so_proto->pr_soreceive == soreceive_stream,
 	    ("so_splice: soreceive not soreceive_generic/stream"));
 
 	sp = so_splice_alloc(splice->sp_max);
 	so->so_splice_sent = 0;
 	sp->src = so;
 	sp->dst = so2;
 
 	error = 0;
 	SOCK_LOCK(so);
 	if (SOLISTENING(so))
 		error = EINVAL;
 	else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0)
 		error = ENOTCONN;
 	else if (so->so_splice != NULL)
 		error = EBUSY;
 	if (error != 0) {
 		SOCK_UNLOCK(so);
 		uma_zfree(splice_zone, sp);
 		return (error);
 	}
 	soref(so);
 	so->so_splice = sp;
 	SOCK_RECVBUF_LOCK(so);
 	so->so_rcv.sb_flags |= SB_SPLICED;
 	SOCK_RECVBUF_UNLOCK(so);
 	SOCK_UNLOCK(so);
 
 	error = 0;
 	SOCK_LOCK(so2);
 	if (SOLISTENING(so2))
 		error = EINVAL;
 	else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0)
 		error = ENOTCONN;
 	else if (so2->so_splice_back != NULL)
 		error = EBUSY;
 	if (error != 0) {
 		SOCK_UNLOCK(so2);
 		SOCK_LOCK(so);
 		so->so_splice = NULL;
 		SOCK_RECVBUF_LOCK(so);
 		so->so_rcv.sb_flags &= ~SB_SPLICED;
 		SOCK_RECVBUF_UNLOCK(so);
 		SOCK_UNLOCK(so);
 		sorele(so);
 		uma_zfree(splice_zone, sp);
 		return (error);
 	}
 	soref(so2);
 	so2->so_splice_back = sp;
 	SOCK_SENDBUF_LOCK(so2);
 	so2->so_snd.sb_flags |= SB_SPLICED;
 	mtx_lock(&sp->mtx);
 	SOCK_SENDBUF_UNLOCK(so2);
 	SOCK_UNLOCK(so2);
 
 	if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) {
 		taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout,
 		    tvtosbt(splice->sp_idle), 0, C_PREL(4));
 	}
 
 	/*
 	 * Transfer any data already present in the socket buffer.
 	 */
 	sp->state = SPLICE_QUEUED;
 	so_splice_xfer(sp);
 	return (0);
 }
 
 static int
 so_unsplice(struct socket *so, bool timeout)
 {
 	struct socket *so2;
 	struct so_splice *sp;
 	bool drain;
 
 	/*
 	 * First unset SB_SPLICED and hide the splice structure so that
 	 * wakeup routines will stop enqueuing work.  This also ensures that
 	 * a only a single thread will proceed with the unsplice.
 	 */
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		SOCK_UNLOCK(so);
 		return (EINVAL);
 	}
 	SOCK_RECVBUF_LOCK(so);
 	if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) {
 		SOCK_RECVBUF_UNLOCK(so);
 		SOCK_UNLOCK(so);
 		return (ENOTCONN);
 	}
 	so->so_rcv.sb_flags &= ~SB_SPLICED;
 	sp = so->so_splice;
 	so->so_splice = NULL;
 	SOCK_RECVBUF_UNLOCK(so);
 	SOCK_UNLOCK(so);
 
 	so2 = sp->dst;
 	SOCK_LOCK(so2);
 	KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__));
 	SOCK_SENDBUF_LOCK(so2);
 	KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0,
 	    ("%s: so2 is not spliced", __func__));
 	KASSERT(so2->so_splice_back == sp,
 	    ("%s: so_splice_back != sp", __func__));
 	so2->so_snd.sb_flags &= ~SB_SPLICED;
 	so2->so_splice_back = NULL;
 	SOCK_SENDBUF_UNLOCK(so2);
 	SOCK_UNLOCK(so2);
 
 	/*
 	 * No new work is being enqueued.  The worker thread might be
 	 * splicing data right now, in which case we want to wait for it to
 	 * finish before proceeding.
 	 */
 	mtx_lock(&sp->mtx);
 	switch (sp->state) {
 	case SPLICE_QUEUED:
 	case SPLICE_RUNNING:
 		sp->state = SPLICE_CLOSING;
 		while (sp->state == SPLICE_CLOSING)
 			msleep(sp, &sp->mtx, PSOCK, "unsplice", 0);
 		break;
 	case SPLICE_IDLE:
 	case SPLICE_EXCEPTION:
 		sp->state = SPLICE_CLOSED;
 		break;
 	default:
 		__assert_unreachable();
 	}
 	if (!timeout) {
 		drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout,
 		    NULL) != 0;
 	} else {
 		drain = false;
 	}
 	mtx_unlock(&sp->mtx);
 	if (drain)
 		taskqueue_drain_timeout(taskqueue_thread, &sp->timeout);
 
 	/*
 	 * Now we hold the sole reference to the splice structure.
 	 * Clean up: signal userspace and release socket references.
 	 */
 	sorwakeup(so);
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	sowwakeup(so2);
 	sorele(so2);
 	CURVNET_RESTORE();
 	so_splice_free(sp);
 	return (0);
 }
 
 /*
  * Free socket upon release of the very last reference.
  */
 static void
 sofree(struct socket *so)
 {
 	struct protosw *pr = so->so_proto;
 
 	SOCK_LOCK_ASSERT(so);
 	KASSERT(refcount_load(&so->so_count) == 0,
 	    ("%s: so %p has references", __func__, so));
 	KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
 	    ("%s: so %p is on listen queue", __func__, so));
 	KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0,
 	    ("%s: so %p rcvbuf is spliced", __func__, so));
 	KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0,
 	    ("%s: so %p sndbuf is spliced", __func__, so));
 	KASSERT(so->so_splice == NULL && so->so_splice_back == NULL,
 	    ("%s: so %p has spliced data", __func__, so));
 
 	SOCK_UNLOCK(so);
 
 	if (so->so_dtor != NULL)
 		so->so_dtor(so);
 
 	VNET_SO_ASSERT(so);
 	if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) {
 		MPASS(pr->pr_domain->dom_dispose != NULL);
 		(*pr->pr_domain->dom_dispose)(so);
 	}
 	if (pr->pr_detach != NULL)
 		pr->pr_detach(so);
 
 	/*
 	 * From this point on, we assume that no other references to this
 	 * socket exist anywhere else in the stack.  Therefore, no locks need
 	 * to be acquired or held.
 	 */
 	if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) {
 		sbdestroy(so, SO_SND);
 		sbdestroy(so, SO_RCV);
 	}
 	seldrain(&so->so_rdsel);
 	seldrain(&so->so_wrsel);
 	knlist_destroy(&so->so_rdsel.si_note);
 	knlist_destroy(&so->so_wrsel.si_note);
 	sodealloc(so);
 }
 
 /*
  * Release a reference on a socket while holding the socket lock.
  * Unlocks the socket lock before returning.
  */
 void
 sorele_locked(struct socket *so)
 {
 	SOCK_LOCK_ASSERT(so);
 	if (refcount_release(&so->so_count))
 		sofree(so);
 	else
 		SOCK_UNLOCK(so);
 }
 
 /*
  * Close a socket on last file table reference removal.  Initiate disconnect
  * if connected.  Free socket when disconnect complete.
  *
  * This function will sorele() the socket.  Note that soclose() may be called
  * prior to the ref count reaching zero.  The actual socket structure will
  * not be freed until the ref count reaches zero.
  */
 int
 soclose(struct socket *so)
 {
 	struct accept_queue lqueue;
 	int error = 0;
 	bool listening, last __diagused;
 
 	CURVNET_SET(so->so_vnet);
 	funsetown(&so->so_sigio);
 	if (so->so_state & SS_ISCONNECTED) {
 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 			error = sodisconnect(so);
 			if (error) {
 				if (error == ENOTCONN)
 					error = 0;
 				goto drop;
 			}
 		}
 
 		if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
 			if ((so->so_state & SS_ISDISCONNECTING) &&
 			    (so->so_state & SS_NBIO))
 				goto drop;
 			while (so->so_state & SS_ISCONNECTED) {
 				error = tsleep(&so->so_timeo,
 				    PSOCK | PCATCH, "soclos",
 				    so->so_linger * hz);
 				if (error)
 					break;
 			}
 		}
 	}
 
 drop:
 	if (so->so_proto->pr_close != NULL)
 		so->so_proto->pr_close(so);
 
 	SOCK_LOCK(so);
 	if ((listening = SOLISTENING(so))) {
 		struct socket *sp;
 
 		TAILQ_INIT(&lqueue);
 		TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
 		TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
 
 		so->sol_qlen = so->sol_incqlen = 0;
 
 		TAILQ_FOREACH(sp, &lqueue, so_list) {
 			SOCK_LOCK(sp);
 			sp->so_qstate = SQ_NONE;
 			sp->so_listen = NULL;
 			SOCK_UNLOCK(sp);
 			last = refcount_release(&so->so_count);
 			KASSERT(!last, ("%s: released last reference for %p",
 			    __func__, so));
 		}
 	}
 	sorele_locked(so);
 	if (listening) {
 		struct socket *sp, *tsp;
 
 		TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp)
 			soabort(sp);
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * soabort() is used to abruptly tear down a connection, such as when a
  * resource limit is reached (listen queue depth exceeded), or if a listen
  * socket is closed while there are sockets waiting to be accepted.
  *
  * This interface is tricky, because it is called on an unreferenced socket,
  * and must be called only by a thread that has actually removed the socket
  * from the listen queue it was on.  Likely this thread holds the last
  * reference on the socket and soabort() will proceed with sofree().  But
  * it might be not the last, as the sockets on the listen queues are seen
  * from the protocol side.
  *
  * This interface will call into the protocol code, so must not be called
  * with any socket locks held.  Protocols do call it while holding their own
  * recursible protocol mutexes, but this is something that should be subject
  * to review in the future.
  *
  * Usually socket should have a single reference left, but this is not a
  * requirement.  In the past, when we have had named references for file
  * descriptor and protocol, we asserted that none of them are being held.
  */
 void
 soabort(struct socket *so)
 {
 
 	VNET_SO_ASSERT(so);
 
 	if (so->so_proto->pr_abort != NULL)
 		so->so_proto->pr_abort(so);
 	SOCK_LOCK(so);
 	sorele_locked(so);
 }
 
 int
 soaccept(struct socket *so, struct sockaddr **nam)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_accept(so, nam);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (soconnectat(AT_FDCWD, so, nam, td));
 }
 
 int
 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 
 	/*
 	 * If protocol is connection-based, can only connect once.
 	 * Otherwise, if connected, try to disconnect first.  This allows
 	 * user to disconnect by connecting to, e.g., a null address.
 	 *
 	 * Note, this check is racy and may need to be re-evaluated at the
 	 * protocol layer.
 	 */
 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 	    (error = sodisconnect(so)))) {
 		error = EISCONN;
 	} else {
 		/*
 		 * Prevent accumulated error from previous connection from
 		 * biting us.
 		 */
 		so->so_error = 0;
 		if (fd == AT_FDCWD) {
 			error = so->so_proto->pr_connect(so, nam, td);
 		} else {
 			error = so->so_proto->pr_connectat(fd, so, nam, td);
 		}
 	}
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 int
 soconnect2(struct socket *so1, struct socket *so2)
 {
 	int error;
 
 	CURVNET_SET(so1->so_vnet);
 	error = so1->so_proto->pr_connect2(so1, so2);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sodisconnect(struct socket *so)
 {
 	int error;
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 	if (so->so_state & SS_ISDISCONNECTING)
 		return (EALREADY);
 	VNET_SO_ASSERT(so);
 	error = so->so_proto->pr_disconnect(so);
 	return (error);
 }
 
 int
 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	long space;
 	ssize_t resid;
 	int clen = 0, error, dontroute;
 
 	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 	    ("sosend_dgram: !PR_ATOMIC"));
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
 		resid = top->m_pkthdr.len;
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 */
 	if (resid < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	dontroute =
 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(&so->so_snd);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		/*
 		 * `sendto' and `sendmsg' is allowed on a connection-based
 		 * socket if it supports implied connect.  Return ENOTCONN if
 		 * not connected and no address is supplied.
 		 */
 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 			    !(resid == 0 && clen != 0)) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = ENOTCONN;
 				goto out;
 			}
 		} else if (addr == NULL) {
 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
 				error = ENOTCONN;
 			else
 				error = EDESTADDRREQ;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto out;
 		}
 	}
 
 	/*
 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
 	 * problem and need fixing.
 	 */
 	space = sbspace(&so->so_snd);
 	if (flags & MSG_OOB)
 		space += 1024;
 	space -= clen;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	if (resid > space) {
 		error = EMSGSIZE;
 		goto out;
 	}
 	if (uio == NULL) {
 		resid = 0;
 		if (flags & MSG_EOR)
 			top->m_flags |= M_EOR;
 	} else {
 		/*
 		 * Copy the data from userland into a mbuf chain.
 		 * If no data is to be copied in, a single empty mbuf
 		 * is returned.
 		 */
 		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
 		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
 		if (top == NULL) {
 			error = EFAULT;	/* only possible error */
 			goto out;
 		}
 		space -= resid - uio->uio_resid;
 		resid = uio->uio_resid;
 	}
 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
 	/*
 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
 	 * than with.
 	 */
 	if (dontroute) {
 		SOCK_LOCK(so);
 		so->so_options |= SO_DONTROUTE;
 		SOCK_UNLOCK(so);
 	}
 	/*
 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
 	 * of date.  We could have received a reset packet in an interrupt or
 	 * maybe we slept while doing page faults in uiomove() etc.  We could
 	 * probably recheck again inside the locking protection here, but
 	 * there are probably other places that this also happens.  We must
 	 * rethink this.
 	 */
 	VNET_SO_ASSERT(so);
 	error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB :
 	/*
 	 * If the user set MSG_EOF, the protocol understands this flag and
 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
 	 */
 	    ((flags & MSG_EOF) &&
 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
 	     (resid <= 0)) ?
 		PRUS_EOF :
 		/* If there is more to send set PRUS_MORETOCOME */
 		(flags & MSG_MORETOCOME) ||
 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
 		top, addr, control, td);
 	if (dontroute) {
 		SOCK_LOCK(so);
 		so->so_options &= ~SO_DONTROUTE;
 		SOCK_UNLOCK(so);
 	}
 	clen = 0;
 	control = NULL;
 	top = NULL;
 out:
 	if (top != NULL)
 		m_freem(top);
 	if (control != NULL)
 		m_freem(control);
 	return (error);
 }
 
 /*
  * Send on a socket.  If send must go all at once and message is larger than
  * send buffering, then hard error.  Lock against other senders.  If must go
  * all at once and not enough room now, then inform user that this would
  * block and do nothing.  Otherwise, if nonblocking, send as much as
  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
  * in mbuf chain must be small enough to send all at once.
  *
  * Returns nonzero on error, timeout or signal; callers must check for short
  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
  * on return.
  */
 static int
 sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	long space;
 	ssize_t resid;
 	int clen = 0, error, dontroute;
 	int atomic = sosendallatonce(so) || top;
 	int pr_send_flag;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 	int tls_enq_cnt, tls_send_flag;
 	uint8_t tls_rtype;
 
 	tls = NULL;
 	tls_rtype = TLS_RLTYPE_APP;
 #endif
 
 	SOCK_IO_SEND_ASSERT_LOCKED(so);
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else if ((top->m_flags & M_PKTHDR) != 0)
 		resid = top->m_pkthdr.len;
 	else
 		resid = m_length(top, NULL);
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 *
 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 	 * type sockets since that's an error.
 	 */
 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
 		error = EINVAL;
 		goto out;
 	}
 
 	dontroute =
 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
 	    (so->so_proto->pr_flags & PR_ATOMIC);
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
 
 #ifdef KERN_TLS
 	tls_send_flag = 0;
 	tls = ktls_hold(so->so_snd.sb_tls_info);
 	if (tls != NULL) {
 		if (tls->mode == TCP_TLS_MODE_SW)
 			tls_send_flag = PRUS_NOTREADY;
 
 		if (control != NULL) {
 			struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 
 			if (clen >= sizeof(*cm) &&
 			    cm->cmsg_type == TLS_SET_RECORD_TYPE) {
 				tls_rtype = *((uint8_t *)CMSG_DATA(cm));
 				clen = 0;
 				m_freem(control);
 				control = NULL;
 				atomic = 1;
 			}
 		}
 
 		if (resid == 0 && !ktls_permit_empty_frames(tls)) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 #endif
 
 restart:
 	do {
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EPIPE;
 			goto out;
 		}
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto out;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			/*
 			 * `sendto' and `sendmsg' is allowed on a connection-
 			 * based socket if it supports implied connect.
 			 * Return ENOTCONN if not connected and no address is
 			 * supplied.
 			 */
 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 				    !(resid == 0 && clen != 0)) {
 					SOCKBUF_UNLOCK(&so->so_snd);
 					error = ENOTCONN;
 					goto out;
 				}
 			} else if (addr == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
 					error = ENOTCONN;
 				else
 					error = EDESTADDRREQ;
 				goto out;
 			}
 		}
 		space = sbspace(&so->so_snd);
 		if (flags & MSG_OOB)
 			space += 1024;
 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
 		    clen > so->so_snd.sb_hiwat) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EMSGSIZE;
 			goto out;
 		}
 		if (space < resid + clen &&
 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
 			if ((so->so_state & SS_NBIO) ||
 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EWOULDBLOCK;
 				goto out;
 			}
 			error = sbwait(so, SO_SND);
 			SOCKBUF_UNLOCK(&so->so_snd);
 			if (error)
 				goto out;
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 		space -= clen;
 		do {
 			if (uio == NULL) {
 				resid = 0;
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 #ifdef KERN_TLS
 				if (tls != NULL) {
 					ktls_frame(top, tls, &tls_enq_cnt,
 					    tls_rtype);
 					tls_rtype = TLS_RLTYPE_APP;
 				}
 #endif
 			} else {
 				/*
 				 * Copy the data from userland into a mbuf
 				 * chain.  If resid is 0, which can happen
 				 * only if we have control to send, then
 				 * a single empty mbuf is returned.  This
 				 * is a workaround to prevent protocol send
 				 * methods to panic.
 				 */
 #ifdef KERN_TLS
 				if (tls != NULL) {
 					top = m_uiotombuf(uio, M_WAITOK, space,
 					    tls->params.max_frame_len,
 					    M_EXTPG |
 					    ((flags & MSG_EOR) ? M_EOR : 0));
 					if (top != NULL) {
 						ktls_frame(top, tls,
 						    &tls_enq_cnt, tls_rtype);
 					}
 					tls_rtype = TLS_RLTYPE_APP;
 				} else
 #endif
 					top = m_uiotombuf(uio, M_WAITOK, space,
 					    (atomic ? max_hdr : 0),
 					    (atomic ? M_PKTHDR : 0) |
 					    ((flags & MSG_EOR) ? M_EOR : 0));
 				if (top == NULL) {
 					error = EFAULT; /* only possible error */
 					goto out;
 				}
 				space -= resid - uio->uio_resid;
 				resid = uio->uio_resid;
 			}
 			if (dontroute) {
 				SOCK_LOCK(so);
 				so->so_options |= SO_DONTROUTE;
 				SOCK_UNLOCK(so);
 			}
 			/*
 			 * XXX all the SBS_CANTSENDMORE checks previously
 			 * done could be out of date.  We could have received
 			 * a reset packet in an interrupt or maybe we slept
 			 * while doing page faults in uiomove() etc.  We
 			 * could probably recheck again inside the locking
 			 * protection here, but there are probably other
 			 * places that this also happens.  We must rethink
 			 * this.
 			 */
 			VNET_SO_ASSERT(so);
 
 			pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB :
 			/*
 			 * If the user set MSG_EOF, the protocol understands
 			 * this flag and nothing left to send then use
 			 * PRU_SEND_EOF instead of PRU_SEND.
 			 */
 			    ((flags & MSG_EOF) &&
 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
 			     (resid <= 0)) ?
 				PRUS_EOF :
 			/* If there is more to send set PRUS_MORETOCOME. */
 			    (flags & MSG_MORETOCOME) ||
 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
 
 #ifdef KERN_TLS
 			pr_send_flag |= tls_send_flag;
 #endif
 
 			error = so->so_proto->pr_send(so, pr_send_flag, top,
 			    addr, control, td);
 
 			if (dontroute) {
 				SOCK_LOCK(so);
 				so->so_options &= ~SO_DONTROUTE;
 				SOCK_UNLOCK(so);
 			}
 
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
 				if (error != 0) {
 					m_freem(top);
 					top = NULL;
 				} else {
 					soref(so);
 					ktls_enqueue(top, so, tls_enq_cnt);
 				}
 			}
 #endif
 			clen = 0;
 			control = NULL;
 			top = NULL;
 			if (error)
 				goto out;
 		} while (resid && space > 0);
 	} while (resid);
 
 out:
 #ifdef KERN_TLS
 	if (tls != NULL)
 		ktls_free(tls);
 #endif
 	if (top != NULL)
 		m_freem(top);
 	if (control != NULL)
 		m_freem(control);
 	return (error);
 }
 
 int
 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	int error;
 
 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 	error = sosend_generic_locked(so, addr, uio, top, control, flags, td);
 	SOCK_IO_SEND_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Send to a socket from a kernel thread.
  *
  * XXXGL: in almost all cases uio is NULL and the mbuf is supplied.
  * Exception is nfs/bootp_subr.c.  It is arguable that the VNET context needs
  * to be set at all.  This function should just boil down to a static inline
  * calling the protocol method.
  */
 int
 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_sosend(so, addr, uio,
 	    top, control, flags, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * send(2), write(2) or aio_write(2) on a socket.
  */
 int
 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *control, int flags, struct proc *userproc)
 {
 	struct thread *td;
 	ssize_t len;
 	int error;
 
 	td = uio->uio_td;
 	len = uio->uio_resid;
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags,
 	    td);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		/*
 		 * Clear transient errors for stream protocols if they made
 		 * some progress.  Make exclusion for aio(4) that would
 		 * schedule a new write in case of EWOULDBLOCK and clear
 		 * error itself.  See soaio_process_job().
 		 */
 		if (uio->uio_resid != len &&
 		    (so->so_proto->pr_flags & PR_ATOMIC) == 0 &&
 		    userproc == NULL &&
 		    (error == ERESTART || error == EINTR ||
 		    error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket. */
 		if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 &&
 		    (flags & MSG_NOSIGNAL) == 0) {
 			if (userproc != NULL) {
 				/* aio(4) job */
 				PROC_LOCK(userproc);
 				kern_psignal(userproc, SIGPIPE);
 				PROC_UNLOCK(userproc);
 			} else {
 				PROC_LOCK(td->td_proc);
 				tdsignal(td, SIGPIPE);
 				PROC_UNLOCK(td->td_proc);
 			}
 		}
 	}
 	return (error);
 }
 
 /*
  * The part of soreceive() that implements reading non-inline out-of-band
  * data from a socket.  For more complete comments, see soreceive(), from
  * which this code originated.
  *
  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  * unable to return an mbuf chain to the caller.
  */
 static int
 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
 {
 	struct protosw *pr = so->so_proto;
 	struct mbuf *m;
 	int error;
 
 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
 	VNET_SO_ASSERT(so);
 
 	m = m_get(M_WAITOK, MT_DATA);
 	error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
 	if (error)
 		goto bad;
 	do {
 		error = uiomove(mtod(m, void *),
 		    (int) min(uio->uio_resid, m->m_len), uio);
 		m = m_free(m);
 	} while (uio->uio_resid && error == 0 && m);
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Following replacement or removal of the first mbuf on the first mbuf chain
  * of a socket buffer, push necessary state changes back into the socket
  * buffer so that other consumers see the values consistently.  'nextrecord'
  * is the callers locally stored value of the original value of
  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  * NOTE: 'nextrecord' may be NULL.
  */
 static __inline void
 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	/*
 	 * First, update for the new value of nextrecord.  If necessary, make
 	 * it the first record.
 	 */
 	if (sb->sb_mb != NULL)
 		sb->sb_mb->m_nextpkt = nextrecord;
 	else
 		sb->sb_mb = nextrecord;
 
 	/*
 	 * Now update any dependent socket buffer fields to reflect the new
 	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
 	 * addition of a second clause that takes care of the case where
 	 * sb_mb has been updated, but remains the last record.
 	 */
 	if (sb->sb_mb == NULL) {
 		sb->sb_mbtail = NULL;
 		sb->sb_lastrecord = NULL;
 	} else if (sb->sb_mb->m_nextpkt == NULL)
 		sb->sb_lastrecord = sb->sb_mb;
 }
 
 /*
  * Implement receive operations on a socket.  We depend on the way that
  * records are added to the sockbuf by sbappend.  In particular, each record
  * (mbufs linked through m_next) must begin with an address if the protocol
  * so specifies, followed by an optional mbuf or mbufs containing ancillary
  * data, and then zero or more mbufs of data.  In order to allow parallelism
  * between network receive and copying to user space, as well as avoid
  * sleeping with a mutex held, we release the socket buffer mutex during the
  * user space copy.  Although the sockbuf is locked, new data may still be
  * appended, and thus we must maintain consistency of the sockbuf during that
  * time.
  *
  * The caller may receive the data as a single mbuf chain by supplying an
  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
  * the count in uio_resid.
  */
 static int
 soreceive_generic_locked(struct socket *so, struct sockaddr **psa,
     struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp)
 {
 	struct mbuf *m;
 	int flags, error, offset;
 	ssize_t len;
 	struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 	int moff, type = 0;
 	ssize_t orig_resid = uio->uio_resid;
 	bool report_real_len = false;
 
 	SOCK_IO_RECV_ASSERT_LOCKED(so);
 
 	error = 0;
 	if (flagsp != NULL) {
 		report_real_len = *flagsp & MSG_TRUNC;
 		*flagsp &= ~MSG_TRUNC;
 		flags = *flagsp &~ MSG_EOR;
 	} else
 		flags = 0;
 
 restart:
 	SOCKBUF_LOCK(&so->so_rcv);
 	m = so->so_rcv.sb_mb;
 	/*
 	 * If we have less data than requested, block awaiting more (subject
 	 * to any timeout) if:
 	 *   1. the current count is less than the low water mark, or
 	 *   2. MSG_DONTWAIT is not set
 	 */
 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 	    sbavail(&so->so_rcv) < uio->uio_resid) &&
 	    sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
 		KASSERT(m != NULL || !sbavail(&so->so_rcv),
 		    ("receive: m == %p sbavail == %u",
 		    m, sbavail(&so->so_rcv)));
 		if (so->so_error || so->so_rerror) {
 			if (m != NULL)
 				goto dontblock;
 			if (so->so_error)
 				error = so->so_error;
 			else
 				error = so->so_rerror;
 			if ((flags & MSG_PEEK) == 0) {
 				if (so->so_error)
 					so->so_error = 0;
 				else
 					so->so_rerror = 0;
 			}
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			if (m != NULL)
 				goto dontblock;
 #ifdef KERN_TLS
 			else if (so->so_rcv.sb_tlsdcc == 0 &&
 			    so->so_rcv.sb_tlscc == 0) {
 #else
 			else {
 #endif
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
 			}
 		}
 		for (; m != NULL; m = m->m_next)
 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 				m = so->so_rcv.sb_mb;
 				goto dontblock;
 			}
 		if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
 		    SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
 		    (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = ENOTCONN;
 			goto release;
 		}
 		if (uio->uio_resid == 0 && !report_real_len) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
 		}
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = EWOULDBLOCK;
 			goto release;
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		error = sbwait(so, SO_RCV);
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		if (error)
 			goto release;
 		goto restart;
 	}
 dontblock:
 	/*
 	 * From this point onward, we maintain 'nextrecord' as a cache of the
 	 * pointer to the next record in the socket buffer.  We must keep the
 	 * various socket buffer pointers and local stack versions of the
 	 * pointers in sync, pushing out modifications before dropping the
 	 * socket buffer mutex, and re-reading them when picking it up.
 	 *
 	 * Otherwise, we will race with the network stack appending new data
 	 * or records onto the socket buffer by using inconsistent/stale
 	 * versions of the field, possibly resulting in socket buffer
 	 * corruption.
 	 *
 	 * By holding the high-level sblock(), we prevent simultaneous
 	 * readers from pulling off the front of the socket buffer.
 	 */
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	nextrecord = m->m_nextpkt;
 	if (pr->pr_flags & PR_ADDR) {
 		KASSERT(m->m_type == MT_SONAME,
 		    ("m->m_type == %d", m->m_type));
 		orig_resid = 0;
 		if (psa != NULL)
 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
 			    M_NOWAIT);
 		if (flags & MSG_PEEK) {
 			m = m->m_next;
 		} else {
 			sbfree(&so->so_rcv, m);
 			so->so_rcv.sb_mb = m_free(m);
 			m = so->so_rcv.sb_mb;
 			sockbuf_pushsync(&so->so_rcv, nextrecord);
 		}
 	}
 
 	/*
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
 	 * perform externalization (or freeing if controlp == NULL).
 	 */
 	if (m != NULL && m->m_type == MT_CONTROL) {
 		struct mbuf *cm = NULL, *cmn;
 		struct mbuf **cme = &cm;
 #ifdef KERN_TLS
 		struct cmsghdr *cmsg;
 		struct tls_get_record tgr;
 
 		/*
 		 * For MSG_TLSAPPDATA, check for an alert record.
 		 * If found, return ENXIO without removing
 		 * it from the receive queue.  This allows a subsequent
 		 * call without MSG_TLSAPPDATA to receive it.
 		 * Note that, for TLS, there should only be a single
 		 * control mbuf with the TLS_GET_RECORD message in it.
 		 */
 		if (flags & MSG_TLSAPPDATA) {
 			cmsg = mtod(m, struct cmsghdr *);
 			if (cmsg->cmsg_type == TLS_GET_RECORD &&
 			    cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
 				memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
 				if (__predict_false(tgr.tls_type ==
 				    TLS_RLTYPE_ALERT)) {
 					SOCKBUF_UNLOCK(&so->so_rcv);
 					error = ENXIO;
 					goto release;
 				}
 			}
 		}
 #endif
 
 		do {
 			if (flags & MSG_PEEK) {
 				if (controlp != NULL) {
 					*controlp = m_copym(m, 0, m->m_len,
 					    M_NOWAIT);
 					controlp = &(*controlp)->m_next;
 				}
 				m = m->m_next;
 			} else {
 				sbfree(&so->so_rcv, m);
 				so->so_rcv.sb_mb = m->m_next;
 				m->m_next = NULL;
 				*cme = m;
 				cme = &(*cme)->m_next;
 				m = so->so_rcv.sb_mb;
 			}
 		} while (m != NULL && m->m_type == MT_CONTROL);
 		if ((flags & MSG_PEEK) == 0)
 			sockbuf_pushsync(&so->so_rcv, nextrecord);
 		while (cm != NULL) {
 			cmn = cm->m_next;
 			cm->m_next = NULL;
 			if (pr->pr_domain->dom_externalize != NULL) {
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				VNET_SO_ASSERT(so);
 				error = (*pr->pr_domain->dom_externalize)
 				    (cm, controlp, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
 			} else if (controlp != NULL)
 				*controlp = cm;
 			else
 				m_freem(cm);
 			if (controlp != NULL) {
 				while (*controlp != NULL)
 					controlp = &(*controlp)->m_next;
 			}
 			cm = cmn;
 		}
 		if (m != NULL)
 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 		else
 			nextrecord = so->so_rcv.sb_mb;
 		orig_resid = 0;
 	}
 	if (m != NULL) {
 		if ((flags & MSG_PEEK) == 0) {
 			KASSERT(m->m_nextpkt == nextrecord,
 			    ("soreceive: post-control, nextrecord !sync"));
 			if (nextrecord == NULL) {
 				KASSERT(so->so_rcv.sb_mb == m,
 				    ("soreceive: post-control, sb_mb!=m"));
 				KASSERT(so->so_rcv.sb_lastrecord == m,
 				    ("soreceive: post-control, lastrecord!=m"));
 			}
 		}
 		type = m->m_type;
 		if (type == MT_OOBDATA)
 			flags |= MSG_OOB;
 	} else {
 		if ((flags & MSG_PEEK) == 0) {
 			KASSERT(so->so_rcv.sb_mb == nextrecord,
 			    ("soreceive: sb_mb != nextrecord"));
 			if (so->so_rcv.sb_mb == NULL) {
 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
 				    ("soreceive: sb_lastercord != NULL"));
 			}
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 
 	/*
 	 * Now continue to read any data mbufs off of the head of the socket
 	 * buffer until the read request is satisfied.  Note that 'type' is
 	 * used to store the type of any mbuf reads that have happened so far
 	 * such that soreceive() can stop reading if the type changes, which
 	 * causes soreceive() to return only one of regular data and inline
 	 * out-of-band data in a single socket receive operation.
 	 */
 	moff = 0;
 	offset = 0;
 	while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
 	    && error == 0) {
 		/*
 		 * If the type of mbuf has changed since the last mbuf
 		 * examined ('type'), end the receive operation.
 		 */
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
 			if (type != m->m_type)
 				break;
 		} else if (type == MT_OOBDATA)
 			break;
 		else
 		    KASSERT(m->m_type == MT_DATA,
 			("m->m_type == %d", m->m_type));
 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 		len = uio->uio_resid;
 		if (so->so_oobmark && len > so->so_oobmark - offset)
 			len = so->so_oobmark - offset;
 		if (len > m->m_len - moff)
 			len = m->m_len - moff;
 		/*
 		 * If mp is set, just pass back the mbufs.  Otherwise copy
 		 * them out via the uio, then free.  Sockbuf must be
 		 * consistent here (points to current mbuf, it points to next
 		 * record) when we drop priority; we must note any additions
 		 * to the sockbuf when we block interrupts again.
 		 */
 		if (mp == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			if ((m->m_flags & M_EXTPG) != 0)
 				error = m_unmapped_uiomove(m, moff, uio,
 				    (int)len);
 			else
 				error = uiomove(mtod(m, char *) + moff,
 				    (int)len, uio);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (error) {
 				/*
 				 * The MT_SONAME mbuf has already been removed
 				 * from the record, so it is necessary to
 				 * remove the data mbufs, if any, to preserve
 				 * the invariant in the case of PR_ADDR that
 				 * requires MT_SONAME mbufs at the head of
 				 * each record.
 				 */
 				if (pr->pr_flags & PR_ATOMIC &&
 				    ((flags & MSG_PEEK) == 0))
 					(void)sbdroprecord_locked(&so->so_rcv);
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
 			}
 		} else
 			uio->uio_resid -= len;
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (len == m->m_len - moff) {
 			if (m->m_flags & M_EOR)
 				flags |= MSG_EOR;
 			if (flags & MSG_PEEK) {
 				m = m->m_next;
 				moff = 0;
 			} else {
 				nextrecord = m->m_nextpkt;
 				sbfree(&so->so_rcv, m);
 				if (mp != NULL) {
 					m->m_nextpkt = NULL;
 					*mp = m;
 					mp = &m->m_next;
 					so->so_rcv.sb_mb = m = m->m_next;
 					*mp = NULL;
 				} else {
 					so->so_rcv.sb_mb = m_free(m);
 					m = so->so_rcv.sb_mb;
 				}
 				sockbuf_pushsync(&so->so_rcv, nextrecord);
 				SBLASTRECORDCHK(&so->so_rcv);
 				SBLASTMBUFCHK(&so->so_rcv);
 			}
 		} else {
 			if (flags & MSG_PEEK)
 				moff += len;
 			else {
 				if (mp != NULL) {
 					if (flags & MSG_DONTWAIT) {
 						*mp = m_copym(m, 0, len,
 						    M_NOWAIT);
 						if (*mp == NULL) {
 							/*
 							 * m_copym() couldn't
 							 * allocate an mbuf.
 							 * Adjust uio_resid back
 							 * (it was adjusted
 							 * down by len bytes,
 							 * which we didn't end
 							 * up "copying" over).
 							 */
 							uio->uio_resid += len;
 							break;
 						}
 					} else {
 						SOCKBUF_UNLOCK(&so->so_rcv);
 						*mp = m_copym(m, 0, len,
 						    M_WAITOK);
 						SOCKBUF_LOCK(&so->so_rcv);
 					}
 				}
 				sbcut_locked(&so->so_rcv, len);
 			}
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (so->so_oobmark) {
 			if ((flags & MSG_PEEK) == 0) {
 				so->so_oobmark -= len;
 				if (so->so_oobmark == 0) {
 					so->so_rcv.sb_state |= SBS_RCVATMARK;
 					break;
 				}
 			} else {
 				offset += len;
 				if (offset == so->so_oobmark)
 					break;
 			}
 		}
 		if (flags & MSG_EOR)
 			break;
 		/*
 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
 		 * must not quit until "uio->uio_resid == 0" or an error
 		 * termination.  If a signal/timeout occurs, return with a
 		 * short count but without error.  Keep sockbuf locked
 		 * against other readers.
 		 */
 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 		    !sosendallatonce(so) && nextrecord == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 			if (so->so_error || so->so_rerror ||
 			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				break;
 			/*
 			 * Notify the protocol that some data has been
 			 * drained before blocking.
 			 */
 			if (pr->pr_flags & PR_WANTRCVD) {
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				VNET_SO_ASSERT(so);
 				pr->pr_rcvd(so, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
 				if (__predict_false(so->so_rcv.sb_mb == NULL &&
 				    (so->so_error || so->so_rerror ||
 				    so->so_rcv.sb_state & SBS_CANTRCVMORE)))
 					break;
 			}
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			/*
 			 * We could receive some data while was notifying
 			 * the protocol. Skip blocking in this case.
 			 */
 			if (so->so_rcv.sb_mb == NULL) {
 				error = sbwait(so, SO_RCV);
 				if (error) {
 					SOCKBUF_UNLOCK(&so->so_rcv);
 					goto release;
 				}
 			}
 			m = so->so_rcv.sb_mb;
 			if (m != NULL)
 				nextrecord = m->m_nextpkt;
 		}
 	}
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
 		if (report_real_len)
 			uio->uio_resid -= m_length(m, NULL) - moff;
 		flags |= MSG_TRUNC;
 		if ((flags & MSG_PEEK) == 0)
 			(void) sbdroprecord_locked(&so->so_rcv);
 	}
 	if ((flags & MSG_PEEK) == 0) {
 		if (m == NULL) {
 			/*
 			 * First part is an inline SB_EMPTY_FIXUP().  Second
 			 * part makes sure sb_lastrecord is up-to-date if
 			 * there is still data in the socket buffer.
 			 */
 			so->so_rcv.sb_mb = nextrecord;
 			if (so->so_rcv.sb_mb == NULL) {
 				so->so_rcv.sb_mbtail = NULL;
 				so->so_rcv.sb_lastrecord = NULL;
 			} else if (nextrecord->m_nextpkt == NULL)
 				so->so_rcv.sb_lastrecord = nextrecord;
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		/*
 		 * If soreceive() is being done from the socket callback,
 		 * then don't need to generate ACK to peer to update window,
 		 * since ACK will be generated on return to TCP.
 		 */
 		if (!(flags & MSG_SOCALLBCK) &&
 		    (pr->pr_flags & PR_WANTRCVD)) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			VNET_SO_ASSERT(so);
 			pr->pr_rcvd(so, flags);
 			SOCKBUF_LOCK(&so->so_rcv);
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (orig_resid == uio->uio_resid && orig_resid &&
 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		goto restart;
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	if (flagsp != NULL)
 		*flagsp |= flags;
 release:
 	return (error);
 }
 
 int
 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp, struct mbuf **controlp, int *flagsp)
 {
 	int error, flags;
 
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flagsp != NULL) {
 		flags = *flagsp;
 		if ((flags & MSG_OOB) != 0)
 			return (soreceive_rcvoob(so, uio, flags));
 	} else {
 		flags = 0;
 	}
 	if (mp != NULL)
 		*mp = NULL;
 	if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
 	    (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) {
 		VNET_SO_ASSERT(so);
 		so->so_proto->pr_rcvd(so, 0);
 	}
 
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 	error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp);
 	SOCK_IO_RECV_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for stream (TCP) sockets.
  */
 static int
 soreceive_stream_locked(struct socket *so, struct sockbuf *sb,
     struct sockaddr **psa, struct uio *uio, struct mbuf **mp0,
     struct mbuf **controlp, int flags)
 {
 	int len = 0, error = 0, oresid;
 	struct mbuf *m, *n = NULL;
 
 	SOCK_IO_RECV_ASSERT_LOCKED(so);
 
 	/* Easy one, no space to copyout anything. */
 	if (uio->uio_resid == 0)
 		return (EINVAL);
 	oresid = uio->uio_resid;
 
 	SOCKBUF_LOCK(sb);
 	/* We will never ever get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		error = ENOTCONN;
 		goto out;
 	}
 
 restart:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
 		if (sbavail(sb) > 0)
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
 		error = so->so_error;
 		if (!(flags & MSG_PEEK))
 			so->so_error = 0;
 		goto out;
 	}
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
 		if (sbavail(sb) > 0)
 			goto deliver;
 		else
 			goto out;
 	}
 
 	/* Socket buffer is empty and we shall not block. */
 	if (sbavail(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
 	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sbavail(sb) >= sb->sb_lowat ||
 	     sbavail(sb) >= uio->uio_resid ||
 	     sbavail(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
 		goto deliver;
 
 	/*
 	 * Wait and block until (more) data comes in.
 	 * NB: Drops the sockbuf lock during wait.
 	 */
 	error = sbwait(so, SO_RCV);
 	if (error)
 		goto out;
 	goto restart;
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	/* Statistics. */
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
 	len = min(uio->uio_resid, sbavail(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
 			if (*mp0 == NULL)
 				*mp0 = sb->sb_mb;
 			else
 				m_cat(*mp0, sb->sb_mb);
 			for (m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
 				KASSERT(!(m->m_flags & M_NOTAVAIL),
 				    ("%s: m %p not available", __func__, m));
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
 				sbfree(sb, m);
 				n = m;
 			}
 			n->m_next = NULL;
 			sb->sb_mb = m;
 			sb->sb_lastrecord = sb->sb_mb;
 			if (sb->sb_mb == NULL)
 				SB_EMPTY_FIXUP(sb);
 		}
 		/* Copy the remainder. */
 		if (len > 0) {
 			KASSERT(sb->sb_mb != NULL,
 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
 
 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
 			if (m == NULL)
 				len = 0;	/* Don't flush data from sockbuf. */
 			else
 				uio->uio_resid -= len;
 			if (*mp0 != NULL)
 				m_cat(*mp0, m);
 			else
 				*mp0 = m;
 			if (*mp0 == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 	} else {
 		/* NB: Must unlock socket buffer as uiomove may sleep. */
 		SOCKBUF_UNLOCK(sb);
 		error = m_mbuftouio(uio, sb->sb_mb, len);
 		SOCKBUF_LOCK(sb);
 		if (error)
 			goto out;
 	}
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 
 	/*
 	 * Remove the delivered data from the socket buffer unless we
 	 * were only peeking.
 	 */
 	if (!(flags & MSG_PEEK)) {
 		if (len > 0)
 			sbdrop_locked(sb, len);
 
 		/* Notify protocol that we drained some data. */
 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
 		     !(flags & MSG_SOCALLBCK))) {
 			SOCKBUF_UNLOCK(sb);
 			VNET_SO_ASSERT(so);
 			so->so_proto->pr_rcvd(so, flags);
 			SOCKBUF_LOCK(sb);
 		}
 	}
 
 	/*
 	 * For MSG_WAITALL we may have to loop again and wait for
 	 * more data to come in.
 	 */
 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
 		goto restart;
 out:
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 	SOCKBUF_UNLOCK(sb);
 	return (error);
 }
 
 int
 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct sockbuf *sb;
 	int error, flags;
 
 	sb = &so->so_rcv;
 
 	/* We only do stream sockets. */
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (psa != NULL)
 		*psa = NULL;
 	if (flagsp != NULL)
 		flags = *flagsp & ~MSG_EOR;
 	else
 		flags = 0;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp0 != NULL)
 		*mp0 = NULL;
 
 #ifdef KERN_TLS
 	/*
 	 * KTLS store TLS records as records with a control message to
 	 * describe the framing.
 	 *
 	 * We check once here before acquiring locks to optimize the
 	 * common case.
 	 */
 	if (sb->sb_tls_info != NULL)
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 #endif
 
 	/*
 	 * Prevent other threads from reading from the socket.  This lock may be
 	 * dropped in order to sleep waiting for data to arrive.
 	 */
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 #ifdef KERN_TLS
 	if (__predict_false(sb->sb_tls_info != NULL)) {
 		SOCK_IO_RECV_UNLOCK(so);
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 	}
 #endif
 	error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags);
 	SOCK_IO_RECV_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for simple datagram cases from userspace.
  * Unlike in the stream case, we're able to drop a datagram if copyout()
  * fails, and because we handle datagrams atomically, we don't need to use a
  * sleep lock to prevent I/O interlacing.
  */
 int
 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct mbuf *m, *m2;
 	int flags, error;
 	ssize_t len;
 	struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 
 	/*
 	 * For any complicated cases, fall back to the full
 	 * soreceive_generic().
 	 */
 	if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC)))
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 
 	/*
 	 * Enforce restrictions on use.
 	 */
 	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
 	    ("soreceive_dgram: wantrcvd"));
 	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
 	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
 	    ("soreceive_dgram: SBS_RCVATMARK"));
 	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
 	    ("soreceive_dgram: P_CONNREQUIRED"));
 
 	/*
 	 * Loop blocking while waiting for a datagram.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	while ((m = so->so_rcv.sb_mb) == NULL) {
 		KASSERT(sbavail(&so->so_rcv) == 0,
 		    ("soreceive_dgram: sb_mb NULL but sbavail %u",
 		    sbavail(&so->so_rcv)));
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (error);
 		}
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
 		    uio->uio_resid == 0) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (0);
 		}
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (EWOULDBLOCK);
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		error = sbwait(so, SO_RCV);
 		if (error) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (error);
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	nextrecord = m->m_nextpkt;
 	if (nextrecord == NULL) {
 		KASSERT(so->so_rcv.sb_lastrecord == m,
 		    ("soreceive_dgram: lastrecord != m"));
 	}
 
 	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
 	    ("soreceive_dgram: m_nextpkt != nextrecord"));
 
 	/*
 	 * Pull 'm' and its chain off the front of the packet queue.
 	 */
 	so->so_rcv.sb_mb = NULL;
 	sockbuf_pushsync(&so->so_rcv, nextrecord);
 
 	/*
 	 * Walk 'm's chain and free that many bytes from the socket buffer.
 	 */
 	for (m2 = m; m2 != NULL; m2 = m2->m_next)
 		sbfree(&so->so_rcv, m2);
 
 	/*
 	 * Do a few last checks before we let go of the lock.
 	 */
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	if (pr->pr_flags & PR_ADDR) {
 		KASSERT(m->m_type == MT_SONAME,
 		    ("m->m_type == %d", m->m_type));
 		if (psa != NULL)
 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
 			    M_NOWAIT);
 		m = m_free(m);
 	}
 	if (m == NULL) {
 		/* XXXRW: Can this happen? */
 		return (0);
 	}
 
 	/*
 	 * Packet to copyout() is now in 'm' and it is disconnected from the
 	 * queue.
 	 *
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  We call into the
 	 * protocol to perform externalization (or freeing if controlp ==
 	 * NULL). In some cases there can be only MT_CONTROL mbufs without
 	 * MT_DATA mbufs.
 	 */
 	if (m->m_type == MT_CONTROL) {
 		struct mbuf *cm = NULL, *cmn;
 		struct mbuf **cme = &cm;
 
 		do {
 			m2 = m->m_next;
 			m->m_next = NULL;
 			*cme = m;
 			cme = &(*cme)->m_next;
 			m = m2;
 		} while (m != NULL && m->m_type == MT_CONTROL);
 		while (cm != NULL) {
 			cmn = cm->m_next;
 			cm->m_next = NULL;
 			if (pr->pr_domain->dom_externalize != NULL) {
 				error = (*pr->pr_domain->dom_externalize)
 				    (cm, controlp, flags);
 			} else if (controlp != NULL)
 				*controlp = cm;
 			else
 				m_freem(cm);
 			if (controlp != NULL) {
 				while (*controlp != NULL)
 					controlp = &(*controlp)->m_next;
 			}
 			cm = cmn;
 		}
 	}
 	KASSERT(m == NULL || m->m_type == MT_DATA,
 	    ("soreceive_dgram: !data"));
 	while (m != NULL && uio->uio_resid > 0) {
 		len = uio->uio_resid;
 		if (len > m->m_len)
 			len = m->m_len;
 		error = uiomove(mtod(m, char *), (int)len, uio);
 		if (error) {
 			m_freem(m);
 			return (error);
 		}
 		if (len == m->m_len)
 			m = m_free(m);
 		else {
 			m->m_data += len;
 			m->m_len -= len;
 		}
 	}
 	if (m != NULL) {
 		flags |= MSG_TRUNC;
 		m_freem(m);
 	}
 	if (flagsp != NULL)
 		*flagsp |= flags;
 	return (0);
 }
 
 int
 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soshutdown(struct socket *so, int how)
 {
 	struct protosw *pr;
 	int error, soerror_enotconn;
 
 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 		return (EINVAL);
 
 	soerror_enotconn = 0;
 	SOCK_LOCK(so);
 	if ((so->so_state &
 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
 		/*
 		 * POSIX mandates us to return ENOTCONN when shutdown(2) is
 		 * invoked on a datagram sockets, however historically we would
 		 * actually tear socket down. This is known to be leveraged by
 		 * some applications to unblock process waiting in recvXXX(2)
 		 * by other process that it shares that socket with. Try to meet
 		 * both backward-compatibility and POSIX requirements by forcing
 		 * ENOTCONN but still asking protocol to perform pru_shutdown().
 		 */
 		if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) {
 			SOCK_UNLOCK(so);
 			return (ENOTCONN);
 		}
 		soerror_enotconn = 1;
 	}
 
 	if (SOLISTENING(so)) {
 		if (how != SHUT_WR) {
 			so->so_error = ECONNABORTED;
 			solisten_wakeup(so);	/* unlocks so */
 		} else {
 			SOCK_UNLOCK(so);
 		}
 		goto done;
 	}
 	SOCK_UNLOCK(so);
 
 	CURVNET_SET(so->so_vnet);
 	pr = so->so_proto;
 	if (pr->pr_flush != NULL)
 		pr->pr_flush(so, how);
 	if (how != SHUT_WR)
 		sorflush(so);
 	if (how != SHUT_RD) {
 		error = pr->pr_shutdown(so);
 		wakeup(&so->so_timeo);
 		CURVNET_RESTORE();
 		return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
 	}
 	wakeup(&so->so_timeo);
 	CURVNET_RESTORE();
 
 done:
 	return (soerror_enotconn ? ENOTCONN : 0);
 }
 
 void
 sorflush(struct socket *so)
 {
 	struct protosw *pr;
 	int error;
 
 	VNET_SO_ASSERT(so);
 
 	/*
 	 * Dislodge threads currently blocked in receive and wait to acquire
 	 * a lock against other simultaneous readers before clearing the
 	 * socket buffer.  Don't let our acquire be interrupted by a signal
 	 * despite any existing socket disposition on interruptable waiting.
 	 */
 	socantrcvmore(so);
 
 	error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
 	if (error != 0) {
 		KASSERT(SOLISTENING(so),
 		    ("%s: soiolock(%p) failed", __func__, so));
 		return;
 	}
 
 	pr = so->so_proto;
 	if (pr->pr_flags & PR_RIGHTS) {
 		MPASS(pr->pr_domain->dom_dispose != NULL);
 		(*pr->pr_domain->dom_dispose)(so);
 	} else {
 		sbrelease(so, SO_RCV);
 		SOCK_IO_RECV_UNLOCK(so);
 	}
 
 }
 
 /*
  * Wrapper for Socket established helper hook.
  * Parameters: socket, context of the hook point, hook id.
  */
 static int inline
 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
 {
 	struct socket_hhook_data hhook_data = {
 		.so = so,
 		.hctx = hctx,
 		.m = NULL,
 		.status = 0
 	};
 
 	CURVNET_SET(so->so_vnet);
 	HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
 	CURVNET_RESTORE();
 
 	/* Ugly but needed, since hhooks return void for now */
 	return (hhook_data.status);
 }
 
 /*
  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
  * additional variant to handle the case where the option value needs to be
  * some kind of integer, but not a specific size.  In addition to their use
  * here, these functions are also called by the protocol-level pr_ctloutput()
  * routines.
  */
 int
 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
 {
 	size_t	valsize;
 
 	/*
 	 * If the user gives us more than we wanted, we ignore it, but if we
 	 * don't get the minimum length the caller wants, we return EINVAL.
 	 * On success, sopt->sopt_valsize is set to however much we actually
 	 * retrieved.
 	 */
 	if ((valsize = sopt->sopt_valsize) < minlen)
 		return EINVAL;
 	if (valsize > len)
 		sopt->sopt_valsize = valsize = len;
 
 	if (sopt->sopt_td != NULL)
 		return (copyin(sopt->sopt_val, buf, valsize));
 
 	bcopy(sopt->sopt_val, buf, valsize);
 	return (0);
 }
 
 /*
  * Kernel version of setsockopt(2).
  *
  * XXX: optlen is size_t, not socklen_t
  */
 int
 so_setsockopt(struct socket *so, int level, int optname, void *optval,
     size_t optlen)
 {
 	struct sockopt sopt;
 
 	sopt.sopt_level = level;
 	sopt.sopt_name = optname;
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_val = optval;
 	sopt.sopt_valsize = optlen;
 	sopt.sopt_td = NULL;
 	return (sosetopt(so, &sopt));
 }
 
 int
 sosetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
 	struct	timeval tv;
 	sbintime_t val, *valp;
 	uint32_t val32;
 #ifdef MAC
 	struct mac extmac;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 	error = 0;
 	if (sopt->sopt_level != SOL_SOCKET) {
 		if (so->so_proto->pr_ctloutput != NULL)
 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
 		else
 			error = ENOPROTOOPT;
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
 			error = accept_filt_setopt(so, sopt);
 			if (error)
 				goto bad;
 			break;
 
 		case SO_LINGER:
 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
 			if (error)
 				goto bad;
 			if (l.l_linger < 0 ||
 			    l.l_linger > USHRT_MAX ||
 			    l.l_linger > (INT_MAX / hz)) {
 				error = EDOM;
 				goto bad;
 			}
 			SOCK_LOCK(so);
 			so->so_linger = l.l_linger;
 			if (l.l_onoff)
 				so->so_options |= SO_LINGER;
 			else
 				so->so_options &= ~SO_LINGER;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_DEBUG:
 		case SO_KEEPALIVE:
 		case SO_DONTROUTE:
 		case SO_USELOOPBACK:
 		case SO_BROADCAST:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
 		case SO_REUSEPORT_LB:
 		case SO_OOBINLINE:
 		case SO_TIMESTAMP:
 		case SO_BINTIME:
 		case SO_NOSIGPIPE:
 		case SO_NO_DDP:
 		case SO_NO_OFFLOAD:
 		case SO_RERROR:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 			SOCK_LOCK(so);
 			if (optval)
 				so->so_options |= sopt->sopt_name;
 			else
 				so->so_options &= ~sopt->sopt_name;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_SETFIB:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 
 			if (optval < 0 || optval >= rt_numfibs) {
 				error = EINVAL;
 				goto bad;
 			}
 			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
 			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
 			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
 				so->so_fibnum = optval;
 			else
 				so->so_fibnum = 0;
 			break;
 
 		case SO_USER_COOKIE:
 			error = sooptcopyin(sopt, &val32, sizeof val32,
 			    sizeof val32);
 			if (error)
 				goto bad;
 			so->so_user_cookie = val32;
 			break;
 
 		case SO_SNDBUF:
 		case SO_RCVBUF:
 		case SO_SNDLOWAT:
 		case SO_RCVLOWAT:
 			error = so->so_proto->pr_setsbopt(so, sopt);
 			if (error)
 				goto bad;
 			break;
 
 		case SO_SNDTIMEO:
 		case SO_RCVTIMEO:
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct timeval32 tv32;
 
 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
 				    sizeof tv32);
 				CP(tv32, tv, tv_sec);
 				CP(tv32, tv, tv_usec);
 			} else
 #endif
 				error = sooptcopyin(sopt, &tv, sizeof tv,
 				    sizeof tv);
 			if (error)
 				goto bad;
 			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
 			    tv.tv_usec >= 1000000) {
 				error = EDOM;
 				goto bad;
 			}
 			if (tv.tv_sec > INT32_MAX)
 				val = SBT_MAX;
 			else
 				val = tvtosbt(tv);
 			SOCK_LOCK(so);
 			valp = sopt->sopt_name == SO_SNDTIMEO ?
 			    (SOLISTENING(so) ? &so->sol_sbsnd_timeo :
 			    &so->so_snd.sb_timeo) :
 			    (SOLISTENING(so) ? &so->sol_sbrcv_timeo :
 			    &so->so_rcv.sb_timeo);
 			*valp = val;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_LABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
 			    sizeof extmac);
 			if (error)
 				goto bad;
 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
 			    so, &extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_TS_CLOCK:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 			if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
 				error = EINVAL;
 				goto bad;
 			}
 			so->so_ts_clock = optval;
 			break;
 
 		case SO_MAX_PACING_RATE:
 			error = sooptcopyin(sopt, &val32, sizeof(val32),
 			    sizeof(val32));
 			if (error)
 				goto bad;
 			so->so_max_pacing_rate = val32;
 			break;
 
 		case SO_SPLICE: {
 			struct splice splice;
 
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct splice32 splice32;
 
 				error = sooptcopyin(sopt, &splice32,
 				    sizeof(splice32), sizeof(splice32));
 				if (error == 0) {
 					splice.sp_fd = splice32.sp_fd;
 					splice.sp_max = splice32.sp_max;
 					CP(splice32.sp_idle, splice.sp_idle,
 					    tv_sec);
 					CP(splice32.sp_idle, splice.sp_idle,
 					    tv_usec);
 				}
 			} else
 #endif
 			{
 				error = sooptcopyin(sopt, &splice,
 				    sizeof(splice), sizeof(splice));
 			}
 			if (error)
 				goto bad;
 #ifdef KTRACE
 			if (KTRPOINT(curthread, KTR_STRUCT))
 				ktrsplice(&splice);
 #endif
 
 			error = splice_init();
 			if (error != 0)
 				goto bad;
 
 			if (splice.sp_fd >= 0) {
 				struct file *fp;
 				struct socket *so2;
 
 				if (!cap_rights_contains(sopt->sopt_rights,
 				    &cap_recv_rights)) {
 					error = ENOTCAPABLE;
 					goto bad;
 				}
 				error = getsock(sopt->sopt_td, splice.sp_fd,
 				    &cap_send_rights, &fp);
 				if (error != 0)
 					goto bad;
 				so2 = fp->f_data;
 
 				error = so_splice(so, so2, &splice);
 				fdrop(fp, sopt->sopt_td);
 			} else {
 				error = so_unsplice(so, false);
 			}
 			break;
 		}
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,
 				    HHOOK_SOCKET_OPT);
 			else
 				error = ENOPROTOOPT;
 			break;
 		}
 		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
 			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
 	}
 bad:
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Helper routine for getsockopt.
  */
 int
 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 {
 	int	error;
 	size_t	valsize;
 
 	error = 0;
 
 	/*
 	 * Documented get behavior is that we always return a value, possibly
 	 * truncated to fit in the user's buffer.  Traditional behavior is
 	 * that we always tell the user precisely how much we copied, rather
 	 * than something useful like the total amount we had available for
 	 * her.  Note that this interface is not idempotent; the entire
 	 * answer must be generated ahead of time.
 	 */
 	valsize = min(len, sopt->sopt_valsize);
 	sopt->sopt_valsize = valsize;
 	if (sopt->sopt_val != NULL) {
 		if (sopt->sopt_td != NULL)
 			error = copyout(buf, sopt->sopt_val, valsize);
 		else
 			bcopy(buf, sopt->sopt_val, valsize);
 	}
 	return (error);
 }
 
 int
 sogetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
 	struct	timeval tv;
 #ifdef MAC
 	struct mac extmac;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 	error = 0;
 	if (sopt->sopt_level != SOL_SOCKET) {
 		if (so->so_proto->pr_ctloutput != NULL)
 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
 		else
 			error = ENOPROTOOPT;
 		CURVNET_RESTORE();
 		return (error);
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
 			error = accept_filt_getopt(so, sopt);
 			break;
 
 		case SO_LINGER:
 			SOCK_LOCK(so);
 			l.l_onoff = so->so_options & SO_LINGER;
 			l.l_linger = so->so_linger;
 			SOCK_UNLOCK(so);
 			error = sooptcopyout(sopt, &l, sizeof l);
 			break;
 
 		case SO_USELOOPBACK:
 		case SO_DONTROUTE:
 		case SO_DEBUG:
 		case SO_KEEPALIVE:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
 		case SO_REUSEPORT_LB:
 		case SO_BROADCAST:
 		case SO_OOBINLINE:
 		case SO_ACCEPTCONN:
 		case SO_TIMESTAMP:
 		case SO_BINTIME:
 		case SO_NOSIGPIPE:
 		case SO_NO_DDP:
 		case SO_NO_OFFLOAD:
 		case SO_RERROR:
 			optval = so->so_options & sopt->sopt_name;
 integer:
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case SO_DOMAIN:
 			optval = so->so_proto->pr_domain->dom_family;
 			goto integer;
 
 		case SO_TYPE:
 			optval = so->so_type;
 			goto integer;
 
 		case SO_PROTOCOL:
 			optval = so->so_proto->pr_protocol;
 			goto integer;
 
 		case SO_ERROR:
 			SOCK_LOCK(so);
 			if (so->so_error) {
 				optval = so->so_error;
 				so->so_error = 0;
 			} else {
 				optval = so->so_rerror;
 				so->so_rerror = 0;
 			}
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_SNDBUF:
 			SOCK_LOCK(so);
 			optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
 			    so->so_snd.sb_hiwat;
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_RCVBUF:
 			SOCK_LOCK(so);
 			optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
 			    so->so_rcv.sb_hiwat;
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_SNDLOWAT:
 			SOCK_LOCK(so);
 			optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
 			    so->so_snd.sb_lowat;
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_RCVLOWAT:
 			SOCK_LOCK(so);
 			optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
 			    so->so_rcv.sb_lowat;
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_SNDTIMEO:
 		case SO_RCVTIMEO:
 			SOCK_LOCK(so);
 			tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
 			    (SOLISTENING(so) ? so->sol_sbsnd_timeo :
 			    so->so_snd.sb_timeo) :
 			    (SOLISTENING(so) ? so->sol_sbrcv_timeo :
 			    so->so_rcv.sb_timeo));
 			SOCK_UNLOCK(so);
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct timeval32 tv32;
 
 				CP(tv, tv32, tv_sec);
 				CP(tv, tv32, tv_usec);
 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
 			} else
 #endif
 				error = sooptcopyout(sopt, &tv, sizeof tv);
 			break;
 
 		case SO_LABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 			    sizeof(extmac));
 			if (error)
 				goto bad;
 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
 			    so, &extmac);
 			if (error)
 				goto bad;
 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_PEERLABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 			    sizeof(extmac));
 			if (error)
 				goto bad;
 			error = mac_getsockopt_peerlabel(
 			    sopt->sopt_td->td_ucred, so, &extmac);
 			if (error)
 				goto bad;
 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_LISTENQLIMIT:
 			SOCK_LOCK(so);
 			optval = SOLISTENING(so) ? so->sol_qlimit : 0;
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_LISTENQLEN:
 			SOCK_LOCK(so);
 			optval = SOLISTENING(so) ? so->sol_qlen : 0;
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_LISTENINCQLEN:
 			SOCK_LOCK(so);
 			optval = SOLISTENING(so) ? so->sol_incqlen : 0;
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_TS_CLOCK:
 			optval = so->so_ts_clock;
 			goto integer;
 
 		case SO_MAX_PACING_RATE:
 			optval = so->so_max_pacing_rate;
 			goto integer;
 
 		case SO_SPLICE: {
 			off_t n;
 
 			/*
 			 * Acquire the I/O lock to serialize with
 			 * so_splice_xfer().  This is not required for
 			 * correctness, but makes testing simpler: once a byte
 			 * has been transmitted to the sink and observed (e.g.,
 			 * by reading from the socket to which the sink is
 			 * connected), a subsequent getsockopt(SO_SPLICE) will
 			 * return an up-to-date value.
 			 */
 			error = SOCK_IO_RECV_LOCK(so, SBL_WAIT);
 			if (error != 0)
 				goto bad;
 			SOCK_LOCK(so);
 			if (SOLISTENING(so)) {
 				n = 0;
 			} else {
 				n = so->so_splice_sent;
 			}
 			SOCK_UNLOCK(so);
 			SOCK_IO_RECV_UNLOCK(so);
 			error = sooptcopyout(sopt, &n, sizeof(n));
 			break;
 		}
 
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,
 				    HHOOK_SOCKET_OPT);
 			else
 				error = ENOPROTOOPT;
 			break;
 		}
 	}
 bad:
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 {
 	struct mbuf *m, *m_prev;
 	int sopt_size = sopt->sopt_valsize;
 
 	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return ENOBUFS;
 	if (sopt_size > MLEN) {
 		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return ENOBUFS;
 		}
 		m->m_len = min(MCLBYTES, sopt_size);
 	} else {
 		m->m_len = min(MLEN, sopt_size);
 	}
 	sopt_size -= m->m_len;
 	*mp = m;
 	m_prev = m;
 
 	while (sopt_size) {
 		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			m_freem(*mp);
 			return ENOBUFS;
 		}
 		if (sopt_size > MLEN) {
 			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
 			    M_NOWAIT);
 			if ((m->m_flags & M_EXT) == 0) {
 				m_freem(m);
 				m_freem(*mp);
 				return ENOBUFS;
 			}
 			m->m_len = min(MCLBYTES, sopt_size);
 		} else {
 			m->m_len = min(MLEN, sopt_size);
 		}
 		sopt_size -= m->m_len;
 		m_prev->m_next = m;
 		m_prev = m;
 	}
 	return (0);
 }
 
 int
 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 {
 	struct mbuf *m0 = m;
 
 	if (sopt->sopt_val == NULL)
 		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
 
 			error = copyin(sopt->sopt_val, mtod(m, char *),
 			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
 			}
 		} else
 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
 		sopt->sopt_valsize -= m->m_len;
 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 		m = m->m_next;
 	}
 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 		panic("ip6_sooptmcopyin");
 	return (0);
 }
 
 int
 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 {
 	struct mbuf *m0 = m;
 	size_t valsize = 0;
 
 	if (sopt->sopt_val == NULL)
 		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
 
 			error = copyout(mtod(m, char *), sopt->sopt_val,
 			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
 			}
 		} else
 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
 		sopt->sopt_valsize -= m->m_len;
 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 		valsize += m->m_len;
 		m = m->m_next;
 	}
 	if (m != NULL) {
 		/* enough soopt buffer should be given from user-land */
 		m_freem(m0);
 		return(EINVAL);
 	}
 	sopt->sopt_valsize = valsize;
 	return (0);
 }
 
 /*
  * sohasoutofband(): protocol notifies socket layer of the arrival of new
  * out-of-band data, which will then notify socket consumers.
  */
 void
 sohasoutofband(struct socket *so)
 {
 
 	if (so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGURG, 0);
 	selwakeuppri(&so->so_rdsel, PSOCK);
 }
 
 int
 sopoll(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	/*
 	 * We do not need to set or assert curvnet as long as everyone uses
 	 * sopoll_generic().
 	 */
 	return (so->so_proto->pr_sopoll(so, events, active_cred, td));
 }
 
 int
 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	int revents;
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		if (!(events & (POLLIN | POLLRDNORM)))
 			revents = 0;
 		else if (!TAILQ_EMPTY(&so->sol_comp))
 			revents = events & (POLLIN | POLLRDNORM);
 		else if ((events & POLLINIGNEOF) == 0 && so->so_error)
 			revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
 		else {
 			selrecord(td, &so->so_rdsel);
 			revents = 0;
 		}
 	} else {
 		revents = 0;
 		SOCK_SENDBUF_LOCK(so);
 		SOCK_RECVBUF_LOCK(so);
 		if (events & (POLLIN | POLLRDNORM))
 			if (soreadabledata(so) && !isspliced(so))
 				revents |= events & (POLLIN | POLLRDNORM);
 		if (events & (POLLOUT | POLLWRNORM))
 			if (sowriteable(so) && !issplicedback(so))
 				revents |= events & (POLLOUT | POLLWRNORM);
 		if (events & (POLLPRI | POLLRDBAND))
 			if (so->so_oobmark ||
 			    (so->so_rcv.sb_state & SBS_RCVATMARK))
 				revents |= events & (POLLPRI | POLLRDBAND);
 		if ((events & POLLINIGNEOF) == 0) {
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				revents |= events & (POLLIN | POLLRDNORM);
 				if (so->so_snd.sb_state & SBS_CANTSENDMORE)
 					revents |= POLLHUP;
 			}
 		}
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			revents |= events & POLLRDHUP;
 		if (revents == 0) {
 			if (events &
 			    (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
 				selrecord(td, &so->so_rdsel);
 				so->so_rcv.sb_flags |= SB_SEL;
 			}
 			if (events & (POLLOUT | POLLWRNORM)) {
 				selrecord(td, &so->so_wrsel);
 				so->so_snd.sb_flags |= SB_SEL;
 			}
 		}
 		SOCK_RECVBUF_UNLOCK(so);
 		SOCK_SENDBUF_UNLOCK(so);
 	}
 	SOCK_UNLOCK(so);
 	return (revents);
 }
 
 int
 soo_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 	struct sockbuf *sb;
 	sb_which which;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &soread_filtops;
 		knl = &so->so_rdsel.si_note;
 		sb = &so->so_rcv;
 		which = SO_RCV;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &sowrite_filtops;
 		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	case EVFILT_EMPTY:
 		kn->kn_fop = &soempty_filtops;
 		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		knlist_add(knl, kn, 1);
 	} else {
 		SOCK_BUF_LOCK(so, which);
 		knlist_add(knl, kn, 1);
 		sb->sb_flags |= SB_KNOTE;
 		SOCK_BUF_UNLOCK(so, which);
 	}
 	SOCK_UNLOCK(so);
 	return (0);
 }
 
 static void
 filt_sordetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
 	so_rdknl_lock(so);
 	knlist_remove(&so->so_rdsel.si_note, kn, 1);
 	if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
 	so_rdknl_unlock(so);
 }
 
 /*ARGSUSED*/
 static int
 filt_soread(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so)) {
 		SOCK_LOCK_ASSERT(so);
 		kn->kn_data = so->sol_qlen;
 		if (so->so_error) {
 			kn->kn_flags |= EV_EOF;
 			kn->kn_fflags = so->so_error;
 			return (1);
 		}
 		return (!TAILQ_EMPTY(&so->sol_comp));
 	}
 
 	if ((so->so_rcv.sb_flags & SB_SPLICED) != 0)
 		return (0);
 
 	SOCK_RECVBUF_LOCK_ASSERT(so);
 
 	kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = so->so_error;
 		return (1);
 	} else if (so->so_error || so->so_rerror)
 		return (1);
 
 	if (kn->kn_sfflags & NOTE_LOWAT) {
 		if (kn->kn_data >= kn->kn_sdata)
 			return (1);
 	} else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
 		return (1);
 
 	/* This hook returning non-zero indicates an event, not error */
 	return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
 }
 
 static void
 filt_sowdetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
 	so_wrknl_lock(so);
 	knlist_remove(&so->so_wrsel.si_note, kn, 1);
 	if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
 	so_wrknl_unlock(so);
 }
 
 /*ARGSUSED*/
 static int
 filt_sowrite(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so))
 		return (0);
 
 	SOCK_SENDBUF_LOCK_ASSERT(so);
 	kn->kn_data = sbspace(&so->so_snd);
 
 	hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
 
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = so->so_error;
 		return (1);
 	} else if (so->so_error)	/* temporary udp error */
 		return (1);
 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
 		return (0);
 	else if (kn->kn_sfflags & NOTE_LOWAT)
 		return (kn->kn_data >= kn->kn_sdata);
 	else
 		return (kn->kn_data >= so->so_snd.sb_lowat);
 }
 
 static int
 filt_soempty(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so))
 		return (1);
 
 	SOCK_SENDBUF_LOCK_ASSERT(so);
 	kn->kn_data = sbused(&so->so_snd);
 
 	if (kn->kn_data == 0)
 		return (1);
 	else
 		return (0);
 }
 
 int
 socheckuid(struct socket *so, uid_t uid)
 {
 
 	if (so == NULL)
 		return (EPERM);
 	if (so->so_cred->cr_uid != uid)
 		return (EPERM);
 	return (0);
 }
 
 /*
  * These functions are used by protocols to notify the socket layer (and its
  * consumers) of state changes in the sockets driven by protocol-side events.
  */
 
 /*
  * Procedures to manipulate state flags of socket and do appropriate wakeups.
  *
  * Normal sequence from the active (originating) side is that
  * soisconnecting() is called during processing of connect() call, resulting
  * in an eventual call to soisconnected() if/when the connection is
  * established.  When the connection is torn down soisdisconnecting() is
  * called during processing of disconnect() call, and soisdisconnected() is
  * called when the connection to the peer is totally severed.  The semantics
  * of these routines are such that connectionless protocols can call
  * soisconnected() and soisdisconnected() only, bypassing the in-progress
  * calls when setting up a ``connection'' takes no time.
  *
  * From the passive side, a socket is created with two queues of sockets:
  * so_incomp for connections in progress and so_comp for connections already
  * made and awaiting user acceptance.  As a protocol is preparing incoming
  * connections, it creates a socket structure queued on so_incomp by calling
  * sonewconn().  When the connection is established, soisconnected() is
  * called, and transfers the socket structure to so_comp, making it available
  * to accept().
  *
  * If a socket is closed with sockets on either so_incomp or so_comp, these
  * sockets are dropped.
  *
  * If higher-level protocols are implemented in the kernel, the wakeups done
  * here will sometimes cause software-interrupt process scheduling.
  */
 void
 soisconnecting(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
 	so->so_state |= SS_ISCONNECTING;
 	SOCK_UNLOCK(so);
 }
 
 void
 soisconnected(struct socket *so)
 {
 	bool last __diagused;
 
 	SOCK_LOCK(so);
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 	so->so_state |= SS_ISCONNECTED;
 
 	if (so->so_qstate == SQ_INCOMP) {
 		struct socket *head = so->so_listen;
 		int ret;
 
 		KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
 		/*
 		 * Promoting a socket from incomplete queue to complete, we
 		 * need to go through reverse order of locking.  We first do
 		 * trylock, and if that doesn't succeed, we go the hard way
 		 * leaving a reference and rechecking consistency after proper
 		 * locking.
 		 */
 		if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
 			soref(head);
 			SOCK_UNLOCK(so);
 			SOLISTEN_LOCK(head);
 			SOCK_LOCK(so);
 			if (__predict_false(head != so->so_listen)) {
 				/*
 				 * The socket went off the listen queue,
 				 * should be lost race to close(2) of sol.
 				 * The socket is about to soabort().
 				 */
 				SOCK_UNLOCK(so);
 				sorele_locked(head);
 				return;
 			}
 			last = refcount_release(&head->so_count);
 			KASSERT(!last, ("%s: released last reference for %p",
 			    __func__, head));
 		}
 again:
 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
 			TAILQ_REMOVE(&head->sol_incomp, so, so_list);
 			head->sol_incqlen--;
 			TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 			head->sol_qlen++;
 			so->so_qstate = SQ_COMP;
 			SOCK_UNLOCK(so);
 			solisten_wakeup(head);	/* unlocks */
 		} else {
 			SOCK_RECVBUF_LOCK(so);
 			soupcall_set(so, SO_RCV,
 			    head->sol_accept_filter->accf_callback,
 			    head->sol_accept_filter_arg);
 			so->so_options &= ~SO_ACCEPTFILTER;
 			ret = head->sol_accept_filter->accf_callback(so,
 			    head->sol_accept_filter_arg, M_NOWAIT);
 			if (ret == SU_ISCONNECTED) {
 				soupcall_clear(so, SO_RCV);
 				SOCK_RECVBUF_UNLOCK(so);
 				goto again;
 			}
 			SOCK_RECVBUF_UNLOCK(so);
 			SOCK_UNLOCK(so);
 			SOLISTEN_UNLOCK(head);
 		}
 		return;
 	}
 	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 }
 
 void
 soisdisconnecting(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTING;
 	so->so_state |= SS_ISDISCONNECTING;
 
 	if (!SOLISTENING(so)) {
 		SOCK_RECVBUF_LOCK(so);
 		socantrcvmore_locked(so);
 		SOCK_SENDBUF_LOCK(so);
 		socantsendmore_locked(so);
 	}
 	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
 void
 soisdisconnected(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 
 	/*
 	 * There is at least one reader of so_state that does not
 	 * acquire socket lock, namely soreceive_generic().  Ensure
 	 * that it never sees all flags that track connection status
 	 * cleared, by ordering the update with a barrier semantic of
 	 * our release thread fence.
 	 */
 	so->so_state |= SS_ISDISCONNECTED;
 	atomic_thread_fence_rel();
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 
 	if (!SOLISTENING(so)) {
 		SOCK_UNLOCK(so);
 		SOCK_RECVBUF_LOCK(so);
 		socantrcvmore_locked(so);
 		SOCK_SENDBUF_LOCK(so);
 		sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
 		socantsendmore_locked(so);
 	} else
 		SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
 int
 soiolock(struct socket *so, struct sx *sx, int flags)
 {
 	int error;
 
 	KASSERT((flags & SBL_VALID) == flags,
 	    ("soiolock: invalid flags %#x", flags));
 
 	if ((flags & SBL_WAIT) != 0) {
 		if ((flags & SBL_NOINTR) != 0) {
 			sx_xlock(sx);
 		} else {
 			error = sx_xlock_sig(sx);
 			if (error != 0)
 				return (error);
 		}
 	} else if (!sx_try_xlock(sx)) {
 		return (EWOULDBLOCK);
 	}
 
 	if (__predict_false(SOLISTENING(so))) {
 		sx_xunlock(sx);
 		return (ENOTCONN);
 	}
 	return (0);
 }
 
 void
 soiounlock(struct sx *sx)
 {
 	sx_xunlock(sx);
 }
 
 /*
  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
  */
 struct sockaddr *
 sodupsockaddr(const struct sockaddr *sa, int mflags)
 {
 	struct sockaddr *sa2;
 
 	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
 	if (sa2)
 		bcopy(sa, sa2, sa->sa_len);
 	return sa2;
 }
 
 /*
  * Register per-socket destructor.
  */
 void
 sodtor_set(struct socket *so, so_dtor_t *func)
 {
 
 	SOCK_LOCK_ASSERT(so);
 	so->so_dtor = func;
 }
 
 /*
  * Register per-socket buffer upcalls.
  */
 void
 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg)
 {
 	struct sockbuf *sb;
 
 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
 
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
 		break;
 	case SO_SND:
 		sb = &so->so_snd;
 		break;
 	}
 	SOCK_BUF_LOCK_ASSERT(so, which);
 	sb->sb_upcall = func;
 	sb->sb_upcallarg = arg;
 	sb->sb_flags |= SB_UPCALL;
 }
 
 void
 soupcall_clear(struct socket *so, sb_which which)
 {
 	struct sockbuf *sb;
 
 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
 
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
 		break;
 	case SO_SND:
 		sb = &so->so_snd;
 		break;
 	}
 	SOCK_BUF_LOCK_ASSERT(so, which);
 	KASSERT(sb->sb_upcall != NULL,
 	    ("%s: so %p no upcall to clear", __func__, so));
 	sb->sb_upcall = NULL;
 	sb->sb_upcallarg = NULL;
 	sb->sb_flags &= ~SB_UPCALL;
 }
 
 void
 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
 {
 
 	SOLISTEN_LOCK_ASSERT(so);
 	so->sol_upcall = func;
 	so->sol_upcallarg = arg;
 }
 
 static void
 so_rdknl_lock(void *arg)
 {
 	struct socket *so = arg;
 
 retry:
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 	} else {
 		SOCK_RECVBUF_LOCK(so);
 		if (__predict_false(SOLISTENING(so))) {
 			SOCK_RECVBUF_UNLOCK(so);
 			goto retry;
 		}
 	}
 }
 
 static void
 so_rdknl_unlock(void *arg)
 {
 	struct socket *so = arg;
 
 	if (SOLISTENING(so))
 		SOLISTEN_UNLOCK(so);
 	else
 		SOCK_RECVBUF_UNLOCK(so);
 }
 
 static void
 so_rdknl_assert_lock(void *arg, int what)
 {
 	struct socket *so = arg;
 
 	if (what == LA_LOCKED) {
 		if (SOLISTENING(so))
 			SOLISTEN_LOCK_ASSERT(so);
 		else
 			SOCK_RECVBUF_LOCK_ASSERT(so);
 	} else {
 		if (SOLISTENING(so))
 			SOLISTEN_UNLOCK_ASSERT(so);
 		else
 			SOCK_RECVBUF_UNLOCK_ASSERT(so);
 	}
 }
 
 static void
 so_wrknl_lock(void *arg)
 {
 	struct socket *so = arg;
 
 retry:
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 	} else {
 		SOCK_SENDBUF_LOCK(so);
 		if (__predict_false(SOLISTENING(so))) {
 			SOCK_SENDBUF_UNLOCK(so);
 			goto retry;
 		}
 	}
 }
 
 static void
 so_wrknl_unlock(void *arg)
 {
 	struct socket *so = arg;
 
 	if (SOLISTENING(so))
 		SOLISTEN_UNLOCK(so);
 	else
 		SOCK_SENDBUF_UNLOCK(so);
 }
 
 static void
 so_wrknl_assert_lock(void *arg, int what)
 {
 	struct socket *so = arg;
 
 	if (what == LA_LOCKED) {
 		if (SOLISTENING(so))
 			SOLISTEN_LOCK_ASSERT(so);
 		else
 			SOCK_SENDBUF_LOCK_ASSERT(so);
 	} else {
 		if (SOLISTENING(so))
 			SOLISTEN_UNLOCK_ASSERT(so);
 		else
 			SOCK_SENDBUF_UNLOCK_ASSERT(so);
 	}
 }
 
 /*
  * Create an external-format (``xsocket'') structure using the information in
  * the kernel-format socket structure pointed to by so.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 sotoxsocket(struct socket *so, struct xsocket *xso)
 {
 
 	bzero(xso, sizeof(*xso));
 	xso->xso_len = sizeof *xso;
 	xso->xso_so = (uintptr_t)so;
 	xso->so_type = so->so_type;
 	xso->so_options = so->so_options;
 	xso->so_linger = so->so_linger;
 	xso->so_state = so->so_state;
 	xso->so_pcb = (uintptr_t)so->so_pcb;
 	xso->xso_protocol = so->so_proto->pr_protocol;
 	xso->xso_family = so->so_proto->pr_domain->dom_family;
 	xso->so_timeo = so->so_timeo;
 	xso->so_error = so->so_error;
 	xso->so_uid = so->so_cred->cr_uid;
 	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
 	SOCK_LOCK(so);
 	xso->so_fibnum = so->so_fibnum;
 	if (SOLISTENING(so)) {
 		xso->so_qlen = so->sol_qlen;
 		xso->so_incqlen = so->sol_incqlen;
 		xso->so_qlimit = so->sol_qlimit;
 		xso->so_oobmark = 0;
 	} else {
 		xso->so_state |= so->so_qstate;
 		xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
 		xso->so_oobmark = so->so_oobmark;
 		sbtoxsockbuf(&so->so_snd, &xso->so_snd);
 		sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
 		if ((so->so_rcv.sb_flags & SB_SPLICED) != 0)
 			xso->so_splice_so = (uintptr_t)so->so_splice->dst;
 	}
 	SOCK_UNLOCK(so);
 }
 
 struct sockbuf *
 so_sockbuf_rcv(struct socket *so)
 {
 
 	return (&so->so_rcv);
 }
 
 struct sockbuf *
 so_sockbuf_snd(struct socket *so)
 {
 
 	return (&so->so_snd);
 }
 
 int
 so_state_get(const struct socket *so)
 {
 
 	return (so->so_state);
 }
 
 void
 so_state_set(struct socket *so, int val)
 {
 
 	so->so_state = val;
 }
 
 int
 so_options_get(const struct socket *so)
 {
 
 	return (so->so_options);
 }
 
 void
 so_options_set(struct socket *so, int val)
 {
 
 	so->so_options = val;
 }
 
 int
 so_error_get(const struct socket *so)
 {
 
 	return (so->so_error);
 }
 
 void
 so_error_set(struct socket *so, int val)
 {
 
 	so->so_error = val;
 }
 
 int
 so_linger_get(const struct socket *so)
 {
 
 	return (so->so_linger);
 }
 
 void
 so_linger_set(struct socket *so, int val)
 {
 
 	KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
 	    ("%s: val %d out of range", __func__, val));
 
 	so->so_linger = val;
 }
 
 struct protosw *
 so_protosw_get(const struct socket *so)
 {
 
 	return (so->so_proto);
 }
 
 void
 so_protosw_set(struct socket *so, struct protosw *val)
 {
 
 	so->so_proto = val;
 }
 
 void
 so_sorwakeup(struct socket *so)
 {
 
 	sorwakeup(so);
 }
 
 void
 so_sowwakeup(struct socket *so)
 {
 
 	sowwakeup(so);
 }
 
 void
 so_sorwakeup_locked(struct socket *so)
 {
 
 	sorwakeup_locked(so);
 }
 
 void
 so_sowwakeup_locked(struct socket *so)
 {
 
 	sowwakeup_locked(so);
 }
 
 void
 so_lock(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 }
 
 void
 so_unlock(struct socket *so)
 {
 
 	SOCK_UNLOCK(so);
 }
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index fd6682ef03b0..c5b0c7896a17 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -1,3164 +1,3164 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 #include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define MAX_AIO_QUEUE		1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 FEATURE(aio, "Asynchronous I/O");
 SYSCTL_DECL(_p1003_1b);
 
 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
 static MALLOC_DEFINE(M_AIO, "aio", "structures for asynchronous I/O");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Async IO management");
 
 static int enable_aio_unsafe = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
     "Permit asynchronous IO on all file types, not just known-safe types");
 
 static unsigned int unsafe_warningcnt = 1;
 SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
     &unsafe_warningcnt, 0,
     "Warnings that will be triggered upon failed IO requests on unsafe files");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
     "Maximum number of kernel processes to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
     "Number of presently active kernel processes for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
     0,
     "Preferred number of ready kernel processes for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 static int num_unmapped_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
     0,
     "Number of aio requests presently handled by unmapped I/O buffers");
 
 /* Number of async I/O processes in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0,
     "Maximum active aio requests per process");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process");
 
 /* 
  * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
  * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
  * vfs.aio.aio_listio_max.
  */
 SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
     CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
     0, "Maximum aio requests for a single lio_listio call");
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 #endif
 
 /*
  * Below is a key of locks used to protect each member of struct kaiocb
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * If the routine that services an AIO request blocks while running in an
  * AIO kernel process it can starve other I/O requests.  BIO requests
  * queued via aio_qbio() complete asynchronously and do not use AIO kernel
  * processes at all.  Socket I/O requests use a separate pool of
  * kprocs and also force non-blocking I/O.  Other file I/O requests
  * use the generic fo_read/fo_write operations which can block.  The
  * fsync and mlock operations can also block while executing.  Ideally
  * none of these requests would block while executing.
  *
  * Note that the service routines cannot toggle O_NONBLOCK in the file
  * structure directly while handling a request due to races with
  * userland threads.
  */
 
 /* jobflags */
 #define	KAIOCB_QUEUEING		0x01
 #define	KAIOCB_CANCELLED	0x02
 #define	KAIOCB_CANCELLING	0x04
 #define	KAIOCB_CHECKSYNC	0x08
 #define	KAIOCB_CLEARED		0x10
 #define	KAIOCB_FINISHED		0x20
 
 /* ioflags */
 #define	KAIOCB_IO_FOFFSET	0x01
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aioproc {
 	int	aioprocflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
 	struct	proc *aioproc;			/* (*) the AIO proc */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) count of jobs */
 	int	lioj_finished_count;		/* (a) count of finished jobs */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct	knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct	mtx kaio_mtx;		/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_buffer_count;	/* (a) number of bio buffers */
 	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
 	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
 	struct	task kaio_task;		/* (*) task to kick aio processes */
 	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
 
 /*
  * Operations used to interact with userland aio control blocks.
  * Different ABIs provide their own operations.
  */
 struct aiocb_ops {
 	int	(*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty);
 	long	(*fetch_status)(struct aiocb *ujob);
 	long	(*fetch_error)(struct aiocb *ujob);
 	int	(*store_status)(struct aiocb *ujob, long status);
 	int	(*store_error)(struct aiocb *ujob, long error);
 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
 };
 
 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 static void	aio_biocleanup(struct bio *bp);
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
 static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_biowakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p,
 		    struct image_params *imgp);
 static int	aio_qbio(struct proc *p, struct kaiocb *job);
 static void	aio_daemon(void *param);
 static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
 static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiocb	async io jobs
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiocb_zone, aiolio_zone;
 
 /* kqueue filters for aio */
-static struct filterops aio_filtops = {
+static const struct filterops aio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_aioattach,
 	.f_detach = filt_aiodetach,
 	.f_event = filt_aio,
 };
-static struct filterops lio_filtops = {
+static const struct filterops lio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_lioattach,
 	.f_detach = filt_liodetach,
 	.f_event = filt_lio
 };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_kick);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static int
 aio_onceonly(void)
 {
 
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
 	ki->kaio_flags = 0;
 	ki->kaio_active_count = 0;
 	ki->kaio_count = 0;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TAILQ_INIT(&ki->kaio_syncready);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi, bool ext)
 {
 	struct thread *td;
 	int error;
 
 	error = sigev_findtd(p, sigev, &td);
 	if (error)
 		return (error);
 	if (!KSI_ONQ(ksi)) {
 		ksiginfo_set_sigev(ksi, sigev);
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= ext ? (KSI_EXT | KSI_INS) : 0;
 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = job->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, job, plist);
 	TAILQ_REMOVE(&ki->kaio_all, job, allist);
 
 	lj = job->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* job is going away, we need to destroy any knotes */
 	knlist_delete(&job->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&job->ksi);
 	PROC_UNLOCK(p);
 
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * a kaiocb from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	if (job->fd_file)
 		fdrop(job->fd_file, curthread);
 	crfree(job->cred);
 	if (job->uiop != &job->uio)
 		freeuio(job->uiop);
 	uma_zfree(aiocb_zone, job);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p,
     struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 static int
 aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
 {
 	aio_cancel_fn_t *func;
 	int cancelled;
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
 		return (0);
 	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
 	job->jobflags |= KAIOCB_CANCELLED;
 
 	func = job->cancel_fn;
 
 	/*
 	 * If there is no cancel routine, just leave the job marked as
 	 * cancelled.  The job should be in active use by a caller who
 	 * should complete it normally or when it fails to install a
 	 * cancel routine.
 	 */
 	if (func == NULL)
 		return (0);
 
 	/*
 	 * Set the CANCELLING flag so that aio_complete() will defer
 	 * completions of this job.  This prevents the job from being
 	 * freed out from under the cancel callback.  After the
 	 * callback any deferred completion (whether from the callback
 	 * or any other source) will be completed.
 	 */
 	job->jobflags |= KAIOCB_CANCELLING;
 	AIO_UNLOCK(ki);
 	func(job);
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_CANCELLING;
 	if (job->jobflags & KAIOCB_FINISHED) {
 		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(p, job);
 	} else {
 		/*
 		 * The cancel callback might have scheduled an
 		 * operation to cancel this request, but it is
 		 * only counted as cancelled if the request is
 		 * cancelled when the callback returns.
 		 */
 		cancelled = 0;
 	}
 	return (cancelled);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kaiocb *job, *jobn;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		aio_cancel_job(p, ki, job);
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(job);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
 	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct kaiocb *
 aio_selectjob(struct aioproc *aiop)
 {
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 restart:
 	TAILQ_FOREACH(job, &aio_jobs, list) {
 		userp = job->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < max_aio_per_proc) {
 			TAILQ_REMOVE(&aio_jobs, job, list);
 			if (!aio_clear_cancel_function(job))
 				goto restart;
 
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			break;
 		}
 	}
 	return (job);
 }
 
 /*
  * Move all data to a permanent storage device.  This code
  * simulates the fsync and fdatasync syscalls.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp, int op)
 {
 	struct mount *mp;
 	int error;
 
 	for (;;) {
 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 		if (error != 0)
 			break;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vnode_pager_clean_async(vp);
 		if (op == LIO_DSYNC)
 			error = VOP_FDATASYNC(vp, td);
 		else
 			error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 		if (error != ERELOOKUP)
 			break;
 	}
 	return (error);
 }
 
 /*
  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
  * does the I/O request for the non-bio version of the operations.  The normal
  * vn operations are used, and this code should work in all instances for every
  * type of file, including pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process_rw(struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct file *fp;
 	ssize_t cnt;
 	long msgsnd_st, msgsnd_end;
 	long msgrcv_st, msgrcv_end;
 	long oublock_st, oublock_end;
 	long inblock_st, inblock_end;
 	int error, opcode;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
 	    job->uaiocb.aio_lio_opcode == LIO_READV ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITE ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITEV,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 	job->uiop->uio_td = td;
 	fp = job->fd_file;
 
 	opcode = job->uaiocb.aio_lio_opcode;
 	cnt = job->uiop->uio_resid;
 
 	msgrcv_st = td->td_ru.ru_msgrcv;
 	msgsnd_st = td->td_ru.ru_msgsnd;
 	inblock_st = td->td_ru.ru_inblock;
 	oublock_st = td->td_ru.ru_oublock;
 
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (opcode == LIO_READ || opcode == LIO_READV) {
 		if (job->uiop->uio_resid == 0)
 			error = 0;
 		else
 			error = fo_read(fp, job->uiop, fp->f_cred,
 			    (job->ioflags & KAIOCB_IO_FOFFSET) != 0 ? 0 :
 			    FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		error = fo_write(fp, job->uiop, fp->f_cred, (job->ioflags &
 		    KAIOCB_IO_FOFFSET) != 0 ? 0 : FOF_OFFSET, td);
 	}
 	msgrcv_end = td->td_ru.ru_msgrcv;
 	msgsnd_end = td->td_ru.ru_msgsnd;
 	inblock_end = td->td_ru.ru_inblock;
 	oublock_end = td->td_ru.ru_oublock;
 
 	job->msgrcv = msgrcv_end - msgrcv_st;
 	job->msgsnd = msgsnd_end - msgsnd_st;
 	job->inblock = inblock_end - inblock_st;
 	job->outblock = oublock_end - oublock_st;
 
 	if (error != 0 && job->uiop->uio_resid != cnt) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if (error == EPIPE && (opcode & LIO_WRITE)) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	cnt -= job->uiop->uio_resid;
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, cnt, 0);
 }
 
 static void
 aio_process_sync(struct kaiocb *job)
 {
 	struct thread *td = curthread;
 	struct ucred *td_savedcred = td->td_ucred;
 	struct file *fp = job->fd_file;
 	int error = 0;
 
 	KASSERT(job->uaiocb.aio_lio_opcode & LIO_SYNC,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	td->td_ucred = job->cred;
 	if (fp->f_vnode != NULL) {
 		error = aio_fsync_vnode(td, fp->f_vnode,
 		    job->uaiocb.aio_lio_opcode);
 	}
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, 0, 0);
 }
 
 static void
 aio_process_mlock(struct kaiocb *job)
 {
 	struct aiocb *cb = &job->uaiocb;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	error = kern_mlock(job->userproc, job->cred,
 	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
 	aio_complete(job, error != 0 ? -1 : 0, error);
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *sjob, *sjobn;
 	int lj_done;
 	bool schedule_fsync;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = job->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi, true);
 
 	KNOTE_LOCKED(&job->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL &&
 		    (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi,
 			    true);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (job->jobflags & KAIOCB_CHECKSYNC) {
 		schedule_fsync = false;
 		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
 			if (job->fd_file != sjob->fd_file ||
 			    job->seqno >= sjob->seqno)
 				continue;
 			if (--sjob->pending > 0)
 				continue;
 			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
 			if (!aio_clear_cancel_function_locked(sjob))
 				continue;
 			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
 			schedule_fsync = true;
 		}
 		if (schedule_fsync)
 			taskqueue_enqueue(taskqueue_aiod_kick,
 			    &ki->kaio_sync_task);
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 static void
 aio_schedule_fsync(void *context, int pending)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 
 	ki = context;
 	AIO_LOCK(ki);
 	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
 		job = TAILQ_FIRST(&ki->kaio_syncready);
 		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		AIO_LOCK(ki);
 	}
 	AIO_UNLOCK(ki);
 }
 
 bool
 aio_cancel_cleared(struct kaiocb *job)
 {
 
 	/*
 	 * The caller should hold the same queue lock held when
 	 * aio_clear_cancel_function() was called and set this flag
 	 * ensuring this check sees an up-to-date value.  However,
 	 * there is no way to assert that.
 	 */
 	return ((job->jobflags & KAIOCB_CLEARED) != 0);
 }
 
 static bool
 aio_clear_cancel_function_locked(struct kaiocb *job)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	MPASS(job->cancel_fn != NULL);
 	if (job->jobflags & KAIOCB_CANCELLING) {
 		job->jobflags |= KAIOCB_CLEARED;
 		return (false);
 	}
 	job->cancel_fn = NULL;
 	return (true);
 }
 
 bool
 aio_clear_cancel_function(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_clear_cancel_function_locked(job);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 static bool
 aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	if (job->jobflags & KAIOCB_CANCELLED)
 		return (false);
 	job->cancel_fn = func;
 	return (true);
 }
 
 bool
 aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_set_cancel_function_locked(job, func);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 void
 aio_complete(struct kaiocb *job, long status, int error)
 {
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	job->uaiocb._aiocb_private.error = error;
 	job->uaiocb._aiocb_private.status = status;
 
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 
 	AIO_LOCK(ki);
 	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
 	    ("duplicate aio_complete"));
 	job->jobflags |= KAIOCB_FINISHED;
 	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(userp, job);
 	}
 	AIO_UNLOCK(ki);
 }
 
 void
 aio_cancel(struct kaiocb *job)
 {
 
 	aio_complete(job, -1, ECANCELED);
 }
 
 void
 aio_switch_vmspace(struct kaiocb *job)
 {
 
 	vmspace_switch_aio(job->userproc->p_vmspace);
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process_*,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct kaiocb *job;
 	struct aioproc *aiop;
 	struct kaioinfo *ki;
 	struct proc *p;
 	struct vmspace *myvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = td->td_proc;
 	myvm = vmspace_acquire_ref(p);
 
 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = malloc(sizeof(*aiop), M_AIO, M_WAITOK);
 	aiop->aioproc = p;
 	aiop->aioprocflags = 0;
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aioprocflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aioprocflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((job = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 
 			ki = job->userproc->p_aioinfo;
 			job->handle_fn(job);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&aio_job_mtx);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aioprocflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
 		    (aiop->aioprocflags & AIOP_FREE) &&
 		    num_aio_procs > target_aio_procs)
 			break;
 	}
 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
 	num_aio_procs--;
 	mtx_unlock(&aio_job_mtx);
 	free(aiop, M_AIO);
 	free_unr(aiod_unr, id);
 	vmspace_free(myvm);
 
 	KASSERT(p->p_vmspace == myvm,
 	    ("AIOD: bad vmspace for exiting daemon"));
 	KASSERT(refcount_load(&myvm->vm_refcnt) > 1,
 	    ("AIOD: bad vm refcnt for exiting daemon: %d",
 	    refcount_load(&myvm->vm_refcnt)));
 	kproc_exit(0);
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead bio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qbio(struct proc *p, struct kaiocb *job)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	struct bio **bios = NULL;
 	off_t offset;
 	int bio_cmd, error, i, iovcnt, opcode, poff, ref;
 	vm_prot_t prot;
 	bool use_unmapped;
 
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 	opcode = cb->aio_lio_opcode;
 
 	if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV ||
 	    opcode == LIO_READ || opcode == LIO_READV))
 		return (-1);
 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VCHR)
 		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 
 	bio_cmd = (opcode & LIO_WRITE) ? BIO_WRITE : BIO_READ;
 	iovcnt = job->uiop->uio_iovcnt;
 	if (iovcnt > max_buf_aio)
 		return (-1);
 	for (i = 0; i < iovcnt; i++) {
 		if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0)
 			return (-1);
 		if (job->uiop->uio_iov[i].iov_len > maxphys) {
 			error = -1;
 			return (-1);
 		}
 	}
 	offset = cb->aio_offset;
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 
 	if ((csw->d_flags & D_DISK) == 0) {
 		error = -1;
 		goto unref;
 	}
 	if (job->uiop->uio_resid > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
 	ki = p->p_aioinfo;
 	job->error = 0;
 
 	use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed;
 	if (!use_unmapped) {
 		AIO_LOCK(ki);
 		if (ki->kaio_buffer_count + iovcnt > max_buf_aio) {
 			AIO_UNLOCK(ki);
 			error = EAGAIN;
 			goto unref;
 		}
 		ki->kaio_buffer_count += iovcnt;
 		AIO_UNLOCK(ki);
 	}
 
 	bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK);
 	refcount_init(&job->nbio, iovcnt);
 	for (i = 0; i < iovcnt; i++) {
 		struct vm_page** pages;
 		struct bio *bp;
 		void *buf;
 		size_t nbytes;
 		int npages;
 
 		buf = job->uiop->uio_iov[i].iov_base;
 		nbytes = job->uiop->uio_iov[i].iov_len;
 
 		bios[i] = g_alloc_bio();
 		bp = bios[i];
 
 		poff = (vm_offset_t)buf & PAGE_MASK;
 		if (use_unmapped) {
 			pbuf = NULL;
 			pages = malloc(sizeof(vm_page_t) * (atop(round_page(
 			    nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO);
 		} else {
 			pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
 			BUF_KERNPROC(pbuf);
 			pages = pbuf->b_pages;
 		}
 
 		bp->bio_length = nbytes;
 		bp->bio_bcount = nbytes;
 		bp->bio_done = aio_biowakeup;
 		bp->bio_offset = offset;
 		bp->bio_cmd = bio_cmd;
 		bp->bio_dev = dev;
 		bp->bio_caller1 = job;
 		bp->bio_caller2 = pbuf;
 
 		prot = VM_PROT_READ;
 		if (opcode == LIO_READ || opcode == LIO_READV)
 			prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 		npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 		    (vm_offset_t)buf, bp->bio_length, prot, pages,
 		    atop(maxphys) + 1);
 		if (npages < 0) {
 			if (pbuf != NULL)
 				uma_zfree(pbuf_zone, pbuf);
 			else
 				free(pages, M_TEMP);
 			error = EFAULT;
 			g_destroy_bio(bp);
 			i--;
 			goto destroy_bios;
 		}
 		if (pbuf != NULL) {
 			pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages);
 			bp->bio_data = pbuf->b_data + poff;
 			pbuf->b_npages = npages;
 			atomic_add_int(&num_buf_aio, 1);
 		} else {
 			bp->bio_ma = pages;
 			bp->bio_ma_n = npages;
 			bp->bio_ma_offset = poff;
 			bp->bio_data = unmapped_buf;
 			bp->bio_flags |= BIO_UNMAPPED;
 			atomic_add_int(&num_unmapped_aio, 1);
 		}
 
 		offset += nbytes;
 	}
 
 	/* Perform transfer. */
 	for (i = 0; i < iovcnt; i++)
 		csw->d_strategy(bios[i]);
 	free(bios, M_TEMP);
 
 	dev_relthread(dev, ref);
 	return (0);
 
 destroy_bios:
 	for (; i >= 0; i--)
 		aio_biocleanup(bios[i]);
 	free(bios, M_TEMP);
 unref:
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	nsig->sigev_notify = osig->sigev_notify;
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob,
     int type __unused)
 {
 	struct oaiocb *ojob;
 	struct aiocb *kcb = &kjob->uaiocb;
 	int error;
 
 	bzero(kcb, sizeof(struct aiocb));
 	error = copyin(ujob, kcb, sizeof(struct oaiocb));
 	if (error)
 		return (error);
 	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */
 	ojob = (struct oaiocb *)kcb;
 	return (convert_old_sigevent(&ojob->aio_sigevent, &kcb->aio_sigevent));
 }
 #endif
 
 static int
 aiocb_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)
 {
 	struct aiocb *kcb = &kjob->uaiocb;
 	int error;
 
 	error = copyin(ujob, kcb, sizeof(struct aiocb));
 	if (error)
 		return (error);
 	if (type == LIO_NOP)
 		type = kcb->aio_lio_opcode;
 	if (type & LIO_VECTORED) {
 		/* malloc a uio and copy in the iovec */
 		error = copyinuio(__DEVOLATILE(struct iovec*, kcb->aio_iov),
 		    kcb->aio_iovcnt, &kjob->uiop);
 	}
 
 	return (error);
 }
 
 static long
 aiocb_fetch_status(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.status));
 }
 
 static long
 aiocb_fetch_error(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.error));
 }
 
 static int
 aiocb_store_status(struct aiocb *ujob, long status)
 {
 
 	return (suword(&ujob->_aiocb_private.status, status));
 }
 
 static int
 aiocb_store_error(struct aiocb *ujob, long error)
 {
 
 	return (suword(&ujob->_aiocb_private.error, error));
 }
 
 static int
 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 
 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb_ops = {
 	.aio_copyin = aiocb_copyin,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb_ops_osigevent = {
 	.aio_copyin = aiocb_copyin_old_sigevent,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 #endif
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
     int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp = NULL;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 	u_short evflags;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	ops->store_status(ujob, -1);
 	ops->store_error(ujob, 0);
 	ops->store_kernelinfo(ujob, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= max_aio_queue_per_proc) {
 		error = EAGAIN;
 		goto err1;
 	}
 
 	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	knlist_init_mtx(&job->klist, AIO_MTX(ki));
 
 	error = ops->aio_copyin(ujob, job, type);
 	if (error)
 		goto err2;
 
 	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
 		error = EINVAL;
 		goto err2;
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		error = EINVAL;
 		goto err2;
 	}
 
 	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
 		error = EINVAL;
 		goto err2;
 	}
 
 	/* Get the opcode. */
 	if (type == LIO_NOP) {
 		switch (job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET) {
 		case LIO_WRITE:
 		case LIO_WRITEV:
 		case LIO_NOP:
 		case LIO_READ:
 		case LIO_READV:
 			opcode = job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET;
 			if ((job->uaiocb.aio_lio_opcode & LIO_FOFFSET) != 0)
 				job->ioflags |= KAIOCB_IO_FOFFSET;
 			break;
 		default:
 			error = EINVAL;
 			goto err2;
 		}
 	} else
 		opcode = job->uaiocb.aio_lio_opcode = type;
 
 	ksiginfo_init(&job->ksi);
 
 	/* Save userspace address of the job info. */
 	job->ujob = ujob;
 
 	/*
 	 * Validate the opcode and fetch the file object for the specified
 	 * file descriptor.
 	 *
 	 * XXXRW: Moved the opcode validation up here so that we don't
 	 * retrieve a file descriptor without knowing what the capabiltity
 	 * should be.
 	 */
 	fd = job->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 	case LIO_WRITEV:
 		error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 		break;
 	case LIO_READ:
 	case LIO_READV:
 		error = fget_read(td, fd, &cap_pread_rights, &fp);
 		break;
 	case LIO_SYNC:
 	case LIO_DSYNC:
 		error = fget(td, fd, &cap_fsync_rights, &fp);
 		break;
 	case LIO_MLOCK:
 		break;
 	case LIO_NOP:
 		error = fget(td, fd, &cap_no_rights, &fp);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error)
 		goto err3;
 
 	if ((opcode & LIO_SYNC) && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto err3;
 	}
 
 	if ((opcode == LIO_READ || opcode == LIO_READV ||
 	    opcode == LIO_WRITE || opcode == LIO_WRITEV) &&
 	    job->uaiocb.aio_offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
 		error = EINVAL;
 		goto err3;
 	}
 
 	if (fp != NULL && fp->f_ops == &path_fileops) {
 		error = EBADF;
 		goto err3;
 	}
 
 	job->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	job->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = ops->store_kernelinfo(ujob, jid);
 	if (error) {
 		error = EINVAL;
 		goto err3;
 	}
 	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		MPASS(job->uiop == &job->uio || job->uiop == NULL);
 		uma_zfree(aiocb_zone, job);
 		return (0);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
 		error = EINVAL;
 		goto err3;
 	}
 	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	memset(&kev, 0, sizeof(kev));
 	kev.ident = (uintptr_t)job->ujob;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
 	kev.data = (intptr_t)job;
 	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, M_WAITOK);
 	if (error)
 		goto err3;
 
 no_kqueue:
 
 	ops->store_error(ujob, EINPROGRESS);
 	job->uaiocb._aiocb_private.error = EINPROGRESS;
 	job->userproc = p;
 	job->cred = crhold(td->td_ucred);
 	job->jobflags = KAIOCB_QUEUEING;
 	job->lio = lj;
 
 	if (opcode & LIO_VECTORED) {
 		/* Use the uio copied in by aio_copyin */
 		MPASS(job->uiop != &job->uio && job->uiop != NULL);
 	} else {
 		/* Setup the inline uio */
 		job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf;
 		job->iov[0].iov_len = job->uaiocb.aio_nbytes;
 		job->uio.uio_iov = job->iov;
 		job->uio.uio_iovcnt = 1;
 		job->uio.uio_resid = job->uaiocb.aio_nbytes;
 		job->uio.uio_segflg = UIO_USERSPACE;
 		job->uiop = &job->uio;
 	}
 	switch (opcode & (LIO_READ | LIO_WRITE)) {
 	case LIO_READ:
 		job->uiop->uio_rw = UIO_READ;
 		break;
 	case LIO_WRITE:
 		job->uiop->uio_rw = UIO_WRITE;
 		break;
 	}
 	job->uiop->uio_offset = job->uaiocb.aio_offset;
 	job->uiop->uio_td = td;
 
 	if (opcode == LIO_MLOCK) {
 		aio_schedule(job, aio_process_mlock);
 		error = 0;
 	} else if (fp->f_ops->fo_aio_queue == NULL)
 		error = aio_queue_file(fp, job);
 	else
 		error = fo_aio_queue(fp, job);
 	if (error)
 		goto err4;
 
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_QUEUEING;
 	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	atomic_add_int(&num_queue_count, 1);
 	if (job->jobflags & KAIOCB_FINISHED) {
 		/*
 		 * The queue callback completed the request synchronously.
 		 * The bulk of the completion is deferred in that case
 		 * until this point.
 		 */
 		aio_bio_done_notify(p, job);
 	} else
 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
 	AIO_UNLOCK(ki);
 	return (0);
 
 err4:
 	crfree(job->cred);
 err3:
 	if (fp)
 		fdrop(fp, td);
 	knlist_delete(&job->klist, curthread, 0);
 err2:
 	if (job->uiop != &job->uio)
 		freeuio(job->uiop);
 	uma_zfree(aiocb_zone, job);
 err1:
 	ops->store_error(ujob, error);
 	return (error);
 }
 
 static void
 aio_cancel_daemon_job(struct kaiocb *job)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&aio_jobs, job, list);
 	mtx_unlock(&aio_job_mtx);
 	aio_cancel(job);
 }
 
 void
 aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
 		mtx_unlock(&aio_job_mtx);
 		aio_cancel(job);
 		return;
 	}
 	job->handle_fn = func;
 	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
 	aio_kick_nowait(job->userproc);
 	mtx_unlock(&aio_job_mtx);
 }
 
 static void
 aio_cancel_sync(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
 	AIO_UNLOCK(ki);
 	aio_cancel(job);
 }
 
 int
 aio_queue_file(struct file *fp, struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job2;
 	struct vnode *vp;
 	struct mount *mp;
 	int error;
 	bool safe;
 
 	ki = job->userproc->p_aioinfo;
 	error = aio_qbio(job->userproc, job);
 	if (error >= 0)
 		return (error);
 	safe = false;
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vp->v_type == VREG || vp->v_type == VDIR) {
 			mp = fp->f_vnode->v_mount;
 			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
 				safe = true;
 		}
 	}
 	if (!(safe || enable_aio_unsafe)) {
 		counted_warning(&unsafe_warningcnt,
 		    "is attempting to use unsafe AIO requests");
 		return (EOPNOTSUPP);
 	}
 
 	if (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) {
 		aio_schedule(job, aio_process_rw);
 		error = 0;
 	} else if (job->uaiocb.aio_lio_opcode & LIO_SYNC) {
 		AIO_LOCK(ki);
 		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
 			if (job2->fd_file == job->fd_file &&
 			    ((job2->uaiocb.aio_lio_opcode & LIO_SYNC) == 0) &&
 			    job2->seqno < job->seqno) {
 				job2->jobflags |= KAIOCB_CHECKSYNC;
 				job->pending++;
 			}
 		}
 		if (job->pending != 0) {
 			if (!aio_set_cancel_function_locked(job,
 				aio_cancel_sync)) {
 				AIO_UNLOCK(ki);
 				aio_cancel(job);
 				return (0);
 			}
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		error = 0;
 	} else {
 		error = EINVAL;
 	}
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 static int
 kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	long status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
 		if (job->ujob == ujob)
 			break;
 	}
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 int
 sys_aio_return(struct thread *td, struct aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 static int
 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
     struct timespec *ts)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *firstjob, *job;
 	int error, i, timo;
 
 	timo = 0;
 	if (ts) {
 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	if (njoblist == 0)
 		return (0);
 
 	AIO_LOCK(ki);
 	for (;;) {
 		firstjob = NULL;
 		error = 0;
 		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (job->ujob == ujoblist[i]) {
 					if (firstjob == NULL)
 						firstjob = job;
 					if (job->jobflags & KAIOCB_FINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (firstjob == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	return (error);
 }
 
 int
 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	int error;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK);
 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
 	if (error == 0)
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	free(ujoblist, M_AIO);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-bio aio operations not currently in progress.
  */
 int
 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct kaiocb *job, *jobn;
 	struct file *fp;
 	int error;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		if ((uap->fd == job->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == job->ujob))) {
 			if (aio_cancel_job(p, ki, job)) {
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
  * aio_error is implemented in the kernel level for compatibility purposes
  * only.  For a user mode async implementation, it would be best to do it in
  * a userland subroutine.
  */
 static int
 kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 		if (job->ujob == ujob) {
 			if (job->jobflags & KAIOCB_FINISHED)
 				td->td_retval[0] =
 					job->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = ops->fetch_status(ujob);
 	if (status == -1) {
 		td->td_retval[0] = ops->fetch_error(ujob);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 int
 sys_aio_error(struct thread *td, struct aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
 }
 
 int
 sys_aio_readv(struct thread *td, struct aio_readv_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops));
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
 }
 
 int
 sys_aio_writev(struct thread *td, struct aio_writev_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops));
 }
 
 int
 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
 }
 
 static int
 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
     struct aiocb **acb_list, int nent, struct sigevent *sig,
     struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *job;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int error;
 	int nagain, nerror;
 	int i;
 
 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
 		return (EINVAL);
 
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	lj->lioj_signal.sigev_notify = SIGEV_NONE;
 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (sig && (mode == LIO_NOWAIT)) {
 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			memset(&kev, 0, sizeof(kev));
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uacb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td,
 			    M_WAITOK);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nagain = 0;
 	nerror = 0;
 	for (i = 0; i < nent; i++) {
 		job = acb_list[i];
 		if (job != NULL) {
 			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
 			if (error == EAGAIN)
 				nagain++;
 			else if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL |
 			    LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL &&
 			    (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi,
 				    lj->lioj_count != 1);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	else if (nagain)
 		return (EAGAIN);
 	else
 		return (error);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent osig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode,
 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 		    &aiocb_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 /* syscall - list directed I/O (REALTIME) */
 int
 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig, sizeof(sig));
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
 		    nent, sigp, &aiocb_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 static void
 aio_biocleanup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	struct kaioinfo *ki;
 	struct buf *pbuf = (struct buf *)bp->bio_caller2;
 
 	/* Release mapping into kernel space. */
 	if (pbuf != NULL) {
 		MPASS(pbuf->b_npages <= atop(maxphys) + 1);
 		pmap_qremove((vm_offset_t)pbuf->b_data, pbuf->b_npages);
 		vm_page_unhold_pages(pbuf->b_pages, pbuf->b_npages);
 		uma_zfree(pbuf_zone, pbuf);
 		atomic_subtract_int(&num_buf_aio, 1);
 		ki = job->userproc->p_aioinfo;
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 	} else {
 		MPASS(bp->bio_ma_n <= atop(maxphys) + 1);
 		vm_page_unhold_pages(bp->bio_ma, bp->bio_ma_n);
 		free(bp->bio_ma, M_TEMP);
 		atomic_subtract_int(&num_unmapped_aio, 1);
 	}
 	g_destroy_bio(bp);
 }
 
 static void
 aio_biowakeup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	size_t nbytes;
 	long bcount = bp->bio_bcount;
 	long resid = bp->bio_resid;
 	int opcode, nblks;
 	int bio_error = bp->bio_error;
 	uint16_t flags = bp->bio_flags;
 
 	opcode = job->uaiocb.aio_lio_opcode;
 
 	aio_biocleanup(bp);
 
 	nbytes = bcount - resid;
 	atomic_add_acq_long(&job->nbytes, nbytes);
 	nblks = btodb(nbytes);
 
 	/*
 	 * If multiple bios experienced an error, the job will reflect the
 	 * error of whichever failed bio completed last.
 	 */
 	if (flags & BIO_ERROR)
 		atomic_store_int(&job->error, bio_error);
 	if (opcode & LIO_WRITE)
 		atomic_add_int(&job->outblock, nblks);
 	else
 		atomic_add_int(&job->inblock, nblks);
 
 	if (refcount_release(&job->nbio)) {
 		bio_error = atomic_load_int(&job->error);
 		if (bio_error != 0)
 			aio_complete(job, -1, bio_error);
 		else
 			aio_complete(job, atomic_load_long(&job->nbytes), 0);
 	}
 }
 
 /* syscall - wait for the next completion of an aio request */
 static int
 kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
     struct timespec *ts, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 	struct aiocb *ujob;
 	long error, status;
 	int timo;
 
 	ops->store_aiocb(ujobp, NULL);
 
 	if (ts == NULL) {
 		timo = 0;
 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
 		timo = -1;
 	} else {
 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	job = NULL;
 	AIO_LOCK(ki);
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		if (timo == -1) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		ujob = job->ujob;
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_aiocb(ujobp, ujob);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
 }
 
 static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
     struct aiocb_ops *ops)
 {
 	int listop;
 
 	switch (op) {
 	case O_SYNC:
 		listop = LIO_SYNC;
 		break;
 	case O_DSYNC:
 		listop = LIO_DSYNC;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (aio_aqueue(td, ujob, NULL, listop, ops));
 }
 
 int
 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct kaiocb *job;
 
 	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The job pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_aio = job;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&job->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_aio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct kaiocb *job = kn->kn_ptr.p_aio;
 
 	kn->kn_data = job->uaiocb._aiocb_private.error;
 	if (!(job->jobflags & KAIOCB_FINISHED))
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob *lj;
 
 	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_lio = lj;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_lio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = kn->kn_ptr.p_lio;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <sys/sysent.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 struct __aiocb_private32 {
 	int32_t	status;
 	int32_t	error;
 	uint32_t kernelinfo;
 };
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb32 {
 	int	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 } oaiocb32_t;
 #endif
 
 typedef struct aiocb32 {
 	int32_t	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;	/* I/O buffer in process space */
 	uint32_t aio_nbytes;	/* Number of bytes for I/O */
 	int	__spare__[2];
 	uint32_t __spare2__;
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
 } aiocb32_t;
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	CP(*osig, *nsig, sigev_notify);
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob,
     int type __unused)
 {
 	struct oaiocb32 job32;
 	struct aiocb *kcb = &kjob->uaiocb;
 	int error;
 
 	bzero(kcb, sizeof(struct aiocb));
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 
 	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */
 
 	CP(job32, *kcb, aio_fildes);
 	CP(job32, *kcb, aio_offset);
 	PTRIN_CP(job32, *kcb, aio_buf);
 	CP(job32, *kcb, aio_nbytes);
 	CP(job32, *kcb, aio_lio_opcode);
 	CP(job32, *kcb, aio_reqprio);
 	CP(job32, *kcb, _aiocb_private.status);
 	CP(job32, *kcb, _aiocb_private.error);
 	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo);
 	return (convert_old_sigevent32(&job32.aio_sigevent,
 	    &kcb->aio_sigevent));
 }
 #endif
 
 static int
 aiocb32_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)
 {
 	struct aiocb32 job32;
 	struct aiocb *kcb = &kjob->uaiocb;
 	struct iovec32 *iov32;
 	int error;
 
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 	CP(job32, *kcb, aio_fildes);
 	CP(job32, *kcb, aio_offset);
 	CP(job32, *kcb, aio_lio_opcode);
 	if (type == LIO_NOP)
 		type = kcb->aio_lio_opcode;
 	if (type & LIO_VECTORED) {
 		iov32 = PTRIN(job32.aio_iov);
 		CP(job32, *kcb, aio_iovcnt);
 		/* malloc a uio and copy in the iovec */
 		error = freebsd32_copyinuio(iov32,
 		    kcb->aio_iovcnt, &kjob->uiop);
 		if (error)
 			return (error);
 	} else {
 		PTRIN_CP(job32, *kcb, aio_buf);
 		CP(job32, *kcb, aio_nbytes);
 	}
 	CP(job32, *kcb, aio_reqprio);
 	CP(job32, *kcb, _aiocb_private.status);
 	CP(job32, *kcb, _aiocb_private.error);
 	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo);
 	error = convert_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent);
 
 	return (error);
 }
 
 static long
 aiocb32_fetch_status(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.status));
 }
 
 static long
 aiocb32_fetch_error(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.error));
 }
 
 static int
 aiocb32_store_status(struct aiocb *ujob, long status)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.status, status));
 }
 
 static int
 aiocb32_store_error(struct aiocb *ujob, long error)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.error, error));
 }
 
 static int
 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword32(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb32_ops = {
 	.aio_copyin = aiocb32_copyin,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb32_ops_osigevent = {
 	.aio_copyin = aiocb32_copyin_old_sigevent,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 #endif
 
 int
 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 int
 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	uint32_t *ujoblist32;
 	int error, i;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK);
 	ujoblist32 = (uint32_t *)ujoblist;
 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
 	    sizeof(ujoblist32[0]));
 	if (error == 0) {
 		for (i = uap->nent - 1; i >= 0; i--)
 			ujoblist[i] = PTRIN(ujoblist32[i]);
 
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	}
 	free(ujoblist, M_AIO);
 	return (error);
 }
 
 int
 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_read(struct thread *td,
     struct freebsd6_freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_write(struct thread *td,
     struct freebsd6_freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_waitcomplete(struct thread *td,
     struct freebsd32_aio_waitcomplete_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_lio_listio(struct thread *td,
     struct freebsd6_freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent32 osig;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent32(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 int
 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct sigevent32 sig32;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig32, sizeof(sig32));
 		if (error)
 			return (error);
 		error = convert_sigevent32(&sig32, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 #endif
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 747cdf722cc9..404c51b7db77 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1,7331 +1,7331 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 
 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS))
 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp, bool isvnlru);
 static void	v_init_counters(struct vnode *);
 static void	vn_seqc_init(struct vnode *);
 static void	vn_seqc_write_end_free(struct vnode *vp);
 static void	vgonel(struct vnode *);
 static bool	vhold_recycle_free(struct vnode *);
 static void	vdropl_recycle(struct vnode *vp);
 static void	vdrop_recycle(struct vnode *vp);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static void	vfs_knl_assert_lock(void *arg, int what);
 static void	destroy_vpollinfo(struct vpollinfo *vi);
 static int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 		    daddr_t startlbn, daddr_t endlbn);
 static void	vnlru_recalc(void);
 
 static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "vnode configuration and statistics");
 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "vnode configuration");
 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "vnode statistics");
 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "vnode recycling");
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
  */
 static u_long __exclusive_cache_line numvnodes;
 
 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence (legacy)");
 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence");
 
 static counter_u64_t vnodes_created;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
     "Number of vnodes created by getnewvnode (legacy)");
 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created,
     "Number of vnodes created by getnewvnode");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 __enum_uint8(vtype) iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of allocates vnodes in the system.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_list;
 static struct vnode *vnode_list_free_marker;
 static struct vnode *vnode_list_reclaim_marker;
 
 /*
  * "Free" vnode target.  Free vnodes are rarely completely free, but are
  * just ones that are cheap to recycle.  Usually they are for files which
  * have been stat'd but not read; these usually have inode and namecache
  * data attached to them.  This target is the preferred minimum size of a
  * sub-cache consisting mostly of such files. The system balances the size
  * of this sub-cache with its complement to try to prevent either from
  * thrashing while the other is relatively inactive.  The targets express
  * a preference for the best balance.
  *
  * "Above" this target there are 2 further targets (watermarks) related
  * to recyling of free vnodes.  In the best-operating case, the cache is
  * exactly full, the free list has size between vlowat and vhiwat above the
  * free target, and recycling from it and normal use maintains this state.
  * Sometimes the free list is below vlowat or even empty, but this state
  * is even better for immediate use provided the cache is not full.
  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
  * ones) to reach one of these states.  The watermarks are currently hard-
  * coded as 4% and 9% of the available space higher.  These and the default
  * of 25% for wantfreevnodes are too large if the memory size is large.
  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
  * whenever vnlru_proc() becomes active.
  */
 static long wantfreevnodes;
 static long __exclusive_cache_line freevnodes;
 static long freevnodes_old;
 
 static u_long recycles_count;
 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0,
     "Number of vnodes recycled to meet vnode cache targets (legacy)");
 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS,
     &recycles_count, 0,
     "Number of vnodes recycled to meet vnode cache targets");
 
 static u_long recycles_free_count;
 SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS,
     &recycles_free_count, 0,
     "Number of free vnodes recycled to meet vnode cache targets (legacy)");
 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS,
     &recycles_free_count, 0,
     "Number of free vnodes recycled to meet vnode cache targets");
 
 static counter_u64_t direct_recycles_free_count;
 SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD,
     &direct_recycles_free_count,
     "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets");
 
 static counter_u64_t vnode_skipped_requeues;
 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues,
     "Number of times LRU requeue was skipped due to lock contention");
 
 static __read_mostly bool vnode_can_skip_requeue;
 SYSCTL_BOOL(_vfs_vnode_param, OID_AUTO, can_skip_requeue, CTLFLAG_RW,
     &vnode_can_skip_requeue, 0, "Is LRU requeue skippable");
 
 static u_long deferred_inact;
 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD,
     &deferred_inact, 0, "Number of times inactive processing was deferred");
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx __exclusive_cache_line vnode_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 static uma_zone_t buf_trie_zone;
 static smr_t buf_trie_smr;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll");
 
 __read_frequently smr_t vfs_smr;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 static struct cv sync_wakeup;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
     "Time to delay syncing files (in seconds)");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
     "Time to delay syncing directories (in seconds)");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
     "Time to delay syncing metadata (in seconds)");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
 #define	VDBATCH_SIZE 8
 struct vdbatch {
 	u_int index;
 	struct mtx lock;
 	struct vnode *tab[VDBATCH_SIZE];
 };
 DPCPU_DEFINE_STATIC(struct vdbatch, vd);
 
 static void	vdbatch_dequeue(struct vnode *vp);
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /* Target for maximum number of vnodes. */
 u_long desiredvnodes;
 static u_long gapvnodes;		/* gap between wanted and desired */
 static u_long vhiwat;		/* enough extras after expansion */
 static u_long vlowat;		/* minimal extras before expansion */
 static bool vstir;		/* nonzero to stir non-free vnodes */
 static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static u_long vnlru_read_freevnodes(void);
 
 /*
  * Note that no attempt is made to sanitize these parameters.
  */
 static int
 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	int error;
 
 	val = desiredvnodes;
 	error = sysctl_handle_long(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (val == desiredvnodes)
 		return (0);
 	mtx_lock(&vnode_list_mtx);
 	desiredvnodes = val;
 	wantfreevnodes = desiredvnodes / 4;
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	/*
 	 * XXX There is no protection against multiple threads changing
 	 * desiredvnodes at the same time. Locking above only helps vnlru and
 	 * getnewvnode.
 	 */
 	vfs_hash_changesize(desiredvnodes);
 	cache_changesize(desiredvnodes);
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
     "LU", "Target for maximum number of vnodes (legacy)");
 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
     "LU", "Target for maximum number of vnodes");
 
 static int
 sysctl_freevnodes(SYSCTL_HANDLER_ARGS)
 {
 	u_long rfreevnodes;
 
 	rfreevnodes = vnlru_read_freevnodes();
 	return (sysctl_handle_long(oidp, &rfreevnodes, 0, req));
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, freevnodes,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes,
     "LU", "Number of \"free\" vnodes (legacy)");
 SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes,
     "LU", "Number of \"free\" vnodes");
 
 static int
 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	int error;
 
 	val = wantfreevnodes;
 	error = sysctl_handle_long(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (val == wantfreevnodes)
 		return (0);
 	mtx_lock(&vnode_list_mtx);
 	wantfreevnodes = val;
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
     "LU", "Target for minimum number of \"free\" vnodes (legacy)");
 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
     "LU", "Target for minimum number of \"free\" vnodes");
 
 static int vnlru_nowhere;
 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 static int
 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	char *buf;
 	unsigned long ndflags;
 	int error;
 
 	if (req->newptr == NULL)
 		return (EINVAL);
 	if (req->newlen >= PATH_MAX)
 		return (E2BIG);
 
 	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
 	error = SYSCTL_IN(req, buf, req->newlen);
 	if (error != 0)
 		goto out;
 
 	buf[req->newlen] = '\0';
 
 	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1;
 	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vp = nd.ni_vp;
 
 	if (VN_IS_DOOMED(vp)) {
 		/*
 		 * This vnode is being recycled.  Return != 0 to let the caller
 		 * know that the sysctl had no effect.  Return EAGAIN because a
 		 * subsequent call will likely succeed (since namei will create
 		 * a new vnode if necessary)
 		 */
 		error = EAGAIN;
 		goto putvnode;
 	}
 
 	vgone(vp);
 putvnode:
 	vput(vp);
 	NDFREE_PNBUF(&nd);
 out:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct file *fp;
 	int error;
 	int fd;
 
 	if (req->newptr == NULL)
 		return (EBADF);
 
         error = sysctl_handle_int(oidp, &fd, 0, req);
         if (error != 0)
                 return (error);
 	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 
 	error = vn_lock(vp, LK_EXCLUSIVE);
 	if (error != 0)
 		goto drop;
 
 	vgone(vp);
 	VOP_UNLOCK(vp);
 drop:
 	fdrop(fp, td);
 	return (error);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_ftry_reclaim_vnode, "I",
     "Try to reclaim a vnode by its file descriptor");
 
 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 #define vnsz2log 8
 #ifndef DEBUG_LOCKS
 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log &&
     sizeof(struct vnode) < 1UL << (vnsz2log + 1),
     "vnsz2log needs to be updated");
 #endif
 
 /*
  * Support for the bufobj clean & dirty pctrie.
  */
 static void *
 buf_trie_alloc(struct pctrie *ptree)
 {
 	return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
 }
 
 static void
 buf_trie_free(struct pctrie *ptree, void *node)
 {
 	uma_zfree_smr(buf_trie_zone, node);
 }
 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
     buf_trie_smr);
 
 /*
  * Initialize the vnode management data structures.
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	(512UL * 1024 * 1024 / 64)	/* 8M */
 #endif
 
 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 
 static struct vnode *
 vn_alloc_marker(struct mount *mp)
 {
 	struct vnode *vp;
 
 	vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	vp->v_type = VMARKER;
 	vp->v_mount = mp;
 
 	return (vp);
 }
 
 static void
 vn_free_marker(struct vnode *vp)
 {
 
 	MPASS(vp->v_type == VMARKER);
 	free(vp, M_VNODE_MARKER);
 }
 
 #ifdef KASAN
 static int
 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused)
 {
 	kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0);
 	return (0);
 }
 
 static void
 vnode_dtor(void *mem, int size, void *arg __unused)
 {
 	size_t end1, end2, off1, off2;
 
 	_Static_assert(offsetof(struct vnode, v_vnodelist) <
 	    offsetof(struct vnode, v_dbatchcpu),
 	    "KASAN marks require updating");
 
 	off1 = offsetof(struct vnode, v_vnodelist);
 	off2 = offsetof(struct vnode, v_dbatchcpu);
 	end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist);
 	end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu);
 
 	/*
 	 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even
 	 * after the vnode has been freed.  Try to get some KASAN coverage by
 	 * marking everything except those two fields as invalid.  Because
 	 * KASAN's tracking is not byte-granular, any preceding fields sharing
 	 * the same 8-byte aligned word must also be marked valid.
 	 */
 
 	/* Handle the area from the start until v_vnodelist... */
 	off1 = rounddown2(off1, KASAN_SHADOW_SCALE);
 	kasan_mark(mem, off1, off1, KASAN_UMA_FREED);
 
 	/* ... then the area between v_vnodelist and v_dbatchcpu ... */
 	off1 = roundup2(end1, KASAN_SHADOW_SCALE);
 	off2 = rounddown2(off2, KASAN_SHADOW_SCALE);
 	if (off2 > off1)
 		kasan_mark((void *)((char *)mem + off1), off2 - off1,
 		    off2 - off1, KASAN_UMA_FREED);
 
 	/* ... and finally the area from v_dbatchcpu to the end. */
 	off2 = roundup2(end2, KASAN_SHADOW_SCALE);
 	kasan_mark((void *)((char *)mem + off2), size - off2, size - off2,
 	    KASAN_UMA_FREED);
 }
 #endif /* KASAN */
 
 /*
  * Initialize a vnode as it first enters the zone.
  */
 static int
 vnode_init(void *mem, int size, int flags)
 {
 	struct vnode *vp;
 
 	vp = mem;
 	bzero(vp, size);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
 	    LK_NOSHARE | LK_IS_VNODE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bufobj_init(&vp->v_bufobj, vp);
 	/*
 	 * Initialize namecache.
 	 */
 	cache_vnode_init(vp);
 	/*
 	 * Initialize rangelocks.
 	 */
 	rangelock_init(&vp->v_rl);
 
 	vp->v_dbatchcpu = NOCPU;
 
 	vp->v_state = VSTATE_DEAD;
 
 	/*
 	 * Check vhold_recycle_free for an explanation.
 	 */
 	vp->v_holdcnt = VHOLD_NO_SMR;
 	vp->v_type = VNON;
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	return (0);
 }
 
 /*
  * Free a vnode when it is cleared from the zone.
  */
 static void
 vnode_fini(void *mem, int size)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	vp = mem;
 	vdbatch_dequeue(vp);
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	bo = &vp->v_bufobj;
 	rw_destroy(BO_LOCKPTR(bo));
 
 	kasan_mark(mem, size, size, 0);
 }
 
 /*
  * Provide the size of NFS nclnode and NFS fh for calculation of the
  * vnode memory consumption.  The size is specified directly to
  * eliminate dependency on NFS-private header.
  *
  * Other filesystems may use bigger or smaller (like UFS and ZFS)
  * private inode data, but the NFS-based estimation is ample enough.
  * Still, we care about differences in the size between 64- and 32-bit
  * platforms.
  *
  * Namecache structure size is heuristically
  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
  */
 #ifdef _LP64
 #define	NFS_NCLNODE_SZ	(528 + 64)
 #define	NC_SZ		148
 #else
 #define	NFS_NCLNODE_SZ	(360 + 32)
 #define	NC_SZ		92
 #endif
 
 static void
 vntblinit(void *dummy __unused)
 {
 	struct vdbatch *vd;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	int cpu, physvnodes, virtvnodes;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
 	 * physical memory size.  The ratio of desiredvnodes to the physical
 	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
 	 * Thereafter, the
 	 * marginal ratio of desiredvnodes to the physical memory size is
 	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
 	 * must not exceed 1/10th of the kernel's heap size.
 	 */
 	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
 	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
 	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
 	desiredvnodes = min(physvnodes, virtvnodes);
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %lu -> %lu\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_list);
 	mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
 	/*
 	 * The lock is taken to appease WITNESS.
 	 */
 	mtx_lock(&vnode_list_mtx);
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	vnode_list_free_marker = vn_alloc_marker(NULL);
 	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
 	vnode_list_reclaim_marker = vn_alloc_marker(NULL);
 	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
 
 #ifdef KASAN
 	ctor = vnode_ctor;
 	dtor = vnode_dtor;
 #else
 	ctor = NULL;
 	dtor = NULL;
 #endif
 	vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor,
 	    vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN);
 	uma_zone_set_smr(vnode_zone, vfs_smr);
 
 	/*
 	 * Preallocate enough nodes to support one-per buf so that
 	 * we can not fail an insert.  reassignbuf() callers can not
 	 * tolerate the insertion failure.
 	 */
 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
 	    UMA_ZONE_NOFREE | UMA_ZONE_SMR);
 	buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
 	uma_prealloc(buf_trie_zone, nbuf);
 
 	vnodes_created = counter_u64_alloc(M_WAITOK);
 	direct_recycles_free_count = counter_u64_alloc(M_WAITOK);
 	vnode_skipped_requeues = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 	    &syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 	cv_init(&sync_wakeup, "syncer");
 
 	CPU_FOREACH(cpu) {
 		vd = DPCPU_ID_PTR((cpu), vd);
 		bzero(vd, sizeof(*vd));
 		mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
 	}
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Eventually, mountlist_mtx is not released on failure.
  *
  * vfs_busy() is a custom lock, it can block the caller.
  * vfs_busy() only sleeps if the unmount is active on the mount point.
  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  * vnode belonging to mp.
  *
  * Lookup uses vfs_busy() to traverse mount points.
  * root fs			var fs
  * / vnode lock		A	/ vnode lock (/var)		D
  * /var vnode lock	B	/log vnode lock(/var/log)	E
  * vfs_busy lock	C	vfs_busy lock			F
  *
  * Within each file system, the lock order is C->A->B and F->D->E.
  *
  * When traversing across mounts, the system follows that lock order:
  *
  *        C->A->B
  *              |
  *              +->F->D->E
  *
  * The lookup() process for namei("/var") illustrates the process:
  *  1. VOP_LOOKUP() obtains B while A is held
  *  2. vfs_busy() obtains a shared lock on F while A and B are held
  *  3. vput() releases lock on B
  *  4. vput() releases lock on A
  *  5. VFS_ROOT() obtains lock on D while shared lock on F is held
  *  6. vfs_unbusy() releases shared lock on F
  *  7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  *     Attempt to lock A (instead of vp_crossmp) while D is held would
  *     violate the global order, causing deadlocks.
  *
  * dounmount() locks B while F is drained.  Note that for stacked
  * filesystems, D and B in the example above may be the same lock,
  * which introdues potential lock order reversal deadlock between
  * dounmount() and step 5 above.  These filesystems may avoid the LOR
  * by setting VV_CROSSLOCK on the covered vnode so that lock B will
  * remain held until after step 5.
  */
 int
 vfs_busy(struct mount *mp, int flags)
 {
 	struct mount_pcpu *mpcpu;
 
 	MPASS((flags & ~MBF_MASK) == 0);
 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
 		vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 		vfs_mp_count_add_pcpu(mpcpu, lockref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REF(mp);
 	/*
 	 * If mount point is currently being unmounted, sleep until the
 	 * mount point fate is decided.  If thread doing the unmounting fails,
 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 	 * that this mount point has survived the unmount attempt and vfs_busy
 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 	 * about to be really destroyed.  vfs_busy needs to release its
 	 * reference on the mount point in this case and return with ENOENT,
 	 * telling the caller the mount it tried to busy is no longer valid.
 	 */
 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		KASSERT(TAILQ_EMPTY(&mp->mnt_uppers),
 		    ("%s: non-empty upper mount list with pending unmount",
 		    __func__));
 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
 			    __func__);
 			return (ENOENT);
 		}
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_lock(&mountlist_mtx);
 		MNT_ILOCK(mp);
 	}
 	if (flags & MBF_MNTLSTLOCK)
 		mtx_unlock(&mountlist_mtx);
 	mp->mnt_lockref++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int c;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		vfs_mp_count_sub_pcpu(mpcpu, lockref, 1);
 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_lockref;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
 		wakeup(&mp->mnt_lockref);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	return ((struct mount *) 0);
 }
 
 /*
  * Lookup a mount point by filesystem identifier, busying it before
  * returning.
  *
  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  * cache for popular filesystem identifiers.  The cache is lockess, using
  * the fact that struct mount's are never freed.  In worst case we may
  * get pointer to unmounted or even different filesystem, so we have to
  * check what we got, and go slow way if so.
  */
 struct mount *
 vfs_busyfs(fsid_t *fsid)
 {
 #define	FSID_CACHE_SIZE	256
 	typedef struct mount * volatile vmp_t;
 	static vmp_t cache[FSID_CACHE_SIZE];
 	struct mount *mp;
 	int error;
 	uint32_t hash;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	hash = fsid->val[0] ^ fsid->val[1];
 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 	mp = cache[hash];
 	if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
 		goto slow;
 	if (vfs_busy(mp, 0) != 0) {
 		cache[hash] = NULL;
 		goto slow;
 	}
 	if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
 		return (mp);
 	else
 	    vfs_unbusy(mp);
 
 slow:
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
 			if (error) {
 				cache[hash] = NULL;
 				mtx_unlock(&mountlist_mtx);
 				return (NULL);
 			}
 			cache[hash] = mp;
 			return (mp);
 		}
 	}
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if (jailed(td->td_ucred)) {
 		/*
 		 * If the jail of the calling thread lacks permission for
 		 * this type of file system, deny immediately.
 		 */
 		if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
 			return (EPERM);
 
 		/*
 		 * If the file system was mounted outside the jail of the
 		 * calling thread, deny immediately.
 		 */
 		if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 			return (EPERM);
 	}
 
 	/*
 	 * If file system supports delegated administration, we don't check
 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 	 * by the file system itself.
 	 * If this is not the user that did original mount, we check for
 	 * the PRIV_VFS_MOUNT_OWNER privilege.
 	 */
 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static uint16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_USEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
     "3+: sec + ns (max. precision))");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Try to reduce the total number of vnodes.
  *
  * This routine (and its user) are buggy in at least the following ways:
  * - all parameters were picked years ago when RAM sizes were significantly
  *   smaller
  * - it can pick vnodes based on pages used by the vm object, but filesystems
  *   like ZFS don't use it making the pick broken
  * - since ZFS has its own aging policy it gets partially combated by this one
  * - a dedicated method should be provided for filesystems to let them decide
  *   whether the vnode should be recycled
  *
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desirable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  *
  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
  * 			 entries if this argument is strue
  * @param trigger	 Only reclaim vnodes with fewer than this many resident
  *			 pages.
  * @param target	 How many vnodes to reclaim.
  * @return		 The number of vnodes that were reclaimed.
  */
 static int
 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
 {
 	struct vnode *vp, *mvp;
 	struct mount *mp;
 	struct vm_object *object;
 	u_long done;
 	bool retried;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 
 	retried = false;
 	done = 0;
 
 	mvp = vnode_list_reclaim_marker;
 restart:
 	vp = mvp;
 	while (done < target) {
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL))
 			break;
 
 		if (__predict_false(vp->v_type == VMARKER))
 			continue;
 
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 * Also skip free vnodes.  We are trying to make space
 		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
 			goto next_iter;
 
 		if (vp->v_type == VBAD || vp->v_type == VNON)
 			goto next_iter;
 
 		object = atomic_load_ptr(&vp->v_object);
 		if (object == NULL || object->resident_page_count > trigger) {
 			goto next_iter;
 		}
 
 		/*
 		 * Handle races against vnode allocation. Filesystems lock the
 		 * vnode some time after it gets returned from getnewvnode,
 		 * despite type and hold count being manipulated earlier.
 		 * Resorting to checking v_mount restores guarantees present
 		 * before the global list was reworked to contain all vnodes.
 		 */
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		if (vp->v_mount == NULL) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		vholdl(vp);
 		VI_UNLOCK(vp);
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			vdrop_recycle(vp);
 			goto next_iter_unlocked;
 		}
 		if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
 			vdrop_recycle(vp);
 			vn_finished_write(mp);
 			goto next_iter_unlocked;
 		}
 
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0 ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    (vp->v_object != NULL && vp->v_object->handle == vp &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp);
 			vdropl_recycle(vp);
 			vn_finished_write(mp);
 			goto next_iter_unlocked;
 		}
 		recycles_count++;
 		vgonel(vp);
 		VOP_UNLOCK(vp);
 		vdropl_recycle(vp);
 		vn_finished_write(mp);
 		done++;
 next_iter_unlocked:
 		maybe_yield();
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 next_iter:
 		MPASS(vp->v_type != VMARKER);
 		if (!should_yield())
 			continue;
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 		kern_yield(PRI_USER);
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 	}
 	if (done == 0 && !retried) {
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
 		retried = true;
 		goto restart;
 	}
 	return (done);
 }
 
 static int max_free_per_call = 10000;
 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0,
     "limit on vnode free requests per call to the vnlru_free routine (legacy)");
 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW,
     &max_free_per_call, 0,
     "limit on vnode free requests per call to the vnlru_free routine");
 
 /*
  * Attempt to reduce the free list by the requested amount.
  */
 static int
 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int ocount;
 	bool retried;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (count > max_free_per_call)
 		count = max_free_per_call;
 	if (count == 0) {
 		mtx_unlock(&vnode_list_mtx);
 		return (0);
 	}
 	ocount = count;
 	retried = false;
 	vp = mvp;
 	for (;;) {
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL)) {
 			/*
 			 * The free vnode marker can be past eligible vnodes:
 			 * 1. if vdbatch_process trylock failed
 			 * 2. if vtryrecycle failed
 			 *
 			 * If so, start the scan from scratch.
 			 */
 			if (!retried && vnlru_read_freevnodes() > 0) {
 				TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 				TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
 				vp = mvp;
 				retried = true;
 				continue;
 			}
 
 			/*
 			 * Give up
 			 */
 			TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 			TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
 			mtx_unlock(&vnode_list_mtx);
 			break;
 		}
 		if (__predict_false(vp->v_type == VMARKER))
 			continue;
 		if (vp->v_holdcnt > 0)
 			continue;
 		/*
 		 * Don't recycle if our vnode is from different type
 		 * of mount point.  Note that mp is type-safe, the
 		 * check does not reach unmapped address even if
 		 * vnode is reclaimed.
 		 */
 		if (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
 		    mp->mnt_op != mnt_op) {
 			continue;
 		}
 		if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 			continue;
 		}
 		if (!vhold_recycle_free(vp))
 			continue;
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 		/*
 		 * FIXME: ignores the return value, meaning it may be nothing
 		 * got recycled but it claims otherwise to the caller.
 		 *
 		 * Originally the value started being ignored in 2005 with
 		 * 114a1006a8204aa156e1f9ad6476cdff89cada7f .
 		 *
 		 * Respecting the value can run into significant stalls if most
 		 * vnodes belong to one file system and it has writes
 		 * suspended.  In presence of many threads and millions of
 		 * vnodes they keep contending on the vnode_list_mtx lock only
 		 * to find vnodes they can't recycle.
 		 *
 		 * The solution would be to pre-check if the vnode is likely to
 		 * be recycle-able, but it needs to happen with the
 		 * vnode_list_mtx lock held. This runs into a problem where
 		 * VOP_GETWRITEMOUNT (currently needed to find out about if
 		 * writes are frozen) can take locks which LOR against it.
 		 *
 		 * Check nullfs for one example (null_getwritemount).
 		 */
 		vtryrecycle(vp, isvnlru);
 		count--;
 		if (count == 0) {
 			break;
 		}
 		mtx_lock(&vnode_list_mtx);
 		vp = mvp;
 	}
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	return (ocount - count);
 }
 
 /*
  * XXX: returns without vnode_list_mtx locked!
  */
 static int
 vnlru_free_locked_direct(int count)
 {
 	int ret;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false);
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	return (ret);
 }
 
 static int
 vnlru_free_locked_vnlru(int count)
 {
 	int ret;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true);
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	return (ret);
 }
 
 static int
 vnlru_free_vnlru(int count)
 {
 
 	mtx_lock(&vnode_list_mtx);
 	return (vnlru_free_locked_vnlru(count));
 }
 
 void
 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
 {
 
 	MPASS(mnt_op != NULL);
 	MPASS(mvp != NULL);
 	VNPASS(mvp->v_type == VMARKER, mvp);
 	mtx_lock(&vnode_list_mtx);
 	vnlru_free_impl(count, mnt_op, mvp, true);
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 }
 
 struct vnode *
 vnlru_alloc_marker(void)
 {
 	struct vnode *mvp;
 
 	mvp = vn_alloc_marker(NULL);
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	return (mvp);
 }
 
 void
 vnlru_free_marker(struct vnode *mvp)
 {
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	vn_free_marker(mvp);
 }
 
 static void
 vnlru_recalc(void)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
 	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
 	vlowat = vhiwat / 2;
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 static u_long vnlruproc_kicks;
 
 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0,
     "Number of times vnlru got woken up due to vnode shortage");
 
 #define VNLRU_COUNT_SLOP 100
 
 /*
  * The main freevnodes counter is only updated when a counter local to CPU
  * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally
  * walked to compute a more accurate total.
  *
  * Note: the actual value at any given moment can still exceed slop, but it
  * should not be by significant margin in practice.
  */
 #define VNLRU_FREEVNODES_SLOP 126
 
 static void __noinline
 vfs_freevnodes_rollup(int8_t *lfreevnodes)
 {
 
 	atomic_add_long(&freevnodes, *lfreevnodes);
 	*lfreevnodes = 0;
 	critical_exit();
 }
 
 static __inline void
 vfs_freevnodes_inc(void)
 {
 	int8_t *lfreevnodes;
 
 	critical_enter();
 	lfreevnodes = PCPU_PTR(vfs_freevnodes);
 	(*lfreevnodes)++;
 	if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP))
 		vfs_freevnodes_rollup(lfreevnodes);
 	else
 		critical_exit();
 }
 
 static __inline void
 vfs_freevnodes_dec(void)
 {
 	int8_t *lfreevnodes;
 
 	critical_enter();
 	lfreevnodes = PCPU_PTR(vfs_freevnodes);
 	(*lfreevnodes)--;
 	if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP))
 		vfs_freevnodes_rollup(lfreevnodes);
 	else
 		critical_exit();
 }
 
 static u_long
 vnlru_read_freevnodes(void)
 {
 	long slop, rfreevnodes, rfreevnodes_old;
 	int cpu;
 
 	rfreevnodes = atomic_load_long(&freevnodes);
 	rfreevnodes_old = atomic_load_long(&freevnodes_old);
 
 	if (rfreevnodes > rfreevnodes_old)
 		slop = rfreevnodes - rfreevnodes_old;
 	else
 		slop = rfreevnodes_old - rfreevnodes;
 	if (slop < VNLRU_FREEVNODES_SLOP)
 		return (rfreevnodes >= 0 ? rfreevnodes : 0);
 	CPU_FOREACH(cpu) {
 		rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes;
 	}
 	atomic_store_long(&freevnodes_old, rfreevnodes);
 	return (freevnodes_old >= 0 ? freevnodes_old : 0);
 }
 
 static bool
 vnlru_under(u_long rnumvnodes, u_long limit)
 {
 	u_long rfreevnodes, space;
 
 	if (__predict_false(rnumvnodes > desiredvnodes))
 		return (true);
 
 	space = desiredvnodes - rnumvnodes;
 	if (space < limit) {
 		rfreevnodes = vnlru_read_freevnodes();
 		if (rfreevnodes > wantfreevnodes)
 			space += rfreevnodes - wantfreevnodes;
 	}
 	return (space < limit);
 }
 
 static void
 vnlru_kick_locked(void)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (vnlruproc_sig == 0) {
 		vnlruproc_sig = 1;
 		vnlruproc_kicks++;
 		wakeup(vnlruproc);
 	}
 }
 
 static void
 vnlru_kick_cond(void)
 {
 
 	if (vnlru_read_freevnodes() > wantfreevnodes)
 		return;
 
 	if (vnlruproc_sig)
 		return;
 	mtx_lock(&vnode_list_mtx);
 	vnlru_kick_locked();
 	mtx_unlock(&vnode_list_mtx);
 }
 
 static void
 vnlru_proc_sleep(void)
 {
 
 	if (vnlruproc_sig) {
 		vnlruproc_sig = 0;
 		wakeup(&vnlruproc_sig);
 	}
 	msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz);
 }
 
 /*
  * A lighter version of the machinery below.
  *
  * Tries to reach goals only by recycling free vnodes and does not invoke
  * uma_reclaim(UMA_RECLAIM_DRAIN).
  *
  * This works around pathological behavior in vnlru in presence of tons of free
  * vnodes, but without having to rewrite the machinery at this time. Said
  * behavior boils down to continuously trying to reclaim all kinds of vnodes
  * (cycling through all levels of "force") when the count is transiently above
  * limit. This happens a lot when all vnodes are used up and vn_alloc
  * speculatively increments the counter.
  *
  * Sample testcase: vnode limit 8388608, 20 separate directory trees each with
  * 1 million files in total and 20 find(1) processes stating them in parallel
  * (one per each tree).
  *
  * On a kernel with only stock machinery this needs anywhere between 60 and 120
  * seconds to execute (time varies *wildly* between runs). With the workaround
  * it consistently stays around 20 seconds [it got further down with later
  * changes].
  *
  * That is to say the entire thing needs a fundamental redesign (most notably
  * to accommodate faster recycling), the above only tries to get it ouf the way.
  *
  * Return values are:
  * -1 -- fallback to regular vnlru loop
  *  0 -- do nothing, go to sleep
  * >0 -- recycle this many vnodes
  */
 static long
 vnlru_proc_light_pick(void)
 {
 	u_long rnumvnodes, rfreevnodes;
 
 	if (vstir || vnlruproc_sig == 1)
 		return (-1);
 
 	rnumvnodes = atomic_load_long(&numvnodes);
 	rfreevnodes = vnlru_read_freevnodes();
 
 	/*
 	 * vnode limit might have changed and now we may be at a significant
 	 * excess. Bail if we can't sort it out with free vnodes.
 	 *
 	 * Due to atomic updates the count can legitimately go above
 	 * the limit for a short period, don't bother doing anything in
 	 * that case.
 	 */
 	if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) {
 		if (rnumvnodes - rfreevnodes >= desiredvnodes ||
 		    rfreevnodes <= wantfreevnodes) {
 			return (-1);
 		}
 
 		return (rnumvnodes - desiredvnodes);
 	}
 
 	/*
 	 * Don't try to reach wantfreevnodes target if there are too few vnodes
 	 * to begin with.
 	 */
 	if (rnumvnodes < wantfreevnodes) {
 		return (0);
 	}
 
 	if (rfreevnodes < wantfreevnodes) {
 		return (-1);
 	}
 
 	return (0);
 }
 
 static bool
 vnlru_proc_light(void)
 {
 	long freecount;
 
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 
 	freecount = vnlru_proc_light_pick();
 	if (freecount == -1)
 		return (false);
 
 	if (freecount != 0) {
 		vnlru_free_vnlru(freecount);
 	}
 
 	mtx_lock(&vnode_list_mtx);
 	vnlru_proc_sleep();
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	return (true);
 }
 
 static u_long uma_reclaim_calls;
 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS,
     &uma_reclaim_calls, 0, "Number of calls to uma_reclaim");
 
 static void
 vnlru_proc(void)
 {
 	u_long rnumvnodes, rfreevnodes, target;
 	unsigned long onumvnodes;
 	int done, force, trigger, usevnodes;
 	bool reclaim_nc_src, want_reread;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);
 
 	force = 0;
 	want_reread = false;
 	for (;;) {
 		kproc_suspend_check(vnlruproc);
 
 		if (force == 0 && vnlru_proc_light())
 			continue;
 
 		mtx_lock(&vnode_list_mtx);
 		rnumvnodes = atomic_load_long(&numvnodes);
 
 		if (want_reread) {
 			force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
 			want_reread = false;
 		}
 
 		/*
 		 * If numvnodes is too large (due to desiredvnodes being
 		 * adjusted using its sysctl, or emergency growth), first
 		 * try to reduce it by discarding from the free list.
 		 */
 		if (rnumvnodes > desiredvnodes + 10) {
 			vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes);
 			mtx_lock(&vnode_list_mtx);
 			rnumvnodes = atomic_load_long(&numvnodes);
 		}
 		/*
 		 * Sleep if the vnode cache is in a good state.  This is
 		 * when it is not over-full and has space for about a 4%
 		 * or 9% expansion (by growing its size or inexcessively
 		 * reducing its free list).  Otherwise, try to reclaim
 		 * space for a 10% expansion.
 		 */
 		if (vstir && force == 0) {
 			force = 1;
 			vstir = false;
 		}
 		if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
 			vnlru_proc_sleep();
 			continue;
 		}
 		rfreevnodes = vnlru_read_freevnodes();
 
 		onumvnodes = rnumvnodes;
 		/*
 		 * Calculate parameters for recycling.  These are the same
 		 * throughout the loop to give some semblance of fairness.
 		 * The trigger point is to avoid recycling vnodes with lots
 		 * of resident pages.  We aren't trying to free memory; we
 		 * are trying to recycle or at least free vnodes.
 		 */
 		if (rnumvnodes <= desiredvnodes)
 			usevnodes = rnumvnodes - rfreevnodes;
 		else
 			usevnodes = rnumvnodes;
 		if (usevnodes <= 0)
 			usevnodes = 1;
 		/*
 		 * The trigger value is chosen to give a conservatively
 		 * large value to ensure that it alone doesn't prevent
 		 * making progress.  The value can easily be so large that
 		 * it is effectively infinite in some congested and
 		 * misconfigured cases, and this is necessary.  Normally
 		 * it is about 8 to 100 (pages), which is quite large.
 		 */
 		trigger = vm_cnt.v_page_count * 2 / usevnodes;
 		if (force < 2)
 			trigger = vsmalltrigger;
 		reclaim_nc_src = force >= 3;
 		target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
 		target = target / 10 + 1;
 		done = vlrureclaim(reclaim_nc_src, trigger, target);
 		mtx_unlock(&vnode_list_mtx);
 		/*
 		 * Total number of vnodes can transiently go slightly above the
 		 * limit (see vn_alloc_hard), no need to call uma_reclaim if
 		 * this happens.
 		 */
 		if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes &&
 		    numvnodes <= desiredvnodes) {
 			uma_reclaim_calls++;
 			uma_reclaim(UMA_RECLAIM_DRAIN);
 		}
 		if (done == 0) {
 			if (force == 0 || force == 1) {
 				force = 2;
 				continue;
 			}
 			if (force == 2) {
 				force = 3;
 				continue;
 			}
 			want_reread = true;
 			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else {
 			want_reread = true;
 			kern_yield(PRI_USER);
 		}
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
     &vnlru_kp);
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp, bool isvnlru)
 {
 	struct mount *vnmp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, vp %p lock is already held",
 		    __func__, vp);
 		vdrop_recycle(vp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, cannot start the write for %p",
 		    __func__, vp);
 		vdrop_recycle(vp);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp);
 		vdropl_recycle(vp);
 		vn_finished_write(vnmp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, %p is already referenced",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	if (!VN_IS_DOOMED(vp)) {
 		if (isvnlru)
 			recycles_free_count++;
 		else
 			counter_u64_add(direct_recycles_free_count, 1);
 		vgonel(vp);
 	}
 	VOP_UNLOCK(vp);
 	vdropl_recycle(vp);
 	vn_finished_write(vnmp);
 	return (0);
 }
 
 /*
  * Allocate a new vnode.
  *
  * The operation never returns an error. Returning an error was disabled
  * in r145385 (dated 2005) with the following comment:
  *
  * XXX Not all VFS_VGET/ffs_vget callers check returns.
  *
  * Given the age of this commit (almost 15 years at the time of writing this
  * comment) restoring the ability to fail requires a significant audit of
  * all codepaths.
  *
  * The routine can try to free a vnode or stall for up to 1 second waiting for
  * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
  */
 static u_long vn_alloc_cyclecount;
 static u_long vn_alloc_sleeps;
 
 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0,
     "Number of times vnode allocation blocked waiting on vnlru");
 
 static struct vnode * __noinline
 vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped)
 {
 	u_long rfreevnodes;
 
 	if (bumped) {
 		if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) {
 			atomic_subtract_long(&numvnodes, 1);
 			bumped = false;
 		}
 	}
 
 	mtx_lock(&vnode_list_mtx);
 
 	rfreevnodes = vnlru_read_freevnodes();
 	if (vn_alloc_cyclecount++ >= rfreevnodes) {
 		vn_alloc_cyclecount = 0;
 		vstir = true;
 	}
 	/*
 	 * Grow the vnode cache if it will not be above its target max
 	 * after growing.  Otherwise, if the free list is nonempty, try
 	 * to reclaim 1 item from it before growing the cache (possibly
 	 * above its target max if the reclamation failed or is delayed).
 	 * Otherwise, wait for some space.  In all cases, schedule
 	 * vnlru_proc() if we are getting short of space.  The watermarks
 	 * should be chosen so that we never wait or even reclaim from
 	 * the free list to below its target minimum.
 	 */
 	if (vnlru_free_locked_direct(1) > 0)
 		goto alloc;
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		/*
 		 * Wait for space for a new vnode.
 		 */
 		if (bumped) {
 			atomic_subtract_long(&numvnodes, 1);
 			bumped = false;
 		}
 		mtx_lock(&vnode_list_mtx);
 		vnlru_kick_locked();
 		vn_alloc_sleeps++;
 		msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 		if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
 		    vnlru_read_freevnodes() > 1)
 			vnlru_free_locked_direct(1);
 		else
 			mtx_unlock(&vnode_list_mtx);
 	}
 alloc:
 	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	if (!bumped)
 		atomic_add_long(&numvnodes, 1);
 	vnlru_kick_cond();
 	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static struct vnode *
 vn_alloc(struct mount *mp)
 {
 	u_long rnumvnodes;
 
 	if (__predict_false(vn_alloc_cyclecount != 0))
 		return (vn_alloc_hard(mp, 0, false));
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 	if (__predict_false(vnlru_under(rnumvnodes, vlowat))) {
 		return (vn_alloc_hard(mp, rnumvnodes, true));
 	}
 
 	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static void
 vn_free(struct vnode *vp)
 {
 
 	atomic_subtract_long(&numvnodes, 1);
 	uma_zfree_smr(vnode_zone, vp);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct lock_object *lo;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
 
 	KASSERT(vops->registered,
 	    ("%s: not registered vector op %p\n", __func__, vops));
 	cache_validate_vop_vector(mp, vops);
 
 	td = curthread;
 	if (td->td_vp_reserved != NULL) {
 		vp = td->td_vp_reserved;
 		td->td_vp_reserved = NULL;
 	} else {
 		vp = vn_alloc(mp);
 	}
 	counter_u64_add(vnodes_created, 1);
 
 	vn_set_state(vp, VSTATE_UNINITIALIZED);
 
 	/*
 	 * Locks are given the generic name "vnode" when created.
 	 * Follow the historic practice of using the filesystem
 	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
 	 *
 	 * Locks live in a witness group keyed on their name. Thus,
 	 * when a lock is renamed, it must also move from the witness
 	 * group of its old name to the witness group of its new name.
 	 *
 	 * The change only needs to be made when the vnode moves
 	 * from one filesystem type to another. We ensure that each
 	 * filesystem use a single static name pointer for its tag so
 	 * that we can compare pointers rather than doing a strcmp().
 	 */
 	lo = &vp->v_vnlock->lock_object;
 #ifdef WITNESS
 	if (lo->lo_name != tag) {
 #endif
 		lo->lo_name = tag;
 #ifdef WITNESS
 		WITNESS_DESTROY(lo);
 		WITNESS_INIT(lo, tag);
 	}
 #endif
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
 	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
 	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_op = vops;
 	vp->v_irflag = 0;
 	v_init_counters(vp);
 	vn_seqc_init(vp);
 	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef DIAGNOSTIC
 	if (mp == NULL && vops != &dead_vnodeops)
 		printf("NULL mp in getnewvnode(9), tag %s\n", tag);
 #endif
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_vnode_associate_singlelabel(mp, vp);
 #endif
 	if (mp != NULL) {
 		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 	}
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
 	 * still initialize v_hash to have vfs_hash_index() useful.
 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 	 * its own hashing.
 	 */
 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
 
 	*vpp = vp;
 	return (0);
 }
 
 void
 getnewvnode_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(td->td_vp_reserved == NULL);
 	td->td_vp_reserved = vn_alloc(NULL);
 }
 
 void
 getnewvnode_drop_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_vp_reserved != NULL) {
 		vn_free(td->td_vp_reserved);
 		td->td_vp_reserved = NULL;
 	}
 }
 
 static void __noinline
 freevnode(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	/*
 	 * The vnode has been marked for destruction, so free it.
 	 *
 	 * The vnode will be returned to the zone where it will
 	 * normally remain until it is needed for another vnode. We
 	 * need to cleanup (or verify that the cleanup has already
 	 * been done) any residual data left from its current use
 	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	/*
 	 * Paired with vgone.
 	 */
 	vn_seqc_write_end_free(vp);
 
 	bo = &vp->v_bufobj;
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 	    ("clean blk trie not empty"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 	    ("dirty blk trie not empty"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
 	    ("Dangling rangelock waiters"));
 	VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
 	    ("Leaked inactivation"));
 	VI_UNLOCK(vp);
 	cache_assert_no_entries(vp);
 
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		/*
 		 * Use LK_NOWAIT to shut up witness about the lock. We may get
 		 * here while having another vnode locked when trying to
 		 * satisfy a lookup and needing to recycle.
 		 */
 		VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		destroy_vpollinfo(vp->v_pollinfo);
 		VOP_UNLOCK(vp);
 		vp->v_pollinfo = NULL;
 	}
 	vp->v_mountedhere = NULL;
 	vp->v_unpcb = NULL;
 	vp->v_rdev = NULL;
 	vp->v_fifoinfo = NULL;
 	vp->v_iflag = 0;
 	vp->v_vflag = 0;
 	bo->bo_flag = 0;
 	vn_free(vp);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 
 	mp = vp->v_mount;
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	vp->v_mount = NULL;
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	/*
 	 * The caller expects the interlock to be still held.
 	 */
 	ASSERT_VI_LOCKED(vp, __func__);
 }
 
 static int
 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) {
 		ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 	} else {
 		KASSERT(!dtr,
 		    ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup",
 		    __func__));
 	}
 
 	/*
 	 * We acquire the vnode interlock early to ensure that the
 	 * vnode cannot be recycled by another process releasing a
 	 * holdcnt on it before we get it on both the vnode list
 	 * and the active vnode list. The mount mutex protects only
 	 * manipulation of the vnode list and the vnode freelist
 	 * mutex protects only manipulation of the active vnode list.
 	 * Hence the need to hold the vnode interlock throughout.
 	 */
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 	    mp->mnt_nvnodelistsize == 0)) &&
 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 		VI_UNLOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (dtr) {
 			vp->v_data = NULL;
 			vp->v_op = &dead_vnodeops;
 			vgone(vp);
 			vput(vp);
 		}
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	VI_UNLOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  * insmntque() reclaims the vnode on insertion failure, insmntque1()
  * leaves handling of the vnode to the caller.
  */
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 	return (insmntque1_int(vp, mp, true));
 }
 
 int
 insmntque1(struct vnode *vp, struct mount *mp)
 {
 	return (insmntque1_int(vp, mp, false));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			do {
 				error = BO_SYNC(bo, MNT_WAIT);
 			} while (error == ERELOOKUP);
 			if (error != 0)
 				return (error);
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
 				BO_UNLOCK(bo);
 				return (EBUSY);
 			}
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
 			BO_UNLOCK(bo);
 			vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
 			BO_LOCK(bo);
 		}
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL &&
 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
 		VM_OBJECT_WLOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
 		VM_OBJECT_WUNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
 	    V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
 	    bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
 	    bo->bo_dirty.bv_cnt > 0)
 		panic("vinvalbuf: flush dirty failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 {
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	if (vp->v_object != NULL && vp->v_object->handle != vp)
 		return (0);
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		/*
 		 * If we are flushing both V_NORMAL and V_ALT buffers then
 		 * do not skip any buffers. If we are flushing only V_NORMAL
 		 * buffers then skip buffers marked as BX_ALTDATA. If we are
 		 * flushing only V_ALT buffers then skip buffers not marked
 		 * as BX_ALTDATA.
 		 */
 		if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
 		   (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
 			continue;
 		}
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp == NULL)
 			break;
 		nbp = gbincore(bo, lblkno);
 		if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		    != xflags)
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 int
 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 {
 	struct buf *bp;
 	int error;
 	daddr_t lblkno;
 
 	ASSERT_BO_LOCKED(bo);
 
 	for (lblkno = startn;;) {
 again:
 		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
 		if (bp == NULL || bp->b_lblkno >= endn ||
 		    bp->b_lblkno < startn)
 			break;
 		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
 		if (error != 0) {
 			BO_RLOCK(bo);
 			if (error == ENOLCK)
 				goto again;
 			return (error);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		lblkno = bp->b_lblkno + 1;
 		if ((bp->b_flags & B_MANAGED) == 0)
 			bremfree(bp);
 		bp->b_flags |= B_RELBUF;
 		/*
 		 * In the VMIO case, use the B_NOREUSE flag to hint that the
 		 * pages backing each buffer in the range are unlikely to be
 		 * reused.  Dirty buffers will have the hint applied once
 		 * they've been written.
 		 */
 		if ((bp->b_flags & B_VMIO) != 0)
 			bp->b_flags |= B_NOREUSE;
 		brelse(bp);
 		BO_RLOCK(bo);
 	}
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	daddr_t startlbn;
 
 	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
 	    vp, blksize, (uintmax_t)length);
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	startlbn = howmany(length, blksize);
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 
 	bo = &vp->v_bufobj;
 restart_unlocked:
 	BO_LOCK(bo);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
 		;
 
 	if (length > 0) {
 		/*
 		 * Write out vnode metadata, e.g. indirect blocks.
 		 */
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno >= 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart_unlocked;
 
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			BO_LOCK(bo);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	BO_UNLOCK(bo);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * Invalidate the cached pages of a file's buffer within the range of block
  * numbers [startlbn, endlbn).
  */
 void
 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
     int blksize)
 {
 	struct bufobj *bo;
 	off_t start, end;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
 
 	start = blksize * startlbn;
 	end = blksize * endlbn;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	MPASS(blksize == bo->bo_bsize);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
 		;
 
 	BO_UNLOCK(bo);
 	vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
 }
 
 static int
 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
     daddr_t startlbn, daddr_t endlbn)
 {
 	struct buf *bp, *nbp;
 	bool anyfreed;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
 	ASSERT_BO_LOCKED(bo);
 
 	do {
 		anyfreed = false;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    nbp->b_vp != vp ||
 			    (nbp->b_flags & B_DELWRI) != 0))
 				return (EAGAIN);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0))
 				return (EAGAIN);
 		}
 	} while (anyfreed);
 	return (0);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct bufv *bv;
 	b_xflags_t flags;
 
 	flags = bp->b_xflags;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_WLOCKED(bp->b_bufobj);
 	KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
 	    (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
 	    ("%s: buffer %p has invalid queue state", __func__, bp));
 
 	if ((flags & BX_VNDIRTY) != 0)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct bufv *bv;
 	struct buf *n;
 	int error;
 
 	ASSERT_BO_WLOCKED(bo);
 	KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
 	    ("buf_vlist_add: bo %p does not allow bufs", bo));
 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 	    ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	/*
 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
 	 * we tend to grow at the tail so lookup_le should usually be cheaper
 	 * than _ge. 
 	 */
 	if (bv->bv_cnt == 0 ||
 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 	else
 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 	if (error)
 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
 	bv->bv_cnt++;
 }
 
 /*
  * Look up a buffer using the buffer tries.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
 }
 
 /*
  * Look up a buf using the buffer tries, without the bufobj lock.  This relies
  * on SMR for safe lookup, and bufs being in a no-free zone to provide type
  * stability of the result.  Like other lockless lookups, the found buf may
  * already be invalid by the time this function returns.
  */
 struct buf *
 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_UNLOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = bo;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, bo, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	buf_vlist_remove(bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
 	vdrop(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
     CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL)
 		return (0);
 	vp = bo2vnode(*bo);
 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (*bo == LIST_FIRST(slp));
 	}
 	MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 ||
 	    (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp,
 	    ("suspended mp syncing vp %p", vp));
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	BO_LOCK(*bo);
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	BO_UNLOCK(*bo);
 	vdrop(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 static int first_printf = 1;
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining... ");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while (!LIST_EMPTY(slp)) {
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 
 			if (first_printf == 0) {
 				/*
 				 * Drop the sync mutex, because some watchdog
 				 * drivers need to sleep while patting
 				 */
 				mtx_unlock(&sync_mtx);
 				wdog_kern_pat(WD_LASTVAL);
 				mtx_lock(&sync_mtx);
 			}
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING ||
 		    time_uptime == starttime) {
 			thread_lock(td);
 			sched_prio(td, PPAUSE);
 			thread_unlock(td);
 		}
 		if (syncer_state != SYNCER_RUNNING)
 			cv_timedwait(&sync_wakeup, &sync_mtx,
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	int ret = 0;
 
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 
 	if (howto & RB_NOSYNC)
 		return;
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_shutdown(arg, howto);
 }
 
 void
 syncer_suspend(void)
 {
 
 	syncer_shutdown(updateproc, 0);
 }
 
 void
 syncer_resume(void)
 {
 
 	mtx_lock(&sync_mtx);
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_resume(updateproc);
 }
 
 /*
  * Move the buffer between the clean and dirty lists of its vnode.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 
 	KASSERT((bp->b_flags & B_PAGING) == 0,
 	    ("%s: cannot reassign paging buffer %p", __func__, bp));
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 
 	BO_LOCK(bo);
 	buf_vlist_remove(bp);
 
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	BO_UNLOCK(bo);
 }
 
 static void
 v_init_counters(struct vnode *vp)
 {
 
 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
 
 	refcount_init(&vp->v_holdcnt, 1);
 	refcount_init(&vp->v_usecount, 1);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it.  VIRF_DOOMED is set if the vnode
  * is being destroyed.  Only callers who specify LK_RETRY will
  * see doomed vnodes.  If inactive processing was delayed in
  * vput try to do it here.
  *
  * usecount is manipulated using atomics without holding any locks.
  *
  * holdcnt can be manipulated using atomics without holding any locks,
  * except when transitioning 1<->0, in which case the interlock is held.
  *
  * Consumers which don't guarantee liveness of the vnode can use SMR to
  * try to get a reference. Note this operation can fail since the vnode
  * may be awaiting getting freed by the time they get to it.
  */
 enum vgetstate
 vget_prep_smr(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		if (vhold_smr(vp))
 			vs = VGET_HOLDCNT;
 		else
 			vs = VGET_NONE;
 	}
 	return (vs);
 }
 
 enum vgetstate
 vget_prep(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		vhold(vp);
 		vs = VGET_HOLDCNT;
 	}
 	return (vs);
 }
 
 void
 vget_abort(struct vnode *vp, enum vgetstate vs)
 {
 
 	switch (vs) {
 	case VGET_USECOUNT:
 		vrele(vp);
 		break;
 	case VGET_HOLDCNT:
 		vdrop(vp);
 		break;
 	default:
 		__assert_unreachable();
 	}
 }
 
 int
 vget(struct vnode *vp, int flags)
 {
 	enum vgetstate vs;
 
 	vs = vget_prep(vp);
 	return (vget_finish(vp, flags, vs));
 }
 
 int
 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
 {
 	int error;
 
 	if ((flags & LK_INTERLOCK) != 0)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 
 	error = vn_lock(vp, flags);
 	if (__predict_false(error != 0)) {
 		vget_abort(vp, vs);
 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 		    vp);
 		return (error);
 	}
 
 	vget_finish_ref(vp, vs);
 	return (0);
 }
 
 void
 vget_finish_ref(struct vnode *vp, enum vgetstate vs)
 {
 	int old;
 
 	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 
 	if (vs == VGET_USECOUNT)
 		return;
 
 	/*
 	 * We hold the vnode. If the usecount is 0 it will be utilized to keep
 	 * the vnode around. Otherwise someone else lended their hold count and
 	 * we have to drop ours.
 	 */
 	old = atomic_fetchadd_int(&vp->v_usecount, 1);
 	VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
 	if (old != 0) {
 #ifdef INVARIANTS
 		old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
 		VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
 #else
 		refcount_release(&vp->v_holdcnt);
 #endif
 	}
 }
 
 void
 vref(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vs = vget_prep(vp);
 	vget_finish_ref(vp, vs);
 }
 
 void
 vrefact(struct vnode *vp)
 {
 	int old __diagused;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	old = refcount_acquire(&vp->v_usecount);
 	VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
 }
 
 void
 vlazy(struct vnode *vp)
 {
 	struct mount *mp;
 
 	VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
 
 	if ((vp->v_mflag & VMP_LAZYLIST) != 0)
 		return;
 	/*
 	 * We may get here for inactive routines after the vnode got doomed.
 	 */
 	if (VN_IS_DOOMED(vp))
 		return;
 	mp = vp->v_mount;
 	mtx_lock(&mp->mnt_listmtx);
 	if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
 		vp->v_mflag |= VMP_LAZYLIST;
 		TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize++;
 	}
 	mtx_unlock(&mp->mnt_listmtx);
 }
 
 static void
 vunlazy(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(!VN_IS_DOOMED(vp), vp);
 
 	mp = vp->v_mount;
 	mtx_lock(&mp->mnt_listmtx);
 	VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 	/*
 	 * Don't remove the vnode from the lazy list if another thread
 	 * has increased the hold count. It may have re-enqueued the
 	 * vnode to the lazy list and is now responsible for its
 	 * removal.
 	 */
 	if (vp->v_holdcnt == 0) {
 		vp->v_mflag &= ~VMP_LAZYLIST;
 		TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize--;
 	}
 	mtx_unlock(&mp->mnt_listmtx);
 }
 
 /*
  * This routine is only meant to be called from vgonel prior to dooming
  * the vnode.
  */
 static void
 vunlazy_gone(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(!VN_IS_DOOMED(vp), vp);
 
 	if (vp->v_mflag & VMP_LAZYLIST) {
 		mp = vp->v_mount;
 		mtx_lock(&mp->mnt_listmtx);
 		VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 		vp->v_mflag &= ~VMP_LAZYLIST;
 		TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
 }
 
 static void
 vdefer_inactive(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	if (VN_IS_DOOMED(vp)) {
 		vdropl(vp);
 		return;
 	}
 	if (vp->v_iflag & VI_DEFINACT) {
 		VNPASS(vp->v_holdcnt > 1, vp);
 		vdropl(vp);
 		return;
 	}
 	if (vp->v_usecount > 0) {
 		vp->v_iflag &= ~VI_OWEINACT;
 		vdropl(vp);
 		return;
 	}
 	vlazy(vp);
 	vp->v_iflag |= VI_DEFINACT;
 	VI_UNLOCK(vp);
 	atomic_add_long(&deferred_inact, 1);
 }
 
 static void
 vdefer_inactive_unlocked(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		vdropl(vp);
 		return;
 	}
 	vdefer_inactive(vp);
 }
 
 enum vput_op { VRELE, VPUT, VUNREF };
 
 /*
  * Handle ->v_usecount transitioning to 0.
  *
  * By releasing the last usecount we take ownership of the hold count which
  * provides liveness of the vnode, meaning we have to vdrop.
  *
  * For all vnodes we may need to perform inactive processing. It requires an
  * exclusive lock on the vnode, while it is legal to call here with only a
  * shared lock (or no locks). If locking the vnode in an expected manner fails,
  * inactive processing gets deferred to the syncer.
  *
  * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
  * on the lock being held all the way until VOP_INACTIVE. This in particular
  * happens with UFS which adds half-constructed vnodes to the hash, where they
  * can be found by other code.
  */
 static void
 vput_final(struct vnode *vp, enum vput_op func)
 {
 	int error;
 	bool want_unlock;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 
 	VI_LOCK(vp);
 
 	/*
 	 * By the time we got here someone else might have transitioned
 	 * the count back to > 0.
 	 */
 	if (vp->v_usecount > 0)
 		goto out;
 
 	/*
 	 * If the vnode is doomed vgone already performed inactive processing
 	 * (if needed).
 	 */
 	if (VN_IS_DOOMED(vp))
 		goto out;
 
 	if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
 		goto out;
 
 	if (vp->v_iflag & VI_DOINGINACT)
 		goto out;
 
 	/*
 	 * Locking operations here will drop the interlock and possibly the
 	 * vnode lock, opening a window where the vnode can get doomed all the
 	 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
 	 * perform inactive.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
 	want_unlock = false;
 	error = 0;
 	switch (func) {
 	case VRELE:
 		switch (VOP_ISLOCKED(vp)) {
 		case LK_EXCLUSIVE:
 			break;
 		case LK_EXCLOTHER:
 		case 0:
 			want_unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 			VI_LOCK(vp);
 			break;
 		default:
 			/*
 			 * The lock has at least one sharer, but we have no way
 			 * to conclude whether this is us. Play it safe and
 			 * defer processing.
 			 */
 			error = EAGAIN;
 			break;
 		}
 		break;
 	case VPUT:
 		want_unlock = true;
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
 			    LK_NOWAIT);
 			VI_LOCK(vp);
 		}
 		break;
 	case VUNREF:
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 			VI_LOCK(vp);
 		}
 		break;
 	}
 	if (error == 0) {
 		if (func == VUNREF) {
 			VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
 			    ("recursive vunref"));
 			vp->v_vflag |= VV_UNREF;
 		}
 		for (;;) {
 			error = vinactive(vp);
 			if (want_unlock)
 				VOP_UNLOCK(vp);
 			if (error != ERELOOKUP || !want_unlock)
 				break;
 			VOP_LOCK(vp, LK_EXCLUSIVE);
 		}
 		if (func == VUNREF)
 			vp->v_vflag &= ~VV_UNREF;
 		vdropl(vp);
 	} else {
 		vdefer_inactive(vp);
 	}
 	return;
 out:
 	if (func == VPUT)
 		VOP_UNLOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Decrement ->v_usecount for a vnode.
  *
  * Releasing the last use count requires additional processing, see vput_final
  * above for details.
  *
  * Comment above each variant denotes lock state on entry and exit.
  */
 
 /*
  * in: any
  * out: same as passed in
  */
 void
 vrele(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount))
 		return;
 	vput_final(vp, VRELE);
 }
 
 /*
  * in: locked
  * out: unlocked
  */
 void
 vput(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount)) {
 		VOP_UNLOCK(vp);
 		return;
 	}
 	vput_final(vp, VPUT);
 }
 
 /*
  * in: locked
  * out: locked
  */
 void
 vunref(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount))
 		return;
 	vput_final(vp, VUNREF);
 }
 
 void
 vhold(struct vnode *vp)
 {
 	int old;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 	if (old == 0)
 		vfs_freevnodes_dec();
 }
 
 void
 vholdnz(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 #else
 	atomic_add_int(&vp->v_holdcnt, 1);
 #endif
 }
 
 /*
  * Grab a hold count unless the vnode is freed.
  *
  * Only use this routine if vfs smr is the only protection you have against
  * freeing the vnode.
  *
  * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
  * is not set.  After the flag is set the vnode becomes immutable to anyone but
  * the thread which managed to set the flag.
  *
  * It may be tempting to replace the loop with:
  * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
  * if (count & VHOLD_NO_SMR) {
  *     backpedal and error out;
  * }
  *
  * However, while this is more performant, it hinders debugging by eliminating
  * the previously mentioned invariant.
  */
 bool
 vhold_smr(struct vnode *vp)
 {
 	int count;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	count = atomic_load_int(&vp->v_holdcnt);
 	for (;;) {
 		if (count & VHOLD_NO_SMR) {
 			VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 			    ("non-zero hold count with flags %d\n", count));
 			return (false);
 		}
 		VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 		if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 			if (count == 0)
 				vfs_freevnodes_dec();
 			return (true);
 		}
 	}
 }
 
 /*
  * Hold a free vnode for recycling.
  *
  * Note: vnode_init references this comment.
  *
  * Attempts to recycle only need the global vnode list lock and have no use for
  * SMR.
  *
  * However, vnodes get inserted into the global list before they get fully
  * initialized and stay there until UMA decides to free the memory. This in
  * particular means the target can be found before it becomes usable and after
  * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to
  * VHOLD_NO_SMR.
  *
  * Note: the vnode may gain more references after we transition the count 0->1.
  */
 static bool
 vhold_recycle_free(struct vnode *vp)
 {
 	int count;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 
 	count = atomic_load_int(&vp->v_holdcnt);
 	for (;;) {
 		if (count & VHOLD_NO_SMR) {
 			VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 			    ("non-zero hold count with flags %d\n", count));
 			return (false);
 		}
 		VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 		if (count > 0) {
 			return (false);
 		}
 		if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 			vfs_freevnodes_dec();
 			return (true);
 		}
 	}
 }
 
 static void __noinline
 vdbatch_process(struct vdbatch *vd)
 {
 	struct vnode *vp;
 	int i;
 
 	mtx_assert(&vd->lock, MA_OWNED);
 	MPASS(curthread->td_pinned > 0);
 	MPASS(vd->index == VDBATCH_SIZE);
 
 	/*
 	 * Attempt to requeue the passed batch, but give up easily.
 	 *
 	 * Despite batching the mechanism is prone to transient *significant*
 	 * lock contention, where vnode_list_mtx becomes the primary bottleneck
 	 * if multiple CPUs get here (one real-world example is highly parallel
 	 * do-nothing make , which will stat *tons* of vnodes). Since it is
 	 * quasi-LRU (read: not that great even if fully honoured) provide an
 	 * option to just dodge the problem. Parties which don't like it are
 	 * welcome to implement something better.
 	 */
 	if (vnode_can_skip_requeue) {
 		if (!mtx_trylock(&vnode_list_mtx)) {
 			counter_u64_add(vnode_skipped_requeues, 1);
 			critical_enter();
 			for (i = 0; i < VDBATCH_SIZE; i++) {
 				vp = vd->tab[i];
 				vd->tab[i] = NULL;
 				MPASS(vp->v_dbatchcpu != NOCPU);
 				vp->v_dbatchcpu = NOCPU;
 			}
 			vd->index = 0;
 			critical_exit();
 			return;
 
 		}
 		/* fallthrough to locked processing */
 	} else {
 		mtx_lock(&vnode_list_mtx);
 	}
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	critical_enter();
 	for (i = 0; i < VDBATCH_SIZE; i++) {
 		vp = vd->tab[i];
 		vd->tab[i] = NULL;
 		TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 		TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
 		MPASS(vp->v_dbatchcpu != NOCPU);
 		vp->v_dbatchcpu = NOCPU;
 	}
 	mtx_unlock(&vnode_list_mtx);
 	vd->index = 0;
 	critical_exit();
 }
 
 static void
 vdbatch_enqueue(struct vnode *vp)
 {
 	struct vdbatch *vd;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(!VN_IS_DOOMED(vp), vp);
 
 	if (vp->v_dbatchcpu != NOCPU) {
 		VI_UNLOCK(vp);
 		return;
 	}
 
 	sched_pin();
 	vd = DPCPU_PTR(vd);
 	mtx_lock(&vd->lock);
 	MPASS(vd->index < VDBATCH_SIZE);
 	MPASS(vd->tab[vd->index] == NULL);
 	/*
 	 * A hack: we depend on being pinned so that we know what to put in
 	 * ->v_dbatchcpu.
 	 */
 	vp->v_dbatchcpu = curcpu;
 	vd->tab[vd->index] = vp;
 	vd->index++;
 	VI_UNLOCK(vp);
 	if (vd->index == VDBATCH_SIZE)
 		vdbatch_process(vd);
 	mtx_unlock(&vd->lock);
 	sched_unpin();
 }
 
 /*
  * This routine must only be called for vnodes which are about to be
  * deallocated. Supporting dequeue for arbitrary vndoes would require
  * validating that the locked batch matches.
  */
 static void
 vdbatch_dequeue(struct vnode *vp)
 {
 	struct vdbatch *vd;
 	int i;
 	short cpu;
 
 	VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp);
 
 	cpu = vp->v_dbatchcpu;
 	if (cpu == NOCPU)
 		return;
 
 	vd = DPCPU_ID_PTR(cpu, vd);
 	mtx_lock(&vd->lock);
 	for (i = 0; i < vd->index; i++) {
 		if (vd->tab[i] != vp)
 			continue;
 		vp->v_dbatchcpu = NOCPU;
 		vd->index--;
 		vd->tab[i] = vd->tab[vd->index];
 		vd->tab[vd->index] = NULL;
 		break;
 	}
 	mtx_unlock(&vd->lock);
 	/*
 	 * Either we dequeued the vnode above or the target CPU beat us to it.
 	 */
 	MPASS(vp->v_dbatchcpu == NOCPU);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VIRF_DOOMED) in which case we will free it.
  *
  * Because the vnode vm object keeps a hold reference on the vnode if
  * there is at least one resident non-cached page, the vnode cannot
  * leave the active list without the page cleanup done.
  */
 static void __noinline
 vdropl_final(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(VN_IS_DOOMED(vp), vp);
 	/*
 	 * Set the VHOLD_NO_SMR flag.
 	 *
 	 * We may be racing against vhold_smr. If they win we can just pretend
 	 * we never got this far, they will vdrop later.
 	 */
 	if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
 		vfs_freevnodes_inc();
 		VI_UNLOCK(vp);
 		/*
 		 * We lost the aforementioned race. Any subsequent access is
 		 * invalid as they might have managed to vdropl on their own.
 		 */
 		return;
 	}
 	/*
 	 * Don't bump freevnodes as this one is going away.
 	 */
 	freevnode(vp);
 }
 
 void
 vdrop(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (refcount_release_if_not_last(&vp->v_holdcnt))
 		return;
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 static void __always_inline
 vdropl_impl(struct vnode *vp, bool enqueue)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (!refcount_release(&vp->v_holdcnt)) {
 		VI_UNLOCK(vp);
 		return;
 	}
 	VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp);
 	VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
 	if (VN_IS_DOOMED(vp)) {
 		vdropl_final(vp);
 		return;
 	}
 
 	vfs_freevnodes_inc();
 	if (vp->v_mflag & VMP_LAZYLIST) {
 		vunlazy(vp);
 	}
 
 	if (!enqueue) {
 		VI_UNLOCK(vp);
 		return;
 	}
 
 	/*
 	 * Also unlocks the interlock. We can't assert on it as we
 	 * released our hold and by now the vnode might have been
 	 * freed.
 	 */
 	vdbatch_enqueue(vp);
 }
 
 void
 vdropl(struct vnode *vp)
 {
 
 	vdropl_impl(vp, true);
 }
 
 /*
  * vdrop a vnode when recycling
  *
  * This is a special case routine only to be used when recycling, differs from
  * regular vdrop by not requeieing the vnode on LRU.
  *
  * Consider a case where vtryrecycle continuously fails with all vnodes (due to
  * e.g., frozen writes on the filesystem), filling the batch and causing it to
  * be requeued. Then vnlru will end up revisiting the same vnodes. This is a
  * loop which can last for as long as writes are frozen.
  */
 static void
 vdropl_recycle(struct vnode *vp)
 {
 
 	vdropl_impl(vp, false);
 }
 
 static void
 vdrop_recycle(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl_recycle(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  */
 static int
 vinactivef(struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 
 	/*
 	 * Before moving off the active list, we must be sure that any
 	 * modified pages are converted into the vnode's dirty
 	 * buffers, since these will no longer be checked once the
 	 * vnode is on the inactive list.
 	 *
 	 * The write-out of the dirty pages is asynchronous.  At the
 	 * point that VOP_INACTIVE() is called, there could still be
 	 * pending I/O and dirty pages in the object.
 	 */
 	if ((vp->v_vflag & VV_NOSYNC) == 0)
 		vnode_pager_clean_async(vp);
 
 	error = VOP_INACTIVE(vp);
 	VI_LOCK(vp);
 	VNPASS(vp->v_iflag & VI_DOINGINACT, vp);
 	vp->v_iflag &= ~VI_DOINGINACT;
 	return (error);
 }
 
 int
 vinactive(struct vnode *vp)
 {
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 
 	if ((vp->v_iflag & VI_OWEINACT) == 0)
 		return (0);
 	if (vp->v_iflag & VI_DOINGINACT)
 		return (0);
 	if (vp->v_usecount > 0) {
 		vp->v_iflag &= ~VI_OWEINACT;
 		return (0);
 	}
 	return (vinactivef(vp));
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 #endif
 
 int
 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 	    rootrefs, flags);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 			    __func__, error);
 			return (error);
 		}
 		vput(rootvp);
 	}
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		vholdl(vp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 		if (error) {
 			vdrop(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp);
 			vdrop(vp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			vnode_pager_clean_async(vp);
 			do {
 				error = VOP_FSYNC(vp, MNT_WAIT, td);
 			} while (error == ERELOOKUP);
 			if (error != 0) {
 				VOP_UNLOCK(vp);
 				vdrop(vp);
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				return (error);
 			}
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount <= 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp);
 				vdropl(vp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vn_printf(vp, "vflush: busy vnode ");
 #endif
 		}
 		VOP_UNLOCK(vp);
 		vdropl(vp);
 	}
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy) {
 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 		    busy);
 		return (EBUSY);
 	}
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp)
 {
 	int recycled;
 
 	VI_LOCK(vp);
 	recycled = vrecyclel(vp);
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * vrecycle, with the vp interlock held.
  */
 int
 vrecyclel(struct vnode *vp)
 {
 	int recycled;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	recycled = 0;
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Notify upper mounts about reclaimed or unlinked vnode.
  */
 void
 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event)
 {
 	struct mount *mp;
 	struct mount_upper_node *ump;
 
 	mp = atomic_load_ptr(&vp->v_mount);
 	if (mp == NULL)
 		return;
 	if (TAILQ_EMPTY(&mp->mnt_notify))
 		return;
 
 	MNT_ILOCK(mp);
 	mp->mnt_upper_pending++;
 	KASSERT(mp->mnt_upper_pending > 0,
 	    ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending));
 	TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) {
 		MNT_IUNLOCK(mp);
 		switch (event) {
 		case VFS_NOTIFY_UPPER_RECLAIM:
 			VFS_RECLAIM_LOWERVP(ump->mp, vp);
 			break;
 		case VFS_NOTIFY_UPPER_UNLINK:
 			VFS_UNLINK_LOWERVP(ump->mp, vp);
 			break;
 		}
 		MNT_ILOCK(mp);
 	}
 	mp->mnt_upper_pending--;
 	if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
 	    mp->mnt_upper_pending == 0) {
 		mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
 		wakeup(&mp->mnt_uppers);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 static void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	struct mount *mp;
 	vm_object_t object;
 	bool active, doinginact, oweinact;
 
 	ASSERT_VOP_ELOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (VN_IS_DOOMED(vp)) {
 		VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \
 		    vn_get_state(vp) == VSTATE_DEAD, vp);
 		return;
 	}
 	/*
 	 * Paired with freevnode.
 	 */
 	vn_seqc_write_begin_locked(vp);
 	vunlazy_gone(vp);
 	vn_irflag_set_locked(vp, VIRF_DOOMED);
 	vn_set_state(vp, VSTATE_DESTROYING);
 
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to
 	 * call VOP_CLOSE() and VOP_INACTIVE().
 	 *
 	 * It could be that VOP_INACTIVE() requested reclamation, in
 	 * which case we should avoid recursion, so check
 	 * VI_DOINGINACT.  This is not precise but good enough.
 	 */
 	active = vp->v_usecount > 0;
 	oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 	doinginact = (vp->v_iflag & VI_DOINGINACT) != 0;
 
 	/*
 	 * If we need to do inactive VI_OWEINACT will be set.
 	 */
 	if (vp->v_iflag & VI_DEFINACT) {
 		VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 		vp->v_iflag &= ~VI_DEFINACT;
 		vdropl(vp);
 	} else {
 		VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
 		VI_UNLOCK(vp);
 	}
 	cache_purge_vgone(vp);
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (!doinginact) {
 		do {
 			if (oweinact || active) {
 				VI_LOCK(vp);
 				vinactivef(vp);
 				oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 				VI_UNLOCK(vp);
 			}
 		} while (oweinact);
 	}
 	if (vp->v_type == VSOCK)
 		vfs_unp_reclaim(vp);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 		while (vinvalbuf(vp, 0, 0, 0) != 0)
 			;
 	}
 
 	BO_LOCK(&vp->v_bufobj);
 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
 	    ("vp %p bufobj not invalidated", vp));
 
 	/*
 	 * For VMIO bufobj, BO_DEAD is set later, or in
 	 * vm_object_terminate() after the object's page queue is
 	 * flushed.
 	 */
 	object = vp->v_bufobj.bo_object;
 	if (object == NULL)
 		vp->v_bufobj.bo_flag |= BO_DEAD;
 	BO_UNLOCK(&vp->v_bufobj);
 
 	/*
 	 * Handle the VM part.  Tmpfs handles v_object on its own (the
 	 * OBJT_VNODE check).  Nullfs or other bypassing filesystems
 	 * should not touch the object borrowed from the lower vnode
 	 * (the handle check).
 	 */
 	if (object != NULL && object->type == OBJT_VNODE &&
 	    object->handle == vp)
 		vnode_destroy_vobject(vp);
 
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p", vp));
 	/*
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	if (vp->v_lockf != NULL) {
 		(void)VOP_ADVLOCKPURGE(vp);
 		vp->v_lockf = NULL;
 	}
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	if (vp->v_mount == NULL) {
 		VI_LOCK(vp);
 	} else {
 		delmntque(vp);
 		ASSERT_VI_LOCKED(vp, "vgonel 2");
 	}
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_type = VBAD;
 	vn_set_state(vp, VSTATE_DEAD);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static const char *const vtypename[] = {
 	[VNON] = "VNON",
 	[VREG] = "VREG",
 	[VDIR] = "VDIR",
 	[VBLK] = "VBLK",
 	[VCHR] = "VCHR",
 	[VLNK] = "VLNK",
 	[VSOCK] = "VSOCK",
 	[VFIFO] = "VFIFO",
 	[VBAD] = "VBAD",
 	[VMARKER] = "VMARKER",
 };
 _Static_assert(nitems(vtypename) == VLASTTYPE + 1,
     "vnode type name not added to vtypename");
 
 static const char *const vstatename[] = {
 	[VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED",
 	[VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED",
 	[VSTATE_DESTROYING] = "VSTATE_DESTROYING",
 	[VSTATE_DEAD] = "VSTATE_DEAD",
 };
 _Static_assert(nitems(vstatename) == VLASTSTATE + 1,
     "vnode state name not added to vstatename");
 
 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
     "new hold count flag not added to vn_printf");
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 	u_int holdcnt;
 	short irflag;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("type %s state %s op %p\n", vtypename[vp->v_type],
 	    vstatename[vp->v_state], vp->v_op);
 	holdcnt = atomic_load_int(&vp->v_holdcnt);
 	printf("    usecount %d, writecount %d, refcount %d seqc users %d",
 	    vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
 	    vp->v_seqc_users);
 	switch (vp->v_type) {
 	case VDIR:
 		printf(" mountedhere %p\n", vp->v_mountedhere);
 		break;
 	case VCHR:
 		printf(" rdev %p\n", vp->v_rdev);
 		break;
 	case VSOCK:
 		printf(" socket %p\n", vp->v_unpcb);
 		break;
 	case VFIFO:
 		printf(" fifoinfo %p\n", vp->v_fifoinfo);
 		break;
 	default:
 		printf("\n");
 		break;
 	}
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (holdcnt & VHOLD_NO_SMR)
 		strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
 	printf("    hold count flags (%s)\n", buf + 1);
 
 	buf[0] = '\0';
 	buf[1] = '\0';
 	irflag = vn_irflag_read(vp);
 	if (irflag & VIRF_DOOMED)
 		strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
 	if (irflag & VIRF_PGREAD)
 		strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
 	if (irflag & VIRF_MOUNTPOINT)
 		strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf));
 	if (irflag & VIRF_TEXT_REF)
 		strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf));
 	flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_vflag & VV_ROOT)
 		strlcat(buf, "|VV_ROOT", sizeof(buf));
 	if (vp->v_vflag & VV_ISTTY)
 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
 	if (vp->v_vflag & VV_NOSYNC)
 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 	if (vp->v_vflag & VV_ETERNALDEV)
 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_VMSIZEVNLOCK)
 		strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 	if (vp->v_vflag & VV_PROCDEP)
 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 	if (vp->v_vflag & VV_DELETED)
 		strlcat(buf, "|VV_DELETED", sizeof(buf));
 	if (vp->v_vflag & VV_MD)
 		strlcat(buf, "|VV_MD", sizeof(buf));
 	if (vp->v_vflag & VV_FORCEINSMQ)
 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 	if (vp->v_vflag & VV_READLINK)
 		strlcat(buf, "|VV_READLINK", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 	    VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM |
 	    VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
 	if (vp->v_iflag & VI_DOINGINACT)
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 	if (vp->v_iflag & VI_DEFINACT)
 		strlcat(buf, "|VI_DEFINACT", sizeof(buf));
 	if (vp->v_iflag & VI_FOPENING)
 		strlcat(buf, "|VI_FOPENING", sizeof(buf));
 	flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT |
 	    VI_OWEINACT | VI_DEFINACT | VI_FOPENING);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_mflag & VMP_LAZYLIST)
 		strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
 	flags = vp->v_mflag & ~(VMP_LAZYLIST);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	printf("    flags (%s)", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	printf("\n");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d "
 		    "cleanbuf %d dirtybuf %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count,
 		    vp->v_bufobj.bo_clean.bv_cnt,
 		    vp->v_bufobj.bo_dirty.bv_cnt);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE)
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 				vn_printf(vp, "vnode ");
 		}
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 
 /*
  * Show details about the given mount point.
  */
 DB_SHOW_COMMAND(mount, db_show_mount)
 {
 	struct mount *mp;
 	struct vfsopt *opt;
 	struct statfs *sp;
 	struct vnode *vp;
 	char buf[512];
 	uint64_t mflags;
 	u_int flags;
 
 	if (!have_addr) {
 		/* No address given, print short info about all mount points. */
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			db_printf("%p %s on %s (%s)\n", mp,
 			    mp->mnt_stat.f_mntfromname,
 			    mp->mnt_stat.f_mntonname,
 			    mp->mnt_stat.f_fstypename);
 			if (db_pager_quit)
 				break;
 		}
 		db_printf("\nMore info: show mount <addr>\n");
 		return;
 	}
 
 	mp = (struct mount *)addr;
 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 
 	buf[0] = '\0';
 	mflags = mp->mnt_flag;
 #define	MNT_FLAG(flag)	do {						\
 	if (mflags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
 		mflags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_FLAG(MNT_RDONLY);
 	MNT_FLAG(MNT_SYNCHRONOUS);
 	MNT_FLAG(MNT_NOEXEC);
 	MNT_FLAG(MNT_NOSUID);
 	MNT_FLAG(MNT_NFS4ACLS);
 	MNT_FLAG(MNT_UNION);
 	MNT_FLAG(MNT_ASYNC);
 	MNT_FLAG(MNT_SUIDDIR);
 	MNT_FLAG(MNT_SOFTDEP);
 	MNT_FLAG(MNT_NOSYMFOLLOW);
 	MNT_FLAG(MNT_GJOURNAL);
 	MNT_FLAG(MNT_MULTILABEL);
 	MNT_FLAG(MNT_ACLS);
 	MNT_FLAG(MNT_NOATIME);
 	MNT_FLAG(MNT_NOCLUSTERR);
 	MNT_FLAG(MNT_NOCLUSTERW);
 	MNT_FLAG(MNT_SUJ);
 	MNT_FLAG(MNT_EXRDONLY);
 	MNT_FLAG(MNT_EXPORTED);
 	MNT_FLAG(MNT_DEFEXPORTED);
 	MNT_FLAG(MNT_EXPORTANON);
 	MNT_FLAG(MNT_EXKERB);
 	MNT_FLAG(MNT_EXPUBLIC);
 	MNT_FLAG(MNT_LOCAL);
 	MNT_FLAG(MNT_QUOTA);
 	MNT_FLAG(MNT_ROOTFS);
 	MNT_FLAG(MNT_USER);
 	MNT_FLAG(MNT_IGNORE);
 	MNT_FLAG(MNT_UPDATE);
 	MNT_FLAG(MNT_DELEXPORT);
 	MNT_FLAG(MNT_RELOAD);
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
 #undef MNT_FLAG
 	if (mflags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%016jx", mflags);
 	}
 	db_printf("    mnt_flag = %s\n", buf);
 
 	buf[0] = '\0';
 	flags = mp->mnt_kern_flag;
 #define	MNT_KERN_FLAG(flag)	do {					\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
 	MNT_KERN_FLAG(MNTK_ASYNC);
 	MNT_KERN_FLAG(MNTK_SOFTDEP);
 	MNT_KERN_FLAG(MNTK_NOMSYNC);
 	MNT_KERN_FLAG(MNTK_DRAINING);
 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 	MNT_KERN_FLAG(MNTK_NO_IOPF);
 	MNT_KERN_FLAG(MNTK_RECURSE);
 	MNT_KERN_FLAG(MNTK_UPPER_WAITER);
 	MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE);
 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
 	MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG);
 	MNT_KERN_FLAG(MNTK_FPLOOKUP);
 	MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
 	MNT_KERN_FLAG(MNTK_SUSPEND);
 	MNT_KERN_FLAG(MNTK_SUSPEND2);
 	MNT_KERN_FLAG(MNTK_SUSPENDED);
 	MNT_KERN_FLAG(MNTK_NULL_NOCACHE);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 #undef MNT_KERN_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%08x", flags);
 	}
 	db_printf("    mnt_kern_flag = %s\n", buf);
 
 	db_printf("    mnt_opt = ");
 	opt = TAILQ_FIRST(mp->mnt_opt);
 	if (opt != NULL) {
 		db_printf("%s", opt->name);
 		opt = TAILQ_NEXT(opt, link);
 		while (opt != NULL) {
 			db_printf(", %s", opt->name);
 			opt = TAILQ_NEXT(opt, link);
 		}
 	}
 	db_printf("\n");
 
 	sp = &mp->mnt_stat;
 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 
 	db_printf("    mnt_cred = { uid=%u ruid=%u",
 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 	if (jailed(mp->mnt_cred))
 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 	db_printf(" }\n");
 	db_printf("    mnt_ref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 	db_printf("    mnt_lazyvnodelistsize = %d\n",
 	    mp->mnt_lazyvnodelistsize);
 	db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 	db_printf("    mnt_lockref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 	db_printf("    mnt_secondary_accwrites = %d\n",
 	    mp->mnt_secondary_accwrites);
 	db_printf("    mnt_gjprovider = %s\n",
 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 	db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
 
 	db_printf("\n\nList of active vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 	db_printf("\n\nList of inactive vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static int
 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp.vfc_vfsops = NULL;
 	xvfsp.vfc_next = NULL;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 
 #ifdef COMPAT_FREEBSD32
 struct xvfsconf32 {
 	uint32_t	vfc_vfsops;
 	char		vfc_name[MFSNAMELEN];
 	int32_t		vfc_typenum;
 	int32_t		vfc_refcount;
 	int32_t		vfc_flags;
 	uint32_t	vfc_next;
 };
 
 static int
 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf32 xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	int error;
 
 	error = 0;
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			error = vfsconf2x32(req, vfsp);
 		else
 #endif
 			error = vfsconf2x(req, vfsp);
 		if (error)
 			break;
 	}
 	vfsconf_sunlock();
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 	log(LOG_WARNING, "userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		vfsconf_slock();
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		}
 		vfsconf_sunlock();
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			return (vfsconf2x32(req, vfsp));
 		else
 #endif
 			return (vfsconf2x(req, vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, vfs_sysctl,
     "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error != 0) {
 			vfsconf_sunlock();
 			return (error);
 		}
 	}
 	vfsconf_sunlock();
 	return (0);
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 static void
 unmount_or_warn(struct mount *mp)
 {
 	int error;
 
 	error = dounmount(mp, MNT_FORCE, curthread);
 	if (error != 0) {
 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 		if (error == EBUSY)
 			printf("BUSY)\n");
 		else
 			printf("%d)\n", error);
 	}
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp, *tmp;
 
 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
 		vfs_ref(mp);
 
 		/*
 		 * Forcibly unmounting "/dev" before "/" would prevent clean
 		 * unmount of the latter.
 		 */
 		if (mp == rootdevmp)
 			continue;
 
 		unmount_or_warn(mp);
 	}
 
 	if (rootdevmp != NULL)
 		unmount_or_warn(rootdevmp);
 }
 
 static void
 vfs_deferred_inactive(struct vnode *vp, int lkflags)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		vdropl(vp);
 		return;
 	}
 	if (vn_lock(vp, lkflags) == 0) {
 		VI_LOCK(vp);
 		vinactive(vp);
 		VOP_UNLOCK(vp);
 		vdropl(vp);
 		return;
 	}
 	vdefer_inactive_unlocked(vp);
 }
 
 static int
 vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
 {
 
 	return (vp->v_iflag & VI_DEFINACT);
 }
 
 static void __noinline
 vfs_periodic_inactive(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	int lkflags;
 
 	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (flags != MNT_WAIT)
 		lkflags |= LK_NOWAIT;
 
 	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
 		if ((vp->v_iflag & VI_DEFINACT) == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		vp->v_iflag &= ~VI_DEFINACT;
 		vfs_deferred_inactive(vp, lkflags);
 	}
 }
 
 static inline bool
 vfs_want_msync(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	/*
 	 * This test may be performed without any locks held.
 	 * We rely on vm_object's type stability.
 	 */
 	if (vp->v_vflag & VV_NOSYNC)
 		return (false);
 	obj = vp->v_object;
 	return (obj != NULL && vm_object_mightbedirty(obj));
 }
 
 static int
 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
 {
 
 	if (vp->v_vflag & VV_NOSYNC)
 		return (false);
 	if (vp->v_iflag & VI_DEFINACT)
 		return (true);
 	return (vfs_want_msync(vp));
 }
 
 static void __noinline
 vfs_periodic_msync_inactive(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	int lkflags;
 	bool seen_defer;
 
 	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (flags != MNT_WAIT)
 		lkflags |= LK_NOWAIT;
 
 	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
 		seen_defer = false;
 		if (vp->v_iflag & VI_DEFINACT) {
 			vp->v_iflag &= ~VI_DEFINACT;
 			seen_defer = true;
 		}
 		if (!vfs_want_msync(vp)) {
 			if (seen_defer)
 				vfs_deferred_inactive(vp, lkflags);
 			else
 				VI_UNLOCK(vp);
 			continue;
 		}
 		if (vget(vp, lkflags) == 0) {
 			if ((vp->v_vflag & VV_NOSYNC) == 0) {
 				if (flags == MNT_WAIT)
 					vnode_pager_clean_sync(vp);
 				else
 					vnode_pager_clean_async(vp);
 			}
 			vput(vp);
 			if (seen_defer)
 				vdrop(vp);
 		} else {
 			if (seen_defer)
 				vdefer_inactive_unlocked(vp);
 		}
 	}
 }
 
 void
 vfs_periodic(struct mount *mp, int flags)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
 		vfs_periodic_inactive(mp, flags);
 	else
 		vfs_periodic_msync_inactive(mp, flags);
 }
 
 static void
 destroy_vpollinfo_free(struct vpollinfo *vi)
 {
 
 	knlist_destroy(&vi->vpi_selinfo.si_note);
 	mtx_destroy(&vi->vpi_lock);
 	free(vi, M_VNODEPOLL);
 }
 
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
 
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
 }
 
 /*
  * Initialize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	if (vp->v_pollinfo != NULL)
 		return;
 	vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_lock);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
 		destroy_vpollinfo_free(vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return (events);
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return (0);
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,
 	.vop_fsync =	sync_fsync,
 	.vop_getwritemount = vop_stdgetwritemount,
 	.vop_inactive =	sync_inactive,
 	.vop_need_inactive = vop_stdneed_inactive,
 	.vop_reclaim =	sync_reclaim,
 	.vop_lock1 =	vop_stdlock,
 	.vop_unlock =	vop_stdunlock,
 	.vop_islocked =	vop_stdislocked,
 	.vop_fplookup_vexec = VOP_EAGAIN,
 	.vop_fplookup_symlink = VOP_EAGAIN,
 };
 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 void
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
 	vp->v_type = VNON;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque1(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque() failed");
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	vn_set_state(vp, VSTATE_CONSTRUCTED);
 	VOP_UNLOCK(vp);
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	if (mp->mnt_syncer == NULL) {
 		mp->mnt_syncer = vp;
 		vp = NULL;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 	if (vp != NULL) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vgone(vp);
 		vput(vp);
 	}
 }
 
 void
 vfs_deallocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_lock(&sync_mtx);
 	vp = mp->mnt_syncer;
 	if (vp != NULL)
 		mp->mnt_syncer = NULL;
 	mtx_unlock(&sync_mtx);
 	if (vp != NULL)
 		vrele(vp);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	int error, save;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
 		return (0);
 	VOP_UNLOCK(syncvp);
 	save = curthread_pflags_set(TDP_SYNCIO);
 	/*
 	 * The filesystem at hand may be idle with free vnodes stored in the
 	 * batch.  Return them instead of letting them stay there indefinitely.
 	 */
 	vfs_periodic(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY);
 	curthread_pflags_restore(save);
 	vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY);
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	mtx_lock(&sync_mtx);
 	if (vp->v_mount->mnt_syncer == vp)
 		vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 
 	return (0);
 }
 
 int
 vn_need_pageq_flush(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	obj = vp->v_object;
 	return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    vm_object_mightbedirty(obj));
 }
 
 /*
  * Check if vnode represents a disk device
  */
 bool
 vn_isdisk_error(struct vnode *vp, int *errp)
 {
 	int error;
 
 	if (vp->v_type != VCHR) {
 		error = ENOTBLK;
 		goto out;
 	}
 	error = 0;
 	dev_lock();
 	if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 out:
 	*errp = error;
 	return (error == 0);
 }
 
 bool
 vn_isdisk(struct vnode *vp)
 {
 	int error;
 
 	return (vn_isdisk_error(vp, &error));
 }
 
 /*
  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
  * the comment above cache_fplookup for details.
  */
 int
 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
 {
 	int error;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		if (file_mode & S_IXUSR)
 			return (0);
 		goto out_error;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			return (0);
 		goto out_error;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		return (0);
 out_error:
 	/*
 	 * Permission check failed, but it is possible denial will get overwritten
 	 * (e.g., when root is traversing through a 700 directory owned by someone
 	 * else).
 	 *
 	 * vaccess() calls priv_check_cred which in turn can descent into MAC
 	 * modules overriding this result. It's quite unclear what semantics
 	 * are allowed for them to operate, thus for safety we don't call them
 	 * from within the SMR section. This also means if any such modules
 	 * are present, we have to let the regular lookup decide.
 	 */
 	error = priv_check_cred_vfs_lookup_nomac(cred);
 	switch (error) {
 	case 0:
 		return (0);
 	case EAGAIN:
 		/*
 		 * MAC modules present.
 		 */
 		return (EAGAIN);
 	case EPERM:
 		return (EACCES);
 	default:
 		return (error);
 	}
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, and credentials.
  * Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     accmode_t accmode, struct ucred *cred)
 {
 	accmode_t dac_granted;
 	accmode_t priv_granted;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    ("VAPPEND without VWRITE"));
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((accmode & dac_granted) == accmode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ))
 		priv_granted |= VREAD;
 
 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN))
 		priv_granted |= VADMIN;
 
 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
 		return (0);
 	}
 
 	return ((accmode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, accmode_t accmode)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, accmode, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
     "Drop into debugger on lock violation");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
     0, "Check for interlock across VOPs");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
     0, "Print lock violations");
 
 int vfs_badlock_vnode = 1;	/* Print vnode details on lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
     0, "Print vnode details on lock violations");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_vnode)
 		vn_printf(vp, "vnode ");
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 	if (KERNEL_PANICKED() || vp == NULL)
 		return;
 
 #ifdef WITNESS
 	if ((vp->v_irflag & VIRF_CROSSMP) == 0 &&
 	    witness_is_owned(&vp->v_vnlock->lock_object) == -1)
 #else
 	int locked = VOP_ISLOCKED(vp);
 	if (locked == 0 || locked == LK_EXCLOTHER)
 #endif
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 	if (KERNEL_PANICKED() || vp == NULL)
 		return;
 
 #ifdef WITNESS
 	if ((vp->v_irflag & VIRF_CROSSMP) == 0 &&
 	    witness_is_owned(&vp->v_vnlock->lock_object) == 1)
 #else
 	if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 #endif
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 	if (KERNEL_PANICKED() || vp == NULL)
 		return;
 
 	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_fail(struct vop_rename_args *ap)
 {
 
 	if (ap->a_tvp != NULL)
 		vput(ap->a_tvp);
 	if (ap->a_tdvp == ap->a_tvp)
 		vrele(ap->a_tdvp);
 	else
 		vput(ap->a_tdvp);
 	vrele(ap->a_fdvp);
 	vrele(ap->a_fvp);
 }
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	/*
 	 * It may be tempting to add vn_seqc_write_begin/end calls here and
 	 * in vop_rename_post but that's not going to work out since some
 	 * filesystems relookup vnodes mid-rename. This is probably a bug.
 	 *
 	 * For now filesystems are expected to do the relevant calls after they
 	 * decide what vnodes to operate on.
 	 */
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 #ifdef DEBUG_VFS_LOCKS
 void
 vop_fplookup_vexec_debugpre(void *ap __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 void
 vop_fplookup_vexec_debugpost(void *ap, int rc)
 {
 	struct vop_fplookup_vexec_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 
 	VFS_SMR_ASSERT_ENTERED();
 	if (rc == EOPNOTSUPP)
 		VNPASS(VN_IS_DOOMED(vp), vp);
 }
 
 void
 vop_fplookup_symlink_debugpre(void *ap __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 void
 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 static void
 vop_fsync_debugprepost(struct vnode *vp, const char *name)
 {
 	if (vp->v_type == VCHR)
 		;
 	/*
 	 * The shared vs. exclusive locking policy for fsync()
 	 * is actually determined by vp's write mount as indicated
 	 * by VOP_GETWRITEMOUNT(), which for stacked filesystems
 	 * may not be the same as vp->v_mount.  However, if the
 	 * underlying filesystem which really handles the fsync()
 	 * supports shared locking, the stacked filesystem must also
 	 * be prepared for its VOP_FSYNC() operation to be called
 	 * with only a shared lock.  On the other hand, if the
 	 * stacked filesystem claims support for shared write
 	 * locking but the underlying filesystem does not, and the
 	 * caller incorrectly uses a shared lock, this condition
 	 * should still be caught when the stacked filesystem
 	 * invokes VOP_FSYNC() on the underlying filesystem.
 	 */
 	else if (MNT_SHARED_WRITES(vp->v_mount))
 		ASSERT_VOP_LOCKED(vp, name);
 	else
 		ASSERT_VOP_ELOCKED(vp, name);
 }
 
 void
 vop_fsync_debugpre(void *a)
 {
 	struct vop_fsync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_fsync_debugpost(void *a, int rc __unused)
 {
 	struct vop_fsync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_fdatasync_debugpre(void *a)
 {
 	struct vop_fdatasync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_fdatasync_debugpost(void *a, int rc __unused)
 {
 	struct vop_fdatasync_args *ap;
 
 	ap = a;
 	vop_fsync_debugprepost(ap->a_vp, "fsync");
 }
 
 void
 vop_strategy_debugpre(void *ap)
 {
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 	}
 }
 
 void
 vop_lock_debugpre(void *ap)
 {
 	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_debugpost(void *ap, int rc)
 {
 	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_unlock_debugpre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 	struct vnode *vp = a->a_vp;
 
 	VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp);
 	ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK");
 }
 
 void
 vop_need_inactive_debugpre(void *ap)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 
 void
 vop_need_inactive_debugpost(void *ap, int rc)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 #endif
 
 void
 vop_create_pre(void *ap)
 {
 	struct vop_create_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_whiteout_pre(void *ap)
 {
 	struct vop_whiteout_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_whiteout_post(void *ap, int rc)
 {
 	struct vop_whiteout_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 }
 
 void
 vop_deleteextattr_pre(void *ap)
 {
 	struct vop_deleteextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_deleteextattr_post(void *ap, int rc)
 {
 	struct vop_deleteextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_link_pre(void *ap)
 {
 	struct vop_link_args *a;
 	struct vnode *vp, *tdvp;
 
 	a = ap;
 	vp = a->a_vp;
 	tdvp = a->a_tdvp;
 	vn_seqc_write_begin(vp);
 	vn_seqc_write_begin(tdvp);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a;
 	struct vnode *vp, *tdvp;
 
 	a = ap;
 	vp = a->a_vp;
 	tdvp = a->a_tdvp;
 	vn_seqc_write_end(vp);
 	vn_seqc_write_end(tdvp);
 	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_pre(void *ap)
 {
 	struct vop_mkdir_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 #ifdef DEBUG_VFS_LOCKS
 void
 vop_mkdir_debugpost(void *ap, int rc)
 {
 	struct vop_mkdir_args *a;
 
 	a = ap;
 	if (!rc)
 		cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp);
 }
 #endif
 
 void
 vop_mknod_pre(void *ap)
 {
 	struct vop_mknod_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_reclaim_post(void *ap, int rc)
 {
 	struct vop_reclaim_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	ASSERT_VOP_IN_SEQC(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
 }
 
 void
 vop_remove_pre(void *ap)
 {
 	struct vop_remove_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_begin(dvp);
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_end(dvp);
 	vn_seqc_write_end(vp);
 	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 	long hint;
 
 	if (!rc) {
 		hint = NOTE_WRITE;
 		if (a->a_fdvp == a->a_tdvp) {
 			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		} else {
 			hint |= NOTE_EXTEND;
 			if (a->a_fvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 
 			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
 			    a->a_tvp->v_type == VDIR)
 				hint &= ~NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		}
 
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_pre(void *ap)
 {
 	struct vop_rmdir_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_begin(dvp);
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_end(dvp);
 	vn_seqc_write_end(vp);
 	if (!rc) {
 		vp->v_vflag |= VV_UNLINKED;
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_pre(void *ap)
 {
 	struct vop_setattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 }
 
 void
 vop_setacl_pre(void *ap)
 {
 	struct vop_setacl_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setacl_post(void *ap, int rc __unused)
 {
 	struct vop_setacl_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 }
 
 void
 vop_setextattr_pre(void *ap)
 {
 	struct vop_setextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setextattr_post(void *ap, int rc)
 {
 	struct vop_setextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_pre(void *ap)
 {
 	struct vop_symlink_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_open_post(void *ap, int rc)
 {
 	struct vop_open_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
 }
 
 void
 vop_close_post(void *ap, int rc)
 {
 	struct vop_close_args *a = ap;
 
 	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
 	    !VN_IS_DOOMED(a->a_vp))) {
 		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
 	}
 }
 
 void
 vop_read_post(void *ap, int rc)
 {
 	struct vop_read_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 void
 vop_read_pgcache_post(void *ap, int rc)
 {
 	struct vop_read_pgcache_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
 }
 
 void
 vop_readdir_post(void *ap, int rc)
 {
 	struct vop_readdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init_mtx(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
-struct filterops fs_filtops = {
+const struct filterops fs_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_fsattach,
 	.f_detach = filt_fsdetach,
 	.f_event = filt_fsevent
 };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= kn->kn_sfflags & hint;
 
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
     NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
-static struct filterops vfsread_filtops = {
+static const struct filterops vfsread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsread
 };
-static struct filterops vfswrite_filtops = {
+static const struct filterops vfswrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfswrite
 };
-static struct filterops vfsvnode_filtops = {
+static const struct filterops vfsvnode_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsvnode
 };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp);
 }
 
 static void
 vfs_knl_assert_lock(void *arg, int what)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	if (what == LA_LOCKED)
 		ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 	else
 		ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 #endif
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ &&
 	    kn->kn_filter != EVFILT_WRITE),
 	    ("READ/WRITE filter on a FIFO leaked through"));
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	vhold(vp);
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 	vdrop(vp);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	off_t size;
 	int res;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		VI_LOCK(vp);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		VI_UNLOCK(vp);
 		return (1);
 	}
 
 	if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0)
 		return (0);
 
 	VI_LOCK(vp);
 	kn->kn_data = size - kn->kn_fp->f_offset;
 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	VI_LOCK(vp);
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	VI_UNLOCK(vp);
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	int res;
 
 	VI_LOCK(vp);
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		kn->kn_flags |= EV_EOF;
 		VI_UNLOCK(vp);
 		return (1);
 	}
 	res = (kn->kn_fflags != 0);
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	*ap->a_ncookies += 1;
 	return (0);
 }
 
 /*
  * The purpose of this routine is to remove granularity from accmode_t,
  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
  * VADMIN and VAPPEND.
  *
  * If it returns 0, the caller is supposed to continue with the usual
  * access checks using 'accmode' as modified by this routine.  If it
  * returns nonzero value, the caller is supposed to return that value
  * as errno.
  *
  * Note that after this routine runs, accmode may be zero.
  */
 int
 vfs_unixify_accmode(accmode_t *accmode)
 {
 	/*
 	 * There is no way to specify explicit "deny" rule using
 	 * file mode or POSIX.1e ACLs.
 	 */
 	if (*accmode & VEXPLICIT_DENY) {
 		*accmode = 0;
 		return (0);
 	}
 
 	/*
 	 * None of these can be translated into usual access bits.
 	 * Also, the common case for NFSv4 ACLs is to not contain
 	 * either of these bits. Caller should check for VWRITE
 	 * on the containing directory instead.
 	 */
 	if (*accmode & (VDELETE_CHILD | VDELETE))
 		return (EPERM);
 
 	if (*accmode & VADMIN_PERMS) {
 		*accmode &= ~VADMIN_PERMS;
 		*accmode |= VADMIN;
 	}
 
 	/*
 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 	 */
 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 
 	return (0);
 }
 
 /*
  * Clear out a doomed vnode (if any) and replace it with a new one as long
  * as the fs is not being unmounted. Return the root vnode to the caller.
  */
 static int __noinline
 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 restart:
 	if (mp->mnt_rootvnode != NULL) {
 		MNT_ILOCK(mp);
 		vp = mp->mnt_rootvnode;
 		if (vp != NULL) {
 			if (!VN_IS_DOOMED(vp)) {
 				vrefact(vp);
 				MNT_IUNLOCK(mp);
 				error = vn_lock(vp, flags);
 				if (error == 0) {
 					*vpp = vp;
 					return (0);
 				}
 				vrele(vp);
 				goto restart;
 			}
 			/*
 			 * Clear the old one.
 			 */
 			mp->mnt_rootvnode = NULL;
 		}
 		MNT_IUNLOCK(mp);
 		if (vp != NULL) {
 			vfs_op_barrier_wait(mp);
 			vrele(vp);
 		}
 	}
 	error = VFS_CACHEDROOT(mp, flags, vpp);
 	if (error != 0)
 		return (error);
 	if (mp->mnt_vfs_ops == 0) {
 		MNT_ILOCK(mp);
 		if (mp->mnt_vfs_ops != 0) {
 			MNT_IUNLOCK(mp);
 			return (0);
 		}
 		if (mp->mnt_rootvnode == NULL) {
 			vrefact(*vpp);
 			mp->mnt_rootvnode = *vpp;
 		} else {
 			if (mp->mnt_rootvnode != *vpp) {
 				if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
 					panic("%s: mismatch between vnode returned "
 					    " by VFS_CACHEDROOT and the one cached "
 					    " (%p != %p)",
 					    __func__, *vpp, mp->mnt_rootvnode);
 				}
 			}
 		}
 		MNT_IUNLOCK(mp);
 	}
 	return (0);
 }
 
 int
 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mount_pcpu *mpcpu;
 	struct vnode *vp;
 	int error;
 
 	if (!vfs_op_thread_enter(mp, mpcpu))
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	vp = atomic_load_ptr(&mp->mnt_rootvnode);
 	if (vp == NULL || VN_IS_DOOMED(vp)) {
 		vfs_op_thread_exit(mp, mpcpu);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	vrefact(vp);
 	vfs_op_thread_exit(mp, mpcpu);
 	error = vn_lock(vp, flags);
 	if (error != 0) {
 		vrele(vp);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	*vpp = vp;
 	return (0);
 }
 
 struct vnode *
 vfs_cache_root_clear(struct mount *mp)
 {
 	struct vnode *vp;
 
 	/*
 	 * ops > 0 guarantees there is nobody who can see this vnode
 	 */
 	MPASS(mp->mnt_vfs_ops > 0);
 	vp = mp->mnt_rootvnode;
 	if (vp != NULL)
 		vn_seqc_write_begin(vp);
 	mp->mnt_rootvnode = NULL;
 	return (vp);
 }
 
 void
 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
 {
 
 	MPASS(mp->mnt_vfs_ops > 0);
 	vrefact(vp);
 	mp->mnt_rootvnode = vp;
 }
 
 /*
  * These are helper functions for filesystems to traverse all
  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
  *
  * This interface replaces MNT_VNODE_FOREACH.
  */
 
 struct vnode *
 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	maybe_yield();
 	MNT_ILOCK(mp);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
 	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		__mnt_vnode_markerfree_all(mvp, mp);
 		/* MNT_IUNLOCK(mp); -- done in above function */
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = vn_alloc_marker(mp);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		vn_free_marker(*mvp);
 		*mvp = NULL;
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 void
 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	vn_free_marker(*mvp);
 	*mvp = NULL;
 }
 
 /*
  * These are helper functions for filesystems to traverse their
  * lazy vnodes.  See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
  */
 static void
 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 {
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	vn_free_marker(*mvp);
 	*mvp = NULL;
 }
 
 /*
  * Relock the mp mount vnode list lock with the vp vnode interlock in the
  * conventional lock order during mnt_vnode_next_lazy iteration.
  *
  * On entry, the mount vnode list lock is held and the vnode interlock is not.
  * The list lock is dropped and reacquired.  On success, both locks are held.
  * On failure, the mount vnode list lock is held but the vnode interlock is
  * not, and the procedure may have yielded.
  */
 static bool
 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
     struct vnode *vp)
 {
 
 	VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
 	    TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
 	    ("%s: bad marker", __func__));
 	VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
 	    ("%s: inappropriate vnode", __func__));
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
 	TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
 
 	/*
 	 * Note we may be racing against vdrop which transitioned the hold
 	 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
 	 * if we are the only user after we get the interlock we will just
 	 * vdrop.
 	 */
 	vhold(vp);
 	mtx_unlock(&mp->mnt_listmtx);
 	VI_LOCK(vp);
 	if (VN_IS_DOOMED(vp)) {
 		VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 		goto out_lost;
 	}
 	VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 	/*
 	 * There is nothing to do if we are the last user.
 	 */
 	if (!refcount_release_if_not_last(&vp->v_holdcnt))
 		goto out_lost;
 	mtx_lock(&mp->mnt_listmtx);
 	return (true);
 out_lost:
 	vdropl(vp);
 	maybe_yield();
 	mtx_lock(&mp->mnt_listmtx);
 	return (false);
 }
 
 static struct vnode *
 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 	struct vnode *vp;
 
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 restart:
 	vp = TAILQ_NEXT(*mvp, v_lazylist);
 	while (vp != NULL) {
 		if (vp->v_type == VMARKER) {
 			vp = TAILQ_NEXT(vp, v_lazylist);
 			continue;
 		}
 		/*
 		 * See if we want to process the vnode. Note we may encounter a
 		 * long string of vnodes we don't care about and hog the list
 		 * as a result. Check for it and requeue the marker.
 		 */
 		VNPASS(!VN_IS_DOOMED(vp), vp);
 		if (!cb(vp, cbarg)) {
 			if (!should_yield()) {
 				vp = TAILQ_NEXT(vp, v_lazylist);
 				continue;
 			}
 			TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
 			    v_lazylist);
 			TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
 			    v_lazylist);
 			mtx_unlock(&mp->mnt_listmtx);
 			kern_yield(PRI_USER);
 			mtx_lock(&mp->mnt_listmtx);
 			goto restart;
 		}
 		/*
 		 * Try-lock because this is the wrong lock order.
 		 */
 		if (!VI_TRYLOCK(vp) &&
 		    !mnt_vnode_next_lazy_relock(*mvp, mp, vp))
 			goto restart;
 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 		    ("alien vnode on the lazy list %p %p", vp, mp));
 		VNPASS(vp->v_mount == mp, vp);
 		VNPASS(!VN_IS_DOOMED(vp), vp);
 		break;
 	}
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_lazy(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
 	mtx_unlock(&mp->mnt_listmtx);
 	ASSERT_VI_LOCKED(vp, "lazy iter");
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 
 	maybe_yield();
 	mtx_lock(&mp->mnt_listmtx);
 	return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 }
 
 struct vnode *
 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 	struct vnode *vp;
 
 	if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
 		return (NULL);
 
 	*mvp = vn_alloc_marker(mp);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 
 	mtx_lock(&mp->mnt_listmtx);
 	vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_lazy(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
 	return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 }
 
 void
 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 
 	mtx_lock(&mp->mnt_listmtx);
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 	mtx_unlock(&mp->mnt_listmtx);
 	mnt_vnode_markerfree_lazy(mvp, mp);
 }
 
 int
 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
 {
 
 	if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 		cnp->cn_flags &= ~NOEXECCHECK;
 		return (0);
 	}
 
 	return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread));
 }
 
 /*
  * Do not use this variant unless you have means other than the hold count
  * to prevent the vnode from getting freed.
  */
 void
 vn_seqc_write_begin_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vp->v_seqc_users >= 0, vp);
 	vp->v_seqc_users++;
 	if (vp->v_seqc_users == 1)
 		seqc_sleepable_write_begin(&vp->v_seqc);
 }
 
 void
 vn_seqc_write_begin(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vn_seqc_write_begin_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_seqc_write_end_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_seqc_users > 0, vp);
 	vp->v_seqc_users--;
 	if (vp->v_seqc_users == 0)
 		seqc_sleepable_write_end(&vp->v_seqc);
 }
 
 void
 vn_seqc_write_end(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vn_seqc_write_end_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Special case handling for allocating and freeing vnodes.
  *
  * The counter remains unchanged on free so that a doomed vnode will
  * keep testing as in modify as long as it is accessible with SMR.
  */
 static void
 vn_seqc_init(struct vnode *vp)
 {
 
 	vp->v_seqc = 0;
 	vp->v_seqc_users = 0;
 }
 
 static void
 vn_seqc_write_end_free(struct vnode *vp)
 {
 
 	VNPASS(seqc_in_modify(vp->v_seqc), vp);
 	VNPASS(vp->v_seqc_users == 1, vp);
 }
 
 void
 vn_irflag_set_locked(struct vnode *vp, short toset)
 {
 	short flags;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	flags = vn_irflag_read(vp);
 	VNASSERT((flags & toset) == 0, vp,
 	    ("%s: some of the passed flags already set (have %d, passed %d)\n",
 	    __func__, flags, toset));
 	atomic_store_short(&vp->v_irflag, flags | toset);
 }
 
 void
 vn_irflag_set(struct vnode *vp, short toset)
 {
 
 	VI_LOCK(vp);
 	vn_irflag_set_locked(vp, toset);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_irflag_set_cond_locked(struct vnode *vp, short toset)
 {
 	short flags;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	flags = vn_irflag_read(vp);
 	atomic_store_short(&vp->v_irflag, flags | toset);
 }
 
 void
 vn_irflag_set_cond(struct vnode *vp, short toset)
 {
 
 	VI_LOCK(vp);
 	vn_irflag_set_cond_locked(vp, toset);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_irflag_unset_locked(struct vnode *vp, short tounset)
 {
 	short flags;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	flags = vn_irflag_read(vp);
 	VNASSERT((flags & tounset) == tounset, vp,
 	    ("%s: some of the passed flags not set (have %d, passed %d)\n",
 	    __func__, flags, tounset));
 	atomic_store_short(&vp->v_irflag, flags & ~tounset);
 }
 
 void
 vn_irflag_unset(struct vnode *vp, short tounset)
 {
 
 	VI_LOCK(vp);
 	vn_irflag_unset_locked(vp, tounset);
 	VI_UNLOCK(vp);
 }
 
 int
 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred)
 {
 	struct vattr vattr;
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	error = VOP_GETATTR(vp, &vattr, cred);
 	if (__predict_true(error == 0)) {
 		if (vattr.va_size <= OFF_MAX)
 			*size = vattr.va_size;
 		else
 			error = EFBIG;
 	}
 	return (error);
 }
 
 int
 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred)
 {
 	int error;
 
 	VOP_LOCK(vp, LK_SHARED);
 	error = vn_getsize_locked(vp, size, cred);
 	VOP_UNLOCK(vp);
 	return (error);
 }
 
 #ifdef INVARIANTS
 void
 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state)
 {
 
 	switch (vp->v_state) {
 	case VSTATE_UNINITIALIZED:
 		switch (state) {
 		case VSTATE_CONSTRUCTED:
 		case VSTATE_DESTROYING:
 			return;
 		default:
 			break;
 		}
 		break;
 	case VSTATE_CONSTRUCTED:
 		ASSERT_VOP_ELOCKED(vp, __func__);
 		switch (state) {
 		case VSTATE_DESTROYING:
 			return;
 		default:
 			break;
 		}
 		break;
 	case VSTATE_DESTROYING:
 		ASSERT_VOP_ELOCKED(vp, __func__);
 		switch (state) {
 		case VSTATE_DEAD:
 			return;
 		default:
 			break;
 		}
 		break;
 	case VSTATE_DEAD:
 		switch (state) {
 		case VSTATE_UNINITIALIZED:
 			return;
 		default:
 			break;
 		}
 		break;
 	}
 
 	vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state);
 	panic("invalid state transition %d -> %d\n", vp->v_state, state);
 }
 #endif
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 53a2ddf94862..c28d6e66853f 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,4272 +1,4272 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/disk.h>
 #include <sys/fail.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/prng.h>
 #include <sys/sx.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/ktrace.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
 static fo_kqfilter_t	vn_kqfilter;
 static fo_close_t	vn_closefile;
 static fo_mmap_t	vn_mmap;
 static fo_fallocate_t	vn_fallocate;
 static fo_fspacectl_t	vn_fspacectl;
 
-struct 	fileops vnops = {
+const struct fileops vnops = {
 	.fo_read = vn_io_fault,
 	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
 	.fo_kqfilter = vn_kqfilter,
 	.fo_stat = vn_statfile,
 	.fo_close = vn_closefile,
 	.fo_chmod = vn_chmod,
 	.fo_chown = vn_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap = vn_mmap,
 	.fo_fallocate = vn_fallocate,
 	.fo_fspacectl = vn_fspacectl,
 	.fo_cmp = vn_cmp,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 const u_int io_hold_cnt = 16;
 static int vn_io_fault_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 static int vn_io_fault_prefault = 0;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 static int vn_io_pgcache_read_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
     &vn_io_pgcache_read_enable, 0,
     "Enable copying from page cache for reads, avoiding fs");
 static u_long vn_io_faults_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 
 static int vfs_allow_read_dir = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
     &vfs_allow_read_dir, 0,
     "Enable read(2) of directory by root for filesystems that support it");
 
 /*
  * Returns true if vn_io_fault mode of handling the i/o request should
  * be used.
  */
 static bool
 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 {
 	struct mount *mp;
 
 	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 	    (mp = vp->v_mount) != NULL &&
 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 }
 
 /*
  * Structure used to pass arguments to vn_io_fault1(), to do either
  * file- or vnode-based I/O calls.
  */
 struct vn_io_fault_args {
 	enum {
 		VN_IO_FAULT_FOP,
 		VN_IO_FAULT_VOP
 	} kind;
 	struct ucred *cred;
 	int flags;
 	union {
 		struct fop_args_tag {
 			struct file *fp;
 			fo_rdwr_t *doio;
 		} fop_args;
 		struct vop_args_tag {
 			struct vnode *vp;
 		} vop_args;
 	} args;
 };
 
 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
     struct vn_io_fault_args *args, struct thread *td);
 
 int
 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 {
 	struct thread *td = curthread;
 
 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 }
 
 static uint64_t
 open2nameif(int fmode, u_int vn_open_flags)
 {
 	uint64_t res;
 
 	res = ISOPEN | LOCKLEAF;
 	if ((fmode & O_RESOLVE_BENEATH) != 0)
 		res |= RBENEATH;
 	if ((fmode & O_EMPTY_PATH) != 0)
 		res |= EMPTYPATH;
 	if ((fmode & FREAD) != 0)
 		res |= OPENREAD;
 	if ((fmode & FWRITE) != 0)
 		res |= OPENWRITE;
 	if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 		res |= AUDITVNODE1;
 	if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 		res |= NOCAPCHECK;
 	if ((vn_open_flags & VN_OPEN_WANTIOCTLCAPS) != 0)
 		res |= WANTIOCTLCAPS;
 	return (res);
 }
 
 /*
  * Common code for vnode open operations via a name lookup.
  * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  *
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
     struct ucred *cred, struct file *fp)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
 	bool first_open;
 
 restart:
 	first_open = false;
 	fmode = *flagp;
 	if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 	    O_EXCL | O_DIRECTORY) ||
 	    (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH))
 		return (EINVAL);
 	else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		/*
 		 * Set NOCACHE to avoid flushing the cache when
 		 * rolling in many files at once.
 		 *
 		 * Set NC_KEEPPOSENTRY to keep positive entries if they already
 		 * exist despite NOCACHE.
 		 */
 		ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 			bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE_PNBUF(ndp);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | V_PCATCH)) != 0)
 					return (error);
 				NDREINIT(ndp);
 				goto restart;
 			}
 			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0)
 #endif
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 				    &ndp->ni_cnd, vap);
 			vp = ndp->ni_vp;
 			if (error == 0 && (fmode & O_EXCL) != 0 &&
 			    (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
 				VI_LOCK(vp);
 				vp->v_iflag |= VI_FOPENING;
 				VI_UNLOCK(vp);
 				first_open = true;
 			}
 			VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
 			    false);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE_PNBUF(ndp);
 				if (error == ERELOOKUP) {
 					NDREINIT(ndp);
 					goto restart;
 				}
 				return (error);
 			}
 			fmode &= ~O_TRUNC;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			if (vp->v_type == VDIR) {
 				error = EISDIR;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 		    FOLLOW;
 		if ((fmode & FWRITE) == 0)
 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	error = vn_open_vnode(vp, fmode, cred, curthread, fp);
 	if (first_open) {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_FOPENING;
 		wakeup(vp);
 		VI_UNLOCK(vp);
 	}
 	if (error)
 		goto bad;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE_PNBUF(ndp);
 	vput(vp);
 	*flagp = fmode;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 static int
 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 {
 	struct flock lf;
 	int error, lock_flags, type;
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 	if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 		return (0);
 	KASSERT(fp != NULL, ("open with flock requires fp"));
 	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 		return (EOPNOTSUPP);
 
 	lock_flags = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 	type = F_FLOCK;
 	if ((fmode & FNONBLOCK) == 0)
 		type |= F_WAIT;
 	if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 		type |= F_FIRSTOPEN;
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 	if (error == 0)
 		fp->f_flag |= FHASLOCK;
 
 	vn_lock(vp, lock_flags | LK_RETRY);
 	return (error);
 }
 
 /*
  * Common code for vnode open operations once a vnode is located.
  * Check permissions, and call the VOP_OPEN routine.
  */
 int
 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
     struct thread *td, struct file *fp)
 {
 	accmode_t accmode;
 	int error;
 
 	if (vp->v_type == VLNK) {
 		if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
 			return (EMLINK);
 	}
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
 
 	accmode = 0;
 	if ((fmode & O_PATH) == 0) {
 		if (vp->v_type == VSOCK)
 			return (EOPNOTSUPP);
 		if ((fmode & (FWRITE | O_TRUNC)) != 0) {
 			if (vp->v_type == VDIR)
 				return (EISDIR);
 			accmode |= VWRITE;
 		}
 		if ((fmode & FREAD) != 0)
 			accmode |= VREAD;
 		if ((fmode & O_APPEND) && (fmode & FWRITE))
 			accmode |= VAPPEND;
 #ifdef MAC
 		if ((fmode & O_CREAT) != 0)
 			accmode |= VCREAT;
 #endif
 	}
 	if ((fmode & FEXEC) != 0)
 		accmode |= VEXEC;
 #ifdef MAC
 	if ((fmode & O_VERIFY) != 0)
 		accmode |= VVERIFY;
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error != 0)
 		return (error);
 
 	accmode &= ~(VCREAT | VVERIFY);
 #endif
 	if ((fmode & O_CREAT) == 0 && accmode != 0) {
 		error = VOP_ACCESS(vp, accmode, cred, td);
 		if (error != 0)
 			return (error);
 	}
 	if ((fmode & O_PATH) != 0) {
 		if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
 		    VOP_ACCESS(vp, VREAD, cred, td) == 0)
 			fp->f_flag |= FKQALLOWED;
 		return (0);
 	}
 
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 	error = VOP_OPEN(vp, fmode, cred, td, fp);
 	if (error != 0)
 		return (error);
 
 	error = vn_open_vnode_advlock(vp, fmode, fp);
 	if (error == 0 && (fmode & FWRITE) != 0) {
 		error = VOP_ADD_WRITECOUNT(vp, 1);
 		if (error == 0) {
 			CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 			     __func__, vp, vp->v_writecount);
 		}
 	}
 
 	/*
 	 * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 	 * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 	 */
 	if (error != 0) {
 		if (fp != NULL) {
 			/*
 			 * Arrange the call by having fdrop() to use
 			 * vn_closefile().  This is to satisfy
 			 * filesystems like devfs or tmpfs, which
 			 * override fo_close().
 			 */
 			fp->f_flag |= FOPENFAILED;
 			fp->f_vnode = vp;
 			if (fp->f_ops == &badfileops) {
 				fp->f_type = DTYPE_VNODE;
 				fp->f_ops = &vnops;
 			}
 			vref(vp);
 		} else {
 			/*
 			 * If there is no fp, due to kernel-mode open,
 			 * we can call VOP_CLOSE() now.
 			 */
 			if ((vp->v_type == VFIFO ||
 			    !MNT_EXTENDED_SHARED(vp->v_mount)) &&
 			    VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 				vn_lock(vp, LK_UPGRADE | LK_RETRY);
 			(void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC),
 			    cred, td);
 		}
 	}
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (error);
 
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  * It is racy.
  */
 int
 vn_writechk(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (VOP_IS_TEXT(vp))
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 static int
 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td, bool keep_ref)
 {
 	struct mount *mp;
 	int error, lock_flags;
 
 	lock_flags = vp->v_type != VFIFO && MNT_EXTENDED_SHARED(vp->v_mount) ?
 	    LK_SHARED : LK_EXCLUSIVE;
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	if (keep_ref)
 		VOP_UNLOCK(vp);
 	else
 		vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td)
 {
 
 	return (vn_close1(vp, flags, file_cred, td, false));
 }
 
 /*
  * Heuristic to detect sequential operation.
  */
 static int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 	enum uio_rw rw;
 
 	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 
 	rw = uio->uio_rw;
 	if (fp->f_flag & FRDAHEAD)
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 
 	/*
 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 	 * that the first I/O is normally considered to be slightly
 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
 	 * unless previous seeks have reduced f_seqcount to 0, in which
 	 * case offset 0 is not special.
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 	    uio->uio_offset == fp->f_nextoff[rw]) {
 		/*
 		 * f_seqcount is in units of fixed-size blocks so that it
 		 * depends mainly on the amount of sequential I/O and not
 		 * much on the number of sequential I/O's.  The fixed size
 		 * of 16384 is hard-coded here since it is (not quite) just
 		 * a magic size that works well here.  This size is more
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
 		if (uio->uio_resid >= IO_SEQMAX * 16384)
 			fp->f_seqcount[rw] = IO_SEQMAX;
 		else {
 			fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 			if (fp->f_seqcount[rw] > IO_SEQMAX)
 				fp->f_seqcount[rw] = IO_SEQMAX;
 		}
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 	}
 
 	/* Not sequential.  Quickly draw-down sequentiality. */
 	if (fp->f_seqcount[rw] > 1)
 		fp->f_seqcount[rw] = 1;
 	else
 		fp->f_seqcount[rw] = 0;
 	return (0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error, lock_flags;
 
 	if (offset < 0 && vp->v_type != VCHR)
 		return (EINVAL);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((ioflg & IO_RANGELOCKED) == 0) {
 			if (rw == UIO_READ) {
 				rl_cookie = vn_rangelock_rlock(vp, offset,
 				    offset + len);
 			} else if ((ioflg & IO_APPEND) != 0) {
 				rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 			} else {
 				rl_cookie = vn_rangelock_wlock(vp, offset,
 				    offset + len);
 			}
 		} else
 			rl_cookie = NULL;
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH))
 			    != 0)
 				goto out;
 			lock_flags = vn_lktype_write(mp, vp);
 		} else
 			lock_flags = LK_SHARED;
 		vn_lock(vp, lock_flags | LK_RETRY);
 	} else
 		rl_cookie = NULL;
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_vnode_check_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (do_vn_io_fault(vp, &auio)) {
 			args.kind = VN_IO_FAULT_VOP;
 			args.cred = cred;
 			args.flags = ioflg;
 			args.args.vop_args.vp = vp;
 			error = vn_io_fault1(vp, &auio, &args, td);
 		} else if (rw == UIO_READ) {
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		} else /* if (rw == UIO_WRITE) */ {
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 		}
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		VOP_UNLOCK(vp);
 		if (mp != NULL)
 			vn_finished_write(mp);
 	}
  out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, size_t *aresid, struct thread *td)
 {
 	int error = 0;
 	ssize_t iaresid;
 
 	do {
 		int chunk;
 
 		/*
 		 * Force `offset' to a multiple of MAXBSIZE except possibly
 		 * for the first chunk, so that filesystems only need to
 		 * write full blocks except possibly for the first and last
 		 * chunks.
 		 */
 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 
 		if (chunk > len)
 			chunk = len;
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		iaresid = 0;
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, &iaresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base = (char *)base + chunk;
 		kern_yield(PRI_USER);
 	} while (len);
 	if (aresid)
 		*aresid = len + iaresid;
 	return (error);
 }
 
 #if OFF_MAX <= LONG_MAX
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	volatile short *flagsp;
 	off_t res;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return (atomic_load_long(&fp->f_offset));
 
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	flagsp = &fp->f_vnread_flags;
 	if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 		return (atomic_load_long(&fp->f_offset));
 
 	sleepq_lock(&fp->f_vnread_flags);
 	state = atomic_load_16(flagsp);
 	for (;;) {
 		if ((state & FOFFSET_LOCKED) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    FOFFSET_LOCKED))
 				continue;
 			break;
 		}
 		if ((state & FOFFSET_LOCK_WAITING) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    state | FOFFSET_LOCK_WAITING))
 				continue;
 		}
 		DROP_GIANT();
 		sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 		sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 		PICKUP_GIANT();
 		sleepq_lock(&fp->f_vnread_flags);
 		state = atomic_load_16(flagsp);
 	}
 	res = atomic_load_long(&fp->f_offset);
 	sleepq_release(&fp->f_vnread_flags);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	volatile short *flagsp;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOUPDATE) == 0)
 		atomic_store_long(&fp->f_offset, val);
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return;
 
 	flagsp = &fp->f_vnread_flags;
 	state = atomic_load_16(flagsp);
 	if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 	    atomic_cmpset_rel_16(flagsp, state, 0))
 		return;
 
 	sleepq_lock(&fp->f_vnread_flags);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 	fp->f_vnread_flags = 0;
 	sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(&fp->f_vnread_flags);
 }
 
 static off_t
 foffset_read(struct file *fp)
 {
 
 	return (atomic_load_long(&fp->f_offset));
 }
 #else
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	struct mtx *mtxp;
 	off_t res;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOLOCK) == 0) {
 		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 			    "vofflock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
 	}
 	res = fp->f_offset;
 	mtx_unlock(mtxp);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	struct mtx *mtxp;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOUPDATE) == 0)
 		fp->f_offset = val;
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 	if ((flags & FOF_NOLOCK) == 0) {
 		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 		    ("Lost FOFFSET_LOCKED"));
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
 	}
 	mtx_unlock(mtxp);
 }
 
 static off_t
 foffset_read(struct file *fp)
 {
 
 	return (foffset_lock(fp, FOF_NOLOCK));
 }
 #endif
 
 void
 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = foffset_lock(fp, flags);
 }
 
 void
 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		foffset_unlock(fp, uio->uio_offset, flags);
 }
 
 static int
 get_advice(struct file *fp, struct uio *uio)
 {
 	struct mtx *mtxp;
 	int ret;
 
 	ret = POSIX_FADV_NORMAL;
 	if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 		return (ret);
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if (fp->f_advice != NULL &&
 	    uio->uio_offset >= fp->f_advice->fa_start &&
 	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 		ret = fp->f_advice->fa_advice;
 	mtx_unlock(mtxp);
 	return (ret);
 }
 
 static int
 get_write_ioflag(struct file *fp)
 {
 	int ioflag;
 	struct mount *mp;
 	struct vnode *vp;
 
 	ioflag = 0;
 	vp = fp->f_vnode;
 	mp = atomic_load_ptr(&vp->v_mount);
 
 	if ((fp->f_flag & O_DIRECT) != 0)
 		ioflag |= IO_DIRECT;
 
 	if ((fp->f_flag & O_FSYNC) != 0 ||
 	    (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0))
 		ioflag |= IO_SYNC;
 
 	/*
 	 * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
 	 * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC
 	 * fall back to full O_SYNC behavior.
 	 */
 	if ((fp->f_flag & O_DSYNC) != 0)
 		ioflag |= IO_SYNC | IO_DATASYNC;
 
 	return (ioflag);
 }
 
 int
 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 {
 	vm_object_t obj;
 	vm_page_t ma[io_hold_cnt + 2];
 	off_t off, vsz;
 	ssize_t resid;
 	int error, i, j;
 
 	MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 	obj = atomic_load_ptr(&vp->v_object);
 	if (obj == NULL)
 		return (EJUSTRETURN);
 
 	/*
 	 * Depends on type stability of vm_objects.
 	 */
 	vm_object_pip_add(obj, 1);
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		/*
 		 * Note that object might be already reused from the
 		 * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 		 * we recheck for DOOMED vnode state after all pages
 		 * are busied, and retract then.
 		 *
 		 * But we check for OBJ_DEAD to ensure that we do not
 		 * busy pages while vm_object_terminate_pages()
 		 * processes the queue.
 		 */
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	resid = uio->uio_resid;
 	off = uio->uio_offset;
 	for (i = 0; resid > 0; i++) {
 		MPASS(i < io_hold_cnt + 2);
 		ma[i] = vm_page_grab_unlocked(obj, atop(off),
 		    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_NOWAIT);
 		if (ma[i] == NULL)
 			break;
 
 		/*
 		 * Skip invalid pages.  Valid mask can be partial only
 		 * at EOF, and we clip later.
 		 */
 		if (vm_page_none_valid(ma[i])) {
 			vm_page_sunbusy(ma[i]);
 			break;
 		}
 
 		resid -= PAGE_SIZE;
 		off += PAGE_SIZE;
 	}
 	if (i == 0) {
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	/*
 	 * Check VIRF_DOOMED after we busied our pages.  Since
 	 * vgonel() terminates the vnode' vm_object, it cannot
 	 * process past pages busied by us.
 	 */
 	if (VN_IS_DOOMED(vp)) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 
 	resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 	if (resid > uio->uio_resid)
 		resid = uio->uio_resid;
 
 	/*
 	 * Unlocked read of vnp_size is safe because truncation cannot
 	 * pass busied page.  But we load vnp_size into a local
 	 * variable so that possible concurrent extension does not
 	 * break calculation.
 	 */
 #if defined(__powerpc__) && !defined(__powerpc64__)
 	vsz = obj->un_pager.vnp.vnp_size;
 #else
 	vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 #endif
 	if (uio->uio_offset >= vsz) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 	if (uio->uio_offset + resid > vsz)
 		resid = vsz - uio->uio_offset;
 
 	error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 
 out:
 	for (j = 0; j < i; j++) {
 		if (error == 0)
 			vm_page_reference(ma[j]);
 		vm_page_sunbusy(ma[j]);
 	}
 out_pip:
 	vm_object_pip_wakeup(obj);
 	if (error != 0)
 		return (error);
 	return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	off_t orig_offset;
 	int error, ioflag;
 	int advice;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	/*
 	 * Try to read from page cache.  VIRF_DOOMED check is racy but
 	 * allows us to avoid unneeded work outright.
 	 */
 	if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
 	    (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
 		error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
 		if (error == 0) {
 			fp->f_nextoff[UIO_READ] = uio->uio_offset;
 			return (0);
 		}
 		if (error != EJUSTRETURN)
 			return (error);
 	}
 
 	advice = get_advice(fp, uio);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* Disable read-ahead for random I/O. */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_READ] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * read(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	off_t orig_offset;
 	int error, ioflag;
 	int advice;
 	bool need_finished_write;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0)
 		ioflag |= IO_APPEND;
 	if ((fp->f_flag & FNONBLOCK) != 0)
 		ioflag |= IO_NDELAY;
 	ioflag |= get_write_ioflag(fp);
 
 	mp = NULL;
 	need_finished_write = false;
 	if (vp->v_type != VCHR) {
 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 		if (error != 0)
 			goto unlock;
 		need_finished_write = true;
 	}
 
 	advice = get_advice(fp, uio);
 
 	vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (need_finished_write)
 		vn_finished_write(mp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * write(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 unlock:
 	return (error);
 }
 
 /*
  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
  * prevent the following deadlock:
  *
  * Assume that the thread A reads from the vnode vp1 into userspace
  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
  * currently not resident, then system ends up with the call chain
  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
  * backed by the pages of vnode vp1, and some page in buf2 is not
  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
  *
  * To prevent the lock order reversal and deadlock, vn_io_fault() does
  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
  * Instead, it first tries to do the whole range i/o with pagefaults
  * disabled. If all pages in the i/o buffer are resident and mapped,
  * VOP will succeed (ignoring the genuine filesystem errors).
  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
  * i/o in chunks, with all pages in the chunk prefaulted and held
  * using vm_fault_quick_hold_pages().
  *
  * Filesystems using this deadlock avoidance scheme should use the
  * array of the held pages from uio, saved in the curthread->td_ma,
  * instead of doing uiomove().  A helper function
  * vn_io_fault_uiomove() converts uiomove request into
  * uiomove_fromphys() over td_ma array.
  *
  * Since vnode locks do not cover the whole i/o anymore, rangelocks
  * make the current i/o request atomic with respect to other i/os and
  * truncations.
  */
 
 /*
  * Decode vn_io_fault_args and perform the corresponding i/o.
  */
 static int
 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
     struct thread *td)
 {
 	int error, save;
 
 	error = 0;
 	save = vm_fault_disable_pagefaults();
 	switch (args->kind) {
 	case VN_IO_FAULT_FOP:
 		error = (args->args.fop_args.doio)(args->args.fop_args.fp,
 		    uio, args->cred, args->flags, td);
 		break;
 	case VN_IO_FAULT_VOP:
 		switch (uio->uio_rw) {
 		case UIO_READ:
 			error = VOP_READ(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 			break;
 		case UIO_WRITE:
 			error = VOP_WRITE(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 			break;
 		}
 		break;
 	default:
 		panic("vn_io_fault_doio: unknown kind of io %d %d",
 		    args->kind, uio->uio_rw);
 	}
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 static int
 vn_io_fault_touch(char *base, const struct uio *uio)
 {
 	int r;
 
 	r = fubyte(base);
 	if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
 		return (EFAULT);
 	return (0);
 }
 
 static int
 vn_io_fault_prefault_user(const struct uio *uio)
 {
 	char *base;
 	const struct iovec *iov;
 	size_t len;
 	ssize_t resid;
 	int error, i;
 
 	KASSERT(uio->uio_segflg == UIO_USERSPACE,
 	    ("vn_io_fault_prefault userspace"));
 
 	error = i = 0;
 	iov = uio->uio_iov;
 	resid = uio->uio_resid;
 	base = iov->iov_base;
 	len = iov->iov_len;
 	while (resid > 0) {
 		error = vn_io_fault_touch(base, uio);
 		if (error != 0)
 			break;
 		if (len < PAGE_SIZE) {
 			if (len != 0) {
 				error = vn_io_fault_touch(base + len - 1, uio);
 				if (error != 0)
 					break;
 				resid -= len;
 			}
 			if (++i >= uio->uio_iovcnt)
 				break;
 			iov = uio->uio_iov + i;
 			base = iov->iov_base;
 			len = iov->iov_len;
 		} else {
 			len -= PAGE_SIZE;
 			base += PAGE_SIZE;
 			resid -= PAGE_SIZE;
 		}
 	}
 	return (error);
 }
 
 /*
  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
  * into args and call vn_io_fault1() to handle faults during the user
  * mode buffer accesses.
  */
 static int
 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
     struct thread *td)
 {
 	vm_page_t ma[io_hold_cnt + 2];
 	struct uio *uio_clone, short_uio;
 	struct iovec short_iovec[1];
 	vm_page_t *prev_td_ma;
 	vm_prot_t prot;
 	vm_offset_t addr, end;
 	size_t len, resid;
 	ssize_t adv;
 	int error, cnt, saveheld, prev_td_ma_cnt;
 
 	if (vn_io_fault_prefault) {
 		error = vn_io_fault_prefault_user(uio);
 		if (error != 0)
 			return (error); /* Or ignore ? */
 	}
 
 	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 
 	/*
 	 * The UFS follows IO_UNIT directive and replays back both
 	 * uio_offset and uio_resid if an error is encountered during the
 	 * operation.  But, since the iovec may be already advanced,
 	 * uio is still in an inconsistent state.
 	 *
 	 * Cache a copy of the original uio, which is advanced to the redo
 	 * point using UIO_NOCOPY below.
 	 */
 	uio_clone = cloneuio(uio);
 	resid = uio->uio_resid;
 
 	short_uio.uio_segflg = UIO_USERSPACE;
 	short_uio.uio_rw = uio->uio_rw;
 	short_uio.uio_td = uio->uio_td;
 
 	error = vn_io_fault_doio(args, uio, td);
 	if (error != EFAULT)
 		goto out;
 
 	atomic_add_long(&vn_io_faults_cnt, 1);
 	uio_clone->uio_segflg = UIO_NOCOPY;
 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
 	uio_clone->uio_segflg = uio->uio_segflg;
 
 	saveheld = curthread_pflags_set(TDP_UIOHELD);
 	prev_td_ma = td->td_ma;
 	prev_td_ma_cnt = td->td_ma_cnt;
 
 	while (uio_clone->uio_resid != 0) {
 		len = uio_clone->uio_iov->iov_len;
 		if (len == 0) {
 			KASSERT(uio_clone->uio_iovcnt >= 1,
 			    ("iovcnt underflow"));
 			uio_clone->uio_iov++;
 			uio_clone->uio_iovcnt--;
 			continue;
 		}
 		if (len > ptoa(io_hold_cnt))
 			len = ptoa(io_hold_cnt);
 		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 		end = round_page(addr + len);
 		if (end < addr) {
 			error = EFAULT;
 			break;
 		}
 		/*
 		 * A perfectly misaligned address and length could cause
 		 * both the start and the end of the chunk to use partial
 		 * page.  +2 accounts for such a situation.
 		 */
 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 		    addr, len, prot, ma, io_hold_cnt + 2);
 		if (cnt == -1) {
 			error = EFAULT;
 			break;
 		}
 		short_uio.uio_iov = &short_iovec[0];
 		short_iovec[0].iov_base = (void *)addr;
 		short_uio.uio_iovcnt = 1;
 		short_uio.uio_resid = short_iovec[0].iov_len = len;
 		short_uio.uio_offset = uio_clone->uio_offset;
 		td->td_ma = ma;
 		td->td_ma_cnt = cnt;
 
 		error = vn_io_fault_doio(args, &short_uio, td);
 		vm_page_unhold_pages(ma, cnt);
 		adv = len - short_uio.uio_resid;
 
 		uio_clone->uio_iov->iov_base =
 		    (char *)uio_clone->uio_iov->iov_base + adv;
 		uio_clone->uio_iov->iov_len -= adv;
 		uio_clone->uio_resid -= adv;
 		uio_clone->uio_offset += adv;
 
 		uio->uio_resid -= adv;
 		uio->uio_offset += adv;
 
 		if (error != 0 || adv == 0)
 			break;
 	}
 	td->td_ma = prev_td_ma;
 	td->td_ma_cnt = prev_td_ma_cnt;
 	curthread_pflags_restore(saveheld);
 out:
 	freeuio(uio_clone);
 	return (error);
 }
 
 static int
 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	fo_rdwr_t *doio;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error;
 	bool do_io_fault, do_rangelock;
 
 	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 	vp = fp->f_vnode;
 
 	/*
 	 * The ability to read(2) on a directory has historically been
 	 * allowed for all users, but this can and has been the source of
 	 * at least one security issue in the past.  As such, it is now hidden
 	 * away behind a sysctl for those that actually need it to use it, and
 	 * restricted to root when it's turned on to make it relatively safe to
 	 * leave on for longer sessions of need.
 	 */
 	if (vp->v_type == VDIR) {
 		KASSERT(uio->uio_rw == UIO_READ,
 		    ("illegal write attempted on a directory"));
 		if (!vfs_allow_read_dir)
 			return (EISDIR);
 		if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
 			return (EISDIR);
 	}
 
 	do_io_fault = do_vn_io_fault(vp, uio);
 	do_rangelock = do_io_fault || (vn_irflag_read(vp) & VIRF_PGREAD) != 0;
 	foffset_lock_uio(fp, uio, flags);
 	if (do_rangelock) {
 		if (uio->uio_rw == UIO_READ) {
 			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		} else if ((fp->f_flag & O_APPEND) != 0 ||
 		    (flags & FOF_OFFSET) == 0) {
 			/* For appenders, punt and lock the whole range. */
 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 		} else {
 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		}
 	}
 	if (do_io_fault) {
 		args.kind = VN_IO_FAULT_FOP;
 		args.args.fop_args.fp = fp;
 		args.args.fop_args.doio = doio;
 		args.cred = active_cred;
 		args.flags = flags | FOF_OFFSET;
 		error = vn_io_fault1(vp, uio, &args, td);
 	} else {
 		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 	}
 	if (do_rangelock)
 		vn_rangelock_unlock(vp, rl_cookie);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 /*
  * Helper function to perform the requested uiomove operation using
  * the held pages for io->uio_iov[0].iov_base buffer instead of
  * copyin/copyout.  Access to the pages with uiomove_fromphys()
  * instead of iov_base prevents page faults that could occur due to
  * pmap_collect() invalidating the mapping created by
  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
  * object cleanup revoking the write access from page mappings.
  *
  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
  * instead of plain uiomove().
  */
 int
 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 {
 	struct uio transp_uio;
 	struct iovec transp_iov[1];
 	struct thread *td;
 	size_t adv;
 	int error, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove(data, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	transp_iov[0].iov_base = data;
 	transp_uio.uio_iov = &transp_iov[0];
 	transp_uio.uio_iovcnt = 1;
 	if (xfersize > uio->uio_resid)
 		xfersize = uio->uio_resid;
 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 	transp_uio.uio_offset = 0;
 	transp_uio.uio_segflg = UIO_SYSSPACE;
 	/*
 	 * Since transp_iov points to data, and td_ma page array
 	 * corresponds to original uio->uio_iov, we need to invert the
 	 * direction of the i/o operation as passed to
 	 * uiomove_fromphys().
 	 */
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		transp_uio.uio_rw = UIO_READ;
 		break;
 	case UIO_READ:
 		transp_uio.uio_rw = UIO_WRITE;
 		break;
 	}
 	transp_uio.uio_td = uio->uio_td;
 	error = uiomove_fromphys(td->td_ma,
 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 	    xfersize, &transp_uio);
 	adv = xfersize - transp_uio.uio_resid;
 	pgadv =
 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 	uio->uio_iov->iov_len -= adv;
 	uio->uio_resid -= adv;
 	uio->uio_offset += adv;
 	return (error);
 }
 
 int
 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
     struct uio *uio)
 {
 	struct thread *td;
 	vm_offset_t iov_base;
 	int cnt, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove_fromphys(ma, offset, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 		    offset, cnt);
 		break;
 	case UIO_READ:
 		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 		    cnt);
 		break;
 	}
 	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 	uio->uio_iov->iov_len -= cnt;
 	uio->uio_resid -= cnt;
 	uio->uio_offset += cnt;
 	return (0);
 }
 
 /*
  * File table truncate routine.
  */
 static int
 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	int error;
 
 	vp = fp->f_vnode;
 
 retry:
 	/*
 	 * Lock the whole range for truncation.  Otherwise split i/o
 	 * might happen partly before and partly after the truncation.
 	 */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 	if (error)
 		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error)
 		goto out;
 #endif
 	error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
 	    fp->f_cred);
 out:
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 out1:
 	vn_rangelock_unlock(vp, rl_cookie);
 	if (error == ERELOOKUP)
 		goto retry;
 	return (error);
 }
 
 /*
  * Truncate a file that is already locked.
  */
 int
 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
     struct ucred *cred)
 {
 	struct vattr vattr;
 	int error;
 
 	error = VOP_ADD_WRITECOUNT(vp, 1);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		if (sync)
 			vattr.va_vaflags |= VA_SYNC;
 		error = VOP_SETATTR(vp, &vattr, cred);
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 	}
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 int
 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct vnode *vp = fp->f_vnode;
 	int error;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_STAT(vp, sb, active_cred, fp->f_cred);
 	VOP_UNLOCK(vp);
 
 	return (error);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 	struct fiobmap2_arg *bmarg;
 	off_t size;
 	int error;
 
 	vp = fp->f_vnode;
 	switch (vp->v_type) {
 	case VDIR:
 	case VREG:
 		switch (com) {
 		case FIONREAD:
 			error = vn_getsize(vp, &size, active_cred);
 			if (error == 0)
 				*(int *)data = size - fp->f_offset;
 			return (error);
 		case FIOBMAP2:
 			bmarg = (struct fiobmap2_arg *)data;
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 #ifdef MAC
 			error = mac_vnode_check_read(active_cred, fp->f_cred,
 			    vp);
 			if (error == 0)
 #endif
 				error = VOP_BMAP(vp, bmarg->bn, NULL,
 				    &bmarg->bn, &bmarg->runp, &bmarg->runb);
 			VOP_UNLOCK(vp);
 			return (error);
 		case FIONBIO:
 		case FIOASYNC:
 			return (0);
 		default:
 			return (VOP_IOCTL(vp, com, data, fp->f_flag,
 			    active_cred, td));
 		}
 		break;
 	case VCHR:
 		return (VOP_IOCTL(vp, com, data, fp->f_flag,
 		    active_cred, td));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 #if defined(MAC) || defined(AUDIT)
 	if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(vp);
 		error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 		VOP_UNLOCK(vp);
 		if (error != 0)
 			return (error);
 	}
 #endif
 	error = VOP_POLL(vp, events, fp->f_cred, td);
 	return (error);
 }
 
 /*
  * Acquire the requested lock and then check for validity.  LK_RETRY
  * permits vn_lock to return doomed vnodes.
  */
 static int __noinline
 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
     int error)
 {
 
 	KASSERT((flags & LK_RETRY) == 0 || error == 0,
 	    ("vn_lock: error %d incompatible with flags %#x", error, flags));
 
 	if (error == 0)
 		VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
 
 	if ((flags & LK_RETRY) == 0) {
 		if (error == 0) {
 			VOP_UNLOCK(vp);
 			error = ENOENT;
 		}
 		return (error);
 	}
 
 	/*
 	 * LK_RETRY case.
 	 *
 	 * Nothing to do if we got the lock.
 	 */
 	if (error == 0)
 		return (0);
 
 	/*
 	 * Interlock was dropped by the call in _vn_lock.
 	 */
 	flags &= ~LK_INTERLOCK;
 	do {
 		error = VOP_LOCK1(vp, flags, file, line);
 	} while (error != 0);
 	return (0);
 }
 
 int
 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
 {
 	int error;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vn_lock: no locktype (%d passed)", flags));
 	VNPASS(vp->v_holdcnt > 0, vp);
 	error = VOP_LOCK1(vp, flags, file, line);
 	if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
 		return (_vn_lock_fallback(vp, flags, file, line, error));
 	return (0);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 	bool ref;
 
 	vp = fp->f_vnode;
 	fp->f_ops = &badfileops;
 	ref = (fp->f_flag & FHASLOCK) != 0;
 
 	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
 
 	if (__predict_false(ref)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 		vrele(vp);
 	}
 	return (error);
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 static int
 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
 {
 	struct mount_pcpu *mpcpu;
 	int error, mflags;
 
 	if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
 	    vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return (0);
 	}
 
 	if (mplocked)
 		mtx_assert(MNT_MTX(mp), MA_OWNED);
 	else
 		MNT_ILOCK(mp);
 
 	error = 0;
 
 	/*
 	 * Check on status of suspension.
 	 */
 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 	    mp->mnt_susp_owner != curthread) {
 		mflags = 0;
 		if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
 			if (flags & V_PCATCH)
 				mflags |= PCATCH;
 		}
 		mflags |= (PUSER - 1);
 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			if ((flags & V_NOWAIT) != 0) {
 				error = EWOULDBLOCK;
 				goto unlock;
 			}
 			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 			    "suspfs", 0);
 			if (error != 0)
 				goto unlock;
 		}
 	}
 	if ((flags & V_XSLEEP) != 0)
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
 	if (error != 0 || (flags & V_XSLEEP) != 0)
 		MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 int
 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & ~V_VALID_FLAGS) == 0,
 	    ("%s: invalid flags passed %d\n", __func__, flags));
 
 	error = 0;
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	if (vp == NULL)
 		vfs_ref(mp);
 
 	error = vn_start_write_refed(mp, flags, false);
 	if (error != 0 && (flags & V_NOWAIT) == 0)
 		*mpp = NULL;
 	return (error);
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error, mflags;
 
 	KASSERT((flags & (~V_VALID_FLAGS | V_XSLEEP)) == 0,
 	    ("%s: invalid flags passed %d\n", __func__, flags));
 
  retry:
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if ((flags & V_NOWAIT) != 0) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		*mpp = NULL;
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	mflags = 0;
 	if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
 		if ((flags & V_PCATCH) != 0)
 			mflags |= PCATCH;
 	}
 	mflags |= (PUSER - 1) | PDROP;
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	*mpp = NULL;
 	return (error);
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int c;
 
 	if (mp == NULL)
 		return;
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_writeopcount;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
 		wakeup(&mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Filesystem secondary write operation has completed. If we are
  * suspending and this operation is the last one, notify the suspender
  * that the suspension is now in effect.
  */
 void
 vn_finished_secondary_write(struct mount *mp)
 {
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_secondary_writes--;
 	if (mp->mnt_secondary_writes < 0)
 		panic("vn_finished_secondary_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_secondary_writes <= 0)
 		wakeup(&mp->mnt_secondary_writes);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Request a filesystem to suspend write operations.
  */
 int
 vfs_write_suspend(struct mount *mp, int flags)
 {
 	int error;
 
 	vfs_op_enter(mp);
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	if (mp->mnt_susp_owner == curthread) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EALREADY);
 	}
 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 
 	/*
 	 * Unmount holds a write reference on the mount point.  If we
 	 * own busy reference and drain for writers, we deadlock with
 	 * the reference draining in the unmount path.  Callers of
 	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 	 * vfs_busy() reference is owned and caller is not in the
 	 * unmount context.
 	 */
 	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EBUSY);
 	}
 
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	mp->mnt_susp_owner = curthread;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
 		vfs_write_resume(mp, 0);
 		/* vfs_write_resume does vfs_op_exit() for us */
 	}
 	return (error);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(struct mount *mp, int flags)
 {
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
 		mp->mnt_susp_owner = NULL;
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
 		curthread->td_pflags &= ~TDP_IGNSUSP;
 		if ((flags & VR_START_WRITE) != 0) {
 			MNT_REF(mp);
 			mp->mnt_writeopcount++;
 		}
 		MNT_IUNLOCK(mp);
 		if ((flags & VR_NO_SUSPCLR) == 0)
 			VFS_SUSP_CLEAN(mp);
 		vfs_op_exit(mp);
 	} else if ((flags & VR_START_WRITE) != 0) {
 		MNT_REF(mp);
 		vn_start_write_refed(mp, 0, true);
 	} else {
 		MNT_IUNLOCK(mp);
 	}
 }
 
 /*
  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
  * methods.
  */
 int
 vfs_write_suspend_umnt(struct mount *mp)
 {
 	int error;
 
 	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 	    ("vfs_write_suspend_umnt: recursed"));
 
 	/* dounmount() already called vn_start_write(). */
 	for (;;) {
 		vn_finished_write(mp);
 		error = vfs_write_suspend(mp, 0);
 		if (error != 0) {
 			vn_start_write(NULL, &mp, V_WAIT);
 			return (error);
 		}
 		MNT_ILOCK(mp);
 		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			break;
 		MNT_IUNLOCK(mp);
 		vn_start_write(NULL, &mp, V_WAIT);
 	}
 	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 	wakeup(&mp->mnt_flag);
 	MNT_IUNLOCK(mp);
 	curthread->td_pflags |= TDP_IGNSUSP;
 	return (0);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(fp->f_vnode, kn));
 }
 
 int
 vn_kqfilter_opath(struct file *fp, struct knote *kn)
 {
 	if ((fp->f_flag & FKQALLOWED) == 0)
 		return (EBADF);
 	return (vn_kqfilter(fp, kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute removal as kernel */
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 static int
 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 
 	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 }
 
 int
 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 {
 
 	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 	    lkflags, rvp));
 }
 
 int
 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
     int lkflags, struct vnode **rvp)
 {
 	struct mount *mp;
 	int ltype, error;
 
 	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 	mp = vp->v_mount;
 	ltype = VOP_ISLOCKED(vp);
 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 	    ("vn_vget_ino: vp not locked"));
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, ltype | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0)
 			return (ENOENT);
 		if (VN_IS_DOOMED(vp)) {
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp);
 	error = alloc(mp, alloc_arg, lkflags, rvp);
 	vfs_unbusy(mp);
 	if (error != 0 || *rvp != vp)
 		vn_lock(vp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		if (error == 0) {
 			if (*rvp == vp)
 				vunref(vp);
 			else
 				vput(*rvp);
 		}
 		error = ENOENT;
 	}
 	return (error);
 }
 
 static void
 vn_send_sigxfsz(struct proc *p)
 {
 	PROC_LOCK(p);
 	kern_psignal(p, SIGXFSZ);
 	PROC_UNLOCK(p);
 }
 
 int
 vn_rlimit_trunc(u_quad_t size, struct thread *td)
 {
 	if (size <= lim_cur(td, RLIMIT_FSIZE))
 		return (0);
 	vn_send_sigxfsz(td->td_proc);
 	return (EFBIG);
 }
 
 static int
 vn_rlimit_fsizex1(const struct vnode *vp, struct uio *uio, off_t maxfsz,
     bool adj, struct thread *td)
 {
 	off_t lim;
 	bool ktr_write;
 
 	if (vp->v_type != VREG)
 		return (0);
 
 	/*
 	 * Handle file system maximum file size.
 	 */
 	if (maxfsz != 0 && uio->uio_offset + uio->uio_resid > maxfsz) {
 		if (!adj || uio->uio_offset >= maxfsz)
 			return (EFBIG);
 		uio->uio_resid = maxfsz - uio->uio_offset;
 	}
 
 	/*
 	 * This is kernel write (e.g. vnode_pager) or accounting
 	 * write, ignore limit.
 	 */
 	if (td == NULL || (td->td_pflags2 & TDP2_ACCT) != 0)
 		return (0);
 
 	/*
 	 * Calculate file size limit.
 	 */
 	ktr_write = (td->td_pflags & TDP_INKTRACE) != 0;
 	lim = __predict_false(ktr_write) ? td->td_ktr_io_lim :
 	    lim_cur(td, RLIMIT_FSIZE);
 
 	/*
 	 * Is the limit reached?
 	 */
 	if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim))
 		return (0);
 
 	/*
 	 * Prepared filesystems can handle writes truncated to the
 	 * file size limit.
 	 */
 	if (adj && (uoff_t)uio->uio_offset < lim) {
 		uio->uio_resid = lim - (uoff_t)uio->uio_offset;
 		return (0);
 	}
 
 	if (!ktr_write || ktr_filesize_limit_signal)
 		vn_send_sigxfsz(td->td_proc);
 	return (EFBIG);
 }
 
 /*
  * Helper for VOP_WRITE() implementations, the common code to
  * handle maximum supported file size on the filesystem, and
  * RLIMIT_FSIZE, except for special writes from accounting subsystem
  * and ktrace.
  *
  * For maximum file size (maxfsz argument):
  * - return EFBIG if uio_offset is beyond it
  * - otherwise, clamp uio_resid if write would extend file beyond maxfsz.
  *
  * For RLIMIT_FSIZE:
  * - return EFBIG and send SIGXFSZ if uio_offset is beyond the limit
  * - otherwise, clamp uio_resid if write would extend file beyond limit.
  *
  * If clamping occured, the adjustment for uio_resid is stored in
  * *resid_adj, to be re-applied by vn_rlimit_fsizex_res() on return
  * from the VOP.
  */
 int
 vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio, off_t maxfsz,
     ssize_t *resid_adj, struct thread *td)
 {
 	ssize_t resid_orig;
 	int error;
 	bool adj;
 
 	resid_orig = uio->uio_resid;
 	adj = resid_adj != NULL;
 	error = vn_rlimit_fsizex1(vp, uio, maxfsz, adj, td);
 	if (adj)
 		*resid_adj = resid_orig - uio->uio_resid;
 	return (error);
 }
 
 void
 vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj)
 {
 	uio->uio_resid += resid_adj;
 }
 
 int
 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
     struct thread *td)
 {
 	return (vn_rlimit_fsizex(vp, __DECONST(struct uio *, uio), 0, NULL,
 	    td));
 }
 
 int
 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfmode(td, active_cred, vp, mode));
 }
 
 int
 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfown(td, active_cred, vp, uid, gid));
 }
 
 /*
  * Remove pages in the range ["start", "end") from the vnode's VM object.  If
  * "end" is 0, then the range extends to the end of the object.
  */
 void
 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  * Like vn_pages_remove(), but skips invalid pages, which by definition are not
  * mapped into any process' address space.  Filesystems may use this in
  * preference to vn_pages_remove() to avoid blocking on pages busied in
  * preparation for a VOP_GETPAGES.
  */
 void
 vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, OBJPR_VALIDONLY);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 int
 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
     struct ucred *cred)
 {
 	off_t size;
 	daddr_t bn, bnp;
 	uint64_t bsize;
 	off_t noff;
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
 	ASSERT_VOP_ELOCKED(vp, "vn_bmap_seekhole_locked");
 
 	if (vp->v_type != VREG) {
 		error = ENOTTY;
 		goto out;
 	}
 	error = vn_getsize_locked(vp, &size, cred);
 	if (error != 0)
 		goto out;
 	noff = *off;
 	if (noff < 0 || noff >= size) {
 		error = ENXIO;
 		goto out;
 	}
 
 	/* See the comment in ufs_bmap_seekdata(). */
 	vnode_pager_clean_sync(vp);
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	for (bn = noff / bsize; noff < size; bn++, noff += bsize -
 	    noff % bsize) {
 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 		if (error == EOPNOTSUPP) {
 			error = ENOTTY;
 			goto out;
 		}
 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
 			noff = bn * bsize;
 			if (noff < *off)
 				noff = *off;
 			goto out;
 		}
 	}
 	if (noff > size)
 		noff = size;
 	/* noff == size. There is an implicit hole at the end of file. */
 	if (cmd == FIOSEEKDATA)
 		error = ENXIO;
 out:
 	if (error == 0)
 		*off = noff;
 	return (error);
 }
 
 int
 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 {
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
 
 	if (vn_lock(vp, LK_EXCLUSIVE) != 0)
 		return (EBADF);
 	error = vn_bmap_seekhole_locked(vp, cmd, off, cred);
 	VOP_UNLOCK(vp);
 	return (error);
 }
 
 int
 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct ucred *cred;
 	struct vnode *vp;
 	off_t foffset, fsize, size;
 	int error, noneg;
 
 	cred = td->td_ucred;
 	vp = fp->f_vnode;
 	noneg = (vp->v_type != VCHR);
 	/*
 	 * Try to dodge locking for common case of querying the offset.
 	 */
 	if (whence == L_INCR && offset == 0) {
 		foffset = foffset_read(fp);
 		if (__predict_false(foffset < 0 && noneg)) {
 			return (EOVERFLOW);
 		}
 		td->td_uretoff.tdu_off = foffset;
 		return (0);
 	}
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (noneg &&
 		    (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		error = vn_getsize(vp, &fsize, cred);
 		if (error != 0)
 			break;
 
 		/*
 		 * If the file references a disk device, then fetch
 		 * the media size and use that to determine the ending
 		 * offset.
 		 */
 		if (fsize == 0 && vp->v_type == VCHR &&
 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 			fsize = size;
 		if (noneg && offset > 0 && fsize > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += fsize;
 		break;
 	case L_SET:
 		break;
 	case SEEK_DATA:
 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	case SEEK_HOLE:
 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	VFS_KNOTE_UNLOCKED(vp, 0);
 	td->td_uretoff.tdu_off = offset;
 drop:
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 int
 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	int error;
 
 	/*
 	 * Grant permission if the caller is the owner of the file, or
 	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 	 * on the file.  If the time pointer is null, then write
 	 * permission on the file is also sufficient.
 	 *
 	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 	 * will be allowed to set the times [..] to the current
 	 * server time.
 	 */
 	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 		error = VOP_ACCESS(vp, VWRITE, cred, td);
 	return (error);
 }
 
 int
 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (fp->f_type == DTYPE_FIFO)
 		kif->kf_type = KF_TYPE_FIFO;
 	else
 		kif->kf_type = KF_TYPE_VNODE;
 	vp = fp->f_vnode;
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fill_kinfo_vnode(vp, kif);
 	vrele(vp);
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 static inline void
 vn_fill_junk(struct kinfo_file *kif)
 {
 	size_t len, olen;
 
 	/*
 	 * Simulate vn_fullpath returning changing values for a given
 	 * vp during e.g. coredump.
 	 */
 	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
 	olen = strlen(kif->kf_path);
 	if (len < olen)
 		strcpy(&kif->kf_path[len - 1], "$");
 	else
 		for (; olen < len; olen++)
 			strcpy(&kif->kf_path[olen], "A");
 }
 
 int
 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
 	int error;
 
 	kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
 	freepath = NULL;
 	fullpath = "-";
 	error = vn_fullpath(vp, &fullpath, &freepath);
 	if (error == 0) {
 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
 	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
 		vn_fill_junk(kif);
 	);
 
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 	else
 		kif->kf_un.kf_file.kf_file_fsid =
 		    vp->v_mount->mnt_stat.f_fsid.val[0];
 	kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_fsid; /* truncate */
 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 	kif->kf_un.kf_file.kf_file_size = va.va_size;
 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 	kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_rdev; /* truncate */
 	kif->kf_un.kf_file.kf_file_nlink = va.va_nlink;
 	return (0);
 }
 
 int
 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_in pkm;
 #endif
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	boolean_t writecounted;
 	int error;
 
 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
 	/*
 	 * POSIX shared-memory objects are defined to have
 	 * kernel persistence, and are not defined to support
 	 * read(2)/write(2) -- or even open(2).  Thus, we can
 	 * use MAP_ASYNC to trade on-disk coherence for speed.
 	 * The shm_open(3) library routine turns on the FPOSIXSHM
 	 * flag to request this behavior.
 	 */
 	if ((fp->f_flag & FPOSIXSHM) != 0)
 		flags |= MAP_NOSYNC;
 #endif
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.  Note that we only worry about
 	 * writability if mapping is shared; in this case,
 	 * current and max prot are dictated by the open file.
 	 * XXX use the vnode instead?  Problem is: what
 	 * credentials do we use for determination? What if
 	 * proc does a setuid?
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	} else {
 		maxprot |= VM_PROT_WRITE;
 		cap_maxprot |= VM_PROT_WRITE;
 	}
 	maxprot &= cap_maxprot;
 
 	/*
 	 * For regular files and shared memory, POSIX requires that
 	 * the value of foff be a legitimate offset within the data
 	 * object.  In particular, negative offsets are invalid.
 	 * Blocking negative offsets and overflows here avoids
 	 * possible wraparound or user-level access into reserved
 	 * ranges of the data object later.  In contrast, POSIX does
 	 * not dictate how offsets are used by device drivers, so in
 	 * the case of a device mapping a negative offset is passed
 	 * on.
 	 */
 	if (
 #ifdef _LP64
 	    size > OFF_MAX ||
 #endif
 	    foff > OFF_MAX - size)
 		return (EINVAL);
 
 	writecounted = FALSE;
 	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
 	    &foff, &object, &writecounted);
 	if (error != 0)
 		return (error);
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, writecounted, td);
 	if (error != 0) {
 		/*
 		 * If this mapping was accounted for in the vnode's
 		 * writecount, then undo that now.
 		 */
 		if (writecounted)
 			vm_pager_release_writecount(object, 0, size);
 		vm_object_deallocate(object);
 	}
 #ifdef HWPMC_HOOKS
 	/* Inform hwpmc(4) if an executable is being mapped. */
 	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
 		if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
 			pkm.pm_file = vp;
 			pkm.pm_address = (uintptr_t) *addr;
 			PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
 		}
 	}
 #endif
 	return (error);
 }
 
 void
 vn_fsid(struct vnode *vp, struct vattr *va)
 {
 	fsid_t *f;
 
 	f = &vp->v_mount->mnt_stat.f_fsid;
 	va->va_fsid = (uint32_t)f->val[1];
 	va->va_fsid <<= sizeof(f->val[1]) * NBBY;
 	va->va_fsid += (uint32_t)f->val[0];
 }
 
 int
 vn_fsync_buf(struct vnode *vp, int waitfor)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	struct mount *mp;
 	int error, maxretry;
 
 	error = 0;
 	maxretry = 10000;     /* large, arbitrarily chosen */
 	mp = NULL;
 	if (vp->v_type == VCHR) {
 		VI_LOCK(vp);
 		mp = vp->v_rdev->si_mountpt;
 		VI_UNLOCK(vp);
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 loop1:
 	/*
 	 * MARK/SCAN initialization to avoid infinite loops.
 	 */
         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 		bp->b_vflags &= ~BV_SCANNED;
 		bp->b_error = 0;
 	}
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 loop2:
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
 			    BO_LOCKPTR(bo)) != 0) {
 				BO_LOCK(bo);
 				goto loop1;
 			}
 			BO_LOCK(bo);
 		}
 		BO_UNLOCK(bo);
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("fsync: not dirty");
 		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
 			vfs_bio_awrite(bp);
 		} else {
 			bremfree(bp);
 			bawrite(bp);
 		}
 		if (maxretry < 1000)
 			pause("dirty", hz < 1000 ? 1 : hz / 1000);
 		BO_LOCK(bo);
 		goto loop2;
 	}
 
 	/*
 	 * If synchronous the caller expects us to completely resolve all
 	 * dirty buffers in the system.  Wait for in-progress I/O to
 	 * complete (which could include background bitmap writes), then
 	 * retry if dirty blocks still exist.
 	 */
 	if (waitfor == MNT_WAIT) {
 		bufobj_wwait(bo, 0, 0);
 		if (bo->bo_dirty.bv_cnt > 0) {
 			/*
 			 * If we are unable to write any of these buffers
 			 * then we fail now rather than trying endlessly
 			 * to write them out.
 			 */
 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 				if ((error = bp->b_error) != 0)
 					break;
 			if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
 			    (error == 0 && --maxretry >= 0))
 				goto loop1;
 			if (error == 0)
 				error = EAGAIN;
 		}
 	}
 	BO_UNLOCK(bo);
 	if (error != 0)
 		vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
 
 	return (error);
 }
 
 /*
  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
  * to do the actual copy.
  * vn_generic_copy_file_range() is factored out, so it can be called
  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
  * different file systems.
  */
 int
 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
     struct ucred *outcred, struct thread *fsize_td)
 {
 	struct mount *inmp, *outmp;
 	struct vnode *invpl, *outvpl;
 	int error;
 	size_t len;
 	uint64_t uval;
 
 	invpl = outvpl = NULL;
 	len = *lenp;
 	*lenp = 0;		/* For error returns. */
 	error = 0;
 
 	/* Do some sanity checks on the arguments. */
 	if (invp->v_type == VDIR || outvp->v_type == VDIR)
 		error = EISDIR;
 	else if (*inoffp < 0 || *outoffp < 0 ||
 	    invp->v_type != VREG || outvp->v_type != VREG)
 		error = EINVAL;
 	if (error != 0)
 		goto out;
 
 	/* Ensure offset + len does not wrap around. */
 	uval = *inoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *inoffp;
 	uval = *outoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *outoffp;
 	if (len == 0)
 		goto out;
 
 	error = VOP_GETLOWVNODE(invp, &invpl, FREAD);
 	if (error != 0)
 		goto out;
 	error = VOP_GETLOWVNODE(outvp, &outvpl, FWRITE);
 	if (error != 0)
 		goto out1;
 
 	inmp = invpl->v_mount;
 	outmp = outvpl->v_mount;
 	if (inmp == NULL || outmp == NULL)
 		goto out2;
 
 	for (;;) {
 		error = vfs_busy(inmp, 0);
 		if (error != 0)
 			goto out2;
 		if (inmp == outmp)
 			break;
 		error = vfs_busy(outmp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_unbusy(inmp);
 			error = vfs_busy(outmp, 0);
 			if (error == 0) {
 				vfs_unbusy(outmp);
 				continue;
 			}
 			goto out2;
 		}
 		break;
 	}
 
 	/*
 	 * If the two vnodes are for the same file system type, call
 	 * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
 	 * which can handle copies across multiple file system types.
 	 */
 	*lenp = len;
 	if (inmp == outmp || inmp->mnt_vfc == outmp->mnt_vfc)
 		error = VOP_COPY_FILE_RANGE(invpl, inoffp, outvpl, outoffp,
 		    lenp, flags, incred, outcred, fsize_td);
 	else
 		error = ENOSYS;
 	if (error == ENOSYS)
 		error = vn_generic_copy_file_range(invpl, inoffp, outvpl,
 		    outoffp, lenp, flags, incred, outcred, fsize_td);
 	vfs_unbusy(outmp);
 	if (inmp != outmp)
 		vfs_unbusy(inmp);
 out2:
 	if (outvpl != NULL)
 		vrele(outvpl);
 out1:
 	if (invpl != NULL)
 		vrele(invpl);
 out:
 	return (error);
 }
 
 /*
  * Test len bytes of data starting at dat for all bytes == 0.
  * Return true if all bytes are zero, false otherwise.
  * Expects dat to be well aligned.
  */
 static bool
 mem_iszero(void *dat, int len)
 {
 	int i;
 	const u_int *p;
 	const char *cp;
 
 	for (p = dat; len > 0; len -= sizeof(*p), p++) {
 		if (len >= sizeof(*p)) {
 			if (*p != 0)
 				return (false);
 		} else {
 			cp = (const char *)p;
 			for (i = 0; i < len; i++, cp++)
 				if (*cp != '\0')
 					return (false);
 		}
 	}
 	return (true);
 }
 
 /*
  * Look for a hole in the output file and, if found, adjust *outoffp
  * and *xferp to skip past the hole.
  * *xferp is the entire hole length to be written and xfer2 is how many bytes
  * to be written as 0's upon return.
  */
 static off_t
 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
 {
 	int error;
 	off_t delta;
 
 	if (*holeoffp == 0 || *holeoffp <= *outoffp) {
 		*dataoffp = *outoffp;
 		error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
 		    curthread);
 		if (error == 0) {
 			*holeoffp = *dataoffp;
 			error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
 			    curthread);
 		}
 		if (error != 0 || *holeoffp == *dataoffp) {
 			/*
 			 * Since outvp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, *holeoffp == *dataoffp and finding
 			 * the hole has failed, so disable vn_skip_hole().
 			 */
 			*holeoffp = -1;	/* Disable use of vn_skip_hole(). */
 			return (xfer2);
 		}
 		KASSERT(*dataoffp >= *outoffp,
 		    ("vn_skip_hole: dataoff=%jd < outoff=%jd",
 		    (intmax_t)*dataoffp, (intmax_t)*outoffp));
 		KASSERT(*holeoffp > *dataoffp,
 		    ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
 		    (intmax_t)*holeoffp, (intmax_t)*dataoffp));
 	}
 
 	/*
 	 * If there is a hole before the data starts, advance *outoffp and
 	 * *xferp past the hole.
 	 */
 	if (*dataoffp > *outoffp) {
 		delta = *dataoffp - *outoffp;
 		if (delta >= *xferp) {
 			/* Entire *xferp is a hole. */
 			*outoffp += *xferp;
 			*xferp = 0;
 			return (0);
 		}
 		*xferp -= delta;
 		*outoffp += delta;
 		xfer2 = MIN(xfer2, *xferp);
 	}
 
 	/*
 	 * If a hole starts before the end of this xfer2, reduce this xfer2 so
 	 * that the write ends at the start of the hole.
 	 * *holeoffp should always be greater than *outoffp, but for the
 	 * non-INVARIANTS case, check this to make sure xfer2 remains a sane
 	 * value.
 	 */
 	if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
 		xfer2 = *holeoffp - *outoffp;
 	return (xfer2);
 }
 
 /*
  * Write an xfer sized chunk to outvp in blksize blocks from dat.
  * dat is a maximum of blksize in length and can be written repeatedly in
  * the chunk.
  * If growfile == true, just grow the file via vn_truncate_locked() instead
  * of doing actual writes.
  * If checkhole == true, a hole is being punched, so skip over any hole
  * already in the output file.
  */
 static int
 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
 {
 	struct mount *mp;
 	off_t dataoff, holeoff, xfer2;
 	int error;
 
 	/*
 	 * Loop around doing writes of blksize until write has been completed.
 	 * Lock/unlock on each loop iteration so that a bwillwrite() can be
 	 * done for each iteration, since the xfer argument can be very
 	 * large if there is a large hole to punch in the output file.
 	 */
 	error = 0;
 	holeoff = 0;
 	do {
 		xfer2 = MIN(xfer, blksize);
 		if (checkhole) {
 			/*
 			 * Punching a hole.  Skip writing if there is
 			 * already a hole in the output file.
 			 */
 			xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
 			    &dataoff, &holeoff, cred);
 			if (xfer == 0)
 				break;
 			if (holeoff < 0)
 				checkhole = false;
 			KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
 			    (intmax_t)xfer2));
 		}
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(outvp, &mp, V_WAIT);
 		if (error != 0)
 			break;
 		if (growfile) {
 			error = vn_lock(outvp, LK_EXCLUSIVE);
 			if (error == 0) {
 				error = vn_truncate_locked(outvp, outoff + xfer,
 				    false, cred);
 				VOP_UNLOCK(outvp);
 			}
 		} else {
 			error = vn_lock(outvp, vn_lktype_write(mp, outvp));
 			if (error == 0) {
 				error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
 				    outoff, UIO_SYSSPACE, IO_NODELOCKED,
 				    curthread->td_ucred, cred, NULL, curthread);
 				outoff += xfer2;
 				xfer -= xfer2;
 				VOP_UNLOCK(outvp);
 			}
 		}
 		if (mp != NULL)
 			vn_finished_write(mp);
 	} while (!growfile && xfer > 0 && error == 0);
 	return (error);
 }
 
 /*
  * Copy a byte range of one file to another.  This function can handle the
  * case where invp and outvp are on different file systems.
  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
  * is no better file system specific way to do it.
  */
 int
 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
 {
 	struct vattr inva;
 	struct mount *mp;
 	off_t startoff, endoff, xfer, xfer2;
 	u_long blksize;
 	int error, interrupted;
 	bool cantseek, readzeros, eof, first, lastblock, holetoeof, sparse;
 	ssize_t aresid, r = 0;
 	size_t copylen, len, savlen;
 	off_t outsize;
 	char *dat;
 	long holein, holeout;
 	struct timespec curts, endts;
 
 	holein = holeout = 0;
 	savlen = len = *lenp;
 	error = 0;
 	interrupted = 0;
 	dat = NULL;
 
 	error = vn_lock(invp, LK_SHARED);
 	if (error != 0)
 		goto out;
 	if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
 		holein = 0;
 	error = VOP_GETATTR(invp, &inva, incred);
 	if (error == 0 && inva.va_size > OFF_MAX)
 		error = EFBIG;
 	VOP_UNLOCK(invp);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Use va_bytes >= va_size as a hint that the file does not have
 	 * sufficient holes to justify the overhead of doing FIOSEEKHOLE.
 	 * This hint does not work well for file systems doing compression
 	 * and may fail when allocations for extended attributes increases
 	 * the value of va_bytes to >= va_size.
 	 */
 	sparse = true;
 	if (holein != 0 && inva.va_bytes >= inva.va_size) {
 		holein = 0;
 		sparse = false;
 	}
 
 	mp = NULL;
 	error = vn_start_write(outvp, &mp, V_WAIT);
 	if (error == 0)
 		error = vn_lock(outvp, LK_EXCLUSIVE);
 	if (error == 0) {
 		/*
 		 * If fsize_td != NULL, do a vn_rlimit_fsizex() call,
 		 * now that outvp is locked.
 		 */
 		if (fsize_td != NULL) {
 			struct uio io;
 
 			io.uio_offset = *outoffp;
 			io.uio_resid = len;
 			error = vn_rlimit_fsizex(outvp, &io, 0, &r, fsize_td);
 			len = savlen = io.uio_resid;
 			/*
 			 * No need to call vn_rlimit_fsizex_res before return,
 			 * since the uio is local.
 			 */
 		}
 		if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
 			holeout = 0;
 		/*
 		 * Holes that are past EOF do not need to be written as a block
 		 * of zero bytes.  So, truncate the output file as far as
 		 * possible and then use size to decide if writing 0
 		 * bytes is necessary in the loop below.
 		 */
 		if (error == 0)
 			error = vn_getsize_locked(outvp, &outsize, outcred);
 		if (error == 0 && outsize > *outoffp &&
 		    *outoffp <= OFF_MAX - len && outsize <= *outoffp + len &&
 		    *inoffp < inva.va_size &&
 		    *outoffp <= OFF_MAX - (inva.va_size - *inoffp) &&
 		    outsize <= *outoffp + (inva.va_size - *inoffp)) {
 #ifdef MAC
 			error = mac_vnode_check_write(curthread->td_ucred,
 			    outcred, outvp);
 			if (error == 0)
 #endif
 				error = vn_truncate_locked(outvp, *outoffp,
 				    false, outcred);
 			if (error == 0)
 				outsize = *outoffp;
 		}
 		VOP_UNLOCK(outvp);
 	}
 	if (mp != NULL)
 		vn_finished_write(mp);
 	if (error != 0)
 		goto out;
 
 	if (sparse && holein == 0 && holeout > 0) {
 		/*
 		 * For this special case, the input data will be scanned
 		 * for blocks of all 0 bytes.  For these blocks, the
 		 * write can be skipped for the output file to create
 		 * an unallocated region.
 		 * Therefore, use the appropriate size for the output file.
 		 */
 		blksize = holeout;
 		if (blksize <= 512) {
 			/*
 			 * Use f_iosize, since ZFS reports a _PC_MIN_HOLE_SIZE
 			 * of 512, although it actually only creates
 			 * unallocated regions for blocks >= f_iosize.
 			 */
 			blksize = outvp->v_mount->mnt_stat.f_iosize;
 		}
 	} else {
 		/*
 		 * Use the larger of the two f_iosize values.  If they are
 		 * not the same size, one will normally be an exact multiple of
 		 * the other, since they are both likely to be a power of 2.
 		 */
 		blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
 		    outvp->v_mount->mnt_stat.f_iosize);
 	}
 
 	/* Clip to sane limits. */
 	if (blksize < 4096)
 		blksize = 4096;
 	else if (blksize > maxphys)
 		blksize = maxphys;
 	dat = malloc(blksize, M_TEMP, M_WAITOK);
 
 	/*
 	 * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
 	 * to find holes.  Otherwise, just scan the read block for all 0s
 	 * in the inner loop where the data copying is done.
 	 * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
 	 * support holes on the server, but do not support FIOSEEKHOLE.
 	 * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate
 	 * that this function should return after 1second with a partial
 	 * completion.
 	 */
 	if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) {
 		getnanouptime(&endts);
 		endts.tv_sec++;
 	} else
 		timespecclear(&endts);
 	first = true;
 	holetoeof = eof = false;
 	while (len > 0 && error == 0 && !eof && interrupted == 0) {
 		endoff = 0;			/* To shut up compilers. */
 		cantseek = true;
 		startoff = *inoffp;
 		copylen = len;
 
 		/*
 		 * Find the next data area.  If there is just a hole to EOF,
 		 * FIOSEEKDATA should fail with ENXIO.
 		 * (I do not know if any file system will report a hole to
 		 *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
 		 *  will fail for those file systems.)
 		 *
 		 * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
 		 * the code just falls through to the inner copy loop.
 		 */
 		error = EINVAL;
 		if (holein > 0) {
 			error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
 			    incred, curthread);
 			if (error == ENXIO) {
 				startoff = endoff = inva.va_size;
 				eof = holetoeof = true;
 				error = 0;
 			}
 		}
 		if (error == 0 && !holetoeof) {
 			endoff = startoff;
 			error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
 			    incred, curthread);
 			/*
 			 * Since invp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, startoff == endoff and finding
 			 * the hole has failed, so set an error.
 			 */
 			if (error == 0 && startoff == endoff)
 				error = EINVAL; /* Any error. Reset to 0. */
 		}
 		if (error == 0) {
 			if (startoff > *inoffp) {
 				/* Found hole before data block. */
 				xfer = MIN(startoff - *inoffp, len);
 				if (*outoffp < outsize) {
 					/* Must write 0s to punch hole. */
 					xfer2 = MIN(outsize - *outoffp,
 					    xfer);
 					memset(dat, 0, MIN(xfer2, blksize));
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer2, blksize, false,
 					    holeout > 0, outcred);
 				}
 
 				if (error == 0 && *outoffp + xfer >
 				    outsize && (xfer == len || holetoeof)) {
 					/* Grow output file (hole at end). */
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize, true,
 					    false, outcred);
 				}
 				if (error == 0) {
 					*inoffp += xfer;
 					*outoffp += xfer;
 					len -= xfer;
 					if (len < savlen) {
 						interrupted = sig_intr();
 						if (timespecisset(&endts) &&
 						    interrupted == 0) {
 							getnanouptime(&curts);
 							if (timespeccmp(&curts,
 							    &endts, >=))
 								interrupted =
 								    EINTR;
 						}
 					}
 				}
 			}
 			copylen = MIN(len, endoff - startoff);
 			cantseek = false;
 		} else {
 			cantseek = true;
 			if (!sparse)
 				cantseek = false;
 			startoff = *inoffp;
 			copylen = len;
 			error = 0;
 		}
 
 		xfer = blksize;
 		if (cantseek) {
 			/*
 			 * Set first xfer to end at a block boundary, so that
 			 * holes are more likely detected in the loop below via
 			 * the for all bytes 0 method.
 			 */
 			xfer -= (*inoffp % blksize);
 		}
 
 		/*
 		 * Loop copying the data block.  If this was our first attempt
 		 * to copy anything, allow a zero-length block so that the VOPs
 		 * get a chance to update metadata, specifically the atime.
 		 */
 		while (error == 0 && ((copylen > 0 && !eof) || first) &&
 		    interrupted == 0) {
 			if (copylen < xfer)
 				xfer = copylen;
 			first = false;
 			error = vn_lock(invp, LK_SHARED);
 			if (error != 0)
 				goto out;
 			error = vn_rdwr(UIO_READ, invp, dat, xfer,
 			    startoff, UIO_SYSSPACE, IO_NODELOCKED,
 			    curthread->td_ucred, incred, &aresid,
 			    curthread);
 			VOP_UNLOCK(invp);
 			lastblock = false;
 			if (error == 0 && (xfer == 0 || aresid > 0)) {
 				/* Stop the copy at EOF on the input file. */
 				xfer -= aresid;
 				eof = true;
 				lastblock = true;
 			}
 			if (error == 0) {
 				/*
 				 * Skip the write for holes past the initial EOF
 				 * of the output file, unless this is the last
 				 * write of the output file at EOF.
 				 */
 				readzeros = cantseek ? mem_iszero(dat, xfer) :
 				    false;
 				if (xfer == len)
 					lastblock = true;
 				if (!cantseek || *outoffp < outsize ||
 				    lastblock || !readzeros)
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize,
 					    readzeros && lastblock &&
 					    *outoffp >= outsize, false,
 					    outcred);
 				if (error == 0) {
 					*inoffp += xfer;
 					startoff += xfer;
 					*outoffp += xfer;
 					copylen -= xfer;
 					len -= xfer;
 					if (len < savlen) {
 						interrupted = sig_intr();
 						if (timespecisset(&endts) &&
 						    interrupted == 0) {
 							getnanouptime(&curts);
 							if (timespeccmp(&curts,
 							    &endts, >=))
 								interrupted =
 								    EINTR;
 						}
 					}
 				}
 			}
 			xfer = blksize;
 		}
 	}
 out:
 	*lenp = savlen - len;
 	free(dat, M_TEMP);
 	return (error);
 }
 
 static int
 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	off_t olen, ooffset;
 	int error;
 #ifdef AUDIT
 	int audited_vnode1 = 0;
 #endif
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	/* Allocating blocks may take a long time, so iterate. */
 	for (;;) {
 		olen = len;
 		ooffset = offset;
 
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 		if (error != 0)
 			break;
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vn_finished_write(mp);
 			break;
 		}
 #ifdef AUDIT
 		if (!audited_vnode1) {
 			AUDIT_ARG_VNODE1(vp);
 			audited_vnode1 = 1;
 		}
 #endif
 #ifdef MAC
 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
 		if (error == 0)
 #endif
 			error = VOP_ALLOCATE(vp, &offset, &len, 0,
 			    td->td_ucred);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 
 		if (olen + ooffset != offset + len) {
 			panic("offset + len changed from %jx/%jx to %jx/%jx",
 			    ooffset, olen, offset, len);
 		}
 		if (error != 0 || len == 0)
 			break;
 		KASSERT(olen > len, ("Iteration did not make progress?"));
 		maybe_yield();
 	}
 
 	return (error);
 }
 
 static int
 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
     int ioflag, struct ucred *cred, struct ucred *active_cred,
     struct ucred *file_cred)
 {
 	struct mount *mp;
 	void *rl_cookie;
 	off_t off, len;
 	int error;
 #ifdef AUDIT
 	bool audited_vnode1 = false;
 #endif
 
 	rl_cookie = NULL;
 	error = 0;
 	mp = NULL;
 	off = *offset;
 	len = *length;
 
 	if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0)
 		rl_cookie = vn_rangelock_wlock(vp, off, off + len);
 	while (len > 0 && error == 0) {
 		/*
 		 * Try to deallocate the longest range in one pass.
 		 * In case a pass takes too long to be executed, it returns
 		 * partial result. The residue will be proceeded in the next
 		 * pass.
 		 */
 
 		if ((ioflag & IO_NODELOCKED) == 0) {
 			bwillwrite();
 			if ((error = vn_start_write(vp, &mp,
 			    V_WAIT | V_PCATCH)) != 0)
 				goto out;
 			vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 		}
 #ifdef AUDIT
 		if (!audited_vnode1) {
 			AUDIT_ARG_VNODE1(vp);
 			audited_vnode1 = true;
 		}
 #endif
 
 #ifdef MAC
 		if ((ioflag & IO_NOMACCHECK) == 0)
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 #endif
 		if (error == 0)
 			error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag,
 			    cred);
 
 		if ((ioflag & IO_NODELOCKED) == 0) {
 			VOP_UNLOCK(vp);
 			if (mp != NULL) {
 				vn_finished_write(mp);
 				mp = NULL;
 			}
 		}
 		if (error == 0 && len != 0)
 			maybe_yield();
 	}
 out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	*offset = off;
 	*length = len;
 	return (error);
 }
 
 /*
  * This function is supposed to be used in the situations where the deallocation
  * is not triggered by a user request.
  */
 int
 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
     int ioflag, struct ucred *active_cred, struct ucred *file_cred)
 {
 	struct ucred *cred;
 
 	if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
 	    flags != 0)
 		return (EINVAL);
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	cred = file_cred != NOCRED ? file_cred : active_cred;
 	return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred,
 	    active_cred, file_cred));
 }
 
 static int
 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
     struct ucred *active_cred, struct thread *td)
 {
 	int error;
 	struct vnode *vp;
 	int ioflag;
 
 	KASSERT(cmd == SPACECTL_DEALLOC, ("vn_fspacectl: Invalid cmd"));
 	KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0,
 	    ("vn_fspacectl: non-zero flags"));
 	KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset,
 	    ("vn_fspacectl: offset/length overflow or underflow"));
 	vp = fp->f_vnode;
 
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	ioflag = get_write_ioflag(fp);
 
 	switch (cmd) {
 	case SPACECTL_DEALLOC:
 		error = vn_deallocate_impl(vp, offset, length, flags, ioflag,
 		    active_cred, active_cred, fp->f_cred);
 		break;
 	default:
 		panic("vn_fspacectl: unknown cmd %d", cmd);
 	}
 
 	return (error);
 }
 
 /*
  * Keep this assert as long as sizeof(struct dirent) is used as the maximum
  * entry size.
  */
 _Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent),
     "'struct dirent' size must be a multiple of its alignment "
     "(see _GENERIC_DIRLEN())");
 
 /*
  * Returns successive directory entries through some caller's provided buffer.
  *
  * This function automatically refills the provided buffer with calls to
  * VOP_READDIR() (after MAC permission checks).
  *
  * 'td' is used for credentials and passed to uiomove().  'dirbuf' is the
  * caller's buffer to fill and 'dirbuflen' its allocated size.  'dirbuf' must
  * be properly aligned to access 'struct dirent' structures and 'dirbuflen'
  * must be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning
  * EINVAL (the latter is not a strong guarantee (yet); but EINVAL will always
  * be returned if this requirement is not verified).  '*dpp' points to the
  * current directory entry in the buffer and '*len' contains the remaining
  * valid bytes in 'dirbuf' after 'dpp' (including the pointed entry).
  *
  * At first call (or when restarting the read), '*len' must have been set to 0,
  * '*off' to 0 (or any valid start offset) and '*eofflag' to 0.  There are no
  * more entries as soon as '*len' is 0 after a call that returned 0.  Calling
  * again this function after such a condition is considered an error and EINVAL
  * will be returned.  Other possible error codes are those of VOP_READDIR(),
  * EINTEGRITY if the returned entries do not pass coherency tests, or EINVAL
  * (bad call).  All errors are unrecoverable, i.e., the state ('*len', '*off'
  * and '*eofflag') must be re-initialized before a subsequent call.  On error
  * or at end of directory, '*dpp' is reset to NULL.
  *
  * '*len', '*off' and '*eofflag' are internal state the caller should not
  * tamper with except as explained above.  '*off' is the next directory offset
  * to read from to refill the buffer.  '*eofflag' is set to 0 or 1 by the last
  * internal call to VOP_READDIR() that returned without error, indicating
  * whether it reached the end of the directory, and to 2 by this function after
  * all entries have been read.
  */
 int
 vn_dir_next_dirent(struct vnode *vp, struct thread *td,
     char *dirbuf, size_t dirbuflen,
     struct dirent **dpp, size_t *len, off_t *off, int *eofflag)
 {
 	struct dirent *dp = NULL;
 	int reclen;
 	int error;
 	struct uio uio;
 	struct iovec iov;
 
 	ASSERT_VOP_LOCKED(vp, "vnode not locked");
 	VNASSERT(vp->v_type == VDIR, vp, ("vnode is not a directory"));
 	MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen,
 	    "Address space overflow");
 
 	if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) {
 		/* Don't take any chances in this case */
 		error = EINVAL;
 		goto out;
 	}
 
 	if (*len != 0) {
 		dp = *dpp;
 
 		/*
 		 * The caller continued to call us after an error (we set dp to
 		 * NULL in a previous iteration).  Bail out right now.
 		 */
 		if (__predict_false(dp == NULL))
 			return (EINVAL);
 
 		MPASS(*len <= dirbuflen);
 		MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp &&
 		    (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen,
 		    "Filled range not inside buffer");
 
 		reclen = dp->d_reclen;
 		if (reclen >= *len) {
 			/* End of buffer reached */
 			*len = 0;
 		} else {
 			dp = (struct dirent *)((char *)dp + reclen);
 			*len -= reclen;
 		}
 	}
 
 	if (*len == 0) {
 		dp = NULL;
 
 		/* Have to refill. */
 		switch (*eofflag) {
 		case 0:
 			break;
 
 		case 1:
 			/* Nothing more to read. */
 			*eofflag = 2; /* Remember the caller reached EOF. */
 			goto success;
 
 		default:
 			/* The caller didn't test for EOF. */
 			error = EINVAL;
 			goto out;
 		}
 
 		iov.iov_base = dirbuf;
 		iov.iov_len = dirbuflen;
 
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = *off;
 		uio.uio_resid = dirbuflen;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_READ;
 		uio.uio_td = td;
 
 #ifdef MAC
 		error = mac_vnode_check_readdir(td->td_ucred, vp);
 		if (error == 0)
 #endif
 			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
 			    NULL, NULL);
 		if (error != 0)
 			goto out;
 
 		*len = dirbuflen - uio.uio_resid;
 		*off = uio.uio_offset;
 
 		if (*len == 0) {
 			/* Sanity check on INVARIANTS. */
 			MPASS(*eofflag != 0);
 			*eofflag = 1;
 			goto success;
 		}
 
 		/*
 		 * Normalize the flag returned by VOP_READDIR(), since we use 2
 		 * as a sentinel value.
 		 */
 		if (*eofflag != 0)
 			*eofflag = 1;
 
 		dp = (struct dirent *)dirbuf;
 	}
 
 	if (__predict_false(*len < GENERIC_MINDIRSIZ ||
 	    dp->d_reclen < GENERIC_MINDIRSIZ)) {
 		error = EINTEGRITY;
 		dp = NULL;
 		goto out;
 	}
 
 success:
 	error = 0;
 out:
 	*dpp = dp;
 	return (error);
 }
 
 /*
  * Checks whether a directory is empty or not.
  *
  * If the directory is empty, returns 0, and if it is not, ENOTEMPTY.  Other
  * values are genuine errors preventing the check.
  */
 int
 vn_dir_check_empty(struct vnode *vp)
 {
 	struct thread *const td = curthread;
 	char *dirbuf;
 	size_t dirbuflen, len;
 	off_t off;
 	int eofflag, error;
 	struct dirent *dp;
 	struct vattr va;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
 	VNPASS(vp->v_type == VDIR, vp);
 
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error != 0)
 		return (error);
 
 	dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ);
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
 	dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 	len = 0;
 	off = 0;
 	eofflag = 0;
 
 	for (;;) {
 		error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen,
 		    &dp, &len, &off, &eofflag);
 		if (error != 0)
 			goto end;
 
 		if (len == 0) {
 			/* EOF */
 			error = 0;
 			goto end;
 		}
 
 		/*
 		 * Skip whiteouts.  Unionfs operates on filesystems only and
 		 * not on hierarchies, so these whiteouts would be shadowed on
 		 * the system hierarchy but not for a union using the
 		 * filesystem of their directories as the upper layer.
 		 * Additionally, unionfs currently transparently exposes
 		 * union-specific metadata of its upper layer, meaning that
 		 * whiteouts can be seen through the union view in empty
 		 * directories.  Taking into account these whiteouts would then
 		 * prevent mounting another filesystem on such effectively
 		 * empty directories.
 		 */
 		if (dp->d_type == DT_WHT)
 			continue;
 
 		/*
 		 * Any file in the directory which is not '.' or '..' indicates
 		 * the directory is not empty.
 		 */
 		switch (dp->d_namlen) {
 		case 2:
 			if (dp->d_name[1] != '.') {
 				/* Can't be '..' (nor '.') */
 				error = ENOTEMPTY;
 				goto end;
 			}
 			/* FALLTHROUGH */
 		case 1:
 			if (dp->d_name[0] != '.') {
 				/* Can't be '..' nor '.' */
 				error = ENOTEMPTY;
 				goto end;
 			}
 			break;
 
 		default:
 			error = ENOTEMPTY;
 			goto end;
 		}
 	}
 
 end:
 	free(dirbuf, M_TEMP);
 	return (error);
 }
 
 
 static u_long vn_lock_pair_pause_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
     &vn_lock_pair_pause_cnt, 0,
     "Count of vn_lock_pair deadlocks");
 
 u_int vn_lock_pair_pause_max;
 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
     &vn_lock_pair_pause_max, 0,
     "Max ticks for vn_lock_pair deadlock avoidance sleep");
 
 static void
 vn_lock_pair_pause(const char *wmesg)
 {
 	atomic_add_long(&vn_lock_pair_pause_cnt, 1);
 	pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
 }
 
 /*
  * Lock pair of (possibly same) vnodes vp1, vp2, avoiding lock order
  * reversal.  vp1_locked indicates whether vp1 is locked; if not, vp1
  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
  * can be NULL.
  *
  * The function returns with both vnodes exclusively or shared locked,
  * according to corresponding lkflags, and guarantees that it does not
  * create lock order reversal with other threads during its execution.
  * Both vnodes could be unlocked temporary (and reclaimed).
  *
  * If requesting shared locking, locked vnode lock must not be recursed.
  *
  * Only one of LK_SHARED and LK_EXCLUSIVE must be specified.
  * LK_NODDLKTREAT can be optionally passed.
  *
  * If vp1 == vp2, only one, most exclusive, lock is obtained on it.
  */
 void
 vn_lock_pair(struct vnode *vp1, bool vp1_locked, int lkflags1,
     struct vnode *vp2, bool vp2_locked, int lkflags2)
 {
 	int error, locked1;
 
 	MPASS((((lkflags1 & LK_SHARED) != 0) ^ ((lkflags1 & LK_EXCLUSIVE) != 0)) ||
 	    (vp1 == NULL && lkflags1 == 0));
 	MPASS((lkflags1 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0);
 	MPASS((((lkflags2 & LK_SHARED) != 0) ^ ((lkflags2 & LK_EXCLUSIVE) != 0)) ||
 	    (vp2 == NULL && lkflags2 == 0));
 	MPASS((lkflags2 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0);
 
 	if (vp1 == NULL && vp2 == NULL)
 		return;
 
 	if (vp1 == vp2) {
 		MPASS(vp1_locked == vp2_locked);
 
 		/* Select the most exclusive mode for lock. */
 		if ((lkflags1 & LK_TYPE_MASK) != (lkflags2 & LK_TYPE_MASK))
 			lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE;
 
 		if (vp1_locked) {
 			ASSERT_VOP_LOCKED(vp1, "vp1");
 
 			/* No need to relock if any lock is exclusive. */
 			if ((vp1->v_vnlock->lock_object.lo_flags &
 			    LK_NOSHARE) != 0)
 				return;
 
 			locked1 = VOP_ISLOCKED(vp1);
 			if (((lkflags1 & LK_SHARED) != 0 &&
 			    locked1 != LK_EXCLUSIVE) ||
 			    ((lkflags1 & LK_EXCLUSIVE) != 0 &&
 			    locked1 == LK_EXCLUSIVE))
 				return;
 			VOP_UNLOCK(vp1);
 		}
 
 		ASSERT_VOP_UNLOCKED(vp1, "vp1");
 		vn_lock(vp1, lkflags1 | LK_RETRY);
 		return;
 	}		
 
 	if (vp1 != NULL) {
 		if ((lkflags1 & LK_SHARED) != 0 &&
 		    (vp1->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0)
 			lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE;
 		if (vp1_locked && VOP_ISLOCKED(vp1) != LK_EXCLUSIVE) {
 			ASSERT_VOP_LOCKED(vp1, "vp1");
 			if ((lkflags1 & LK_EXCLUSIVE) != 0) {
 				VOP_UNLOCK(vp1);
 				ASSERT_VOP_UNLOCKED(vp1,
 				    "vp1 shared recursed");
 				vp1_locked = false;
 			}
 		} else if (!vp1_locked)
 			ASSERT_VOP_UNLOCKED(vp1, "vp1");
 	} else {
 		vp1_locked = true;
 	}
 
 	if (vp2 != NULL) {
 		if ((lkflags2 & LK_SHARED) != 0 &&
 		    (vp2->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0)
 			lkflags2 = (lkflags2 & ~LK_SHARED) | LK_EXCLUSIVE;
 		if (vp2_locked && VOP_ISLOCKED(vp2) != LK_EXCLUSIVE) {
 			ASSERT_VOP_LOCKED(vp2, "vp2");
 			if ((lkflags2 & LK_EXCLUSIVE) != 0) {
 				VOP_UNLOCK(vp2);
 				ASSERT_VOP_UNLOCKED(vp2,
 				    "vp2 shared recursed");
 				vp2_locked = false;
 			}
 		} else if (!vp2_locked)
 			ASSERT_VOP_UNLOCKED(vp2, "vp2");
 	} else {
 		vp2_locked = true;
 	}
 
 	if (!vp1_locked && !vp2_locked) {
 		vn_lock(vp1, lkflags1 | LK_RETRY);
 		vp1_locked = true;
 	}
 
 	while (!vp1_locked || !vp2_locked) {
 		if (vp1_locked && vp2 != NULL) {
 			if (vp1 != NULL) {
 				error = VOP_LOCK1(vp2, lkflags2 | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp1);
 				vp1_locked = false;
 				vn_lock_pair_pause("vlp1");
 			}
 			vn_lock(vp2, lkflags2 | LK_RETRY);
 			vp2_locked = true;
 		}
 		if (vp2_locked && vp1 != NULL) {
 			if (vp2 != NULL) {
 				error = VOP_LOCK1(vp1, lkflags1 | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp2);
 				vp2_locked = false;
 				vn_lock_pair_pause("vlp2");
 			}
 			vn_lock(vp1, lkflags1 | LK_RETRY);
 			vp1_locked = true;
 		}
 	}
 	if (vp1 != NULL) {
 		if (lkflags1 == LK_EXCLUSIVE)
 			ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
 		else
 			ASSERT_VOP_LOCKED(vp1, "vp1 ret");
 	}
 	if (vp2 != NULL) {
 		if (lkflags2 == LK_EXCLUSIVE)
 			ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
 		else
 			ASSERT_VOP_LOCKED(vp2, "vp2 ret");
 	}
 }
 
 int
 vn_lktype_write(struct mount *mp, struct vnode *vp)
 {
 	if (MNT_SHARED_WRITES(mp) ||
 	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount)))
 		return (LK_SHARED);
 	return (LK_EXCLUSIVE);
 }
 
 int
 vn_cmp(struct file *fp1, struct file *fp2, struct thread *td)
 {
 	if (fp2->f_type != DTYPE_VNODE)
 		return (3);
 	return (kcmp_cmp((uintptr_t)fp1->f_vnode, (uintptr_t)fp2->f_vnode));
 }
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index c0631591a10e..8a68f65a80f7 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -1,3221 +1,3221 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 #include "opt_bpf.h"
 #include "opt_ddb.h"
 #include "opt_netgraph.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 
 #include <sys/socket.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_private.h>
 #include <net/if_vlan_var.h>
 #include <net/if_dl.h>
 #include <net/bpf.h>
 #include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net80211/ieee80211_freebsd.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 static const struct bpf_if_ext dead_bpf_if = {
 	.bif_dlist = CK_LIST_HEAD_INITIALIZER()
 };
 
 struct bpf_if {
 #define	bif_next	bif_ext.bif_next
 #define	bif_dlist	bif_ext.bif_dlist
 	struct bpf_if_ext bif_ext;	/* public members */
 	u_int		bif_dlt;	/* link layer type */
 	u_int		bif_hdrlen;	/* length of link header */
 	struct bpfd_list bif_wlist;	/* writer-only list */
 	struct ifnet	*bif_ifp;	/* corresponding interface */
 	struct bpf_if	**bif_bpf;	/* Pointer to pointer to us */
 	volatile u_int	bif_refcnt;
 	struct epoch_context epoch_ctx;
 };
 
 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 
 struct bpf_program_buffer {
 	struct epoch_context	epoch_ctx;
 #ifdef BPF_JITTER
 	bpf_jit_filter		*func;
 #endif
 	void			*buffer[0];
 };
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
 #define PRINET  26			/* interruptible */
 #define BPF_PRIO_MAX	7
 
 #define	SIZEOF_BPF_HDR(type)	\
     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #define BPF_ALIGNMENT32 sizeof(int32_t)
 #define	BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
 
 #ifndef BURN_BRIDGES
 /*
  * 32-bit version of structure prepended to each packet.  We use this header
  * instead of the standard one for 32-bit streams.  We mark the a stream as
  * 32-bit the first time we see a 32-bit compat ioctl request.
  */
 struct bpf_hdr32 {
 	struct timeval32 bh_tstamp;	/* time stamp */
 	uint32_t	bh_caplen;	/* length of captured portion */
 	uint32_t	bh_datalen;	/* original length of packet */
 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
 					   plus alignment padding) */
 };
 #endif
 
 struct bpf_program32 {
 	u_int bf_len;
 	uint32_t bf_insns;
 };
 
 struct bpf_dltlist32 {
 	u_int	bfl_len;
 	u_int	bfl_list;
 };
 
 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
 #endif
 
 #define BPF_LOCK()		sx_xlock(&bpf_sx)
 #define BPF_UNLOCK()		sx_xunlock(&bpf_sx)
 #define BPF_LOCK_ASSERT()	sx_assert(&bpf_sx, SA_XLOCKED)
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
  * specific DLT. The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
 CK_LIST_HEAD(bpf_iflist, bpf_if);
 static struct bpf_iflist bpf_iflist;
 static struct sx	bpf_sx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
 static void	bpfif_ref(struct bpf_if *);
 static void	bpfif_rele(struct bpf_if *);
 
 static void	bpfd_ref(struct bpf_d *);
 static void	bpfd_rele(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_detachd_locked(struct bpf_d *, bool);
 static void	bpfd_free(epoch_context_t);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_d *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct bintime *);
 static void	reset_d(struct bpf_d *);
 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int	bpf_setdlt(struct bpf_d *, u_int);
 static void	filt_bpfdetach(struct knote *);
 static int	filt_bpfread(struct knote *, long);
 static int	filt_bpfwrite(struct knote *, long);
 static void	bpf_drvinit(void *);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "bpf sysctl");
 int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 static int bpf_zerocopy_enable = 0;
 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
 VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0;
 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN,
     &VNET_NAME(bpf_optimize_writers), 0,
     "Do not send packets until BPF program is set");
 
 static	d_open_t	bpfopen;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
 static	d_ioctl_t	bpfioctl;
 static	d_poll_t	bpfpoll;
 static	d_kqfilter_t	bpfkqfilter;
 
 static struct cdevsw bpf_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	bpfopen,
 	.d_read =	bpfread,
 	.d_write =	bpfwrite,
 	.d_ioctl =	bpfioctl,
 	.d_poll =	bpfpoll,
 	.d_name =	"bpf",
 	.d_kqfilter =	bpfkqfilter,
 };
 
-static struct filterops bpfread_filtops = {
+static const struct filterops bpfread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_bpfdetach,
 	.f_event = filt_bpfread,
 };
 
-static struct filterops bpfwrite_filtops = {
+static const struct filterops bpfwrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_bpfdetach,
 	.f_event = filt_bpfwrite,
 };
 
 /*
  * LOCKING MODEL USED BY BPF
  *
  * Locks:
  * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
  * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
  * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
  * structure fields used by bpf_*tap* code.
  *
  * Lock order: global lock, then descriptor lock.
  *
  * There are several possible consumers:
  *
  * 1. The kernel registers interface pointer with bpfattach().
  * Each call allocates new bpf_if structure, references ifnet pointer
  * and links bpf_if into bpf_iflist chain. This is protected with global
  * lock.
  *
  * 2. An userland application uses ioctl() call to bpf_d descriptor.
  * All such call are serialized with global lock. BPF filters can be
  * changed, but pointer to old filter will be freed using NET_EPOCH_CALL().
  * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
  * filter pointers, even if change will happen during bpf_tap execution.
  * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL().
  *
  * 3. An userland application can write packets into bpf_d descriptor.
  * There we need to be sure, that ifnet won't disappear during bpfwrite().
  *
  * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
  * bif_dlist is protected with net_epoch_preempt section. So, it should
  * be safe to make access to bpf_d descriptor inside the section.
  *
  * 5. The kernel invokes bpfdetach() on interface destroying. All lists
  * are modified with global lock held and actual free() is done using
  * NET_EPOCH_CALL().
  */
 
 static void
 bpfif_free(epoch_context_t ctx)
 {
 	struct bpf_if *bp;
 
 	bp = __containerof(ctx, struct bpf_if, epoch_ctx);
 	if_rele(bp->bif_ifp);
 	free(bp, M_BPF);
 }
 
 static void
 bpfif_ref(struct bpf_if *bp)
 {
 
 	refcount_acquire(&bp->bif_refcnt);
 }
 
 static void
 bpfif_rele(struct bpf_if *bp)
 {
 
 	if (!refcount_release(&bp->bif_refcnt))
 		return;
 	NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx);
 }
 
 static void
 bpfd_ref(struct bpf_d *d)
 {
 
 	refcount_acquire(&d->bd_refcnt);
 }
 
 static void
 bpfd_rele(struct bpf_d *d)
 {
 
 	if (!refcount_release(&d->bd_refcnt))
 		return;
 	NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx);
 }
 
 static struct bpf_program_buffer*
 bpf_program_buffer_alloc(size_t size, int flags)
 {
 
 	return (malloc(sizeof(struct bpf_program_buffer) + size,
 	    M_BPF, flags));
 }
 
 static void
 bpf_program_buffer_free(epoch_context_t ctx)
 {
 	struct bpf_program_buffer *ptr;
 
 	ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
 #ifdef BPF_JITTER
 	if (ptr->func != NULL)
 		bpf_destroy_jit_filter(ptr->func);
 #endif
 	free(ptr, M_BPF);
 }
 
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
  * similar to protosw, et.
  */
 static void
 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		counter_u64_add(d->bd_zcopy, 1);
 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_bytes");
 	}
 }
 
 static void
 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		counter_u64_add(d->bd_zcopy, 1);
 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_mbuf");
 	}
 }
 
 /*
  * This function gets called when the free buffer is re-assigned.
  */
 static void
 bpf_buf_reclaimed(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return;
 
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buf_reclaimed(d);
 		return;
 
 	default:
 		panic("bpf_buf_reclaimed");
 	}
 }
 
 /*
  * If the buffer mechanism has a way to decide that a held buffer can be made
  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
  * returned if the buffer can be discarded, (0) is returned if it cannot.
  */
 static int
 bpf_canfreebuf(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canfreebuf(d));
 	}
 	return (0);
 }
 
 /*
  * Allow the buffer model to indicate that the current store buffer is
  * immutable, regardless of the appearance of space.  Return (1) if the
  * buffer is writable, and (0) if not.
  */
 static int
 bpf_canwritebuf(struct bpf_d *d)
 {
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canwritebuf(d));
 	}
 	return (1);
 }
 
 /*
  * Notify buffer model that an attempt to write to the store buffer has
  * resulted in a dropped packet, in which case the buffer may be considered
  * full.
  */
 static void
 bpf_buffull(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buffull(d);
 		break;
 	}
 }
 
 /*
  * Notify the buffer model that a buffer has moved into the hold position.
  */
 void
 bpf_bufheld(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_bufheld(d);
 		break;
 	}
 }
 
 static void
 bpf_free(struct bpf_d *d)
 {
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_free(d));
 
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_free(d));
 
 	default:
 		panic("bpf_buf_free");
 	}
 }
 
 static int
 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_uiomove(d, buf, len, uio));
 }
 
 static int
 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_ioctl_sblen(d, i));
 }
 
 static int
 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 }
 
 static int
 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 }
 
 static int
 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 }
 
 /*
  * General BPF functions.
  */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
 {
 	const struct ieee80211_bpf_params *p;
 	struct ether_header *eh;
 	struct mbuf *m;
 	int error;
 	int len;
 	int hlen;
 	int slen;
 
 	/*
 	 * Build a sockaddr based on the data link layer type.
 	 * We do this at this level because the ethernet header
 	 * is copied directly into the data field of the sockaddr.
 	 * In the case of SLIP, there is no header and the packet
 	 * is forwarded as is.
 	 * Also, we are careful to leave room at the front of the mbuf
 	 * for the link level header.
 	 */
 	switch (linktype) {
 	case DLT_SLIP:
 		sockp->sa_family = AF_INET;
 		hlen = 0;
 		break;
 
 	case DLT_EN10MB:
 		sockp->sa_family = AF_UNSPEC;
 		/* XXX Would MAXLINKHDR be better? */
 		hlen = ETHER_HDR_LEN;
 		break;
 
 	case DLT_FDDI:
 		sockp->sa_family = AF_IMPLINK;
 		hlen = 0;
 		break;
 
 	case DLT_RAW:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 0;
 		break;
 
 	case DLT_NULL:
 		/*
 		 * null interface types require a 4 byte pseudo header which
 		 * corresponds to the address family of the packet.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;
 		break;
 
 	case DLT_ATM_RFC1483:
 		/*
 		 * en atm driver requires 4-byte atm pseudo header.
 		 * though it isn't standard, vpi:vci needs to be
 		 * specified anyway.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 		break;
 
 	case DLT_PPP:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;	/* This should match PPP_HDRLEN */
 		break;
 
 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
 		sockp->sa_family = AF_IEEE80211;
 		hlen = 0;
 		break;
 
 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
 		sockp->sa_family = AF_IEEE80211;
 		sockp->sa_len = 12;	/* XXX != 0 */
 		hlen = sizeof(struct ieee80211_bpf_params);
 		break;
 
 	default:
 		return (EIO);
 	}
 
 	len = uio->uio_resid;
 	if (len < hlen || len - hlen > ifp->if_mtu)
 		return (EMSGSIZE);
 
 	/* Allocate a mbuf, up to MJUM16BYTES bytes, for our write. */
 	m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return (EIO);
 	m->m_pkthdr.len = m->m_len = len;
 	*mp = m;
 
 	error = uiomove(mtod(m, u_char *), len, uio);
 	if (error)
 		goto bad;
 
 	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
 	}
 
 	/* Check for multicast destination */
 	switch (linktype) {
 	case DLT_EN10MB:
 		eh = mtod(m, struct ether_header *);
 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 			    ETHER_ADDR_LEN) == 0)
 				m->m_flags |= M_BCAST;
 			else
 				m->m_flags |= M_MCAST;
 		}
 		if (d->bd_hdrcmplt == 0) {
 			memcpy(eh->ether_shost, IF_LLADDR(ifp),
 			    sizeof(eh->ether_shost));
 		}
 		break;
 	}
 
 	/*
 	 * Make room for link header, and copy it to sockaddr
 	 */
 	if (hlen != 0) {
 		if (sockp->sa_family == AF_IEEE80211) {
 			/*
 			 * Collect true length from the parameter header
 			 * NB: sockp is known to be zero'd so if we do a
 			 *     short copy unspecified parameters will be
 			 *     zero.
 			 * NB: packet may not be aligned after stripping
 			 *     bpf params
 			 * XXX check ibp_vers
 			 */
 			p = mtod(m, const struct ieee80211_bpf_params *);
 			hlen = p->ibp_len;
 			if (hlen > sizeof(sockp->sa_data)) {
 				error = EINVAL;
 				goto bad;
 			}
 		}
 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
 	}
 	*hdrlen = hlen;
 
 	return (0);
 bad:
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Attach descriptor to the bpf interface, i.e. make d listen on bp,
  * then reset its buffers and counters with reset_d().
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	int op_w;
 
 	BPF_LOCK_ASSERT();
 
 	/*
 	 * Save sysctl value to protect from sysctl change
 	 * between reads
 	 */
 	op_w = V_bpf_optimize_writers || d->bd_writer;
 
 	if (d->bd_bif != NULL)
 		bpf_detachd_locked(d, false);
 	/*
 	 * Point d at bp, and add d to the interface's list.
 	 * Since there are many applications using BPF for
 	 * sending raw packets only (dhcpd, cdpd are good examples)
 	 * we can delay adding d to the list of active listeners until
 	 * some filter is configured.
 	 */
 
 	BPFD_LOCK(d);
 	/*
 	 * Hold reference to bpif while descriptor uses this interface.
 	 */
 	bpfif_ref(bp);
 	d->bd_bif = bp;
 	if (op_w != 0) {
 		/* Add to writers-only list */
 		CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 		/*
 		 * We decrement bd_writer on every filter set operation.
 		 * First BIOCSETF is done by pcap_open_live() to set up
 		 * snap length. After that appliation usually sets its own
 		 * filter.
 		 */
 		d->bd_writer = 2;
 	} else
 		CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
 	reset_d(d);
 
 	/* Trigger EVFILT_WRITE events. */
 	bpf_wakeup(d);
 
 	BPFD_UNLOCK(d);
 	bpf_bpfd_cnt++;
 
 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
 
 	if (op_w == 0)
 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
 /*
  * Check if we need to upgrade our descriptor @d from write-only mode.
  */
 static int
 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
     int flen)
 {
 	int is_snap, need_upgrade;
 
 	/*
 	 * Check if we've already upgraded or new filter is empty.
 	 */
 	if (d->bd_writer == 0 || fcode == NULL)
 		return (0);
 
 	need_upgrade = 0;
 
 	/*
 	 * Check if cmd looks like snaplen setting from
 	 * pcap_bpf.c:pcap_open_live().
 	 * Note we're not checking .k value here:
 	 * while pcap_open_live() definitely sets to non-zero value,
 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 	 * do not consider upgrading immediately
 	 */
 	if (cmd == BIOCSETF && flen == 1 &&
 	    fcode[0].code == (BPF_RET | BPF_K))
 		is_snap = 1;
 	else
 		is_snap = 0;
 
 	if (is_snap == 0) {
 		/*
 		 * We're setting first filter and it doesn't look like
 		 * setting snaplen.  We're probably using bpf directly.
 		 * Upgrade immediately.
 		 */
 		need_upgrade = 1;
 	} else {
 		/*
 		 * Do not require upgrade by first BIOCSETF
 		 * (used to set snaplen) by pcap_open_live().
 		 */
 
 		if (--d->bd_writer == 0) {
 			/*
 			 * First snaplen filter has already
 			 * been set. This is probably catch-all
 			 * filter
 			 */
 			need_upgrade = 1;
 		}
 	}
 
 	CTR5(KTR_NET,
 	    "%s: filter function set by pid %d, "
 	    "bd_writer counter %d, snap %d upgrade %d",
 	    __func__, d->bd_pid, d->bd_writer,
 	    is_snap, need_upgrade);
 
 	return (need_upgrade);
 }
 
 /*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	BPF_LOCK();
 	bpf_detachd_locked(d, false);
 	BPF_UNLOCK();
 }
 
 static void
 bpf_detachd_locked(struct bpf_d *d, bool detached_ifp)
 {
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 	int error;
 
 	BPF_LOCK_ASSERT();
 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 
 	/* Check if descriptor is attached */
 	if ((bp = d->bd_bif) == NULL)
 		return;
 
 	BPFD_LOCK(d);
 	/* Remove d from the interface's descriptor list. */
 	CK_LIST_REMOVE(d, bd_next);
 	/* Save bd_writer value */
 	error = d->bd_writer;
 	ifp = bp->bif_ifp;
 	d->bd_bif = NULL;
 	if (detached_ifp) {
 		/*
 		 * Notify descriptor as it's detached, so that any
 		 * sleepers wake up and get ENXIO.
 		 */
 		bpf_wakeup(d);
 	}
 	BPFD_UNLOCK(d);
 	bpf_bpfd_cnt--;
 
 	/* Call event handler iff d is attached */
 	if (error == 0)
 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
 	 * If so and ifnet is not detached, turn it off.
 	 */
 	if (d->bd_promisc && !detached_ifp) {
 		d->bd_promisc = 0;
 		CURVNET_SET(ifp->if_vnet);
 		error = ifpromisc(ifp, 0);
 		CURVNET_RESTORE();
 		if (error != 0 && error != ENXIO) {
 			/*
 			 * ENXIO can happen if a pccard is unplugged
 			 * Something is really wrong if we were able to put
 			 * the driver into promiscuous mode, but can't
 			 * take it out.
 			 */
 			if_printf(bp->bif_ifp,
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
 	bpfif_rele(bp);
 }
 
 /*
  * Close the descriptor by detaching it from its interface,
  * deallocating its buffers, and marking it free.
  */
 static void
 bpf_dtor(void *data)
 {
 	struct bpf_d *d = data;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 	funsetown(&d->bd_sigio);
 	bpf_detachd(d);
 #ifdef MAC
 	mac_bpfdesc_destroy(d);
 #endif /* MAC */
 	seldrain(&d->bd_sel);
 	knlist_destroy(&d->bd_sel.si_note);
 	callout_drain(&d->bd_callout);
 	bpfd_rele(d);
 }
 
 /*
  * Open ethernet device.  Returns ENXIO for illegal minor device number,
  * EBUSY if file is open by another process.
  */
 /* ARGSUSED */
 static	int
 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	error = devfs_set_cdevpriv(d, bpf_dtor);
 	if (error != 0) {
 		free(d, M_BPF);
 		return (error);
 	}
 
 	/* Setup counters */
 	d->bd_rcount = counter_u64_alloc(M_WAITOK);
 	d->bd_dcount = counter_u64_alloc(M_WAITOK);
 	d->bd_fcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wfcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wdcount = counter_u64_alloc(M_WAITOK);
 	d->bd_zcopy = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * For historical reasons, perform a one-time initialization call to
 	 * the buffer routines, even though we're not yet committed to a
 	 * particular buffer method.
 	 */
 	bpf_buffer_init(d);
 	if ((flags & FREAD) == 0)
 		d->bd_writer = 2;
 	d->bd_hbuf_in_use = 0;
 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	refcount_init(&d->bd_refcnt, 1);
 	BPF_PID_REFRESH(d, td);
 #ifdef MAC
 	mac_bpfdesc_init(d);
 	mac_bpfdesc_create(td->td_ucred, d);
 #endif
 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
 
 	/* Disable VLAN pcp tagging. */
 	d->bd_pcp = 0;
 
 	return (0);
 }
 
 /*
  *  bpfread - read next chunk of packets from buffers
  */
 static	int
 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	int error;
 	int non_block;
 	int timed_out;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Restrict application to use a buffer the same size as
 	 * as kernel buffers.
 	 */
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
 	non_block = ((ioflag & O_NONBLOCK) != 0);
 
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
 		BPFD_UNLOCK(d);
 		return (EOPNOTSUPP);
 	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
 	while (d->bd_hbuf_in_use) {
 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 		    PRINET | PCATCH, "bd_hbuf", 0);
 		if (error != 0) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 	}
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
 	while (d->bd_hbuf == NULL) {
 		if (d->bd_slen != 0) {
 			/*
 			 * A packet(s) either arrived since the previous
 			 * read or arrived while we were asleep.
 			 */
 			if (d->bd_immediate || non_block || timed_out) {
 				/*
 				 * Rotate the buffers and return what's here
 				 * if we are in immediate mode, non-blocking
 				 * flag is set, or this descriptor timed out.
 				 */
 				ROTATE_BUFFERS(d);
 				break;
 			}
 		}
 
 		/*
 		 * No data is available, check to see if the bpf device
 		 * is still pointed at a real interface.  If not, return
 		 * ENXIO so that the userland process knows to rebind
 		 * it before using it again.
 		 */
 		if (d->bd_bif == NULL) {
 			BPFD_UNLOCK(d);
 			return (ENXIO);
 		}
 
 		if (non_block) {
 			BPFD_UNLOCK(d);
 			return (EWOULDBLOCK);
 		}
 		error = msleep(d, &d->bd_lock, PRINET | PCATCH,
 		     "bpf", d->bd_rtout);
 		if (error == EINTR || error == ERESTART) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 		if (error == EWOULDBLOCK) {
 			/*
 			 * On a timeout, return what's in the buffer,
 			 * which may be nothing.  If there is something
 			 * in the store buffer, we can rotate the buffers.
 			 */
 			if (d->bd_hbuf)
 				/*
 				 * We filled up the buffer in between
 				 * getting the timeout and arriving
 				 * here, so we don't need to rotate.
 				 */
 				break;
 
 			if (d->bd_slen == 0) {
 				BPFD_UNLOCK(d);
 				return (0);
 			}
 			ROTATE_BUFFERS(d);
 			break;
 		}
 	}
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
 	d->bd_hbuf_in_use = 1;
 	BPFD_UNLOCK(d);
 
 	/*
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
   	 *
 	 * We do not have to worry about simultaneous reads because
 	 * we waited for sole access to the hold buffer above.
 	 */
 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 	bpf_buf_reclaimed(d);
 	d->bd_hbuf_in_use = 0;
 	wakeup(&d->bd_hbuf_in_use);
 	BPFD_UNLOCK(d);
 
 	return (error);
 }
 
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
 static __inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_state == BPF_WAITING) {
 		callout_stop(&d->bd_callout);
 		d->bd_state = BPF_IDLE;
 	}
 	wakeup(d);
 	if (d->bd_async && d->bd_sig && d->bd_sigio)
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
 }
 
 static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (callout_pending(&d->bd_callout) ||
 	    !callout_active(&d->bd_callout))
 		return;
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
 }
 
 static int
 bpf_ready(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
 		return (1);
 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
 	    d->bd_slen != 0)
 		return (1);
 	return (0);
 }
 
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct route ro;
 	struct sockaddr dst;
 	struct epoch_tracker et;
 	struct bpf_if *bp;
 	struct bpf_d *d;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	counter_u64_add(d->bd_wcount, 1);
 	if ((bp = d->bd_bif) == NULL) {
 		error = ENXIO;
 		goto out_locked;
 	}
 
 	ifp = bp->bif_ifp;
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		error = ENETDOWN;
 		goto out_locked;
 	}
 
 	if (uio->uio_resid == 0)
 		goto out_locked;
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 
 	/*
 	 * Take extra reference, unlock d and exit from epoch section,
 	 * since bpf_movein() can sleep.
 	 */
 	bpfd_ref(d);
 	NET_EPOCH_EXIT(et);
 	BPFD_UNLOCK(d);
 
 	error = bpf_movein(uio, (int)bp->bif_dlt, ifp,
 	    &m, &dst, &hlen, d);
 
 	if (error != 0) {
 		counter_u64_add(d->bd_wdcount, 1);
 		bpfd_rele(d);
 		return (error);
 	}
 
 	BPFD_LOCK(d);
 	/*
 	 * Check that descriptor is still attached to the interface.
 	 * This can happen on bpfdetach(). To avoid access to detached
 	 * ifnet, free mbuf and return ENXIO.
 	 */
 	if (d->bd_bif == NULL) {
 		counter_u64_add(d->bd_wdcount, 1);
 		BPFD_UNLOCK(d);
 		bpfd_rele(d);
 		m_freem(m);
 		return (ENXIO);
 	}
 	counter_u64_add(d->bd_wfcount, 1);
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
 	if (d->bd_feedback) {
 		mc = m_dup(m, M_NOWAIT);
 		if (mc != NULL)
 			mc->m_pkthdr.rcvif = ifp;
 		/* Set M_PROMISC for outgoing packets to be discarded. */
 		if (d->bd_direction == BPF_D_INOUT)
 			m->m_flags |= M_PROMISC;
 	} else
 		mc = NULL;
 
 	m->m_pkthdr.len -= hlen;
 	m->m_len -= hlen;
 	m->m_data += hlen;	/* XXX */
 
 	CURVNET_SET(ifp->if_vnet);
 #ifdef MAC
 	mac_bpfdesc_create_mbuf(d, m);
 	if (mc != NULL)
 		mac_bpfdesc_create_mbuf(d, mc);
 #endif
 
 	bzero(&ro, sizeof(ro));
 	if (hlen != 0) {
 		ro.ro_prepend = (u_char *)&dst.sa_data;
 		ro.ro_plen = hlen;
 		ro.ro_flags = RT_HAS_HEADER;
 	}
 
 	if (d->bd_pcp != 0)
 		vlan_set_pcp(m, d->bd_pcp);
 
 	/* Avoid possible recursion on BPFD_LOCK(). */
 	NET_EPOCH_ENTER(et);
 	BPFD_UNLOCK(d);
 	error = (*ifp->if_output)(ifp, m, &dst, &ro);
 	if (error)
 		counter_u64_add(d->bd_wdcount, 1);
 
 	if (mc != NULL) {
 		if (error == 0)
 			(*ifp->if_input)(ifp, mc);
 		else
 			m_freem(mc);
 	}
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	bpfd_rele(d);
 	return (error);
 
 out_locked:
 	counter_u64_add(d->bd_wdcount, 1);
 	NET_EPOCH_EXIT(et);
 	BPFD_UNLOCK(d);
 	return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the receive
  * and drop counts.  This is doable for kernel-only buffers, but with
  * zero-copy buffers, we can't write to (or rotate) buffers that are
  * currently owned by userspace.  It would be nice if we could encapsulate
  * this logic in the buffer code rather than here.
  */
 static void
 reset_d(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	while (d->bd_hbuf_in_use)
 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
 		    "bd_hbuf", 0);
 	if ((d->bd_hbuf != NULL) &&
 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 	if (bpf_canwritebuf(d))
 		d->bd_slen = 0;
 	counter_u64_zero(d->bd_rcount);
 	counter_u64_zero(d->bd_dcount);
 	counter_u64_zero(d->bd_fcount);
 	counter_u64_zero(d->bd_wcount);
 	counter_u64_zero(d->bd_wfcount);
 	counter_u64_zero(d->bd_wdcount);
 	counter_u64_zero(d->bd_zcopy);
 }
 
 /*
  *  FIONREAD		Check for read packet available.
  *  BIOCGBLEN		Get buffer len [for read()].
  *  BIOCSETF		Set read filter.
  *  BIOCSETFNR		Set read filter without resetting descriptor.
  *  BIOCSETWF		Set write filter.
  *  BIOCFLUSH		Flush read packet buffer.
  *  BIOCPROMISC		Put interface into promiscuous mode.
  *  BIOCGDLT		Get link layer type.
  *  BIOCGETIF		Get interface name.
  *  BIOCSETIF		Set interface.
  *  BIOCSRTIMEOUT	Set read timeout.
  *  BIOCGRTIMEOUT	Get read timeout.
  *  BIOCGSTATS		Get packet stats.
  *  BIOCIMMEDIATE	Set immediate mode.
  *  BIOCVERSION		Get filter language version.
  *  BIOCGHDRCMPLT	Get "header already complete" flag
  *  BIOCSHDRCMPLT	Set "header already complete" flag
  *  BIOCGDIRECTION	Get packet direction flag
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCGTSTAMP		Get time stamp format and resolution.
  *  BIOCSTSTAMP		Set time stamp format and resolution.
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
  *  BIOCSETZBUF		Set current zero-copy buffer locations.
  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
  *  BIOCROTZBUF		Force rotation of zero-copy buffer
  *  BIOCSETBUFMODE	Set buffer mode.
  *  BIOCGETBUFMODE	Get current buffer mode.
  *  BIOCSETVLANPCP	Set VLAN PCP tag.
  */
 /* ARGSUSED */
 static	int
 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 
 	if (d->bd_locked == 1) {
 		switch (cmd) {
 		case BIOCGBLEN:
 		case BIOCFLUSH:
 		case BIOCGDLT:
 		case BIOCGDLTLIST:
 #ifdef COMPAT_FREEBSD32
 		case BIOCGDLTLIST32:
 #endif
 		case BIOCGETIF:
 		case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 		case BIOCGRTIMEOUT32:
 #endif
 		case BIOCGSTATS:
 		case BIOCVERSION:
 		case BIOCGRSIG:
 		case BIOCGHDRCMPLT:
 		case BIOCSTSTAMP:
 		case BIOCFEEDBACK:
 		case FIONREAD:
 		case BIOCLOCK:
 		case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 		case BIOCSRTIMEOUT32:
 #endif
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
 		case BIOCROTZBUF:
 			break;
 		default:
 			return (EPERM);
 		}
 	}
 #ifdef COMPAT_FREEBSD32
 	/*
 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
 	 * that it will get 32-bit packet headers.
 	 */
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 	case BIOCGDLTLIST32:
 	case BIOCGRTIMEOUT32:
 	case BIOCSRTIMEOUT32:
 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 			BPFD_LOCK(d);
 			d->bd_compat32 = 1;
 			BPFD_UNLOCK(d);
 		}
 	}
 #endif
 
 	CURVNET_SET(TD_TO_VNET(td));
 	switch (cmd) {
 	default:
 		error = EINVAL;
 		break;
 
 	/*
 	 * Check for read packet available.
 	 */
 	case FIONREAD:
 		{
 			int n;
 
 			BPFD_LOCK(d);
 			n = d->bd_slen;
 			while (d->bd_hbuf_in_use)
 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 				    PRINET, "bd_hbuf", 0);
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
 			BPFD_UNLOCK(d);
 
 			*(int *)addr = n;
 			break;
 		}
 
 	/*
 	 * Get buffer len [for read()].
 	 */
 	case BIOCGBLEN:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufsize;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
 		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
 	 * Set link layer read filter.
 	 */
 	case BIOCSETF:
 	case BIOCSETFNR:
 	case BIOCSETWF:
 #ifdef COMPAT_FREEBSD32
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 #endif
 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 		break;
 
 	/*
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
 		BPF_LOCK();
 		if (d->bd_bif == NULL) {
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 		} else if (d->bd_promisc == 0) {
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get current data link type.
 	 */
 	case BIOCGDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get a list of supported data link types.
 	 */
 #ifdef COMPAT_FREEBSD32
 	case BIOCGDLTLIST32:
 		{
 			struct bpf_dltlist32 *list32;
 			struct bpf_dltlist dltlist;
 
 			list32 = (struct bpf_dltlist32 *)addr;
 			dltlist.bfl_len = list32->bfl_len;
 			dltlist.bfl_list = PTRIN(list32->bfl_list);
 			BPF_LOCK();
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				error = bpf_getdltlist(d, &dltlist);
 				if (error == 0)
 					list32->bfl_len = dltlist.bfl_len;
 			}
 			BPF_UNLOCK();
 			break;
 		}
 #endif
 
 	case BIOCGDLTLIST:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set data link type.
 	 */
 	case BIOCSDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get interface name.
 	 */
 	case BIOCGETIF:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else {
 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
 			struct ifreq *const ifr = (struct ifreq *)addr;
 
 			strlcpy(ifr->ifr_name, ifp->if_xname,
 			    sizeof(ifr->ifr_name));
 		}
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set interface.
 	 */
 	case BIOCSETIF:
 		{
 			int alloc_buf, size;
 
 			/*
 			 * Behavior here depends on the buffering model.  If
 			 * we're using kernel memory buffers, then we can
 			 * allocate them here.  If we're using zero-copy,
 			 * then the user process must have registered buffers
 			 * by the time we get here.
 			 */
 			alloc_buf = 0;
 			BPFD_LOCK(d);
 			if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
 			    d->bd_sbuf == NULL)
 				alloc_buf = 1;
 			BPFD_UNLOCK(d);
 			if (alloc_buf) {
 				size = d->bd_bufsize;
 				error = bpf_buffer_ioctl_sblen(d, &size);
 				if (error != 0)
 					break;
 			}
 			BPF_LOCK();
 			error = bpf_setif(d, (struct ifreq *)addr);
 			BPF_UNLOCK();
 			break;
 		}
 
 	/*
 	 * Set read timeout.
 	 */
 	case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 	case BIOCSRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv = (struct timeval *)addr;
 #if defined(COMPAT_FREEBSD32)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCSRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv = &tv64;
 				tv->tv_sec = tv32->tv_sec;
 				tv->tv_usec = tv32->tv_usec;
 			} else
 #endif
 				tv = (struct timeval *)addr;
 
 			/*
 			 * Subtract 1 tick from tvtohz() since this isn't
 			 * a one-shot timer.
 			 */
 			if ((error = itimerfix(tv)) == 0)
 				d->bd_rtout = tvtohz(tv) - 1;
 			break;
 		}
 
 	/*
 	 * Get read timeout.
 	 */
 	case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 	case BIOCGRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv;
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCGRTIMEOUT32)
 				tv = &tv64;
 			else
 #endif
 				tv = (struct timeval *)addr;
 
 			tv->tv_sec = d->bd_rtout / hz;
 			tv->tv_usec = (d->bd_rtout % hz) * tick;
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 			if (cmd == BIOCGRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv32->tv_sec = tv->tv_sec;
 				tv32->tv_usec = tv->tv_usec;
 			}
 #endif
 
 			break;
 		}
 
 	/*
 	 * Get packet stats.
 	 */
 	case BIOCGSTATS:
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
 			/* XXXCSJP overflow */
 			bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
 			bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
 			break;
 		}
 
 	/*
 	 * Set immediate mode.
 	 */
 	case BIOCIMMEDIATE:
 		BPFD_LOCK(d);
 		d->bd_immediate = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCVERSION:
 		{
 			struct bpf_version *bv = (struct bpf_version *)addr;
 
 			bv->bv_major = BPF_MAJOR_VERSION;
 			bv->bv_minor = BPF_MINOR_VERSION;
 			break;
 		}
 
 	/*
 	 * Get "header already complete" flag
 	 */
 	case BIOCGHDRCMPLT:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_hdrcmplt;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set "header already complete" flag
 	 */
 	case BIOCSHDRCMPLT:
 		BPFD_LOCK(d);
 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Get packet direction flag
 	 */
 	case BIOCGDIRECTION:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_direction;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet direction flag
 	 */
 	case BIOCSDIRECTION:
 		{
 			u_int	direction;
 
 			direction = *(u_int *)addr;
 			switch (direction) {
 			case BPF_D_IN:
 			case BPF_D_INOUT:
 			case BPF_D_OUT:
 				BPFD_LOCK(d);
 				d->bd_direction = direction;
 				BPFD_UNLOCK(d);
 				break;
 			default:
 				error = EINVAL;
 			}
 		}
 		break;
 
 	/*
 	 * Get packet timestamp format and resolution.
 	 */
 	case BIOCGTSTAMP:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_tstamp;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet timestamp format and resolution.
 	 */
 	case BIOCSTSTAMP:
 		{
 			u_int	func;
 
 			func = *(u_int *)addr;
 			if (BPF_T_VALID(func))
 				d->bd_tstamp = func;
 			else
 				error = EINVAL;
 		}
 		break;
 
 	case BIOCFEEDBACK:
 		BPFD_LOCK(d);
 		d->bd_feedback = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCLOCK:
 		BPFD_LOCK(d);
 		d->bd_locked = 1;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIONBIO:		/* Non-blocking I/O */
 		break;
 
 	case FIOASYNC:		/* Send signal on receive packets */
 		BPFD_LOCK(d);
 		d->bd_async = *(int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIOSETOWN:
 		/*
 		 * XXX: Add some sort of locking here?
 		 * fsetown() can sleep.
 		 */
 		error = fsetown(*(int *)addr, &d->bd_sigio);
 		break;
 
 	case FIOGETOWN:
 		BPFD_LOCK(d);
 		*(int *)addr = fgetown(&d->bd_sigio);
 		BPFD_UNLOCK(d);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)addr = -fgetown(&d->bd_sigio);
 		break;
 
 	case BIOCSRSIG:		/* Set receive signal */
 		{
 			u_int sig;
 
 			sig = *(u_int *)addr;
 
 			if (sig >= NSIG)
 				error = EINVAL;
 			else {
 				BPFD_LOCK(d);
 				d->bd_sig = sig;
 				BPFD_UNLOCK(d);
 			}
 			break;
 		}
 	case BIOCGRSIG:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_sig;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETBUFMODE:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufmode;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCSETBUFMODE:
 		/*
 		 * Allow the buffering mode to be changed as long as we
 		 * haven't yet committed to a particular mode.  Our
 		 * definition of commitment, for now, is whether or not a
 		 * buffer has been allocated or an interface attached, since
 		 * that's the point where things get tricky.
 		 */
 		switch (*(u_int *)addr) {
 		case BPF_BUFMODE_BUFFER:
 			break;
 
 		case BPF_BUFMODE_ZBUF:
 			if (bpf_zerocopy_enable)
 				break;
 			/* FALLSTHROUGH */
 
 		default:
 			CURVNET_RESTORE();
 			return (EINVAL);
 		}
 
 		BPFD_LOCK(d);
 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
 			BPFD_UNLOCK(d);
 			CURVNET_RESTORE();
 			return (EBUSY);
 		}
 		d->bd_bufmode = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETZMAX:
 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
 		break;
 
 	case BIOCSETZBUF:
 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 
 	case BIOCROTZBUF:
 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 
 	case BIOCSETVLANPCP:
 		{
 			u_int pcp;
 
 			pcp = *(u_int *)addr;
 			if (pcp > BPF_PRIO_MAX || pcp < 0) {
 				error = EINVAL;
 				break;
 			}
 			d->bd_pcp = pcp;
 			break;
 		}
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Set d's packet filter program to fp. If this file already has a filter,
  * free it and replace it. Returns EINVAL for bogus requests.
  *
  * Note we use global lock here to serialize bpf_setf() and bpf_setif()
  * calls.
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
 {
 #ifdef COMPAT_FREEBSD32
 	struct bpf_program fp_swab;
 	struct bpf_program32 *fp32;
 #endif
 	struct bpf_program_buffer *fcode;
 	struct bpf_insn *filter;
 #ifdef BPF_JITTER
 	bpf_jit_filter *jfunc;
 #endif
 	size_t size;
 	u_int flen;
 	bool track_event;
 
 #ifdef COMPAT_FREEBSD32
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETWF32:
 	case BIOCSETFNR32:
 		fp32 = (struct bpf_program32 *)fp;
 		fp_swab.bf_len = fp32->bf_len;
 		fp_swab.bf_insns =
 		    (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
 		fp = &fp_swab;
 		switch (cmd) {
 		case BIOCSETF32:
 			cmd = BIOCSETF;
 			break;
 		case BIOCSETWF32:
 			cmd = BIOCSETWF;
 			break;
 		}
 		break;
 	}
 #endif
 
 	filter = NULL;
 #ifdef BPF_JITTER
 	jfunc = NULL;
 #endif
 	/*
 	 * Check new filter validness before acquiring any locks.
 	 * Allocate memory for new filter, if needed.
 	 */
 	flen = fp->bf_len;
 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
 		return (EINVAL);
 	size = flen * sizeof(*fp->bf_insns);
 	if (size > 0) {
 		/* We're setting up new filter. Copy and check actual data. */
 		fcode = bpf_program_buffer_alloc(size, M_WAITOK);
 		filter = (struct bpf_insn *)fcode->buffer;
 		if (copyin(fp->bf_insns, filter, size) != 0 ||
 		    !bpf_validate(filter, flen)) {
 			free(fcode, M_BPF);
 			return (EINVAL);
 		}
 #ifdef BPF_JITTER
 		if (cmd != BIOCSETWF) {
 			/*
 			 * Filter is copied inside fcode and is
 			 * perfectly valid.
 			 */
 			jfunc = bpf_jitter(filter, flen);
 		}
 #endif
 	}
 
 	track_event = false;
 	fcode = NULL;
 
 	BPF_LOCK();
 	BPFD_LOCK(d);
 	/* Set up new filter. */
 	if (cmd == BIOCSETWF) {
 		if (d->bd_wfilter != NULL) {
 			fcode = __containerof((void *)d->bd_wfilter,
 			    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 			fcode->func = NULL;
 #endif
 		}
 		d->bd_wfilter = filter;
 	} else {
 		if (d->bd_rfilter != NULL) {
 			fcode = __containerof((void *)d->bd_rfilter,
 			    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 			fcode->func = d->bd_bfilter;
 #endif
 		}
 		d->bd_rfilter = filter;
 #ifdef BPF_JITTER
 		d->bd_bfilter = jfunc;
 #endif
 		if (cmd == BIOCSETF)
 			reset_d(d);
 
 		if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
 			/*
 			 * Filter can be set several times without
 			 * specifying interface. In this case just mark d
 			 * as reader.
 			 */
 			d->bd_writer = 0;
 			if (d->bd_bif != NULL) {
 				/*
 				 * Remove descriptor from writers-only list
 				 * and add it to active readers list.
 				 */
 				CK_LIST_REMOVE(d, bd_next);
 				CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
 				    d, bd_next);
 				CTR2(KTR_NET,
 				    "%s: upgrade required by pid %d",
 				    __func__, d->bd_pid);
 				track_event = true;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 
 	if (fcode != NULL)
 		NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx);
 
 	if (track_event)
 		EVENTHANDLER_INVOKE(bpf_track,
 		    d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1);
 
 	BPF_UNLOCK();
 	return (0);
 }
 
 /*
  * Detach a file from its current interface (if attached at all) and attach
  * to the interface indicated by the name stored in ifr.
  * Return an errno or 0.
  */
 static int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
 	struct bpf_if *bp;
 	struct ifnet *theywant;
 
 	BPF_LOCK_ASSERT();
 
 	theywant = ifunit(ifr->ifr_name);
 	if (theywant == NULL || theywant->if_bpf == NULL)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
 	/*
 	 * At this point, we expect the buffer is already allocated.  If not,
 	 * return an error.
 	 */
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 	case BPF_BUFMODE_ZBUF:
 		if (d->bd_sbuf == NULL)
 			return (EINVAL);
 		break;
 
 	default:
 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
 	}
 	if (bp != d->bd_bif)
 		bpf_attachd(d, bp);
 	else {
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 	}
 	return (0);
 }
 
 /*
  * Support for select() and poll() system calls
  *
  * Return true iff the specific operation will not block indefinitely.
  * Otherwise, return false but make a note that a selwakeup() must be done.
  */
 static int
 bpfpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct bpf_d *d;
 	int revents;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
 		return (events &
 		    (POLLHUP | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM));
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	revents = events & (POLLOUT | POLLWRNORM);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (bpf_ready(d))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			selrecord(td, &d->bd_sel);
 			/* Start the read timeout if necessary. */
 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 				callout_reset(&d->bd_callout, d->bd_rtout,
 				    bpf_timed_out, d);
 				d->bd_state = BPF_WAITING;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 	return (revents);
 }
 
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
  */
 int
 bpfkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct bpf_d *d;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0)
 		return (1);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &bpfread_filtops;
 		break;
 
 	case EVFILT_WRITE:
 		kn->kn_fop = &bpfwrite_filtops;
 		break;
 
 	default:
 		return (1);
 	}
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	kn->kn_hook = d;
 	knlist_add(&d->bd_sel.si_note, kn, 1);
 	BPFD_UNLOCK(d);
 
 	return (0);
 }
 
 static void
 filt_bpfdetach(struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	knlist_remove(&d->bd_sel.si_note, kn, 0);
 }
 
 static int
 filt_bpfread(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 	int ready;
 
 	BPFD_LOCK_ASSERT(d);
 	ready = bpf_ready(d);
 	if (ready) {
 		kn->kn_data = d->bd_slen;
 		/*
 		 * Ignore the hold buffer if it is being copied to user space.
 		 */
 		if (!d->bd_hbuf_in_use && d->bd_hbuf)
 			kn->kn_data += d->bd_hlen;
 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 		callout_reset(&d->bd_callout, d->bd_rtout,
 		    bpf_timed_out, d);
 		d->bd_state = BPF_WAITING;
 	}
 
 	return (ready);
 }
 
 static int
 filt_bpfwrite(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (d->bd_bif == NULL) {
 		kn->kn_data = 0;
 		return (0);
 	} else {
 		kn->kn_data = d->bd_bif->bif_ifp->if_mtu;
 		return (1);
 	}
 }
 
 #define	BPF_TSTAMP_NONE		0
 #define	BPF_TSTAMP_FAST		1
 #define	BPF_TSTAMP_NORMAL	2
 #define	BPF_TSTAMP_EXTERN	3
 
 static int
 bpf_ts_quality(int tstype)
 {
 
 	if (tstype == BPF_T_NONE)
 		return (BPF_TSTAMP_NONE);
 	if ((tstype & BPF_T_FAST) != 0)
 		return (BPF_TSTAMP_FAST);
 
 	return (BPF_TSTAMP_NORMAL);
 }
 
 static int
 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
 {
 	struct timespec ts;
 	struct m_tag *tag;
 	int quality;
 
 	quality = bpf_ts_quality(tstype);
 	if (quality == BPF_TSTAMP_NONE)
 		return (quality);
 
 	if (m != NULL) {
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts);
 			timespec2bintime(&ts, bt);
 			return (BPF_TSTAMP_EXTERN);
 		}
 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
 		if (tag != NULL) {
 			*bt = *(struct bintime *)(tag + 1);
 			return (BPF_TSTAMP_EXTERN);
 		}
 	}
 	if (quality == BPF_TSTAMP_NORMAL)
 		binuptime(bt);
 	else
 		getbinuptime(bt);
 
 	return (quality);
 }
 
 /*
  * Incoming linkage from device drivers.  Process the packet pkt, of length
  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
  * by each process' filter, and if accepted, stashed into the corresponding
  * buffer.
  */
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int slen;
 	int gottime;
 
 	gottime = BPF_TSTAMP_NONE;
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		counter_u64_add(d->bd_rcount, 1);
 		/*
 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there
 		 * is no way for the caller to indiciate to us whether this
 		 * packet is inbound or outbound. In the bpf_mtap() routines,
 		 * we use the interface pointers on the mbuf to figure it out.
 		 */
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		if (bf != NULL)
 			slen = (*(bf->func))(pkt, pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
 		if (slen != 0) {
 			/*
 			 * Filter matches. Let's to acquire write lock.
 			 */
 			BPFD_LOCK(d);
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp,
 				    NULL);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, pkt, pktlen, slen,
 				    bpf_append_bytes, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 void
 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
 {
 	if (bpf_peers_present(ifp->if_bpf))
 		bpf_tap(ifp->if_bpf, pkt, pktlen);
 }
 
 #define	BPF_CHECK_DIRECTION(d, r, i)				\
 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  * Locking model is explained in bpf_tap().
  */
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	gottime = BPF_TSTAMP_NONE;
 
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		/* XXX We cannot handle multiple mbufs. */
 		if (bf != NULL && m->m_next == NULL)
 			slen = (*(bf->func))(mtod(m, u_char *), pktlen,
 			    pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 void
 bpf_mtap_if(if_t ifp, struct mbuf *m)
 {
 	if (bpf_peers_present(ifp->if_bpf)) {
 		M_ASSERTVALID(m);
 		bpf_mtap(ifp->if_bpf, m);
 	}
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in
  * an mbuf chain and to be prepended by a contiguous header.
  */
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct mbuf mb;
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	/*
 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
 	mb.m_flags = 0;
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
 	pktlen += dlen;
 
 	gottime = BPF_TSTAMP_NONE;
 
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 void
 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
 {
 	if (bpf_peers_present(ifp->if_bpf)) {
 		M_ASSERTVALID(m);
 		bpf_mtap2(ifp->if_bpf, data, dlen, m);
 	}
 }
 
 #undef	BPF_CHECK_DIRECTION
 #undef	BPF_TSTAMP_NONE
 #undef	BPF_TSTAMP_FAST
 #undef	BPF_TSTAMP_NORMAL
 #undef	BPF_TSTAMP_EXTERN
 
 static int
 bpf_hdrlen(struct bpf_d *d)
 {
 	int hdrlen;
 
 	hdrlen = d->bd_bif->bif_hdrlen;
 #ifndef BURN_BRIDGES
 	if (d->bd_tstamp == BPF_T_NONE ||
 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32)
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
 		else
 #endif
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
 	else
 #endif
 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		hdrlen = BPF_WORDALIGN32(hdrlen);
 	else
 #endif
 		hdrlen = BPF_WORDALIGN(hdrlen);
 
 	return (hdrlen - d->bd_bif->bif_hdrlen);
 }
 
 static void
 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
 {
 	struct bintime bt2, boottimebin;
 	struct timeval tsm;
 	struct timespec tsn;
 
 	if ((tstype & BPF_T_MONOTONIC) == 0) {
 		bt2 = *bt;
 		getboottimebin(&boottimebin);
 		bintime_add(&bt2, &boottimebin);
 		bt = &bt2;
 	}
 	switch (BPF_T_FORMAT(tstype)) {
 	case BPF_T_MICROTIME:
 		bintime2timeval(bt, &tsm);
 		ts->bt_sec = tsm.tv_sec;
 		ts->bt_frac = tsm.tv_usec;
 		break;
 	case BPF_T_NANOTIME:
 		bintime2timespec(bt, &tsn);
 		ts->bt_sec = tsn.tv_sec;
 		ts->bt_frac = tsn.tv_nsec;
 		break;
 	case BPF_T_BINTIME:
 		ts->bt_sec = bt->sec;
 		ts->bt_frac = bt->frac;
 		break;
 	}
 }
 
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
     struct bintime *bt)
 {
 	static char zeroes[BPF_ALIGNMENT];
 	struct bpf_xhdr hdr;
 #ifndef BURN_BRIDGES
 	struct bpf_hdr hdr_old;
 #ifdef COMPAT_FREEBSD32
 	struct bpf_hdr32 hdr32_old;
 #endif
 #endif
 	int caplen, curlen, hdrlen, pad, totlen;
 	int do_wakeup = 0;
 	int do_timestamp;
 	int tstype;
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_bif == NULL) {
 		/* Descriptor was detached in concurrent thread */
 		counter_u64_add(d->bd_dcount, 1);
 		return;
 	}
 
 	/*
 	 * Detect whether user space has released a buffer back to us, and if
 	 * so, move it from being a hold buffer to a free buffer.  This may
 	 * not be the best place to do it (for example, we might only want to
 	 * run this check if we need the space), but for now it's a reliable
 	 * spot to do it.
 	 */
 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
 	 * much.  Otherwise, transfer the whole packet (unless
 	 * we hit the buffer size limit).
 	 */
 	hdrlen = bpf_hdrlen(d);
 	totlen = hdrlen + min(snaplen, pktlen);
 	if (totlen > d->bd_bufsize)
 		totlen = d->bd_bufsize;
 
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 *
 	 * Drop the packet if there's no room and no hope of room
 	 * If the packet would overflow the storage buffer or the storage
 	 * buffer is considered immutable by the buffer model, try to rotate
 	 * the buffer and wakeup pending processes.
 	 */
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		curlen = BPF_WORDALIGN32(d->bd_slen);
 	else
 #endif
 		curlen = BPF_WORDALIGN(d->bd_slen);
 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
 		if (d->bd_fbuf == NULL) {
 			/*
 			 * There's no room in the store buffer, and no
 			 * prospect of room, so drop the packet.  Notify the
 			 * buffer model.
 			 */
 			bpf_buffull(d);
 			counter_u64_add(d->bd_dcount, 1);
 			return;
 		}
 		KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
 	} else {
 		if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
 			/*
 			 * Immediate mode is set, or the read timeout has
 			 * already expired during a select call.  A packet
 			 * arrived, so the reader should be woken up.
 			 */
 			do_wakeup = 1;
 		}
 		pad = curlen - d->bd_slen;
 		KASSERT(pad >= 0 && pad <= sizeof(zeroes),
 		    ("%s: invalid pad byte count %d", __func__, pad));
 		if (pad > 0) {
 			/* Zero pad bytes. */
 			bpf_append_bytes(d, d->bd_sbuf, d->bd_slen, zeroes,
 			    pad);
 		}
 	}
 
 	caplen = totlen - hdrlen;
 	tstype = d->bd_tstamp;
 	do_timestamp = tstype != BPF_T_NONE;
 #ifndef BURN_BRIDGES
 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
 		struct bpf_ts ts;
 		if (do_timestamp)
 			bpf_bintime2ts(bt, &ts, tstype);
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32) {
 			bzero(&hdr32_old, sizeof(hdr32_old));
 			if (do_timestamp) {
 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
 			}
 			hdr32_old.bh_datalen = pktlen;
 			hdr32_old.bh_hdrlen = hdrlen;
 			hdr32_old.bh_caplen = caplen;
 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
 			    sizeof(hdr32_old));
 			goto copy;
 		}
 #endif
 		bzero(&hdr_old, sizeof(hdr_old));
 		if (do_timestamp) {
 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
 		}
 		hdr_old.bh_datalen = pktlen;
 		hdr_old.bh_hdrlen = hdrlen;
 		hdr_old.bh_caplen = caplen;
 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
 		    sizeof(hdr_old));
 		goto copy;
 	}
 #endif
 
 	/*
 	 * Append the bpf header.  Note we append the actual header size, but
 	 * move forward the length of the header plus padding.
 	 */
 	bzero(&hdr, sizeof(hdr));
 	if (do_timestamp)
 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
 	hdr.bh_datalen = pktlen;
 	hdr.bh_hdrlen = hdrlen;
 	hdr.bh_caplen = caplen;
 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
 
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
 #ifndef BURN_BRIDGES
 copy:
 #endif
 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpfd_free(epoch_context_t ctx)
 {
 	struct bpf_d *d;
 	struct bpf_program_buffer *p;
 
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
 	d = __containerof(ctx, struct bpf_d, epoch_ctx);
 	bpf_free(d);
 	if (d->bd_rfilter != NULL) {
 		p = __containerof((void *)d->bd_rfilter,
 		    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 		p->func = d->bd_bfilter;
 #endif
 		bpf_program_buffer_free(&p->epoch_ctx);
 	}
 	if (d->bd_wfilter != NULL) {
 		p = __containerof((void *)d->bd_wfilter,
 		    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 		p->func = NULL;
 #endif
 		bpf_program_buffer_free(&p->epoch_ctx);
 	}
 
 	mtx_destroy(&d->bd_lock);
 	counter_u64_free(d->bd_rcount);
 	counter_u64_free(d->bd_dcount);
 	counter_u64_free(d->bd_fcount);
 	counter_u64_free(d->bd_wcount);
 	counter_u64_free(d->bd_wfcount);
 	counter_u64_free(d->bd_wdcount);
 	counter_u64_free(d->bd_zcopy);
 	free(d, M_BPF);
 }
 
 /*
  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
  * fixed size of the link header (variable length headers not yet supported).
  */
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 /*
  * Attach an interface to bpf.  ifp is a pointer to the structure
  * defining the interface to be attached, dlt is the link layer type,
  * and hdrlen is the fixed size of the link header (variable length
  * headers are not yet supporrted).
  */
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen,
     struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
 	KASSERT(*driverp == NULL,
 	    ("bpfattach2: driverp already initialized"));
 
 	bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
 
 	CK_LIST_INIT(&bp->bif_dlist);
 	CK_LIST_INIT(&bp->bif_wlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	bp->bif_hdrlen = hdrlen;
 	bp->bif_bpf = driverp;
 	refcount_init(&bp->bif_refcnt, 1);
 	*driverp = bp;
 	/*
 	 * Reference ifnet pointer, so it won't freed until
 	 * we release it.
 	 */
 	if_ref(ifp);
 	BPF_LOCK();
 	CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	BPF_UNLOCK();
 
 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
 		if_printf(ifp, "bpf attached\n");
 }
 
 #ifdef VIMAGE
 /*
  * When moving interfaces between vnet instances we need a way to
  * query the dlt and hdrlen before detach so we can re-attch the if_bpf
  * after the vmove.  We unfortunately have no device driver infrastructure
  * to query the interface for these values after creation/attach, thus
  * add this as a workaround.
  */
 int
 bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
 {
 
 	if (bp == NULL)
 		return (ENXIO);
 	if (bif_dlt == NULL && bif_hdrlen == NULL)
 		return (0);
 
 	if (bif_dlt != NULL)
 		*bif_dlt = bp->bif_dlt;
 	if (bif_hdrlen != NULL)
 		*bif_hdrlen = bp->bif_hdrlen;
 
 	return (0);
 }
 #endif
 
 /*
  * Detach bpf from an interface. This involves detaching each descriptor
  * associated with the interface. Notify each descriptor as it's detached
  * so that any sleepers wake up and get ENXIO.
  */
 void
 bpfdetach(struct ifnet *ifp)
 {
 	struct bpf_if *bp, *bp_temp;
 	struct bpf_d *d;
 
 	BPF_LOCK();
 	/* Find all bpf_if struct's which reference ifp and detach them. */
 	CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
 		if (ifp != bp->bif_ifp)
 			continue;
 
 		CK_LIST_REMOVE(bp, bif_next);
 		*bp->bif_bpf = __DECONST(struct bpf_if *, &dead_bpf_if);
 
 		CTR4(KTR_NET,
 		    "%s: sheduling free for encap %d (%p) for if %p",
 		    __func__, bp->bif_dlt, bp, ifp);
 
 		/* Detach common descriptors */
 		while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
 			bpf_detachd_locked(d, true);
 		}
 
 		/* Detach writer-only descriptors */
 		while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
 			bpf_detachd_locked(d, true);
 		}
 		bpfif_rele(bp);
 	}
 	BPF_UNLOCK();
 }
 
 bool
 bpf_peers_present_if(struct ifnet *ifp)
 {
 	return (bpf_peers_present(ifp->if_bpf));
 }
 
 /*
  * Get a list of available data link type of the interface.
  */
 static int
 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
 {
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 	u_int *lst;
 	int error, n, n1;
 
 	BPF_LOCK_ASSERT();
 
 	ifp = d->bd_bif->bif_ifp;
 	n1 = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp)
 			n1++;
 	}
 	if (bfl->bfl_list == NULL) {
 		bfl->bfl_len = n1;
 		return (0);
 	}
 	if (n1 > bfl->bfl_len)
 		return (ENOMEM);
 
 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
 	n = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		lst[n++] = bp->bif_dlt;
 	}
 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
 	free(lst, M_TEMP);
 	bfl->bfl_len = n;
 	return (error);
 }
 
 /*
  * Set the data link type of a BPF instance.
  */
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
 	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 	MPASS(d->bd_bif != NULL);
 
 	/*
 	 * It is safe to check bd_bif without BPFD_LOCK, it can not be
 	 * changed while we hold global lock.
 	 */
 	if (d->bd_bif->bif_dlt == dlt)
 		return (0);
 
 	ifp = d->bd_bif->bif_ifp;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
 			break;
 	}
 	if (bp == NULL)
 		return (EINVAL);
 
 	opromisc = d->bd_promisc;
 	bpf_attachd(d, bp);
 	if (opromisc) {
 		error = ifpromisc(bp->bif_ifp, 1);
 		if (error)
 			if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n",
 			    __func__, error);
 		else
 			d->bd_promisc = 1;
 	}
 	return (0);
 }
 
 static void
 bpf_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	sx_init(&bpf_sx, "bpf global lock");
 	CK_LIST_INIT(&bpf_iflist);
 
 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
 	/* For compatibility */
 	make_dev_alias(dev, "bpf0");
 }
 
 /*
  * Zero out the various packet counters associated with all of the bpf
  * descriptors.  At some point, we will probably want to get a bit more
  * granular and allow the user to specify descriptors to be zeroed.
  */
 static void
 bpf_zero_counters(void)
 {
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	BPF_LOCK();
 	/*
 	 * We are protected by global lock here, interfaces and
 	 * descriptors can not be deleted while we hold it.
 	 */
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			counter_u64_zero(bd->bd_rcount);
 			counter_u64_zero(bd->bd_dcount);
 			counter_u64_zero(bd->bd_fcount);
 			counter_u64_zero(bd->bd_wcount);
 			counter_u64_zero(bd->bd_wfcount);
 			counter_u64_zero(bd->bd_zcopy);
 		}
 	}
 	BPF_UNLOCK();
 }
 
 /*
  * Fill filter statistics
  */
 static void
 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 {
 
 	BPF_LOCK_ASSERT();
 	bzero(d, sizeof(*d));
 	d->bd_structsize = sizeof(*d);
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
 	d->bd_direction = bd->bd_direction;
 	d->bd_feedback = bd->bd_feedback;
 	d->bd_async = bd->bd_async;
 	d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
 	d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
 	d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
 	d->bd_sig = bd->bd_sig;
 	d->bd_slen = bd->bd_slen;
 	d->bd_hlen = bd->bd_hlen;
 	d->bd_bufsize = bd->bd_bufsize;
 	d->bd_pid = bd->bd_pid;
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
 	d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
 	d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
 	d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
 	d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
 	d->bd_bufmode = bd->bd_bufmode;
 }
 
 /*
  * Handle `netstat -B' stats request
  */
 static int
 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	static const struct xbpf_d zerostats;
 	struct xbpf_d *xbdbuf, *xbd, tempstats;
 	int index, error;
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	/*
 	 * XXX This is not technically correct. It is possible for non
 	 * privileged users to open bpf devices. It would make sense
 	 * if the users who opened the devices were able to retrieve
 	 * the statistics for them, too.
 	 */
 	error = priv_check(req->td, PRIV_NET_BPF);
 	if (error)
 		return (error);
 	/*
 	 * Check to see if the user is requesting that the counters be
 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
 	 * as we aren't allowing the user to set the counters currently.
 	 */
 	if (req->newptr != NULL) {
 		if (req->newlen != sizeof(tempstats))
 			return (EINVAL);
 		memset(&tempstats, 0, sizeof(tempstats));
 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
 		if (error)
 			return (error);
 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
 			return (EINVAL);
 		bpf_zero_counters();
 		return (0);
 	}
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
 	if (bpf_bpfd_cnt == 0)
 		return (SYSCTL_OUT(req, 0, 0));
 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
 	BPF_LOCK();
 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
 		BPF_UNLOCK();
 		free(xbdbuf, M_BPF);
 		return (ENOMEM);
 	}
 	index = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		/* Send writers-only first */
 		CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			bpfstats_fill_xbpf(xbd, bd);
 		}
 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			bpfstats_fill_xbpf(xbd, bd);
 		}
 	}
 	BPF_UNLOCK();
 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
 	free(xbdbuf, M_BPF);
 	return (error);
 }
 
 SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, bpf_drvinit, NULL);
 
 #else /* !DEV_BPF && !NETGRAPH_BPF */
 
 /*
  * NOP stubs to allow bpf-using drivers to load and function.
  *
  * A 'better' implementation would allow the core bpf functionality
  * to be loaded at runtime.
  */
 
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap_if(if_t ifp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
 {
 }
 
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 
 	*driverp = __DECONST(struct bpf_if *, &dead_bpf_if);
 }
 
 void
 bpfdetach(struct ifnet *ifp)
 {
 }
 
 bool
 bpf_peers_present_if(struct ifnet *ifp)
 {
 	return (false);
 }
 
 u_int
 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
 	return (-1);	/* "no filter" behaviour */
 }
 
 int
 bpf_validate(const struct bpf_insn *f, int len)
 {
 	return (0);	/* false */
 }
 
 #endif /* !DEV_BPF && !NETGRAPH_BPF */
 
 #ifdef DDB
 static void
 bpf_show_bpf_if(struct bpf_if *bpf_if)
 {
 
 	if (bpf_if == NULL)
 		return;
 	db_printf("%p:\n", bpf_if);
 #define	BPF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, bpf_if->e);
 #define	BPF_DB_PRINTF_RAW(f, e)	db_printf("   %s = " f "\n", #e, e);
 	/* bif_ext.bif_next */
 	/* bif_ext.bif_dlist */
 	BPF_DB_PRINTF("%#x", bif_dlt);
 	BPF_DB_PRINTF("%u", bif_hdrlen);
 	/* bif_wlist */
 	BPF_DB_PRINTF("%p", bif_ifp);
 	BPF_DB_PRINTF("%p", bif_bpf);
 	BPF_DB_PRINTF_RAW("%u", refcount_load(&bpf_if->bif_refcnt));
 }
 
 DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show bpf_if <struct bpf_if *>\n");
 		return;
 	}
 
 	bpf_show_bpf_if((struct bpf_if *)addr);
 }
 #endif
diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c
index 0dee2260973d..a0275a7471e5 100644
--- a/sys/net/if_tuntap.c
+++ b/sys/net/if_tuntap.c
@@ -1,2071 +1,2071 @@
 /*	$NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $	*/
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * BASED ON:
  * -------------------------------------------------------------------------
  *
  * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
  * Nottingham University 1987.
  *
  * This source may be freely distributed, however I would be interested
  * in any changes that are made.
  *
  * This driver takes packets off the IP i/f and hands them up to a
  * user process to have its wicked way with. This driver has it's
  * roots in a similar driver written by Phil Cockcroft (formerly) at
  * UCL. This driver is based much more on read/write/poll mode of
  * operation though.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/ttycom.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/random.h>
 #include <sys/ctype.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <netinet/in.h>
 #ifdef INET
 #include <netinet/ip.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_lro.h>
 #include <net/bpf.h>
 #include <net/if_tap.h>
 #include <net/if_tun.h>
 
 #include <dev/virtio/network/virtio_net.h>
 
 #include <sys/queue.h>
 #include <sys/condvar.h>
 #include <security/mac/mac_framework.h>
 
 struct tuntap_driver;
 
 /*
  * tun_list is protected by global tunmtx.  Other mutable fields are
  * protected by tun->tun_mtx, or by their owning subsystem.  tun_dev is
  * static for the duration of a tunnel interface.
  */
 struct tuntap_softc {
 	TAILQ_ENTRY(tuntap_softc)	 tun_list;
 	struct cdev			*tun_alias;
 	struct cdev			*tun_dev;
 	u_short				 tun_flags;	/* misc flags */
 #define	TUN_OPEN	0x0001
 #define	TUN_INITED	0x0002
 #define	TUN_UNUSED1	0x0008
 #define	TUN_UNUSED2	0x0010
 #define	TUN_LMODE	0x0020
 #define	TUN_RWAIT	0x0040
 #define	TUN_ASYNC	0x0080
 #define	TUN_IFHEAD	0x0100
 #define	TUN_DYING	0x0200
 #define	TUN_L2		0x0400
 #define	TUN_VMNET	0x0800
 
 #define	TUN_DRIVER_IDENT_MASK	(TUN_L2 | TUN_VMNET)
 #define	TUN_READY		(TUN_OPEN | TUN_INITED)
 
 	pid_t			 tun_pid;	/* owning pid */
 	struct ifnet		*tun_ifp;	/* the interface */
 	struct sigio		*tun_sigio;	/* async I/O info */
 	struct tuntap_driver	*tun_drv;	/* appropriate driver */
 	struct selinfo		 tun_rsel;	/* read select */
 	struct mtx		 tun_mtx;	/* softc field mutex */
 	struct cv		 tun_cv;	/* for ref'd dev destroy */
 	struct ether_addr	 tun_ether;	/* remote address */
 	int			 tun_busy;	/* busy count */
 	int			 tun_vhdrlen;	/* virtio-net header length */
 	struct lro_ctrl		 tun_lro;	/* for TCP LRO */
 	bool			 tun_lro_ready;	/* TCP LRO initialized */
 };
 #define	TUN2IFP(sc)	((sc)->tun_ifp)
 
 #define	TUNDEBUG	if (tundebug) if_printf
 
 #define	TUN_LOCK(tp)		mtx_lock(&(tp)->tun_mtx)
 #define	TUN_UNLOCK(tp)		mtx_unlock(&(tp)->tun_mtx)
 #define	TUN_LOCK_ASSERT(tp)	mtx_assert(&(tp)->tun_mtx, MA_OWNED);
 
 #define	TUN_VMIO_FLAG_MASK	0x0fff
 
 /*
  * Interface capabilities of a tap device that supports the virtio-net
  * header.
  */
 #define TAP_VNET_HDR_CAPS	(IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6	\
 				| IFCAP_VLAN_HWCSUM			\
 				| IFCAP_TSO | IFCAP_LRO			\
 				| IFCAP_VLAN_HWTSO)
 
 #define TAP_ALL_OFFLOAD		(CSUM_TSO | CSUM_TCP | CSUM_UDP |\
 				    CSUM_TCP_IPV6 | CSUM_UDP_IPV6)
 
 /*
  * All mutable global variables in if_tun are locked using tunmtx, with
  * the exception of tundebug, which is used unlocked, and the drivers' *clones,
  * which are static after setup.
  */
 static struct mtx tunmtx;
 static eventhandler_tag arrival_tag;
 static eventhandler_tag clone_tag;
 static const char tunname[] = "tun";
 static const char tapname[] = "tap";
 static const char vmnetname[] = "vmnet";
 static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface");
 static int tundebug = 0;
 static int tundclone = 1;
 static int tap_allow_uopen = 0;	/* allow user devfs cloning */
 static int tapuponopen = 0;	/* IFF_UP on open() */
 static int tapdclone = 1;	/* enable devfs cloning */
 
 static TAILQ_HEAD(,tuntap_softc)	tunhead = TAILQ_HEAD_INITIALIZER(tunhead);
 SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static struct sx tun_ioctl_sx;
 SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl");
 
 SYSCTL_DECL(_net_link);
 /* tun */
 static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IP tunnel software network interface");
 SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0,
     "Enable legacy devfs interface creation");
 
 /* tap */
 static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Ethernet tunnel software network interface");
 SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0,
     "Enable legacy devfs interface creation for all users");
 SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0,
     "Bring interface up when /dev/tap is opened");
 SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0,
     "Enable legacy devfs interface creation");
 SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static int	tun_create_device(struct tuntap_driver *drv, int unit,
     struct ucred *cr, struct cdev **dev, const char *name);
 static int	tun_busy_locked(struct tuntap_softc *tp);
 static void	tun_unbusy_locked(struct tuntap_softc *tp);
 static int	tun_busy(struct tuntap_softc *tp);
 static void	tun_unbusy(struct tuntap_softc *tp);
 
 static int	tuntap_name2info(const char *name, int *unit, int *flags);
 static void	tunclone(void *arg, struct ucred *cred, char *name,
 		    int namelen, struct cdev **dev);
 static void	tuncreate(struct cdev *dev);
 static void	tundtor(void *data);
 static void	tunrename(void *arg, struct ifnet *ifp);
 static int	tunifioctl(struct ifnet *, u_long, caddr_t);
 static void	tuninit(struct ifnet *);
 static void	tunifinit(void *xtp);
 static int	tuntapmodevent(module_t, int, void *);
 static int	tunoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *ro);
 static void	tunstart(struct ifnet *);
 static void	tunstart_l2(struct ifnet *);
 
 static int	tun_clone_match(struct if_clone *ifc, const char *name);
 static int	tap_clone_match(struct if_clone *ifc, const char *name);
 static int	vmnet_clone_match(struct if_clone *ifc, const char *name);
 static int	tun_clone_create(struct if_clone *, char *, size_t,
 		    struct ifc_data *, struct ifnet **);
 static int	tun_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
 static void	tun_vnethdr_set(struct ifnet *ifp, int vhdrlen);
 
 static d_open_t		tunopen;
 static d_read_t		tunread;
 static d_write_t	tunwrite;
 static d_ioctl_t	tunioctl;
 static d_poll_t		tunpoll;
 static d_kqfilter_t	tunkqfilter;
 
 static int		tunkqread(struct knote *, long);
 static int		tunkqwrite(struct knote *, long);
 static void		tunkqdetach(struct knote *);
 
-static struct filterops tun_read_filterops = {
+static const struct filterops tun_read_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqread,
 };
 
-static struct filterops tun_write_filterops = {
+static const struct filterops tun_write_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqwrite,
 };
 
 static struct tuntap_driver {
 	struct cdevsw		 cdevsw;
 	int			 ident_flags;
 	struct unrhdr		*unrhdr;
 	struct clonedevs	*clones;
 	ifc_match_f		*clone_match_fn;
 	ifc_create_f		*clone_create_fn;
 	ifc_destroy_f		*clone_destroy_fn;
 } tuntap_drivers[] = {
 	{
 		.ident_flags =	0,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tunname,
 		},
 		.clone_match_fn =	tun_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tapname,
 		},
 		.clone_match_fn =	tap_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2 | TUN_VMNET,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		vmnetname,
 		},
 		.clone_match_fn =	vmnet_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 };
 
 struct tuntap_driver_cloner {
 	SLIST_ENTRY(tuntap_driver_cloner)	 link;
 	struct tuntap_driver			*drv;
 	struct if_clone				*cloner;
 };
 
 VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) =
     SLIST_HEAD_INITIALIZER(tuntap_driver_cloners);
 
 #define	V_tuntap_driver_cloners	VNET(tuntap_driver_cloners)
 
 /*
  * Mechanism for marking a tunnel device as busy so that we can safely do some
  * orthogonal operations (such as operations on devices) without racing against
  * tun_destroy.  tun_destroy will wait on the condvar if we're at all busy or
  * open, to be woken up when the condition is alleviated.
  */
 static int
 tun_busy_locked(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK_ASSERT(tp);
 	if ((tp->tun_flags & TUN_DYING) != 0) {
 		/*
 		 * Perhaps unintuitive, but the device is busy going away.
 		 * Other interpretations of EBUSY from tun_busy make little
 		 * sense, since making a busy device even more busy doesn't
 		 * sound like a problem.
 		 */
 		return (EBUSY);
 	}
 
 	++tp->tun_busy;
 	return (0);
 }
 
 static void
 tun_unbusy_locked(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK_ASSERT(tp);
 	KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel"));
 
 	--tp->tun_busy;
 	/* Wake up anything that may be waiting on our busy tunnel. */
 	if (tp->tun_busy == 0)
 		cv_broadcast(&tp->tun_cv);
 }
 
 static int
 tun_busy(struct tuntap_softc *tp)
 {
 	int ret;
 
 	TUN_LOCK(tp);
 	ret = tun_busy_locked(tp);
 	TUN_UNLOCK(tp);
 	return (ret);
 }
 
 static void
 tun_unbusy(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK(tp);
 	tun_unbusy_locked(tp);
 	TUN_UNLOCK(tp);
 }
 
 /*
  * Sets unit and/or flags given the device name.  Must be called with correct
  * vnet context.
  */
 static int
 tuntap_name2info(const char *name, int *outunit, int *outflags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	char *dname;
 	int flags, unit;
 	bool found;
 
 	if (name == NULL)
 		return (EINVAL);
 
 	/*
 	 * Needed for dev_stdclone, but dev_stdclone will not modify, it just
 	 * wants to be able to pass back a char * through the second param. We
 	 * will always set that as NULL here, so we'll fake it.
 	 */
 	dname = __DECONST(char *, name);
 	found = false;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 
 		if (strcmp(name, drv->cdevsw.d_name) == 0) {
 			found = true;
 			unit = -1;
 			flags = drv->ident_flags;
 			break;
 		}
 
 		if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) {
 			found = true;
 			flags = drv->ident_flags;
 			break;
 		}
 	}
 
 	if (!found)
 		return (ENXIO);
 
 	if (outunit != NULL)
 		*outunit = unit;
 	if (outflags != NULL)
 		*outflags = flags;
 	return (0);
 }
 
 /*
  * Get driver information from a set of flags specified.  Masks the identifying
  * part of the flags and compares it against all of the available
  * tuntap_drivers. Must be called with correct vnet context.
  */
 static struct tuntap_driver *
 tuntap_driver_from_flags(int tun_flags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 		if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags)
 			return (drv);
 	}
 
 	return (NULL);
 }
 
 static int
 tun_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_L2) == 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tap_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 vmnet_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_VMNET) != 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tun_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct tuntap_driver *drv;
 	struct cdev *dev;
 	int err, i, tunflags, unit;
 
 	tunflags = 0;
 	/* The name here tells us exactly what we're creating */
 	err = tuntap_name2info(name, &unit, &tunflags);
 	if (err != 0)
 		return (err);
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		return (ENXIO);
 
 	if (unit != -1) {
 		/* If this unit number is still available that's okay. */
 		if (alloc_unr_specific(drv->unrhdr, unit) == -1)
 			return (EEXIST);
 	} else {
 		unit = alloc_unr(drv->unrhdr);
 	}
 
 	snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit);
 
 	/* find any existing device, or allocate new unit number */
 	dev = NULL;
 	i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0);
 	/* No preexisting struct cdev *, create one */
 	if (i != 0)
 		i = tun_create_device(drv, unit, NULL, &dev, name);
 	if (i == 0) {
 		dev_ref(dev);
 		tuncreate(dev);
 		struct tuntap_softc *tp = dev->si_drv1;
 		*ifpp = tp->tun_ifp;
 	}
 
 	return (i);
 }
 
 static void
 tunclone(void *arg, struct ucred *cred, char *name, int namelen,
     struct cdev **dev)
 {
 	char devname[SPECNAMELEN + 1];
 	struct tuntap_driver *drv;
 	int append_unit, i, u, tunflags;
 	bool mayclone;
 
 	if (*dev != NULL)
 		return;
 
 	tunflags = 0;
 	CURVNET_SET(CRED_TO_VNET(cred));
 	if (tuntap_name2info(name, &u, &tunflags) != 0)
 		goto out;	/* Not recognized */
 
 	if (u != -1 && u > IF_MAXUNIT)
 		goto out;	/* Unit number too high */
 
 	mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0;
 	if ((tunflags & TUN_L2) != 0) {
 		/* tap/vmnet allow user open with a sysctl */
 		mayclone = (mayclone || tap_allow_uopen) && tapdclone;
 	} else {
 		mayclone = mayclone && tundclone;
 	}
 
 	/*
 	 * If tun cloning is enabled, only the superuser can create an
 	 * interface.
 	 */
 	if (!mayclone)
 		goto out;
 
 	if (u == -1)
 		append_unit = 1;
 	else
 		append_unit = 0;
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		goto out;
 
 	/* find any existing device, or allocate new unit number */
 	i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0);
 	if (i) {
 		if (append_unit) {
 			namelen = snprintf(devname, sizeof(devname), "%s%d",
 			    name, u);
 			name = devname;
 		}
 
 		i = tun_create_device(drv, u, cred, dev, name);
 	}
 	if (i == 0) {
 		dev_ref(*dev);
 		if_clone_create(name, namelen, NULL);
 	}
 out:
 	CURVNET_RESTORE();
 }
 
 static void
 tun_destroy(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK(tp);
 	tp->tun_flags |= TUN_DYING;
 	if (tp->tun_busy != 0)
 		cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx);
 	else
 		TUN_UNLOCK(tp);
 
 	CURVNET_SET(TUN2IFP(tp)->if_vnet);
 
 	/* destroy_dev will take care of any alias. */
 	destroy_dev(tp->tun_dev);
 	seldrain(&tp->tun_rsel);
 	knlist_clear(&tp->tun_rsel.si_note, 0);
 	knlist_destroy(&tp->tun_rsel.si_note);
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		ether_ifdetach(TUN2IFP(tp));
 	} else {
 		bpfdetach(TUN2IFP(tp));
 		if_detach(TUN2IFP(tp));
 	}
 	sx_xlock(&tun_ioctl_sx);
 	TUN2IFP(tp)->if_softc = NULL;
 	sx_xunlock(&tun_ioctl_sx);
 	free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit);
 	if_free(TUN2IFP(tp));
 	mtx_destroy(&tp->tun_mtx);
 	cv_destroy(&tp->tun_cv);
 	free(tp, M_TUN);
 	CURVNET_RESTORE();
 }
 
 static int
 tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp, uint32_t flags)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	mtx_lock(&tunmtx);
 	TAILQ_REMOVE(&tunhead, tp, tun_list);
 	mtx_unlock(&tunmtx);
 	tun_destroy(tp);
 
 	return (0);
 }
 
 static void
 vnet_tun_init(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	int i;
 
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO);
 
 		drvc->drv = drv;
 		struct if_clone_addreq req = {
 			.match_f = drv->clone_match_fn,
 			.create_f = drv->clone_create_fn,
 			.destroy_f = drv->clone_destroy_fn,
 		};
 		drvc->cloner = ifc_attach_cloner(drv->cdevsw.d_name, &req);
 		SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link);
 	};
 }
 VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
 		vnet_tun_init, NULL);
 
 static void
 vnet_tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver_cloner *drvc;
 
 	while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) {
 		drvc = SLIST_FIRST(&V_tuntap_driver_cloners);
 		SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link);
 
 		if_clone_detach(drvc->cloner);
 		free(drvc, M_TUN);
 	}
 }
 VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_tun_uninit, NULL);
 
 static void
 tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	int i;
 
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag);
 	EVENTHANDLER_DEREGISTER(dev_clone, clone_tag);
 
 	mtx_lock(&tunmtx);
 	while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
 		TAILQ_REMOVE(&tunhead, tp, tun_list);
 		mtx_unlock(&tunmtx);
 		tun_destroy(tp);
 		mtx_lock(&tunmtx);
 	}
 	mtx_unlock(&tunmtx);
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		delete_unrhdr(drv->unrhdr);
 		clone_cleanup(&drv->clones);
 	}
 	mtx_destroy(&tunmtx);
 }
 SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL);
 
 static struct tuntap_driver *
 tuntap_driver_from_ifnet(const struct ifnet *ifp)
 {
 	struct tuntap_driver *drv;
 	int i;
 
 	if (ifp == NULL)
 		return (NULL);
 
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0)
 			return (drv);
 	}
 
 	return (NULL);
 }
 
 static int
 tuntapmodevent(module_t mod, int type, void *data)
 {
 	struct tuntap_driver *drv;
 	int i;
 
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF);
 		for (i = 0; i < nitems(tuntap_drivers); ++i) {
 			drv = &tuntap_drivers[i];
 			clone_setup(&drv->clones);
 			drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx);
 		}
 		arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event,
 		   tunrename, 0, 1000);
 		if (arrival_tag == NULL)
 			return (ENOMEM);
 		clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
 		if (clone_tag == NULL)
 			return (ENOMEM);
 		break;
 	case MOD_UNLOAD:
 		/* See tun_uninit, so it's done after the vnet_sysuninit() */
 		break;
 	default:
 		return EOPNOTSUPP;
 	}
 	return 0;
 }
 
 static moduledata_t tuntap_mod = {
 	"if_tuntap",
 	tuntapmodevent,
 	0
 };
 
 /* We'll only ever have these two, so no need for a macro. */
 static moduledata_t tun_mod = { "if_tun", NULL, 0 };
 static moduledata_t tap_mod = { "if_tap", NULL, 0 };
 
 DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tuntap, 1);
 DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tun, 1);
 DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tap, 1);
 
 static int
 tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr,
     struct cdev **dev, const char *name)
 {
 	struct make_dev_args args;
 	struct tuntap_softc *tp;
 	int error;
 
 	tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO);
 	mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF);
 	cv_init(&tp->tun_cv, "tun_condvar");
 	tp->tun_flags = drv->ident_flags;
 	tp->tun_drv = drv;
 
 	make_dev_args_init(&args);
 	if (cr != NULL)
 		args.mda_flags = MAKEDEV_REF | MAKEDEV_CHECKNAME;
 	args.mda_devsw = &drv->cdevsw;
 	args.mda_cr = cr;
 	args.mda_uid = UID_UUCP;
 	args.mda_gid = GID_DIALER;
 	args.mda_mode = 0600;
 	args.mda_unit = unit;
 	args.mda_si_drv1 = tp;
 	error = make_dev_s(&args, dev, "%s", name);
 	if (error != 0) {
 		free(tp, M_TUN);
 		return (error);
 	}
 
 	KASSERT((*dev)->si_drv1 != NULL,
 	    ("Failed to set si_drv1 at %s creation", name));
 	tp->tun_dev = *dev;
 	knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx);
 	mtx_lock(&tunmtx);
 	TAILQ_INSERT_TAIL(&tunhead, tp, tun_list);
 	mtx_unlock(&tunmtx);
 	return (0);
 }
 
 static void
 tunstart(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	struct mbuf *m;
 
 	TUNDEBUG(ifp, "starting\n");
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_LOCK(&ifp->if_snd);
 		IFQ_POLL_NOLOCK(&ifp->if_snd, m);
 		if (m == NULL) {
 			IFQ_UNLOCK(&ifp->if_snd);
 			return;
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_RWAIT) {
 		tp->tun_flags &= ~TUN_RWAIT;
 		wakeup(tp);
 	}
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
 		TUN_UNLOCK(tp);
 		pgsigio(&tp->tun_sigio, SIGIO, 0);
 	} else
 		TUN_UNLOCK(tp);
 }
 
 /*
  * tunstart_l2
  *
  * queue packets from higher level ready to put out
  */
 static void
 tunstart_l2(struct ifnet *ifp)
 {
 	struct tuntap_softc	*tp = ifp->if_softc;
 
 	TUNDEBUG(ifp, "starting\n");
 
 	/*
 	 * do not junk pending output if we are in VMnet mode.
 	 * XXX: can this do any harm because of queue overflow?
 	 */
 
 	TUN_LOCK(tp);
 	if (((tp->tun_flags & TUN_VMNET) == 0) &&
 	    ((tp->tun_flags & TUN_READY) != TUN_READY)) {
 		struct mbuf *m;
 
 		/* Unlocked read. */
 		TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags);
 
 		for (;;) {
 			IF_DEQUEUE(&ifp->if_snd, m);
 			if (m != NULL) {
 				m_freem(m);
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			} else
 				break;
 		}
 		TUN_UNLOCK(tp);
 
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 		if (tp->tun_flags & TUN_RWAIT) {
 			tp->tun_flags &= ~TUN_RWAIT;
 			wakeup(tp);
 		}
 
 		if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) {
 			TUN_UNLOCK(tp);
 			pgsigio(&tp->tun_sigio, SIGIO, 0);
 			TUN_LOCK(tp);
 		}
 
 		selwakeuppri(&tp->tun_rsel, PZERO+1);
 		KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */
 	}
 
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	TUN_UNLOCK(tp);
 } /* tunstart_l2 */
 
 static int
 tap_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	BPF_MTAP(ifp, m);
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 /* XXX: should return an error code so it can fail. */
 static void
 tuncreate(struct cdev *dev)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	struct ifnet *ifp;
 	struct ether_addr eaddr;
 	int iflags;
 	u_char type;
 
 	tp = dev->si_drv1;
 	KASSERT(tp != NULL,
 	    ("si_drv1 should have been initialized at creation"));
 
 	drv = tp->tun_drv;
 	iflags = IFF_MULTICAST;
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		type = IFT_ETHER;
 		iflags |= IFF_BROADCAST | IFF_SIMPLEX;
 	} else {
 		type = IFT_PPP;
 		iflags |= IFF_POINTOPOINT;
 	}
 	ifp = tp->tun_ifp = if_alloc(type);
 	ifp->if_softc = tp;
 	if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev));
 	ifp->if_ioctl = tunifioctl;
 	ifp->if_flags = iflags;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_MEXTPG;
 	if ((tp->tun_flags & TUN_L2) != 0)
 		ifp->if_capabilities |=
 		    IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO;
 	ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_MEXTPG;
 
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		ifp->if_init = tunifinit;
 		ifp->if_start = tunstart_l2;
 		ifp->if_transmit = tap_transmit;
 		ifp->if_qflush = if_qflush;
 
 		ether_gen_addr(ifp, &eaddr);
 		ether_ifattach(ifp, eaddr.octet);
 	} else {
 		ifp->if_mtu = TUNMTU;
 		ifp->if_start = tunstart;
 		ifp->if_output = tunoutput;
 
 		ifp->if_snd.ifq_drv_maxlen = 0;
 		IFQ_SET_READY(&ifp->if_snd);
 
 		if_attach(ifp);
 		bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	}
 
 	TUN_LOCK(tp);
 	tp->tun_flags |= TUN_INITED;
 	TUN_UNLOCK(tp);
 
 	TUNDEBUG(ifp, "interface %s is created, minor = %#x\n",
 	    ifp->if_xname, dev2unit(dev));
 }
 
 static void
 tunrename(void *arg __unused, struct ifnet *ifp)
 {
 	struct tuntap_softc *tp;
 	int error;
 
 	if ((ifp->if_flags & IFF_RENAMING) == 0)
 		return;
 
 	if (tuntap_driver_from_ifnet(ifp) == NULL)
 		return;
 
 	/*
 	 * We need to grab the ioctl sx long enough to make sure the softc is
 	 * still there.  If it is, we can safely try to busy the tun device.
 	 * The busy may fail if the device is currently dying, in which case
 	 * we do nothing.  If it doesn't fail, the busy count stops the device
 	 * from dying until we've created the alias (that will then be
 	 * subsequently destroyed).
 	 */
 	sx_xlock(&tun_ioctl_sx);
 	tp = ifp->if_softc;
 	if (tp == NULL) {
 		sx_xunlock(&tun_ioctl_sx);
 		return;
 	}
 	error = tun_busy(tp);
 	sx_xunlock(&tun_ioctl_sx);
 	if (error != 0)
 		return;
 	if (tp->tun_alias != NULL) {
 		destroy_dev(tp->tun_alias);
 		tp->tun_alias = NULL;
 	}
 
 	if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0)
 		goto out;
 
 	/*
 	 * Failure's ok, aliases are created on a best effort basis.  If a
 	 * tun user/consumer decides to rename the interface to conflict with
 	 * another device (non-ifnet) on the system, we will assume they know
 	 * what they are doing.  make_dev_alias_p won't touch tun_alias on
 	 * failure, so we use it but ignore the return value.
 	 */
 	make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s",
 	    ifp->if_xname);
 out:
 	tun_unbusy(tp);
 }
 
 static int
 tunopen(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 	struct ifnet	*ifp;
 	struct tuntap_softc *tp;
 	int error __diagused, tunflags;
 
 	tunflags = 0;
 	CURVNET_SET(TD_TO_VNET(td));
 	error = tuntap_name2info(dev->si_name, NULL, &tunflags);
 	if (error != 0) {
 		CURVNET_RESTORE();
 		return (error);	/* Shouldn't happen */
 	}
 
 	tp = dev->si_drv1;
 	KASSERT(tp != NULL,
 	    ("si_drv1 should have been initialized at creation"));
 
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & TUN_INITED) == 0) {
 		TUN_UNLOCK(tp);
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 	if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) {
 		TUN_UNLOCK(tp);
 		CURVNET_RESTORE();
 		return (EBUSY);
 	}
 
 	error = tun_busy_locked(tp);
 	KASSERT(error == 0, ("Must be able to busy an unopen tunnel"));
 	ifp = TUN2IFP(tp);
 
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		bcopy(IF_LLADDR(ifp), tp->tun_ether.octet,
 		    sizeof(tp->tun_ether.octet));
 
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 		if (tapuponopen)
 			ifp->if_flags |= IFF_UP;
 	}
 
 	tp->tun_pid = td->td_proc->p_pid;
 	tp->tun_flags |= TUN_OPEN;
 
 	if_link_state_change(ifp, LINK_STATE_UP);
 	TUNDEBUG(ifp, "open\n");
 	TUN_UNLOCK(tp);
 
 	/*
 	 * This can fail with either ENOENT or EBUSY.  This is in the middle of
 	 * d_open, so ENOENT should not be possible.  EBUSY is possible, but
 	 * the only cdevpriv dtor being set will be tundtor and the softc being
 	 * passed is constant for a given cdev.  We ignore the possible error
 	 * because of this as either "unlikely" or "not actually a problem."
 	 */
 	(void)devfs_set_cdevpriv(tp, tundtor);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * tundtor - tear down the device - mark i/f down & delete
  * routing info
  */
 static void
 tundtor(void *data)
 {
 	struct proc *p;
 	struct tuntap_softc *tp;
 	struct ifnet *ifp;
 	bool l2tun;
 
 	tp = data;
 	p = curproc;
 	ifp = TUN2IFP(tp);
 
 	TUN_LOCK(tp);
 
 	/*
 	 * Realistically, we can't be obstinate here.  This only means that the
 	 * tuntap device was closed out of order, and the last closer wasn't the
 	 * controller.  These are still good to know about, though, as software
 	 * should avoid multiple processes with a tuntap device open and
 	 * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in
 	 * parent).
 	 */
 	if (p->p_pid != tp->tun_pid) {
 		log(LOG_INFO,
 		    "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n",
 		    p->p_pid, p->p_comm, tp->tun_dev->si_name);
 	}
 
 	/*
 	 * junk all pending output
 	 */
 	CURVNET_SET(ifp->if_vnet);
 
 	l2tun = false;
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		l2tun = true;
 		IF_DRAIN(&ifp->if_snd);
 	} else {
 		IFQ_PURGE(&ifp->if_snd);
 	}
 
 	/* For vmnet, we won't do most of the address/route bits */
 	if ((tp->tun_flags & TUN_VMNET) != 0 ||
 	    (l2tun && (ifp->if_flags & IFF_LINK0) != 0))
 		goto out;
 #if defined(INET) || defined(INET6)
 	if (l2tun && tp->tun_lro_ready) {
 		TUNDEBUG (ifp, "LRO disabled\n");
 		tcp_lro_free(&tp->tun_lro);
 		tp->tun_lro_ready = false;
 	}
 #endif
 	if (ifp->if_flags & IFF_UP) {
 		TUN_UNLOCK(tp);
 		if_down(ifp);
 		TUN_LOCK(tp);
 	}
 
 	/* Delete all addresses and routes which reference this interface. */
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		TUN_UNLOCK(tp);
 		if_purgeaddrs(ifp);
 		TUN_LOCK(tp);
 	}
 
 out:
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	CURVNET_RESTORE();
 
 	funsetown(&tp->tun_sigio);
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	TUNDEBUG (ifp, "closed\n");
 	tp->tun_flags &= ~TUN_OPEN;
 	tp->tun_pid = 0;
 	tun_vnethdr_set(ifp, 0);
 
 	tun_unbusy_locked(tp);
 	TUN_UNLOCK(tp);
 }
 
 static void
 tuninit(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	TUNDEBUG(ifp, "tuninit\n");
 
 	TUN_LOCK(tp);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if ((tp->tun_flags & TUN_L2) == 0) {
 		ifp->if_flags |= IFF_UP;
 		getmicrotime(&ifp->if_lastchange);
 		TUN_UNLOCK(tp);
 	} else {
 #if defined(INET) || defined(INET6)
 		if (tcp_lro_init(&tp->tun_lro) == 0) {
 			TUNDEBUG(ifp, "LRO enabled\n");
 			tp->tun_lro.ifp = ifp;
 			tp->tun_lro_ready = true;
 		} else {
 			TUNDEBUG(ifp, "Could not enable LRO\n");
 			tp->tun_lro_ready = false;
 		}
 #endif
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		TUN_UNLOCK(tp);
 		/* attempt to start output */
 		tunstart_l2(ifp);
 	}
 
 }
 
 /*
  * Used only for l2 tunnel.
  */
 static void
 tunifinit(void *xtp)
 {
 	struct tuntap_softc *tp;
 
 	tp = (struct tuntap_softc *)xtp;
 	tuninit(tp->tun_ifp);
 }
 
 /*
  * To be called under TUN_LOCK. Update ifp->if_hwassist according to the
  * current value of ifp->if_capenable.
  */
 static void
 tun_caps_changed(struct ifnet *ifp)
 {
 	uint64_t hwassist = 0;
 
 	TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc);
 	if (ifp->if_capenable & IFCAP_TXCSUM)
 		hwassist |= CSUM_TCP | CSUM_UDP;
 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 		hwassist |= CSUM_TCP_IPV6
 		    | CSUM_UDP_IPV6;
 	if (ifp->if_capenable & IFCAP_TSO4)
 		hwassist |= CSUM_IP_TSO;
 	if (ifp->if_capenable & IFCAP_TSO6)
 		hwassist |= CSUM_IP6_TSO;
 	ifp->if_hwassist = hwassist;
 }
 
 /*
  * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust
  * if_capabilities and if_capenable as needed.
  */
 static void
 tun_vnethdr_set(struct ifnet *ifp, int vhdrlen)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	TUN_LOCK_ASSERT(tp);
 
 	if (tp->tun_vhdrlen == vhdrlen)
 		return;
 
 	/*
 	 * Update if_capabilities to reflect the
 	 * functionalities offered by the virtio-net
 	 * header.
 	 */
 	if (vhdrlen != 0)
 		ifp->if_capabilities |=
 			TAP_VNET_HDR_CAPS;
 	else
 		ifp->if_capabilities &=
 			~TAP_VNET_HDR_CAPS;
 	/*
 	 * Disable any capabilities that we don't
 	 * support anymore.
 	 */
 	ifp->if_capenable &= ifp->if_capabilities;
 	tun_caps_changed(ifp);
 	tp->tun_vhdrlen = vhdrlen;
 
 	TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n",
 	    vhdrlen, ifp->if_capabilities);
 }
 
 /*
  * Process an ioctl request.
  */
 static int
 tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct tuntap_softc *tp;
 	struct ifstat *ifs;
 	struct ifmediareq	*ifmr;
 	int		dummy, error = 0;
 	bool		l2tun;
 
 	ifmr = NULL;
 	sx_xlock(&tun_ioctl_sx);
 	tp = ifp->if_softc;
 	if (tp == NULL) {
 		error = ENXIO;
 		goto bad;
 	}
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	switch(cmd) {
 	case SIOCGIFSTATUS:
 		ifs = (struct ifstat *)data;
 		TUN_LOCK(tp);
 		if (tp->tun_pid)
 			snprintf(ifs->ascii, sizeof(ifs->ascii),
 			    "\tOpened by PID %d\n", tp->tun_pid);
 		else
 			ifs->ascii[0] = '\0';
 		TUN_UNLOCK(tp);
 		break;
 	case SIOCSIFADDR:
 		if (l2tun)
 			error = ether_ioctl(ifp, cmd, data);
 		else
 			tuninit(ifp);
 		if (error == 0)
 		    TUNDEBUG(ifp, "address set\n");
 		break;
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		TUNDEBUG(ifp, "mtu set\n");
 		break;
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 	case SIOCGIFMEDIA:
 		if (!l2tun) {
 			error = EINVAL;
 			break;
 		}
 
 		ifmr = (struct ifmediareq *)data;
 		dummy = ifmr->ifm_count;
 		ifmr->ifm_count = 1;
 		ifmr->ifm_status = IFM_AVALID;
 		ifmr->ifm_active = IFM_ETHER | IFM_FDX | IFM_1000_T;
 		if (tp->tun_flags & TUN_OPEN)
 			ifmr->ifm_status |= IFM_ACTIVE;
 		ifmr->ifm_current = ifmr->ifm_active;
 		if (dummy >= 1) {
 			int media = IFM_ETHER;
 			error = copyout(&media, ifmr->ifm_ulist, sizeof(int));
 		}
 		break;
 	case SIOCSIFCAP:
 		TUN_LOCK(tp);
 		ifp->if_capenable = ifr->ifr_reqcap;
 		tun_caps_changed(ifp);
 		TUN_UNLOCK(tp);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	default:
 		if (l2tun) {
 			error = ether_ioctl(ifp, cmd, data);
 		} else {
 			error = EINVAL;
 		}
 	}
 bad:
 	sx_xunlock(&tun_ioctl_sx);
 	return (error);
 }
 
 /*
  * tunoutput - queue packets from higher level ready to put out.
  */
 static int
 tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	u_short cached_tun_flags;
 	int error;
 	u_int32_t af;
 
 	TUNDEBUG (ifp, "tunoutput\n");
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m0);
 	if (error) {
 		m_freem(m0);
 		return (error);
 	}
 #endif
 
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	cached_tun_flags = tp->tun_flags;
 	TUN_UNLOCK(tp);
 	if ((cached_tun_flags & TUN_READY) != TUN_READY) {
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	if ((ifp->if_flags & IFF_UP) != IFF_UP) {
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 
 	BPF_MTAP2(ifp, &af, sizeof(af), m0);
 
 	/* prepend sockaddr? this may abort if the mbuf allocation fails */
 	if (cached_tun_flags & TUN_LMODE) {
 		/* allocate space for sockaddr */
 		M_PREPEND(m0, dst->sa_len, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else {
 			bcopy(dst, m0->m_data, dst->sa_len);
 		}
 	}
 
 	if (cached_tun_flags & TUN_IFHEAD) {
 		/* Prepend the address family */
 		M_PREPEND(m0, 4, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else
 			*(u_int32_t *)m0->m_data = htonl(af);
 	} else {
 #ifdef INET
 		if (af != AF_INET)
 #endif
 		{
 			m_freem(m0);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 	error = (ifp->if_transmit)(ifp, m0);
 	if (error)
 		return (ENOBUFS);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	return (0);
 }
 
 /*
  * the cdevsw interface is now pretty minimal.
  */
 static	int
 tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
     struct thread *td)
 {
 	struct ifreq ifr, *ifrp;
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet *ifp = TUN2IFP(tp);
 	struct tuninfo *tunp;
 	int error, iflags, ival;
 	bool	l2tun;
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	if (l2tun) {
 		/* tap specific ioctls */
 		switch(cmd) {
 		/* VMware/VMnet port ioctl's */
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4)
 		case _IO('V', 0):
 			ival = IOCPARM_IVAL(data);
 			data = (caddr_t)&ival;
 			/* FALLTHROUGH */
 #endif
 		case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */
 			iflags = *(int *)data;
 			iflags &= TUN_VMIO_FLAG_MASK;
 			iflags &= ~IFF_CANTCHANGE;
 			iflags |= IFF_UP;
 
 			TUN_LOCK(tp);
 			ifp->if_flags = iflags |
 			    (ifp->if_flags & IFF_CANTCHANGE);
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCGIFADDR:	/* get MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(&tp->tun_ether.octet, data,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCSIFADDR:	/* set MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(data, &tp->tun_ether.octet,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TAPSVNETHDR:
 			ival = *(int *)data;
 			if (ival != 0 &&
 			    ival != sizeof(struct virtio_net_hdr) &&
 			    ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
 				return (EINVAL);
 			}
 			TUN_LOCK(tp);
 			tun_vnethdr_set(ifp, ival);
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TAPGVNETHDR:
 			TUN_LOCK(tp);
 			*(int *)data = tp->tun_vhdrlen;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 
 		/* Fall through to the common ioctls if unhandled */
 	} else {
 		switch (cmd) {
 		case TUNSLMODE:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_LMODE;
 				tp->tun_flags &= ~TUN_IFHEAD;
 			} else
 				tp->tun_flags &= ~TUN_LMODE;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFHEAD:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_IFHEAD;
 				tp->tun_flags &= ~TUN_LMODE;
 			} else
 				tp->tun_flags &= ~TUN_IFHEAD;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNGIFHEAD:
 			TUN_LOCK(tp);
 			*(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFMODE:
 			/* deny this if UP */
 			if (TUN2IFP(tp)->if_flags & IFF_UP)
 				return (EBUSY);
 
 			switch (*(int *)data & ~IFF_MULTICAST) {
 			case IFF_POINTOPOINT:
 			case IFF_BROADCAST:
 				TUN_LOCK(tp);
 				TUN2IFP(tp)->if_flags &=
 				    ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST);
 				TUN2IFP(tp)->if_flags |= *(int *)data;
 				TUN_UNLOCK(tp);
 
 				break;
 			default:
 				return (EINVAL);
 			}
 
 			return (0);
 		case TUNSIFPID:
 			TUN_LOCK(tp);
 			tp->tun_pid = curthread->td_proc->p_pid;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 		/* Fall through to the common ioctls if unhandled */
 	}
 
 	switch (cmd) {
 	case TUNGIFNAME:
 		ifrp = (struct ifreq *)data;
 		strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ);
 
 		return (0);
 	case TUNSIFINFO:
 		tunp = (struct tuninfo *)data;
 		if (TUN2IFP(tp)->if_type != tunp->type)
 			return (EPROTOTYPE);
 		TUN_LOCK(tp);
 		if (TUN2IFP(tp)->if_mtu != tunp->mtu) {
 			strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ);
 			ifr.ifr_mtu = tunp->mtu;
 			CURVNET_SET(TUN2IFP(tp)->if_vnet);
 			error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp),
 			    (caddr_t)&ifr, td);
 			CURVNET_RESTORE();
 			if (error) {
 				TUN_UNLOCK(tp);
 				return (error);
 			}
 		}
 		TUN2IFP(tp)->if_baudrate = tunp->baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNGIFINFO:
 		tunp = (struct tuninfo *)data;
 		TUN_LOCK(tp);
 		tunp->mtu = TUN2IFP(tp)->if_mtu;
 		tunp->type = TUN2IFP(tp)->if_type;
 		tunp->baudrate = TUN2IFP(tp)->if_baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNSDEBUG:
 		tundebug = *(int *)data;
 		break;
 	case TUNGDEBUG:
 		*(int *)data = tundebug;
 		break;
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		TUN_LOCK(tp);
 		if (*(int *)data)
 			tp->tun_flags |= TUN_ASYNC;
 		else
 			tp->tun_flags &= ~TUN_ASYNC;
 		TUN_UNLOCK(tp);
 		break;
 	case FIONREAD:
 		if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) {
 			struct mbuf *mb;
 			IFQ_LOCK(&TUN2IFP(tp)->if_snd);
 			IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb);
 			for (*(int *)data = 0; mb != NULL; mb = mb->m_next)
 				*(int *)data += mb->m_len;
 			IFQ_UNLOCK(&TUN2IFP(tp)->if_snd);
 		} else
 			*(int *)data = 0;
 		break;
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &tp->tun_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&tp->tun_sigio);
 		return (0);
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		return (fsetown(-(*(int *)data), &tp->tun_sigio));
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&tp->tun_sigio);
 		return (0);
 
 	default:
 		return (ENOTTY);
 	}
 	return (0);
 }
 
 /*
  * The cdevsw read interface - reads a packet at a time, or at
  * least as much of a packet as can be read.
  */
 static	int
 tunread(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	struct mbuf	*m;
 	size_t		len;
 	int		error = 0;
 
 	TUNDEBUG (ifp, "read\n");
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & TUN_READY) != TUN_READY) {
 		TUN_UNLOCK(tp);
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		return (EHOSTDOWN);
 	}
 
 	tp->tun_flags &= ~TUN_RWAIT;
 
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m != NULL)
 			break;
 		if (flag & O_NONBLOCK) {
 			TUN_UNLOCK(tp);
 			return (EWOULDBLOCK);
 		}
 		tp->tun_flags |= TUN_RWAIT;
 		error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1),
 		    "tunread", 0);
 		if (error != 0) {
 			TUN_UNLOCK(tp);
 			return (error);
 		}
 	}
 	TUN_UNLOCK(tp);
 
 	len = min(tp->tun_vhdrlen, uio->uio_resid);
 	if (len > 0) {
 		struct virtio_net_hdr_mrg_rxbuf vhdr;
 
 		bzero(&vhdr, sizeof(vhdr));
 		if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) {
 			m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr);
 		}
 
 		TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
 		    "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
 		    vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
 		    vhdr.hdr.gso_size, vhdr.hdr.csum_start,
 		    vhdr.hdr.csum_offset);
 		error = uiomove(&vhdr, len, uio);
 	}
 	if (error == 0)
 		error = m_mbuftouio(uio, m, 0);
 	m_freem(m);
 	return (error);
 }
 
 static int
 tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m,
 	    struct virtio_net_hdr_mrg_rxbuf *vhdr)
 {
 	struct epoch_tracker et;
 	struct ether_header *eh;
 	struct ifnet *ifp;
 
 	ifp = TUN2IFP(tp);
 
 	/*
 	 * Only pass a unicast frame to ether_input(), if it would
 	 * actually have been received by non-virtual hardware.
 	 */
 	if (m->m_len < sizeof(struct ether_header)) {
 		m_freem(m);
 		return (0);
 	}
 
 	eh = mtod(m, struct ether_header *);
 
 	if ((ifp->if_flags & IFF_PROMISC) == 0 &&
 	    !ETHER_IS_MULTICAST(eh->ether_dhost) &&
 	    bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) {
 		m_freem(m);
 		return (0);
 	}
 
 	if (vhdr != NULL) {
 		if (virtio_net_rx_csum(m, &vhdr->hdr)) {
 			m_freem(m);
 			return (0);
 		}
 	} else {
 		switch (ntohs(eh->ether_type)) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			if (ifp->if_capenable & IFCAP_RXCSUM) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_IP_CHECKED | CSUM_IP_VALID |
 				    CSUM_DATA_VALID | CSUM_SCTP_VALID |
 				    CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			break;
 #endif
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID_IPV6 | CSUM_SCTP_VALID |
 				    CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			break;
 #endif
 		}
 	}
 
 	/* Pass packet up to parent. */
 	CURVNET_SET(ifp->if_vnet);
 	NET_EPOCH_ENTER(et);
 #if defined(INET) || defined(INET6)
 	if (tp->tun_lro_ready && ifp->if_capenable & IFCAP_LRO &&
 	    tcp_lro_rx(&tp->tun_lro, m, 0) == 0)
 		tcp_lro_flush_all(&tp->tun_lro);
 	else
 #endif
 		(*ifp->if_input)(ifp, m);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	/* ibytes are counted in parent */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	return (0);
 }
 
 static int
 tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 	int family, isr;
 
 	ifp = TUN2IFP(tp);
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_IFHEAD) {
 		TUN_UNLOCK(tp);
 		if (m->m_len < sizeof(family) &&
 		(m = m_pullup(m, sizeof(family))) == NULL)
 			return (ENOBUFS);
 		family = ntohl(*mtod(m, u_int32_t *));
 		m_adj(m, sizeof(family));
 	} else {
 		TUN_UNLOCK(tp);
 		family = AF_INET;
 	}
 
 	BPF_MTAP2(ifp, &family, sizeof(family), m);
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	CURVNET_SET(ifp->if_vnet);
 	M_SETFIB(m, ifp->if_fib);
 	NET_EPOCH_ENTER(et);
 	netisr_dispatch(isr, m);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * the cdevsw write interface - an atomic write is a packet - or else!
  */
 static	int
 tunwrite(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct virtio_net_hdr_mrg_rxbuf vhdr;
 	struct tuntap_softc *tp;
 	struct ifnet	*ifp;
 	struct mbuf	*m;
 	uint32_t	mru;
 	int		align, vhdrlen, error;
 	bool		l2tun;
 
 	tp = dev->si_drv1;
 	ifp = TUN2IFP(tp);
 	TUNDEBUG(ifp, "tunwrite\n");
 	if ((ifp->if_flags & IFF_UP) != IFF_UP)
 		/* ignore silently */
 		return (0);
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	mru = l2tun ? TAPMRU : TUNMRU;
 	vhdrlen = tp->tun_vhdrlen;
 	align = 0;
 	if (l2tun) {
 		align = ETHER_ALIGN;
 		mru += vhdrlen;
 	} else if ((tp->tun_flags & TUN_IFHEAD) != 0)
 		mru += sizeof(uint32_t);	/* family */
 	if (uio->uio_resid < 0 || uio->uio_resid > mru) {
 		TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid);
 		return (EIO);
 	}
 
 	if (vhdrlen > 0) {
 		error = uiomove(&vhdr, vhdrlen, uio);
 		if (error != 0)
 			return (error);
 		TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
 		    "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
 		    vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
 		    vhdr.hdr.gso_size, vhdr.hdr.csum_start,
 		    vhdr.hdr.csum_offset);
 	}
 
 	if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	if (l2tun)
 		return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL));
 
 	return (tunwrite_l3(tp, m));
 }
 
 /*
  * tunpoll - the poll interface, this is only useful on reads
  * really. The write detect always returns true, write never blocks
  * anyway, it either accepts the packet or drops it.
  */
 static	int
 tunpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	int		revents = 0;
 
 	TUNDEBUG(ifp, "tunpoll\n");
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		IFQ_LOCK(&ifp->if_snd);
 		if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 			TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len);
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			TUNDEBUG(ifp, "tunpoll waiting\n");
 			selrecord(td, &tp->tun_rsel);
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 	revents |= events & (POLLOUT | POLLWRNORM);
 
 	return (revents);
 }
 
 /*
  * tunkqfilter - support for the kevent() system call.
  */
 static int
 tunkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct tuntap_softc	*tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_read_filterops;
 		break;
 
 	case EVFILT_WRITE:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_write_filterops;
 		break;
 
 	default:
 		TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		return(EINVAL);
 	}
 
 	kn->kn_hook = tp;
 	knlist_add(&tp->tun_rsel.si_note, kn, 0);
 
 	return (0);
 }
 
 /*
  * Return true of there is data in the interface queue.
  */
 static int
 tunkqread(struct knote *kn, long hint)
 {
 	int			ret;
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct cdev		*dev = tp->tun_dev;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) {
 		TUNDEBUG(ifp,
 		    "%s have data in the queue.  Len = %d, minor = %#x\n",
 		    ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev));
 		ret = 1;
 	} else {
 		TUNDEBUG(ifp,
 		    "%s waiting for data, minor = %#x\n", ifp->if_xname,
 		    dev2unit(dev));
 		ret = 0;
 	}
 
 	return (ret);
 }
 
 /*
  * Always can write, always return MTU in kn->data.
  */
 static int
 tunkqwrite(struct knote *kn, long hint)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	kn->kn_data = ifp->if_mtu;
 
 	return (1);
 }
 
 static void
 tunkqdetach(struct knote *kn)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 
 	knlist_remove(&tp->tun_rsel.si_note, kn, 0);
 }
diff --git a/sys/security/audit/audit_pipe.c b/sys/security/audit/audit_pipe.c
index 926865b499d1..c50287321cbd 100644
--- a/sys/security/audit/audit_pipe.c
+++ b/sys/security/audit/audit_pipe.c
@@ -1,1077 +1,1077 @@
 /*-
  * Copyright (c) 2006 Robert N. M. Watson
  * Copyright (c) 2008-2009 Apple, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/selinfo.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <security/audit/audit.h>
 #include <security/audit/audit_ioctl.h>
 #include <security/audit/audit_private.h>
 
 /*
  * Implementation of a clonable special device providing a live stream of BSM
  * audit data.  Consumers receive a "tee" of the system audit trail by
  * default, but may also define alternative event selections using ioctls.
  * This interface provides unreliable but timely access to audit events.
  * Consumers should be very careful to avoid introducing event cycles.
  */
 
 /*
  * Memory types.
  */
 static MALLOC_DEFINE(M_AUDIT_PIPE, "audit_pipe", "Audit pipes");
 static MALLOC_DEFINE(M_AUDIT_PIPE_ENTRY, "audit_pipeent",
     "Audit pipe entries and buffers");
 static MALLOC_DEFINE(M_AUDIT_PIPE_PRESELECT, "audit_pipe_presel",
     "Audit pipe preselection structure");
 
 /*
  * Audit pipe buffer parameters.
  */
 #define	AUDIT_PIPE_QLIMIT_DEFAULT	(128)
 #define	AUDIT_PIPE_QLIMIT_MIN		(1)
 #define	AUDIT_PIPE_QLIMIT_MAX		(1024)
 
 /*
  * Description of an entry in an audit_pipe.
  */
 struct audit_pipe_entry {
 	void				*ape_record;
 	u_int				 ape_record_len;
 	TAILQ_ENTRY(audit_pipe_entry)	 ape_queue;
 };
 
 /*
  * Audit pipes allow processes to express "interest" in the set of records
  * that are delivered via the pipe.  They do this in a similar manner to the
  * mechanism for audit trail configuration, by expressing two global masks,
  * and optionally expressing per-auid masks.  The following data structure is
  * the per-auid mask description.  The global state is stored in the audit
  * pipe data structure.
  *
  * We may want to consider a more space/time-efficient data structure once
  * usage patterns for per-auid specifications are clear.
  */
 struct audit_pipe_preselect {
 	au_id_t					 app_auid;
 	au_mask_t				 app_mask;
 	TAILQ_ENTRY(audit_pipe_preselect)	 app_list;
 };
 
 /*
  * Description of an individual audit_pipe.  Consists largely of a bounded
  * length queue.
  */
 #define	AUDIT_PIPE_ASYNC	0x00000001
 #define	AUDIT_PIPE_NBIO		0x00000002
 struct audit_pipe {
 	u_int				 ap_flags;
 
 	struct selinfo			 ap_selinfo;
 	struct sigio			*ap_sigio;
 
 	/*
 	 * Per-pipe mutex protecting most fields in this data structure.
 	 */
 	struct mtx			 ap_mtx;
 
 	/*
 	 * Per-pipe sleep lock serializing user-generated reads and flushes.
 	 * uiomove() is called to copy out the current head record's data
 	 * while the record remains in the queue, so we prevent other threads
 	 * from removing it using this lock.
 	 */
 	struct sx			 ap_sx;
 
 	/*
 	 * Condition variable to signal when data has been delivered to a
 	 * pipe.
 	 */
 	struct cv			 ap_cv;
 
 	/*
 	 * Various queue-reated variables: qlen and qlimit are a count of
 	 * records in the queue; qbyteslen is the number of bytes of data
 	 * across all records, and qoffset is the amount read so far of the
 	 * first record in the queue.  The number of bytes available for
 	 * reading in the queue is qbyteslen - qoffset.
 	 */
 	u_int				 ap_qlen;
 	u_int				 ap_qlimit;
 	u_int				 ap_qbyteslen;
 	u_int				 ap_qoffset;
 
 	/*
 	 * Per-pipe operation statistics.
 	 */
 	u_int64_t			 ap_inserts;	/* Records added. */
 	u_int64_t			 ap_reads;	/* Records read. */
 	u_int64_t			 ap_drops;	/* Records dropped. */
 
 	/*
 	 * Fields relating to pipe interest: global masks for unmatched
 	 * processes (attributable, non-attributable), and a list of specific
 	 * interest specifications by auid.
 	 */
 	int				 ap_preselect_mode;
 	au_mask_t			 ap_preselect_flags;
 	au_mask_t			 ap_preselect_naflags;
 	TAILQ_HEAD(, audit_pipe_preselect)	ap_preselect_list;
 
 	/*
 	 * Current pending record list.  Protected by a combination of ap_mtx
 	 * and ap_sx.  Note particularly that *both* locks are required to
 	 * remove a record from the head of the queue, as an in-progress read
 	 * may sleep while copying and therefore cannot hold ap_mtx.
 	 */
 	TAILQ_HEAD(, audit_pipe_entry)	 ap_queue;
 
 	/*
 	 * Global pipe list.
 	 */
 	TAILQ_ENTRY(audit_pipe)		 ap_list;
 };
 
 #define	AUDIT_PIPE_LOCK(ap)		mtx_lock(&(ap)->ap_mtx)
 #define	AUDIT_PIPE_LOCK_ASSERT(ap)	mtx_assert(&(ap)->ap_mtx, MA_OWNED)
 #define	AUDIT_PIPE_LOCK_DESTROY(ap)	mtx_destroy(&(ap)->ap_mtx)
 #define	AUDIT_PIPE_LOCK_INIT(ap)	mtx_init(&(ap)->ap_mtx, \
 					    "audit_pipe_mtx", NULL, MTX_DEF)
 #define	AUDIT_PIPE_UNLOCK(ap)		mtx_unlock(&(ap)->ap_mtx)
 #define	AUDIT_PIPE_MTX(ap)		(&(ap)->ap_mtx)
 
 #define	AUDIT_PIPE_SX_LOCK_DESTROY(ap)	sx_destroy(&(ap)->ap_sx)
 #define	AUDIT_PIPE_SX_LOCK_INIT(ap)	sx_init(&(ap)->ap_sx, "audit_pipe_sx")
 #define	AUDIT_PIPE_SX_XLOCK_ASSERT(ap)	sx_assert(&(ap)->ap_sx, SA_XLOCKED)
 #define	AUDIT_PIPE_SX_XLOCK_SIG(ap)	sx_xlock_sig(&(ap)->ap_sx)
 #define	AUDIT_PIPE_SX_XUNLOCK(ap)	sx_xunlock(&(ap)->ap_sx)
 
 /*
  * Global list of audit pipes, rwlock to protect it.  Individual record
  * queues on pipes are protected by per-pipe locks; these locks synchronize
  * between threads walking the list to deliver to individual pipes and add/
  * remove of pipes, and are mostly acquired for read.
  */
 static TAILQ_HEAD(, audit_pipe)	 audit_pipe_list;
 static struct rwlock		 audit_pipe_lock;
 
 #define	AUDIT_PIPE_LIST_LOCK_INIT()	rw_init(&audit_pipe_lock, \
 					    "audit_pipe_list_lock")
 #define	AUDIT_PIPE_LIST_LOCK_DESTROY()	rw_destroy(&audit_pipe_lock)
 #define	AUDIT_PIPE_LIST_RLOCK()		rw_rlock(&audit_pipe_lock)
 #define	AUDIT_PIPE_LIST_RUNLOCK()	rw_runlock(&audit_pipe_lock)
 #define	AUDIT_PIPE_LIST_WLOCK()		rw_wlock(&audit_pipe_lock)
 #define	AUDIT_PIPE_LIST_WLOCK_ASSERT()	rw_assert(&audit_pipe_lock, \
 					    RA_WLOCKED)
 #define	AUDIT_PIPE_LIST_WUNLOCK()	rw_wunlock(&audit_pipe_lock)
 
 /*
  * Audit pipe device.
  */
 static struct cdev	*audit_pipe_dev;
 
 #define AUDIT_PIPE_NAME	"auditpipe"
 
 /*
  * Special device methods and definition.
  */
 static d_open_t		audit_pipe_open;
 static d_read_t		audit_pipe_read;
 static d_ioctl_t	audit_pipe_ioctl;
 static d_poll_t		audit_pipe_poll;
 static d_kqfilter_t	audit_pipe_kqfilter;
 
 static struct cdevsw	audit_pipe_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	audit_pipe_open,
 	.d_read =	audit_pipe_read,
 	.d_ioctl =	audit_pipe_ioctl,
 	.d_poll =	audit_pipe_poll,
 	.d_kqfilter =	audit_pipe_kqfilter,
 	.d_name =	AUDIT_PIPE_NAME,
 };
 
 static int	audit_pipe_kqread(struct knote *note, long hint);
 static void	audit_pipe_kqdetach(struct knote *note);
 
-static struct filterops audit_pipe_read_filterops = {
+static const struct filterops audit_pipe_read_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	audit_pipe_kqdetach,
 	.f_event =	audit_pipe_kqread,
 };
 
 /*
  * Some global statistics on audit pipes.
  */
 static int		audit_pipe_count;	/* Current number of pipes. */
 static u_int64_t	audit_pipe_ever;	/* Pipes ever allocated. */
 static u_int64_t	audit_pipe_records;	/* Records seen. */
 static u_int64_t	audit_pipe_drops;	/* Global record drop count. */
 
 /*
  * Free an audit pipe entry.
  */
 static void
 audit_pipe_entry_free(struct audit_pipe_entry *ape)
 {
 
 	free(ape->ape_record, M_AUDIT_PIPE_ENTRY);
 	free(ape, M_AUDIT_PIPE_ENTRY);
 }
 
 /*
  * Find an audit pipe preselection specification for an auid, if any.
  */
 static struct audit_pipe_preselect *
 audit_pipe_preselect_find(struct audit_pipe *ap, au_id_t auid)
 {
 	struct audit_pipe_preselect *app;
 
 	AUDIT_PIPE_LOCK_ASSERT(ap);
 
 	TAILQ_FOREACH(app, &ap->ap_preselect_list, app_list) {
 		if (app->app_auid == auid)
 			return (app);
 	}
 	return (NULL);
 }
 
 /*
  * Query the per-pipe mask for a specific auid.
  */
 static int
 audit_pipe_preselect_get(struct audit_pipe *ap, au_id_t auid,
     au_mask_t *maskp)
 {
 	struct audit_pipe_preselect *app;
 	int error;
 
 	AUDIT_PIPE_LOCK(ap);
 	app = audit_pipe_preselect_find(ap, auid);
 	if (app != NULL) {
 		*maskp = app->app_mask;
 		error = 0;
 	} else
 		error = ENOENT;
 	AUDIT_PIPE_UNLOCK(ap);
 	return (error);
 }
 
 /*
  * Set the per-pipe mask for a specific auid.  Add a new entry if needed;
  * otherwise, update the current entry.
  */
 static void
 audit_pipe_preselect_set(struct audit_pipe *ap, au_id_t auid, au_mask_t mask)
 {
 	struct audit_pipe_preselect *app, *app_new;
 
 	/*
 	 * Pessimistically assume that the auid doesn't already have a mask
 	 * set, and allocate.  We will free it if it is unneeded.
 	 */
 	app_new = malloc(sizeof(*app_new), M_AUDIT_PIPE_PRESELECT, M_WAITOK);
 	AUDIT_PIPE_LOCK(ap);
 	app = audit_pipe_preselect_find(ap, auid);
 	if (app == NULL) {
 		app = app_new;
 		app_new = NULL;
 		app->app_auid = auid;
 		TAILQ_INSERT_TAIL(&ap->ap_preselect_list, app, app_list);
 	}
 	app->app_mask = mask;
 	AUDIT_PIPE_UNLOCK(ap);
 	if (app_new != NULL)
 		free(app_new, M_AUDIT_PIPE_PRESELECT);
 }
 
 /*
  * Delete a per-auid mask on an audit pipe.
  */
 static int
 audit_pipe_preselect_delete(struct audit_pipe *ap, au_id_t auid)
 {
 	struct audit_pipe_preselect *app;
 	int error;
 
 	AUDIT_PIPE_LOCK(ap);
 	app = audit_pipe_preselect_find(ap, auid);
 	if (app != NULL) {
 		TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list);
 		error = 0;
 	} else
 		error = ENOENT;
 	AUDIT_PIPE_UNLOCK(ap);
 	if (app != NULL)
 		free(app, M_AUDIT_PIPE_PRESELECT);
 	return (error);
 }
 
 /*
  * Delete all per-auid masks on an audit pipe.
  */
 static void
 audit_pipe_preselect_flush_locked(struct audit_pipe *ap)
 {
 	struct audit_pipe_preselect *app;
 
 	AUDIT_PIPE_LOCK_ASSERT(ap);
 
 	while ((app = TAILQ_FIRST(&ap->ap_preselect_list)) != NULL) {
 		TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list);
 		free(app, M_AUDIT_PIPE_PRESELECT);
 	}
 }
 
 static void
 audit_pipe_preselect_flush(struct audit_pipe *ap)
 {
 
 	AUDIT_PIPE_LOCK(ap);
 	audit_pipe_preselect_flush_locked(ap);
 	AUDIT_PIPE_UNLOCK(ap);
 }
 
 /*-
  * Determine whether a specific audit pipe matches a record with these
  * properties.  Algorithm is as follows:
  *
  * - If the pipe is configured to track the default trail configuration, then
  *   use the results of global preselection matching.
  * - If not, search for a specifically configured auid entry matching the
  *   event.  If an entry is found, use that.
  * - Otherwise, use the default flags or naflags configured for the pipe.
  */
 static int
 audit_pipe_preselect_check(struct audit_pipe *ap, au_id_t auid,
     au_event_t event, au_class_t class, int sorf, int trail_preselect)
 {
 	struct audit_pipe_preselect *app;
 
 	AUDIT_PIPE_LOCK_ASSERT(ap);
 
 	switch (ap->ap_preselect_mode) {
 	case AUDITPIPE_PRESELECT_MODE_TRAIL:
 		return (trail_preselect);
 
 	case AUDITPIPE_PRESELECT_MODE_LOCAL:
 		app = audit_pipe_preselect_find(ap, auid);
 		if (app == NULL) {
 			if (auid == AU_DEFAUDITID)
 				return (au_preselect(event, class,
 				    &ap->ap_preselect_naflags, sorf));
 			else
 				return (au_preselect(event, class,
 				    &ap->ap_preselect_flags, sorf));
 		} else
 			return (au_preselect(event, class, &app->app_mask,
 			    sorf));
 
 	default:
 		panic("audit_pipe_preselect_check: mode %d",
 		    ap->ap_preselect_mode);
 	}
 
 	return (0);
 }
 
 /*
  * Determine whether there exists a pipe interested in a record with specific
  * properties.
  */
 int
 audit_pipe_preselect(au_id_t auid, au_event_t event, au_class_t class,
     int sorf, int trail_preselect)
 {
 	struct audit_pipe *ap;
 
 	/* Lockless read to avoid acquiring the global lock if not needed. */
 	if (TAILQ_EMPTY(&audit_pipe_list))
 		return (0);
 
 	AUDIT_PIPE_LIST_RLOCK();
 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
 		AUDIT_PIPE_LOCK(ap);
 		if (audit_pipe_preselect_check(ap, auid, event, class, sorf,
 		    trail_preselect)) {
 			AUDIT_PIPE_UNLOCK(ap);
 			AUDIT_PIPE_LIST_RUNLOCK();
 			return (1);
 		}
 		AUDIT_PIPE_UNLOCK(ap);
 	}
 	AUDIT_PIPE_LIST_RUNLOCK();
 	return (0);
 }
 
 /*
  * Append individual record to a queue -- allocate queue-local buffer, and
  * add to the queue.  If the queue is full or we can't allocate memory, drop
  * the newest record.
  */
 static void
 audit_pipe_append(struct audit_pipe *ap, void *record, u_int record_len)
 {
 	struct audit_pipe_entry *ape;
 
 	AUDIT_PIPE_LOCK_ASSERT(ap);
 
 	if (ap->ap_qlen >= ap->ap_qlimit) {
 		ap->ap_drops++;
 		audit_pipe_drops++;
 		return;
 	}
 
 	ape = malloc(sizeof(*ape), M_AUDIT_PIPE_ENTRY, M_NOWAIT | M_ZERO);
 	if (ape == NULL) {
 		ap->ap_drops++;
 		audit_pipe_drops++;
 		return;
 	}
 
 	ape->ape_record = malloc(record_len, M_AUDIT_PIPE_ENTRY, M_NOWAIT);
 	if (ape->ape_record == NULL) {
 		free(ape, M_AUDIT_PIPE_ENTRY);
 		ap->ap_drops++;
 		audit_pipe_drops++;
 		return;
 	}
 
 	bcopy(record, ape->ape_record, record_len);
 	ape->ape_record_len = record_len;
 
 	TAILQ_INSERT_TAIL(&ap->ap_queue, ape, ape_queue);
 	ap->ap_inserts++;
 	ap->ap_qlen++;
 	ap->ap_qbyteslen += ape->ape_record_len;
 	selwakeuppri(&ap->ap_selinfo, PSOCK);
 	KNOTE_LOCKED(&ap->ap_selinfo.si_note, 0);
 	if (ap->ap_flags & AUDIT_PIPE_ASYNC)
 		pgsigio(&ap->ap_sigio, SIGIO, 0);
 	cv_broadcast(&ap->ap_cv);
 }
 
 /*
  * audit_pipe_submit(): audit_worker submits audit records via this
  * interface, which arranges for them to be delivered to pipe queues.
  */
 void
 audit_pipe_submit(au_id_t auid, au_event_t event, au_class_t class, int sorf,
     int trail_select, void *record, u_int record_len)
 {
 	struct audit_pipe *ap;
 
 	/*
 	 * Lockless read to avoid lock overhead if pipes are not in use.
 	 */
 	if (TAILQ_FIRST(&audit_pipe_list) == NULL)
 		return;
 
 	AUDIT_PIPE_LIST_RLOCK();
 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
 		AUDIT_PIPE_LOCK(ap);
 		if (audit_pipe_preselect_check(ap, auid, event, class, sorf,
 		    trail_select))
 			audit_pipe_append(ap, record, record_len);
 		AUDIT_PIPE_UNLOCK(ap);
 	}
 	AUDIT_PIPE_LIST_RUNLOCK();
 
 	/* Unlocked increment. */
 	audit_pipe_records++;
 }
 
 /*
  * audit_pipe_submit_user(): the same as audit_pipe_submit(), except that
  * since we don't currently have selection information available, it is
  * delivered to the pipe unconditionally.
  *
  * XXXRW: This is a bug.  The BSM check routine for submitting a user record
  * should parse that information and return it.
  */
 void
 audit_pipe_submit_user(void *record, u_int record_len)
 {
 	struct audit_pipe *ap;
 
 	/*
 	 * Lockless read to avoid lock overhead if pipes are not in use.
 	 */
 	if (TAILQ_FIRST(&audit_pipe_list) == NULL)
 		return;
 
 	AUDIT_PIPE_LIST_RLOCK();
 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
 		AUDIT_PIPE_LOCK(ap);
 		audit_pipe_append(ap, record, record_len);
 		AUDIT_PIPE_UNLOCK(ap);
 	}
 	AUDIT_PIPE_LIST_RUNLOCK();
 
 	/* Unlocked increment. */
 	audit_pipe_records++;
 }
 
 /*
  * Allocate a new audit pipe.  Connects the pipe, on success, to the global
  * list and updates statistics.
  */
 static struct audit_pipe *
 audit_pipe_alloc(void)
 {
 	struct audit_pipe *ap;
 
 	ap = malloc(sizeof(*ap), M_AUDIT_PIPE, M_NOWAIT | M_ZERO);
 	if (ap == NULL)
 		return (NULL);
 	ap->ap_qlimit = AUDIT_PIPE_QLIMIT_DEFAULT;
 	TAILQ_INIT(&ap->ap_queue);
 	knlist_init_mtx(&ap->ap_selinfo.si_note, AUDIT_PIPE_MTX(ap));
 	AUDIT_PIPE_LOCK_INIT(ap);
 	AUDIT_PIPE_SX_LOCK_INIT(ap);
 	cv_init(&ap->ap_cv, "audit_pipe");
 
 	/*
 	 * Default flags, naflags, and auid-specific preselection settings to
 	 * 0.  Initialize the mode to the global trail so that if praudit(1)
 	 * is run on /dev/auditpipe, it sees events associated with the
 	 * default trail.  Pipe-aware application can clear the flag, set
 	 * custom masks, and flush the pipe as needed.
 	 */
 	bzero(&ap->ap_preselect_flags, sizeof(ap->ap_preselect_flags));
 	bzero(&ap->ap_preselect_naflags, sizeof(ap->ap_preselect_naflags));
 	TAILQ_INIT(&ap->ap_preselect_list);
 	ap->ap_preselect_mode = AUDITPIPE_PRESELECT_MODE_TRAIL;
 
 	/*
 	 * Add to global list and update global statistics.
 	 */
 	AUDIT_PIPE_LIST_WLOCK();
 	TAILQ_INSERT_HEAD(&audit_pipe_list, ap, ap_list);
 	audit_pipe_count++;
 	audit_pipe_ever++;
 	AUDIT_PIPE_LIST_WUNLOCK();
 
 	return (ap);
 }
 
 /*
  * Flush all records currently present in an audit pipe; assume mutex is held.
  */
 static void
 audit_pipe_flush(struct audit_pipe *ap)
 {
 	struct audit_pipe_entry *ape;
 
 	AUDIT_PIPE_LOCK_ASSERT(ap);
 
 	while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL) {
 		TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
 		ap->ap_qbyteslen -= ape->ape_record_len;
 		audit_pipe_entry_free(ape);
 		ap->ap_qlen--;
 	}
 	ap->ap_qoffset = 0;
 
 	KASSERT(ap->ap_qlen == 0, ("audit_pipe_free: ap_qbyteslen"));
 	KASSERT(ap->ap_qbyteslen == 0, ("audit_pipe_flush: ap_qbyteslen"));
 }
 
 /*
  * Free an audit pipe; this means freeing all preselection state and all
  * records in the pipe.  Assumes global write lock and pipe mutex are held to
  * prevent any new records from being inserted during the free, and that the
  * audit pipe is still on the global list.
  */
 static void
 audit_pipe_free(struct audit_pipe *ap)
 {
 
 	AUDIT_PIPE_LIST_WLOCK_ASSERT();
 	AUDIT_PIPE_LOCK_ASSERT(ap);
 
 	audit_pipe_preselect_flush_locked(ap);
 	audit_pipe_flush(ap);
 	cv_destroy(&ap->ap_cv);
 	AUDIT_PIPE_SX_LOCK_DESTROY(ap);
 	AUDIT_PIPE_LOCK_DESTROY(ap);
 	seldrain(&ap->ap_selinfo);
 	knlist_destroy(&ap->ap_selinfo.si_note);
 	TAILQ_REMOVE(&audit_pipe_list, ap, ap_list);
 	free(ap, M_AUDIT_PIPE);
 	audit_pipe_count--;
 }
 
 static void
 audit_pipe_dtor(void *arg)
 {
 	struct audit_pipe *ap;
 
 	ap = arg;
 	funsetown(&ap->ap_sigio);
 	AUDIT_PIPE_LIST_WLOCK();
 	AUDIT_PIPE_LOCK(ap);
 	audit_pipe_free(ap);
 	AUDIT_PIPE_LIST_WUNLOCK();
 }
 
 /*
  * Audit pipe open method.  Explicit privilege check isn't used as this
  * allows file permissions on the special device to be used to grant audit
  * review access.  Those file permissions should be managed carefully.
  */
 static int
 audit_pipe_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct audit_pipe *ap;
 	int error;
 
 	ap = audit_pipe_alloc();
 	if (ap == NULL)
 		return (ENOMEM);
 	fsetown(td->td_proc->p_pid, &ap->ap_sigio);
 	error = devfs_set_cdevpriv(ap, audit_pipe_dtor);
 	if (error != 0)
 		audit_pipe_dtor(ap);
 	return (error);
 }
 
 /*
  * Audit pipe ioctl() routine.  Handle file descriptor and audit pipe layer
  * commands.
  */
 static int
 audit_pipe_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
     struct thread *td)
 {
 	struct auditpipe_ioctl_preselect *aip;
 	struct audit_pipe *ap;
 	au_mask_t *maskp;
 	int error, mode;
 	au_id_t auid;
 
 	error = devfs_get_cdevpriv((void **)&ap);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Audit pipe ioctls: first come standard device node ioctls, then
 	 * manipulation of pipe settings, and finally, statistics query
 	 * ioctls.
 	 */
 	switch (cmd) {
 	case FIONBIO:
 		AUDIT_PIPE_LOCK(ap);
 		if (*(int *)data)
 			ap->ap_flags |= AUDIT_PIPE_NBIO;
 		else
 			ap->ap_flags &= ~AUDIT_PIPE_NBIO;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case FIONREAD:
 		AUDIT_PIPE_LOCK(ap);
 		*(int *)data = ap->ap_qbyteslen - ap->ap_qoffset;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case FIOASYNC:
 		AUDIT_PIPE_LOCK(ap);
 		if (*(int *)data)
 			ap->ap_flags |= AUDIT_PIPE_ASYNC;
 		else
 			ap->ap_flags &= ~AUDIT_PIPE_ASYNC;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &ap->ap_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&ap->ap_sigio);
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_QLEN:
 		*(u_int *)data = ap->ap_qlen;
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_QLIMIT:
 		*(u_int *)data = ap->ap_qlimit;
 		error = 0;
 		break;
 
 	case AUDITPIPE_SET_QLIMIT:
 		/* Lockless integer write. */
 		if (*(u_int *)data >= AUDIT_PIPE_QLIMIT_MIN &&
 		    *(u_int *)data <= AUDIT_PIPE_QLIMIT_MAX) {
 			ap->ap_qlimit = *(u_int *)data;
 			error = 0;
 		} else
 			error = EINVAL;
 		break;
 
 	case AUDITPIPE_GET_QLIMIT_MIN:
 		*(u_int *)data = AUDIT_PIPE_QLIMIT_MIN;
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_QLIMIT_MAX:
 		*(u_int *)data = AUDIT_PIPE_QLIMIT_MAX;
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_PRESELECT_FLAGS:
 		AUDIT_PIPE_LOCK(ap);
 		maskp = (au_mask_t *)data;
 		*maskp = ap->ap_preselect_flags;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case AUDITPIPE_SET_PRESELECT_FLAGS:
 		AUDIT_PIPE_LOCK(ap);
 		maskp = (au_mask_t *)data;
 		ap->ap_preselect_flags = *maskp;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_PRESELECT_NAFLAGS:
 		AUDIT_PIPE_LOCK(ap);
 		maskp = (au_mask_t *)data;
 		*maskp = ap->ap_preselect_naflags;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case AUDITPIPE_SET_PRESELECT_NAFLAGS:
 		AUDIT_PIPE_LOCK(ap);
 		maskp = (au_mask_t *)data;
 		ap->ap_preselect_naflags = *maskp;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_PRESELECT_AUID:
 		aip = (struct auditpipe_ioctl_preselect *)data;
 		error = audit_pipe_preselect_get(ap, aip->aip_auid,
 		    &aip->aip_mask);
 		break;
 
 	case AUDITPIPE_SET_PRESELECT_AUID:
 		aip = (struct auditpipe_ioctl_preselect *)data;
 		audit_pipe_preselect_set(ap, aip->aip_auid, aip->aip_mask);
 		error = 0;
 		break;
 
 	case AUDITPIPE_DELETE_PRESELECT_AUID:
 		auid = *(au_id_t *)data;
 		error = audit_pipe_preselect_delete(ap, auid);
 		break;
 
 	case AUDITPIPE_FLUSH_PRESELECT_AUID:
 		audit_pipe_preselect_flush(ap);
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_PRESELECT_MODE:
 		AUDIT_PIPE_LOCK(ap);
 		*(int *)data = ap->ap_preselect_mode;
 		AUDIT_PIPE_UNLOCK(ap);
 		error = 0;
 		break;
 
 	case AUDITPIPE_SET_PRESELECT_MODE:
 		mode = *(int *)data;
 		switch (mode) {
 		case AUDITPIPE_PRESELECT_MODE_TRAIL:
 		case AUDITPIPE_PRESELECT_MODE_LOCAL:
 			AUDIT_PIPE_LOCK(ap);
 			ap->ap_preselect_mode = mode;
 			AUDIT_PIPE_UNLOCK(ap);
 			error = 0;
 			break;
 
 		default:
 			error = EINVAL;
 		}
 		break;
 
 	case AUDITPIPE_FLUSH:
 		if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
 			return (EINTR);
 		AUDIT_PIPE_LOCK(ap);
 		audit_pipe_flush(ap);
 		AUDIT_PIPE_UNLOCK(ap);
 		AUDIT_PIPE_SX_XUNLOCK(ap);
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_MAXAUDITDATA:
 		*(u_int *)data = MAXAUDITDATA;
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_INSERTS:
 		*(u_int *)data = ap->ap_inserts;
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_READS:
 		*(u_int *)data = ap->ap_reads;
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_DROPS:
 		*(u_int *)data = ap->ap_drops;
 		error = 0;
 		break;
 
 	case AUDITPIPE_GET_TRUNCATES:
 		*(u_int *)data = 0;
 		error = 0;
 		break;
 
 	default:
 		error = ENOTTY;
 	}
 	return (error);
 }
 
 /*
  * Audit pipe read.  Read one or more partial or complete records to user
  * memory.
  */
 static int
 audit_pipe_read(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct audit_pipe_entry *ape;
 	struct audit_pipe *ap;
 	u_int toread;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&ap);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We hold an sx(9) lock over read and flush because we rely on the
 	 * stability of a record in the queue during uiomove(9).
 	 */
 	if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
 		return (EINTR);
 	AUDIT_PIPE_LOCK(ap);
 	while (TAILQ_EMPTY(&ap->ap_queue)) {
 		if (ap->ap_flags & AUDIT_PIPE_NBIO) {
 			AUDIT_PIPE_UNLOCK(ap);
 			AUDIT_PIPE_SX_XUNLOCK(ap);
 			return (EAGAIN);
 		}
 		error = cv_wait_sig(&ap->ap_cv, AUDIT_PIPE_MTX(ap));
 		if (error) {
 			AUDIT_PIPE_UNLOCK(ap);
 			AUDIT_PIPE_SX_XUNLOCK(ap);
 			return (error);
 		}
 	}
 
 	/*
 	 * Copy as many remaining bytes from the current record to userspace
 	 * as we can.  Keep processing records until we run out of records in
 	 * the queue, or until the user buffer runs out of space.
 	 *
 	 * Note: we rely on the SX lock to maintain ape's stability here.
 	 */
 	ap->ap_reads++;
 	while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL &&
 	    uio->uio_resid > 0) {
 		AUDIT_PIPE_LOCK_ASSERT(ap);
 
 		KASSERT(ape->ape_record_len > ap->ap_qoffset,
 		    ("audit_pipe_read: record_len > qoffset (1)"));
 		toread = MIN(ape->ape_record_len - ap->ap_qoffset,
 		    uio->uio_resid);
 		AUDIT_PIPE_UNLOCK(ap);
 		error = uiomove((char *)ape->ape_record + ap->ap_qoffset,
 		    toread, uio);
 		if (error) {
 			AUDIT_PIPE_SX_XUNLOCK(ap);
 			return (error);
 		}
 
 		/*
 		 * If the copy succeeded, update book-keeping, and if no
 		 * bytes remain in the current record, free it.
 		 */
 		AUDIT_PIPE_LOCK(ap);
 		KASSERT(TAILQ_FIRST(&ap->ap_queue) == ape,
 		    ("audit_pipe_read: queue out of sync after uiomove"));
 		ap->ap_qoffset += toread;
 		KASSERT(ape->ape_record_len >= ap->ap_qoffset,
 		    ("audit_pipe_read: record_len >= qoffset (2)"));
 		if (ap->ap_qoffset == ape->ape_record_len) {
 			TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
 			ap->ap_qbyteslen -= ape->ape_record_len;
 			audit_pipe_entry_free(ape);
 			ap->ap_qlen--;
 			ap->ap_qoffset = 0;
 		}
 	}
 	AUDIT_PIPE_UNLOCK(ap);
 	AUDIT_PIPE_SX_XUNLOCK(ap);
 	return (0);
 }
 
 /*
  * Audit pipe poll.
  */
 static int
 audit_pipe_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct audit_pipe *ap;
 	int error, revents;
 
 	revents = 0;
 	error = devfs_get_cdevpriv((void **)&ap);
 	if (error != 0)
 		return (error);
 	if (events & (POLLIN | POLLRDNORM)) {
 		AUDIT_PIPE_LOCK(ap);
 		if (TAILQ_FIRST(&ap->ap_queue) != NULL)
 			revents |= events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &ap->ap_selinfo);
 		AUDIT_PIPE_UNLOCK(ap);
 	}
 	return (revents);
 }
 
 /*
  * Audit pipe kqfilter.
  */
 static int
 audit_pipe_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct audit_pipe *ap;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&ap);
 	if (error != 0)
 		return (error);
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_fop = &audit_pipe_read_filterops;
 	kn->kn_hook = ap;
 
 	AUDIT_PIPE_LOCK(ap);
 	knlist_add(&ap->ap_selinfo.si_note, kn, 1);
 	AUDIT_PIPE_UNLOCK(ap);
 	return (0);
 }
 
 /*
  * Return true if there are records available for reading on the pipe.
  */
 static int
 audit_pipe_kqread(struct knote *kn, long hint)
 {
 	struct audit_pipe *ap;
 
 	ap = (struct audit_pipe *)kn->kn_hook;
 	AUDIT_PIPE_LOCK_ASSERT(ap);
 
 	if (ap->ap_qlen != 0) {
 		kn->kn_data = ap->ap_qbyteslen - ap->ap_qoffset;
 		return (1);
 	} else {
 		kn->kn_data = 0;
 		return (0);
 	}
 }
 
 /*
  * Detach kqueue state from audit pipe.
  */
 static void
 audit_pipe_kqdetach(struct knote *kn)
 {
 	struct audit_pipe *ap;
 
 	ap = (struct audit_pipe *)kn->kn_hook;
 	AUDIT_PIPE_LOCK(ap);
 	knlist_remove(&ap->ap_selinfo.si_note, kn, 1);
 	AUDIT_PIPE_UNLOCK(ap);
 }
 
 /*
  * Initialize the audit pipe system.
  */
 static void
 audit_pipe_init(void *unused)
 {
 
 	TAILQ_INIT(&audit_pipe_list);
 	AUDIT_PIPE_LIST_LOCK_INIT();
 	audit_pipe_dev = make_dev(&audit_pipe_cdevsw, 0, UID_ROOT,
 		GID_WHEEL, 0600, "%s", AUDIT_PIPE_NAME);
 	if (audit_pipe_dev == NULL) {
 		AUDIT_PIPE_LIST_LOCK_DESTROY();
 		panic("Can't initialize audit pipe subsystem");
 	}
 }
 
 SYSINIT(audit_pipe_init, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, audit_pipe_init,
     NULL);
diff --git a/sys/sys/file.h b/sys/sys/file.h
index c1439b9bbaac..da96f3e332fc 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -1,510 +1,510 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)file.h	8.3 (Berkeley) 1/9/95
  */
 
 #ifndef _SYS_FILE_H_
 #define	_SYS_FILE_H_
 
 #ifndef _KERNEL
 #include <sys/types.h> /* XXX */
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #else
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <vm/vm.h>
 
 struct filedesc;
 struct stat;
 struct thread;
 struct uio;
 struct knote;
 struct vnode;
 struct nameidata;
 
 #endif /* _KERNEL */
 
 #define	DTYPE_NONE	0	/* not yet initialized */
 #define	DTYPE_VNODE	1	/* file */
 #define	DTYPE_SOCKET	2	/* communications endpoint */
 #define	DTYPE_PIPE	3	/* pipe */
 #define	DTYPE_FIFO	4	/* fifo (named pipe) */
 #define	DTYPE_KQUEUE	5	/* event queue */
 #define	DTYPE_CRYPTO	6	/* crypto */
 #define	DTYPE_MQUEUE	7	/* posix message queue */
 #define	DTYPE_SHM	8	/* swap-backed shared memory */
 #define	DTYPE_SEM	9	/* posix semaphore */
 #define	DTYPE_PTS	10	/* pseudo teletype master device */
 #define	DTYPE_DEV	11	/* Device specific fd type */
 #define	DTYPE_PROCDESC	12	/* process descriptor */
 #define	DTYPE_EVENTFD	13	/* eventfd */
 #define	DTYPE_TIMERFD	14	/* timerfd */
 
 #ifdef _KERNEL
 
 struct file;
 struct filecaps;
 struct kaiocb;
 struct kinfo_file;
 struct ucred;
 
 #define	FOF_OFFSET	0x01	/* Use the offset in uio argument */
 #define	FOF_NOLOCK	0x02	/* Do not take FOFFSET_LOCK */
 #define	FOF_NEXTOFF_R	0x04	/* Also update f_nextoff[UIO_READ] */
 #define	FOF_NEXTOFF_W	0x08	/* Also update f_nextoff[UIO_WRITE] */
 #define	FOF_NOUPDATE	0x10	/* Do not update f_offset */
 off_t foffset_lock(struct file *fp, int flags);
 void foffset_lock_uio(struct file *fp, struct uio *uio, int flags);
 void foffset_unlock(struct file *fp, off_t val, int flags);
 void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags);
 
 static inline off_t
 foffset_get(struct file *fp)
 {
 
 	return (foffset_lock(fp, FOF_NOLOCK));
 }
 
 typedef int fo_rdwr_t(struct file *fp, struct uio *uio,
 		    struct ucred *active_cred, int flags,
 		    struct thread *td);
 typedef	int fo_truncate_t(struct file *fp, off_t length,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_ioctl_t(struct file *fp, u_long com, void *data,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_poll_t(struct file *fp, int events,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_kqfilter_t(struct file *fp, struct knote *kn);
 typedef	int fo_stat_t(struct file *fp, struct stat *sb,
 		    struct ucred *active_cred);
 typedef	int fo_close_t(struct file *fp, struct thread *td);
 typedef	int fo_chmod_t(struct file *fp, mode_t mode,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_chown_t(struct file *fp, uid_t uid, gid_t gid,
 		    struct ucred *active_cred, struct thread *td);
 typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio,
 		    struct uio *trl_uio, off_t offset, size_t nbytes,
 		    off_t *sent, int flags, struct thread *td);
 typedef int fo_seek_t(struct file *fp, off_t offset, int whence,
 		    struct thread *td);
 typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif,
 		    struct filedesc *fdp);
 typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr,
 		    vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot,
 		    int flags, vm_ooffset_t foff, struct thread *td);
 typedef int fo_aio_queue_t(struct file *fp, struct kaiocb *job);
 typedef int fo_add_seals_t(struct file *fp, int flags);
 typedef int fo_get_seals_t(struct file *fp, int *flags);
 typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len,
 		    struct thread *td);
 typedef int fo_fspacectl_t(struct file *fp, int cmd,
 		    off_t *offset, off_t *length, int flags,
 		    struct ucred *active_cred, struct thread *td);
 typedef int fo_cmp_t(struct file *fp, struct file *fp1, struct thread *td);
 typedef int fo_spare_t(struct file *fp);
 typedef	int fo_flags_t;
 
 struct fileops {
 	fo_rdwr_t	*fo_read;
 	fo_rdwr_t	*fo_write;
 	fo_truncate_t	*fo_truncate;
 	fo_ioctl_t	*fo_ioctl;
 	fo_poll_t	*fo_poll;
 	fo_kqfilter_t	*fo_kqfilter;
 	fo_stat_t	*fo_stat;
 	fo_close_t	*fo_close;
 	fo_chmod_t	*fo_chmod;
 	fo_chown_t	*fo_chown;
 	fo_sendfile_t	*fo_sendfile;
 	fo_seek_t	*fo_seek;
 	fo_fill_kinfo_t	*fo_fill_kinfo;
 	fo_mmap_t	*fo_mmap;
 	fo_aio_queue_t	*fo_aio_queue;
 	fo_add_seals_t	*fo_add_seals;
 	fo_get_seals_t	*fo_get_seals;
 	fo_fallocate_t	*fo_fallocate;
 	fo_fspacectl_t	*fo_fspacectl;
 	fo_cmp_t	*fo_cmp;
 	fo_spare_t	*fo_spares[7];	/* Spare slots */
 	fo_flags_t	fo_flags;	/* DFLAG_* below */
 };
 
 #define DFLAG_PASSABLE	0x01	/* may be passed via unix sockets. */
 #define DFLAG_SEEKABLE	0x02	/* seekable / nonsequential */
 #endif /* _KERNEL */
 
 #if defined(_KERNEL) || defined(_WANT_FILE)
 /*
  * Kernel descriptor table.
  * One entry for each open kernel vnode and socket.
  *
  * Below is the list of locks that protects members in struct file.
  *
  * (a) f_vnode lock required (shared allows both reads and writes)
  * (f) updated with atomics and blocking on sleepq
  * (d) cdevpriv_mtx
  * none	not locked
  */
 
 #if __BSD_VISIBLE
 struct fadvise_info {
 	int		fa_advice;	/* (f) FADV_* type. */
 	off_t		fa_start;	/* (f) Region start. */
 	off_t		fa_end;		/* (f) Region end. */
 };
 
 struct file {
 	volatile u_int	f_flag;		/* see fcntl.h */
 	volatile u_int 	f_count;	/* reference count */
 	void		*f_data;	/* file descriptor specific data */
 	const struct fileops *f_ops;	/* File operations */
 	struct vnode 	*f_vnode;	/* NULL or applicable vnode */
 	struct ucred	*f_cred;	/* associated credentials. */
 	short		f_type;		/* descriptor type */
 	short		f_vnread_flags; /* (f) Sleep lock for f_offset */
 	/*
 	 *  DTYPE_VNODE specific fields.
 	 */
 	union {
 		int16_t	f_seqcount[2];	/* (a) Count of seq. reads and writes. */
 		int	f_pipegen;
 	};
 	off_t		f_nextoff[2];	/* next expected read/write offset. */
 	union {
 		struct cdev_privdata *fvn_cdevpriv;
 					/* (d) Private data for the cdev. */
 		struct fadvise_info *fvn_advice;
 	} f_vnun;
 	/*
 	 *  DFLAG_SEEKABLE specific fields
 	 */
 	off_t		f_offset;
 };
 
 #define	f_cdevpriv	f_vnun.fvn_cdevpriv
 #define	f_advice	f_vnun.fvn_advice
 
 #define	FOFFSET_LOCKED       0x1
 #define	FOFFSET_LOCK_WAITING 0x2
 #endif /* __BSD_VISIBLE */
 
 #endif /* _KERNEL || _WANT_FILE */
 
 /*
  * Userland version of struct file, for sysctl
  */
 #if __BSD_VISIBLE
 struct xfile {
 	ksize_t	xf_size;	/* size of struct xfile */
 	pid_t	xf_pid;		/* owning process */
 	uid_t	xf_uid;		/* effective uid of owning process */
 	int	xf_fd;		/* descriptor number */
 	int	_xf_int_pad1;
 	kvaddr_t xf_file;	/* address of struct file */
 	short	xf_type;	/* descriptor type */
 	short	_xf_short_pad1;
 	int	xf_count;	/* reference count */
 	int	xf_msgcount;	/* references from message queue */
 	int	_xf_int_pad2;
 	off_t	xf_offset;	/* file offset */
 	kvaddr_t xf_data;	/* file descriptor specific data */
 	kvaddr_t xf_vnode;	/* vnode pointer */
 	u_int	xf_flag;	/* flags (see fcntl.h) */
 	int	_xf_int_pad3;
 	int64_t	_xf_int64_pad[6];
 };
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 
-extern struct fileops vnops;
-extern struct fileops badfileops;
-extern struct fileops path_fileops;
-extern struct fileops socketops;
+extern const struct fileops vnops;
+extern const struct fileops badfileops;
+extern const struct fileops path_fileops;
+extern const struct fileops socketops;
 extern int maxfiles;		/* kernel limit on number of open files */
 extern int maxfilesperproc;	/* per process limit on number of open files */
 
 int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp);
 int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp,
     vm_prot_t *maxprotp, struct file **fpp);
 int fget_read(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp);
 int fget_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp);
 int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp,
     int needfcntl, struct file **fpp);
 int _fdrop(struct file *fp, struct thread *td);
 int fget_remote(struct thread *td, struct proc *p, int fd, struct file **fpp);
 
 fo_rdwr_t	invfo_rdwr;
 fo_truncate_t	invfo_truncate;
 fo_ioctl_t	invfo_ioctl;
 fo_poll_t	invfo_poll;
 fo_kqfilter_t	invfo_kqfilter;
 fo_chmod_t	invfo_chmod;
 fo_chown_t	invfo_chown;
 fo_sendfile_t	invfo_sendfile;
 fo_stat_t	vn_statfile;
 fo_sendfile_t	vn_sendfile;
 fo_seek_t	vn_seek;
 fo_fill_kinfo_t	vn_fill_kinfo;
 fo_kqfilter_t	vn_kqfilter_opath;
 int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif);
 int file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td);
 
-void finit(struct file *, u_int, short, void *, struct fileops *);
-void finit_vnode(struct file *, u_int, void *, struct fileops *);
+void finit(struct file *, u_int, short, void *, const struct fileops *);
+void finit_vnode(struct file *, u_int, void *, const struct fileops *);
 int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp);
 int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, bool *fsearch);
 int fgetvp_lookup(struct nameidata *ndp, struct vnode **vpp);
 
 static __inline __result_use_check bool
 fhold(struct file *fp)
 {
 	return (refcount_acquire_checked(&fp->f_count));
 }
 
 #define	fdrop(fp, td)		({				\
 	struct file *_fp;					\
 	int _error;						\
 								\
 	_error = 0;						\
 	_fp = (fp);						\
 	if (__predict_false(refcount_release(&_fp->f_count)))	\
 		_error = _fdrop(_fp, td);			\
 	_error;							\
 })
 
 #define	fdrop_close(fp, td)		({			\
 	struct file *_fp;					\
 	int _error;						\
 								\
 	_error = 0;						\
 	_fp = (fp);						\
 	if (__predict_true(refcount_release(&_fp->f_count)))	\
 		_error = _fdrop(_fp, td);			\
 	_error;							\
 })
 
 static __inline fo_rdwr_t	fo_read;
 static __inline fo_rdwr_t	fo_write;
 static __inline fo_truncate_t	fo_truncate;
 static __inline fo_ioctl_t	fo_ioctl;
 static __inline fo_poll_t	fo_poll;
 static __inline fo_kqfilter_t	fo_kqfilter;
 static __inline fo_stat_t	fo_stat;
 static __inline fo_close_t	fo_close;
 static __inline fo_chmod_t	fo_chmod;
 static __inline fo_chown_t	fo_chown;
 static __inline fo_sendfile_t	fo_sendfile;
 
 static __inline int
 fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td));
 }
 
 static __inline int
 fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td));
 }
 
 static __inline int
 fo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td));
 }
 
 static __inline int
 fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td));
 }
 
 static __inline int
 fo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td));
 }
 
 static __inline int
 fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 
 	return ((*fp->f_ops->fo_stat)(fp, sb, active_cred));
 }
 
 static __inline int
 fo_close(struct file *fp, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_close)(fp, td));
 }
 
 static __inline int
 fo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return ((*fp->f_ops->fo_kqfilter)(fp, kn));
 }
 
 static __inline int
 fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td));
 }
 
 static __inline int
 fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td));
 }
 
 static __inline int
 fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset,
 	    nbytes, sent, flags, td));
 }
 
 static __inline int
 fo_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_seek)(fp, offset, whence, td));
 }
 
 static __inline int
 fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp));
 }
 
 static __inline int
 fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 
 	if (fp->f_ops->fo_mmap == NULL)
 		return (ENODEV);
 	return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot,
 	    flags, foff, td));
 }
 
 static __inline int
 fo_aio_queue(struct file *fp, struct kaiocb *job)
 {
 
 	return ((*fp->f_ops->fo_aio_queue)(fp, job));
 }
 
 static __inline int
 fo_add_seals(struct file *fp, int seals)
 {
 
 	if (fp->f_ops->fo_add_seals == NULL)
 		return (EINVAL);
 	return ((*fp->f_ops->fo_add_seals)(fp, seals));
 }
 
 static __inline int
 fo_get_seals(struct file *fp, int *seals)
 {
 
 	if (fp->f_ops->fo_get_seals == NULL)
 		return (EINVAL);
 	return ((*fp->f_ops->fo_get_seals)(fp, seals));
 }
 
 static __inline int
 fo_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 
 	if (fp->f_ops->fo_fallocate == NULL)
 		return (ENODEV);
 	return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td));
 }
 
 static __inline int
 fo_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length,
     int flags, struct ucred *active_cred, struct thread *td)
 {
 
 	if (fp->f_ops->fo_fspacectl == NULL)
 		return (ENODEV);
 	return ((*fp->f_ops->fo_fspacectl)(fp, cmd, offset, length, flags,
 	    active_cred, td));
 }
 
 static __inline int
 fo_cmp(struct file *fp1, struct file *fp2, struct thread *td)
 {
 
 	if (fp1->f_ops->fo_cmp == NULL)
 		return (ENODEV);
 	return ((*fp1->f_ops->fo_cmp)(fp1, fp2, td));
 }
 
 #endif /* _KERNEL */
 
 #endif /* !SYS_FILE_H */
diff --git a/sys/sys/mman.h b/sys/sys/mman.h
index 36973b941e61..d444f02d3c89 100644
--- a/sys/sys/mman.h
+++ b/sys/sys/mman.h
@@ -1,359 +1,359 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mman.h	8.2 (Berkeley) 1/9/95
  */
 
 #ifndef _SYS_MMAN_H_
 #define _SYS_MMAN_H_
 
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 
 #if __BSD_VISIBLE
 /*
  * Inheritance for minherit()
  */
 #define INHERIT_SHARE	0
 #define INHERIT_COPY	1
 #define INHERIT_NONE	2
 #define INHERIT_ZERO	3
 #endif
 
 /*
  * Protections are chosen from these bits, or-ed together
  */
 #define	PROT_NONE	0x00	/* no permissions */
 #define	PROT_READ	0x01	/* pages can be read */
 #define	PROT_WRITE	0x02	/* pages can be written */
 #define	PROT_EXEC	0x04	/* pages can be executed */
 #if __BSD_VISIBLE
 #define	_PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
 #define	PROT_EXTRACT(prot)	((prot) & _PROT_ALL)
 
 #define	_PROT_MAX_SHIFT	16
 #define	PROT_MAX(prot)		((prot) << _PROT_MAX_SHIFT)
 #define	PROT_MAX_EXTRACT(prot)	(((prot) >> _PROT_MAX_SHIFT) & _PROT_ALL)
 #endif
 
 /*
  * Flags contain sharing type and options.
  * Sharing types; choose one.
  */
 #define	MAP_SHARED	0x0001		/* share changes */
 #define	MAP_PRIVATE	0x0002		/* changes are private */
 #if __BSD_VISIBLE
 #define	MAP_COPY	MAP_PRIVATE	/* Obsolete */
 #endif
 
 /*
  * Other flags
  */
 #define	MAP_FIXED	 0x0010	/* map addr must be exactly as requested */
 
 #if __BSD_VISIBLE
 #define	MAP_RESERVED0020 0x0020	/* previously unimplemented MAP_RENAME */
 #define	MAP_RESERVED0040 0x0040	/* previously unimplemented MAP_NORESERVE */
 #define	MAP_RESERVED0080 0x0080	/* previously misimplemented MAP_INHERIT */
 #define	MAP_RESERVED0100 0x0100	/* previously unimplemented MAP_NOEXTEND */
 #define	MAP_HASSEMAPHORE 0x0200	/* region may contain semaphores */
 #define	MAP_STACK	 0x0400	/* region grows down, like a stack */
 #define	MAP_NOSYNC	 0x0800 /* page to but do not sync underlying file */
 
 /*
  * Mapping type
  */
 #define	MAP_FILE	 0x0000	/* map from file (default) */
 #define	MAP_ANON	 0x1000	/* allocated from memory, swap space */
 #ifndef _KERNEL
 #define	MAP_ANONYMOUS	 MAP_ANON /* For compatibility. */
 #endif /* !_KERNEL */
 
 /*
  * Extended flags
  */
 #define	MAP_GUARD	 0x00002000 /* reserve but don't map address range */
 #define	MAP_EXCL	 0x00004000 /* for MAP_FIXED, fail if address is used */
 #define	MAP_NOCORE	 0x00020000 /* dont include these pages in a coredump */
 #define	MAP_PREFAULT_READ 0x00040000 /* prefault mapping for reading */
 #define	MAP_32BIT	 0x00080000 /* map in the low 2GB of address space */
 
 /*
  * Request specific alignment (n == log2 of the desired alignment).
  *
  * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does
  * not enforce a specific alignment.
  */
 #define	MAP_ALIGNED(n)	 ((n) << MAP_ALIGNMENT_SHIFT)
 #define	MAP_ALIGNMENT_SHIFT	24
 #define	MAP_ALIGNMENT_MASK	MAP_ALIGNED(0xff)
 #define	MAP_ALIGNED_SUPER	MAP_ALIGNED(1) /* align on a superpage */
 
 /*
  * Flags provided to shm_rename
  */
 /* Don't overwrite dest, if it exists */
 #define SHM_RENAME_NOREPLACE	(1 << 0)
 /* Atomically swap src and dest */
 #define SHM_RENAME_EXCHANGE	(1 << 1)
 
 #endif /* __BSD_VISIBLE */
 
 #if __POSIX_VISIBLE >= 199309
 /*
  * Process memory locking
  */
 #define MCL_CURRENT	0x0001	/* Lock only current memory */
 #define MCL_FUTURE	0x0002	/* Lock all future memory as well */
 #endif
 
 /*
  * Error return from mmap()
  */
 #define MAP_FAILED	((void *)-1)
 
 /*
  * msync() flags
  */
 #define	MS_SYNC		0x0000	/* msync synchronously */
 #define MS_ASYNC	0x0001	/* return immediately */
 #define MS_INVALIDATE	0x0002	/* invalidate all cached data */
 
 /*
  * Advice to madvise
  */
 #define	_MADV_NORMAL	0	/* no further special treatment */
 #define	_MADV_RANDOM	1	/* expect random page references */
 #define	_MADV_SEQUENTIAL 2	/* expect sequential page references */
 #define	_MADV_WILLNEED	3	/* will need these pages */
 #define	_MADV_DONTNEED	4	/* dont need these pages */
 
 #if __BSD_VISIBLE
 #define	MADV_NORMAL	_MADV_NORMAL
 #define	MADV_RANDOM	_MADV_RANDOM
 #define	MADV_SEQUENTIAL _MADV_SEQUENTIAL
 #define	MADV_WILLNEED	_MADV_WILLNEED
 #define	MADV_DONTNEED	_MADV_DONTNEED
 #define	MADV_FREE	5	/* dont need these pages, and junk contents */
 #define	MADV_NOSYNC	6	/* try to avoid flushes to physical media */
 #define	MADV_AUTOSYNC	7	/* revert to default flushing strategy */
 #define	MADV_NOCORE	8	/* do not include these pages in a core file */
 #define	MADV_CORE	9	/* revert to including pages in a core file */
 #define	MADV_PROTECT	10	/* protect process from pageout kill */
 
 /*
  * Return bits from mincore
  */
 #define	MINCORE_INCORE	 	 0x1 /* Page is incore */
 #define	MINCORE_REFERENCED	 0x2 /* Page has been referenced by us */
 #define	MINCORE_MODIFIED	 0x4 /* Page has been modified by us */
 #define	MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */
 #define	MINCORE_MODIFIED_OTHER	0x10 /* Page has been modified */
 #define	MINCORE_SUPER		0x60 /* Page is a "super" page */
 #define	MINCORE_PSIND(i)	(((i) << 5) & MINCORE_SUPER) /* Page size */
 
 /*
  * Anonymous object constant for shm_open().
  */
 #define	SHM_ANON		((char *)1)
 
 /*
  * shmflags for shm_open2()
  */
 #define	SHM_ALLOW_SEALING		0x00000001
 #define	SHM_GROW_ON_WRITE		0x00000002
 #define	SHM_LARGEPAGE			0x00000004
 
 #define	SHM_LARGEPAGE_ALLOC_DEFAULT	0
 #define	SHM_LARGEPAGE_ALLOC_NOWAIT	1
 #define	SHM_LARGEPAGE_ALLOC_HARD	2
 
 struct shm_largepage_conf {
 	int psind;
 	int alloc_policy;
 	int pad[10];
 };
 
 /*
  * Flags for memfd_create().
  */
 #define	MFD_CLOEXEC			0x00000001
 #define	MFD_ALLOW_SEALING		0x00000002
 
 #define	MFD_HUGETLB			0x00000004
 
 #define	MFD_HUGE_MASK			0xFC000000
 #define	MFD_HUGE_SHIFT			26
 #define	MFD_HUGE_64KB			(16 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_512KB			(19 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_1MB			(20 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_2MB			(21 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_8MB			(23 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_16MB			(24 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_32MB			(25 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_256MB			(28 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_512MB			(29 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_1GB			(30 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_2GB			(31 << MFD_HUGE_SHIFT)
 #define	MFD_HUGE_16GB			(34 << MFD_HUGE_SHIFT)
 
 #endif /* __BSD_VISIBLE */
 
 /*
  * XXX missing POSIX_TYPED_MEM_* macros and
  * posix_typed_mem_info structure.
  */
 #if __POSIX_VISIBLE >= 200112
 #define	POSIX_MADV_NORMAL	_MADV_NORMAL
 #define	POSIX_MADV_RANDOM	_MADV_RANDOM
 #define	POSIX_MADV_SEQUENTIAL	_MADV_SEQUENTIAL
 #define	POSIX_MADV_WILLNEED	_MADV_WILLNEED
 #define	POSIX_MADV_DONTNEED	_MADV_DONTNEED
 #endif
 
 #ifndef _MODE_T_DECLARED
 typedef	__mode_t	mode_t;
 #define	_MODE_T_DECLARED
 #endif
 
 #ifndef _OFF_T_DECLARED
 typedef	__off_t		off_t;
 #define	_OFF_T_DECLARED
 #endif
 
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 #if defined(_KERNEL) || defined(_WANT_FILE)
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/rangelock.h>
 #include <vm/vm.h>
 
 struct file;
 
 struct shmfd {
 	vm_ooffset_t	shm_size;
 	vm_object_t	shm_object;
 	vm_pindex_t	shm_pages;	/* allocated pages */
 	int		shm_refs;
 	uid_t		shm_uid;
 	gid_t		shm_gid;
 	mode_t		shm_mode;
 	int		shm_kmappings;
 
 	/*
 	 * Values maintained solely to make this a better-behaved file
 	 * descriptor for fstat() to run on.
 	 */
 	struct timespec	shm_atime;
 	struct timespec	shm_mtime;
 	struct timespec	shm_ctime;
 	struct timespec	shm_birthtime;
 	ino_t		shm_ino;
 
 	struct label	*shm_label;		/* MAC label */
 	const char	*shm_path;
 
 	struct rangelock shm_rl;
 	struct mtx	shm_mtx;
 
 	int		shm_flags;
 	int		shm_seals;
 
 	/* largepage config */
 	int		shm_lp_psind;
 	int		shm_lp_alloc_policy;
 };
 #endif
 
 #ifdef _KERNEL
 struct prison;
 
 int	shm_map(struct file *fp, size_t size, off_t offset, void **memp);
 int	shm_unmap(struct file *fp, void *mem, size_t size);
 
 int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
 struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage);
 struct shmfd *shm_hold(struct shmfd *shmfd);
 void	shm_drop(struct shmfd *shmfd);
 int	shm_dotruncate(struct shmfd *shmfd, off_t length);
 bool	shm_largepage(struct shmfd *shmfd);
 void	shm_remove_prison(struct prison *pr);
 int	shm_get_path(struct vm_object *obj, char *path, size_t sz);
 
-extern struct fileops shm_ops;
+extern const struct fileops shm_ops;
 
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 
 #else /* !_KERNEL */
 
 __BEGIN_DECLS
 /*
  * XXX not yet implemented: posix_mem_offset(), posix_typed_mem_get_info(),
  * posix_typed_mem_open().
  */
 #if __BSD_VISIBLE
 int	getpagesizes(size_t *, int);
 int	madvise(void *, size_t, int);
 int	mincore(const void *, size_t, char *);
 int	minherit(void *, size_t, int);
 #endif
 int	mlock(const void *, size_t);
 #ifndef _MMAP_DECLARED
 #define	_MMAP_DECLARED
 void *	mmap(void *, size_t, int, int, int, off_t);
 #endif
 int	mprotect(void *, size_t, int);
 int	msync(void *, size_t, int);
 int	munlock(const void *, size_t);
 int	munmap(void *, size_t);
 #if __POSIX_VISIBLE >= 200112
 int	posix_madvise(void *, size_t, int);
 #endif
 #if __POSIX_VISIBLE >= 199309
 int	mlockall(int);
 int	munlockall(void);
 int	shm_open(const char *, int, mode_t);
 int	shm_unlink(const char *);
 #endif
 #if __BSD_VISIBLE
 int	memfd_create(const char *, unsigned int);
 int	shm_create_largepage(const char *, int, int, int, mode_t);
 int	shm_rename(const char *, const char *, int);
 #endif
 __END_DECLS
 
 #endif /* !_KERNEL */
 
 #endif /* !_SYS_MMAN_H_ */
diff --git a/sys/sys/pipe.h b/sys/sys/pipe.h
index 0f35316432eb..a83ea800c677 100644
--- a/sys/sys/pipe.h
+++ b/sys/sys/pipe.h
@@ -1,152 +1,152 @@
 /*-
  * Copyright (c) 1996 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
  *    is allowed if this notation is included.
  * 5. Modifications may be freely made to this file if the above conditions
  *    are met.
  */
 
 #ifndef _SYS_PIPE_H_
 #define _SYS_PIPE_H_
 
 /*
  * Pipe buffer size, keep moderate in value, pipes take kva space.
  */
 #ifndef PIPE_SIZE
 #define PIPE_SIZE	16384
 #endif
 
 #ifndef BIG_PIPE_SIZE
 #define BIG_PIPE_SIZE	(64*1024)
 #endif
 
 #ifndef SMALL_PIPE_SIZE
 #define SMALL_PIPE_SIZE	PAGE_SIZE
 #endif
 
 /*
  * PIPE_MINDIRECT MUST be smaller than PIPE_SIZE and MUST be bigger
  * than PIPE_BUF.
  */
 #ifndef PIPE_MINDIRECT
 #define PIPE_MINDIRECT	8192
 #endif
 
 #define PIPENPAGES	(BIG_PIPE_SIZE / PAGE_SIZE + 1)
 
 #ifdef _KERNEL
 /*
  * See sys_pipe.c for info on what these limits mean. 
  */
 extern long	maxpipekva;
-extern struct	fileops pipeops;
+extern const struct fileops pipeops;
 #endif
 
 /*
  * Pipe buffer information.
  * Separate in, out, cnt are used to simplify calculations.
  * Buffered write is active when the buffer.cnt field is set.
  */
 struct pipebuf {
 	u_int	cnt;		/* number of chars currently in buffer */
 	u_int	in;		/* in pointer */
 	u_int	out;		/* out pointer */
 	u_int	size;		/* size of buffer */
 	caddr_t	buffer;		/* kva of buffer */
 };
 
 /*
  * Information to support direct transfers between processes for pipes.
  */
 struct pipemapping {
 	u_int		cnt;		/* number of chars in buffer */
 	u_int		pos;		/* current position of transfer */
 	int		npages;		/* number of pages */
 	vm_page_t	ms[PIPENPAGES];	/* pages in source process */
 };
 
 /*
  * Bits in pipe_state.
  */
 #define PIPE_ASYNC	0x004	/* Async? I/O. */
 #define PIPE_WANTR	0x008	/* Reader wants some characters. */
 #define PIPE_WANTW	0x010	/* Writer wants space to put characters. */
 #define PIPE_WANT	0x020	/* Pipe is wanted to be run-down. */
 #define PIPE_SEL	0x040	/* Pipe has a select active. */
 #define PIPE_EOF	0x080	/* Pipe is in EOF condition. */
 #define PIPE_LOCKFL	0x100	/* Process has exclusive access to pointers/data. */
 #define PIPE_LWANT	0x200	/* Process wants exclusive access to pointers/data. */
 #define PIPE_DIRECTW	0x400	/* Pipe direct write active. */
 #define PIPE_DIRECTOK	0x800	/* Direct mode ok. */
 
 /*
  * Bits in pipe_type.
  */
 #define PIPE_TYPE_NAMED	0x001	/* Is a named pipe. */
 
 /*
  * Per-pipe data structure.
  * Two of these are linked together to produce bi-directional pipes.
  */
 struct pipe {
 	struct	pipebuf pipe_buffer;	/* data storage */
 	struct	pipemapping pipe_pages;	/* wired pages for direct I/O */
 	struct	selinfo pipe_sel;	/* for compat with select */
 	struct	timespec pipe_atime;	/* time of last access */
 	struct	timespec pipe_mtime;	/* time of last modify */
 	struct	timespec pipe_ctime;	/* time of status change */
 	struct	sigio *pipe_sigio;	/* information for async I/O */
 	struct	pipe *pipe_peer;	/* link with other direction */
 	struct	pipepair *pipe_pair;	/* container structure pointer */
 	u_short	pipe_state;		/* pipe status info */
 	u_char	pipe_type;		/* pipe type info */
 	u_char	pipe_present;		/* still present? */
 	int	pipe_waiters;		/* pipelock waiters */
 	int	pipe_busy;		/* busy flag, mostly to handle rundown sanely */
 	int	pipe_wgen;		/* writer generation for named pipe */
 	ino_t	pipe_ino;		/* fake inode for stat(2) */
 };
 
 /*
  * Values for the pipe_present.
  */
 #define PIPE_ACTIVE		1
 #define	PIPE_CLOSING		2
 #define	PIPE_FINALIZED		3
 
 /*
  * Container structure to hold the two pipe endpoints, mutex, and label
  * pointer.
  */
 struct pipepair {
 	struct pipe	pp_rpipe;
 	struct pipe	pp_wpipe;
 	struct mtx	pp_mtx;
 	struct label	*pp_label;
 	struct ucred	*pp_owner;	/* to dec pipe usage count */
 };
 
 #define PIPE_MTX(pipe)		(&(pipe)->pipe_pair->pp_mtx)
 #define PIPE_LOCK(pipe)		mtx_lock(PIPE_MTX(pipe))
 #define PIPE_UNLOCK(pipe)	mtx_unlock(PIPE_MTX(pipe))
 #define PIPE_LOCK_ASSERT(pipe, type)  mtx_assert(PIPE_MTX(pipe), (type))
 
 #ifdef _KERNEL
 void	pipe_dtor(struct pipe *dpipe);
 int	pipe_named_ctor(struct pipe **ppipe, struct thread *td);
 void	pipeselwakeup(struct pipe *cpipe);
 #endif
 #endif /* !_SYS_PIPE_H_ */
diff --git a/sys/x86/acpica/acpi_apm.c b/sys/x86/acpica/acpi_apm.c
index 4e880c3e5411..e7e4b0f1a546 100644
--- a/sys/x86/acpica/acpi_apm.c
+++ b/sys/x86/acpica/acpi_apm.c
@@ -1,452 +1,452 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2001 Mitsuru IWASAKI
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/uio.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 
 #include <dev/acpica/acpivar.h>
 #include <dev/acpica/acpiio.h>
 
 #include <machine/apm_bios.h>
 
 /*
  * APM driver emulation 
  */
 
 #define	APM_UNKNOWN	0xff
 
 static int apm_active;
 
 static MALLOC_DEFINE(M_APMDEV, "apmdev", "APM device emulation");
 
 static d_open_t		apmopen;
 static d_write_t	apmwrite;
 static d_ioctl_t	apmioctl;
 static d_poll_t		apmpoll;
 static d_kqfilter_t	apmkqfilter;
 static void		apmreadfiltdetach(struct knote *kn);
 static int		apmreadfilt(struct knote *kn, long hint);
-static struct filterops	apm_readfiltops = {
+static const struct filterops apm_readfiltops = {
 	.f_isfd = 1,
 	.f_detach = apmreadfiltdetach,
 	.f_event = apmreadfilt,
 };
 
 static struct cdevsw apm_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	apmopen,
 	.d_write =	apmwrite,
 	.d_ioctl =	apmioctl,
 	.d_poll =	apmpoll,
 	.d_name =	"apm",
 	.d_kqfilter =	apmkqfilter
 };
 
 static int
 acpi_capm_convert_battstate(struct  acpi_battinfo *battp)
 {
 	int	state;
 
 	state = APM_UNKNOWN;
 
 	if (battp->state & ACPI_BATT_STAT_DISCHARG) {
 		if (battp->cap >= 50)
 			state = 0;	/* high */
 		else
 			state = 1;	/* low */
 	}
 	if (battp->state & ACPI_BATT_STAT_CRITICAL)
 		state = 2;		/* critical */
 	if (battp->state & ACPI_BATT_STAT_CHARGING)
 		state = 3;		/* charging */
 
 	/* If still unknown, determine it based on the battery capacity. */
 	if (state == APM_UNKNOWN) {
 		if (battp->cap >= 50)
 			state = 0;	/* high */
 		else
 			state = 1;	/* low */
 	}
 
 	return (state);
 }
 
 static int
 acpi_capm_convert_battflags(struct  acpi_battinfo *battp)
 {
 	int	flags;
 
 	flags = 0;
 
 	if (battp->cap >= 50)
 		flags |= APM_BATT_HIGH;
 	else {
 		if (battp->state & ACPI_BATT_STAT_CRITICAL)
 			flags |= APM_BATT_CRITICAL;
 		else
 			flags |= APM_BATT_LOW;
 	}
 	if (battp->state & ACPI_BATT_STAT_CHARGING)
 		flags |= APM_BATT_CHARGING;
 	if (battp->state == ACPI_BATT_STAT_NOT_PRESENT)
 		flags = APM_BATT_NOT_PRESENT;
 
 	return (flags);
 }
 
 static int
 acpi_capm_get_info(apm_info_t aip)
 {
 	int	acline;
 	struct	acpi_battinfo batt;
 
 	aip->ai_infoversion = 1;
 	aip->ai_major       = 1;
 	aip->ai_minor       = 2;
 	aip->ai_status      = apm_active;
 	aip->ai_capabilities= 0xff00;	/* unknown */
 
 	if (acpi_acad_get_acline(&acline))
 		aip->ai_acline = APM_UNKNOWN;	/* unknown */
 	else
 		aip->ai_acline = acline;	/* on/off */
 
 	if (acpi_battery_get_battinfo(NULL, &batt) != 0) {
 		aip->ai_batt_stat = APM_UNKNOWN;
 		aip->ai_batt_life = APM_UNKNOWN;
 		aip->ai_batt_time = -1;		 /* unknown */
 		aip->ai_batteries = ~0U;	 /* unknown */
 	} else {
 		aip->ai_batt_stat = acpi_capm_convert_battstate(&batt);
 		aip->ai_batt_life = batt.cap;
 		aip->ai_batt_time = (batt.min == -1) ? -1 : batt.min * 60;
 		aip->ai_batteries = acpi_battery_get_units();
 	}
 
 	return (0);
 }
 
 static int
 acpi_capm_get_pwstatus(apm_pwstatus_t app)
 {
 	device_t dev;
 	int	acline, unit, error;
 	struct	acpi_battinfo batt;
 
 	if (app->ap_device != PMDV_ALLDEV &&
 	    (app->ap_device < PMDV_BATT0 || app->ap_device > PMDV_BATT_ALL))
 		return (1);
 
 	if (app->ap_device == PMDV_ALLDEV)
 		error = acpi_battery_get_battinfo(NULL, &batt);
 	else {
 		unit = app->ap_device - PMDV_BATT0;
 		dev = devclass_get_device(devclass_find("battery"), unit);
 		if (dev != NULL)
 			error = acpi_battery_get_battinfo(dev, &batt);
 		else
 			error = ENXIO;
 	}
 	if (error)
 		return (1);
 
 	app->ap_batt_stat = acpi_capm_convert_battstate(&batt);
 	app->ap_batt_flag = acpi_capm_convert_battflags(&batt);
 	app->ap_batt_life = batt.cap;
 	app->ap_batt_time = (batt.min == -1) ? -1 : batt.min * 60;
 
 	if (acpi_acad_get_acline(&acline))
 		app->ap_acline = APM_UNKNOWN;
 	else
 		app->ap_acline = acline;	/* on/off */
 
 	return (0);
 }
 
 /* Create a struct for tracking per-device suspend notification. */
 static struct apm_clone_data *
 apm_create_clone(struct cdev *dev, struct acpi_softc *acpi_sc)
 {
 	struct apm_clone_data *clone;
 
 	clone = malloc(sizeof(*clone), M_APMDEV, M_WAITOK);
 	clone->cdev = dev;
 	clone->acpi_sc = acpi_sc;
 	clone->notify_status = APM_EV_NONE;
 	bzero(&clone->sel_read, sizeof(clone->sel_read));
 	knlist_init_mtx(&clone->sel_read.si_note, &acpi_mutex);
 
 	/*
 	 * The acpi device is always managed by devd(8) and is considered
 	 * writable (i.e., ack is required to allow suspend to proceed.)
 	 */
 	if (strcmp("acpi", devtoname(dev)) == 0)
 		clone->flags = ACPI_EVF_DEVD | ACPI_EVF_WRITE;
 	else
 		clone->flags = ACPI_EVF_NONE;
 
 	ACPI_LOCK(acpi);
 	STAILQ_INSERT_TAIL(&acpi_sc->apm_cdevs, clone, entries);
 	ACPI_UNLOCK(acpi);
 	return (clone);
 }
 
 static void
 apmdtor(void *data)
 {
 	struct	apm_clone_data *clone;
 	struct	acpi_softc *acpi_sc;
 
 	clone = data;
 	acpi_sc = clone->acpi_sc;
 
 	/* We are about to lose a reference so check if suspend should occur */
 	if (acpi_sc->acpi_next_sstate != 0 &&
 	    clone->notify_status != APM_EV_ACKED)
 		acpi_AckSleepState(clone, 0);
 
 	/* Remove this clone's data from the list and free it. */
 	ACPI_LOCK(acpi);
 	STAILQ_REMOVE(&acpi_sc->apm_cdevs, clone, apm_clone_data, entries);
 	seldrain(&clone->sel_read);
 	knlist_destroy(&clone->sel_read.si_note);
 	ACPI_UNLOCK(acpi);
 	free(clone, M_APMDEV);
 }
 
 static int
 apmopen(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
 	struct	acpi_softc *acpi_sc;
 	struct 	apm_clone_data *clone;
 
 	acpi_sc = devclass_get_softc(devclass_find("acpi"), 0);
 	clone = apm_create_clone(dev, acpi_sc);
 	devfs_set_cdevpriv(clone, apmdtor);
 
 	/* If the device is opened for write, record that. */
 	if ((flag & FWRITE) != 0)
 		clone->flags |= ACPI_EVF_WRITE;
 
 	return (0);
 }
 
 static int
 apmioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	int	error;
 	struct	apm_clone_data *clone;
 	struct	acpi_softc *acpi_sc;
 	struct	apm_info info;
 	struct 	apm_event_info *ev_info;
 	apm_info_old_t aiop;
 
 	error = 0;
 	devfs_get_cdevpriv((void **)&clone);
 	acpi_sc = clone->acpi_sc;
 
 	switch (cmd) {
 	case APMIO_SUSPEND:
 		if ((flag & FWRITE) == 0)
 			return (EPERM);
 		if (acpi_sc->acpi_next_sstate == 0) {
 			if (acpi_sc->acpi_suspend_sx != ACPI_STATE_S5) {
 				error = acpi_ReqSleepState(acpi_sc,
 				    acpi_sc->acpi_suspend_sx);
 			} else {
 				printf(
 			"power off via apm suspend not supported\n");
 				error = ENXIO;
 			}
 		} else
 			error = acpi_AckSleepState(clone, 0);
 		break;
 	case APMIO_STANDBY:
 		if ((flag & FWRITE) == 0)
 			return (EPERM);
 		if (acpi_sc->acpi_next_sstate == 0) {
 			if (acpi_sc->acpi_standby_sx != ACPI_STATE_S5) {
 				error = acpi_ReqSleepState(acpi_sc,
 				    acpi_sc->acpi_standby_sx);
 			} else {
 				printf(
 			"power off via apm standby not supported\n");
 				error = ENXIO;
 			}
 		} else
 			error = acpi_AckSleepState(clone, 0);
 		break;
 	case APMIO_NEXTEVENT:
 		printf("apm nextevent start\n");
 		ACPI_LOCK(acpi);
 		if (acpi_sc->acpi_next_sstate != 0 && clone->notify_status ==
 		    APM_EV_NONE) {
 			ev_info = (struct apm_event_info *)addr;
 			if (acpi_sc->acpi_next_sstate <= ACPI_STATE_S3)
 				ev_info->type = PMEV_STANDBYREQ;
 			else
 				ev_info->type = PMEV_SUSPENDREQ;
 			ev_info->index = 0;
 			clone->notify_status = APM_EV_NOTIFIED;
 			printf("apm event returning %d\n", ev_info->type);
 		} else
 			error = EAGAIN;
 		ACPI_UNLOCK(acpi);
 		break;
 	case APMIO_GETINFO_OLD:
 		if (acpi_capm_get_info(&info))
 			error = ENXIO;
 		aiop = (apm_info_old_t)addr;
 		aiop->ai_major = info.ai_major;
 		aiop->ai_minor = info.ai_minor;
 		aiop->ai_acline = info.ai_acline;
 		aiop->ai_batt_stat = info.ai_batt_stat;
 		aiop->ai_batt_life = info.ai_batt_life;
 		aiop->ai_status = info.ai_status;
 		break;
 	case APMIO_GETINFO:
 		if (acpi_capm_get_info((apm_info_t)addr))
 			error = ENXIO;
 		break;
 	case APMIO_GETPWSTATUS:
 		if (acpi_capm_get_pwstatus((apm_pwstatus_t)addr))
 			error = ENXIO;
 		break;
 	case APMIO_ENABLE:
 		if ((flag & FWRITE) == 0)
 			return (EPERM);
 		apm_active = 1;
 		break;
 	case APMIO_DISABLE:
 		if ((flag & FWRITE) == 0)
 			return (EPERM);
 		apm_active = 0;
 		break;
 	case APMIO_HALTCPU:
 		break;
 	case APMIO_NOTHALTCPU:
 		break;
 	case APMIO_DISPLAY:
 		if ((flag & FWRITE) == 0)
 			return (EPERM);
 		break;
 	case APMIO_BIOS:
 		if ((flag & FWRITE) == 0)
 			return (EPERM);
 		bzero(addr, sizeof(struct apm_bios_arg));
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 apmwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	return (uio->uio_resid);
 }
 
 static int
 apmpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct	apm_clone_data *clone;
 	int revents;
 
 	revents = 0;
 	devfs_get_cdevpriv((void **)&clone);
 	ACPI_LOCK(acpi);
 	if (clone->acpi_sc->acpi_next_sstate)
 		revents |= events & (POLLIN | POLLRDNORM);
 	else
 		selrecord(td, &clone->sel_read);
 	ACPI_UNLOCK(acpi);
 	return (revents);
 }
 
 static int
 apmkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct	apm_clone_data *clone;
 
 	devfs_get_cdevpriv((void **)&clone);
 	ACPI_LOCK(acpi);
 	kn->kn_hook = clone;
 	kn->kn_fop = &apm_readfiltops;
 	knlist_add(&clone->sel_read.si_note, kn, 0);
 	ACPI_UNLOCK(acpi);
 	return (0);
 }
 
 static void
 apmreadfiltdetach(struct knote *kn)
 {
 	struct	apm_clone_data *clone;
 
 	ACPI_LOCK(acpi);
 	clone = kn->kn_hook;
 	knlist_remove(&clone->sel_read.si_note, kn, 0);
 	ACPI_UNLOCK(acpi);
 }
 
 static int
 apmreadfilt(struct knote *kn, long hint)
 {
 	struct	apm_clone_data *clone;
 	int	sleeping;
 
 	ACPI_LOCK(acpi);
 	clone = kn->kn_hook;
 	sleeping = clone->acpi_sc->acpi_next_sstate ? 1 : 0;
 	ACPI_UNLOCK(acpi);
 	return (sleeping);
 }
 
 void
 acpi_apm_init(struct acpi_softc *sc)
 {
 
 	/* Create a clone for /dev/acpi also. */
 	STAILQ_INIT(&sc->apm_cdevs);
 	sc->acpi_clone = apm_create_clone(sc->acpi_dev_t, sc);
 
 	make_dev(&apm_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0660, "apmctl");
 	make_dev(&apm_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0664, "apm");
 }