No OneTemporary
Actions

Size

4 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: head/sys/arm/allwinner/aw_wdog.c
	===================================================================
	--- head/sys/arm/allwinner/aw_wdog.c (revision 327172)
	+++ head/sys/arm/allwinner/aw_wdog.c (revision 327173)
	@@ -1,275 +1,272 @@
	/*-
	* Copyright (c) 2013 Oleksandr Tymoshenko <gonzo@freebsd.org>
	* Copyright (c) 2016 Emmanuel Vadot <manu@bidouilliste.com>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/watchdog.h>
	#include <sys/reboot.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/rman.h>

	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <machine/bus.h>
	#include <machine/machdep.h>

	#include <arm/allwinner/aw_wdog.h>

	#define READ(_sc, _r) bus_read_4((_sc)->res, (_r))
	#define WRITE(_sc, _r, _v) bus_write_4((_sc)->res, (_r), (_v))

	#define A10_WDOG_CTRL 0x00
	#define A31_WDOG_CTRL 0x10
	#define WDOG_CTRL_RESTART (1 << 0)
	#define A31_WDOG_CTRL_KEY (0xa57 << 1)
	#define A10_WDOG_MODE 0x04
	#define A31_WDOG_MODE 0x18
	#define A10_WDOG_MODE_INTVL_SHIFT 3
	#define A31_WDOG_MODE_INTVL_SHIFT 4
	#define A10_WDOG_MODE_RST_EN (1 << 1)
	#define WDOG_MODE_EN (1 << 0)
	#define A31_WDOG_CONFIG 0x14
	#define A31_WDOG_CONFIG_RST_EN_SYSTEM (1 << 0)
	#define A31_WDOG_CONFIG_RST_EN_INT (2 << 0)

	struct aw_wdog_interval {
	uint64_t milliseconds;
	unsigned int value;
	};

	struct aw_wdog_interval wd_intervals[] = {
	{ 500, 0 },
	{ 1000, 1 },
	{ 2000, 2 },
	{ 3000, 3 },
	{ 4000, 4 },
	{ 5000, 5 },
	{ 6000, 6 },
	{ 8000, 7 },
	{ 10000, 8 },
	{ 12000, 9 },
	{ 14000, 10 },
	{ 16000, 11 },
	{ 0, 0 } /* sentinel */
	};

	static struct aw_wdog_softc *aw_wdog_sc = NULL;

	struct aw_wdog_softc {
	device_t dev;
	struct resource * res;
	struct mtx mtx;
	uint8_t wdog_ctrl;
	uint32_t wdog_ctrl_key;
	uint8_t wdog_mode;
	uint8_t wdog_mode_intvl_shift;
	uint8_t wdog_mode_en;
	uint8_t wdog_config;
	uint8_t wdog_config_value;
	};

	#define A10_WATCHDOG 1
	#define A31_WATCHDOG 2

	static struct ofw_compat_data compat_data[] = {
	{"allwinner,sun4i-a10-wdt", A10_WATCHDOG},
	{"allwinner,sun6i-a31-wdt", A31_WATCHDOG},
	{NULL, 0}
	};

	static void aw_wdog_watchdog_fn(void , u_int, int );
	static void aw_wdog_shutdown_fn(void *, int);

	static int
	aw_wdog_probe(device_t dev)
	{
	- struct aw_wdog_softc *sc;
	-
	- sc = device_get_softc(dev);

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);
	switch (ofw_bus_search_compatible(dev, compat_data)->ocd_data) {
	case A10_WATCHDOG:
	device_set_desc(dev, "Allwinner A10 Watchdog");
	return (BUS_PROBE_DEFAULT);
	case A31_WATCHDOG:
	device_set_desc(dev, "Allwinner A31 Watchdog");
	return (BUS_PROBE_DEFAULT);
	}
	return (ENXIO);
	}

	static int
	aw_wdog_attach(device_t dev)
	{
	struct aw_wdog_softc *sc;
	int rid;

	if (aw_wdog_sc != NULL)
	return (ENXIO);

	sc = device_get_softc(dev);
	sc->dev = dev;

	rid = 0;
	sc->res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
	if (sc->res == NULL) {
	device_printf(dev, "could not allocate memory resource\n");
	return (ENXIO);
	}

	aw_wdog_sc = sc;

	switch (ofw_bus_search_compatible(dev, compat_data)->ocd_data) {
	case A10_WATCHDOG:
	sc->wdog_ctrl = A10_WDOG_CTRL;
	sc->wdog_mode = A10_WDOG_MODE;
	sc->wdog_mode_intvl_shift = A10_WDOG_MODE_INTVL_SHIFT;
	sc->wdog_mode_en = A10_WDOG_MODE_RST_EN \| WDOG_MODE_EN;
	break;
	case A31_WATCHDOG:
	sc->wdog_ctrl = A31_WDOG_CTRL;
	sc->wdog_ctrl_key = A31_WDOG_CTRL_KEY;
	sc->wdog_mode = A31_WDOG_MODE;
	sc->wdog_mode_intvl_shift = A31_WDOG_MODE_INTVL_SHIFT;
	sc->wdog_mode_en = WDOG_MODE_EN;
	sc->wdog_config = A31_WDOG_CONFIG;
	sc->wdog_config_value = A31_WDOG_CONFIG_RST_EN_SYSTEM;
	break;
	default:
	bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->res);
	return (ENXIO);
	}

	mtx_init(&sc->mtx, "AW Watchdog", "aw_wdog", MTX_DEF);
	EVENTHANDLER_REGISTER(watchdog_list, aw_wdog_watchdog_fn, sc, 0);
	EVENTHANDLER_REGISTER(shutdown_final, aw_wdog_shutdown_fn, sc,
	SHUTDOWN_PRI_LAST - 1);

	return (0);
	}

	static void
	aw_wdog_watchdog_fn(void private, u_int cmd, int error)
	{
	struct aw_wdog_softc *sc;
	uint64_t ms;
	int i;

	sc = private;
	mtx_lock(&sc->mtx);

	cmd &= WD_INTERVAL;

	if (cmd > 0) {
	ms = ((uint64_t)1 << (cmd & WD_INTERVAL)) / 1000000;
	i = 0;
	while (wd_intervals[i].milliseconds &&
	(ms > wd_intervals[i].milliseconds))
	i++;
	if (wd_intervals[i].milliseconds) {
	WRITE(sc, sc->wdog_mode,
	(wd_intervals[i].value << sc->wdog_mode_intvl_shift) \|
	sc->wdog_mode_en);
	WRITE(sc, sc->wdog_ctrl,
	WDOG_CTRL_RESTART \| sc->wdog_ctrl_key);
	if (sc->wdog_config)
	WRITE(sc, sc->wdog_config,
	sc->wdog_config_value);
	*error = 0;
	}
	else {
	/*
	* Can't arm
	* disable watchdog as watchdog(9) requires
	*/
	device_printf(sc->dev,
	"Can't arm, timeout is more than 16 sec\n");
	mtx_unlock(&sc->mtx);
	WRITE(sc, sc->wdog_mode, 0);
	return;
	}
	}
	else
	WRITE(sc, sc->wdog_mode, 0);

	mtx_unlock(&sc->mtx);
	}

	static void
	aw_wdog_shutdown_fn(void *private, int howto)
	{
	if ((howto & (RB_POWEROFF\|RB_HALT)) == 0)
	aw_wdog_watchdog_reset();
	}

	void
	aw_wdog_watchdog_reset(void)
	{

	if (aw_wdog_sc == NULL) {
	printf("Reset: watchdog device has not been initialized\n");
	return;
	}

	WRITE(aw_wdog_sc, aw_wdog_sc->wdog_mode,
	(wd_intervals[0].value << aw_wdog_sc->wdog_mode_intvl_shift) \|
	aw_wdog_sc->wdog_mode_en);
	if (aw_wdog_sc->wdog_config)
	WRITE(aw_wdog_sc, aw_wdog_sc->wdog_config,
	aw_wdog_sc->wdog_config_value);
	WRITE(aw_wdog_sc, aw_wdog_sc->wdog_ctrl,
	WDOG_CTRL_RESTART \| aw_wdog_sc->wdog_ctrl_key);
	while(1)
	;

	}

	static device_method_t aw_wdog_methods[] = {
	DEVMETHOD(device_probe, aw_wdog_probe),
	DEVMETHOD(device_attach, aw_wdog_attach),

	DEVMETHOD_END
	};

	static driver_t aw_wdog_driver = {
	"aw_wdog",
	aw_wdog_methods,
	sizeof(struct aw_wdog_softc),
	};
	static devclass_t aw_wdog_devclass;

	DRIVER_MODULE(aw_wdog, simplebus, aw_wdog_driver, aw_wdog_devclass, 0, 0);
	Index: head/sys/arm/allwinner/axp81x.c
	===================================================================
	--- head/sys/arm/allwinner/axp81x.c (revision 327172)
	+++ head/sys/arm/allwinner/axp81x.c (revision 327173)
	@@ -1,785 +1,783 @@
	/*-
	* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* X-Powers AXP813/818 PMU for Allwinner SoCs
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/eventhandler.h>
	#include <sys/bus.h>
	#include <sys/rman.h>
	#include <sys/kernel.h>
	#include <sys/reboot.h>
	#include <sys/gpio.h>
	#include <sys/module.h>
	#include <machine/bus.h>

	#include <dev/iicbus/iicbus.h>
	#include <dev/iicbus/iiconf.h>

	#include <dev/gpio/gpiobusvar.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <dev/extres/regulator/regulator.h>

	#include "gpio_if.h"
	#include "iicbus_if.h"
	#include "regdev_if.h"

	MALLOC_DEFINE(M_AXP81X_REG, "AXP81x regulator", "AXP81x power regulator");

	#define AXP_ICTYPE 0x03
	#define AXP_POWERCTL1 0x10
	#define AXP_POWERCTL1_DCDC2 (1 << 1)
	#define AXP_POWERCTL2 0x12
	#define AXP_POWERCTL2_DC1SW (1 << 7)
	#define AXP_VOLTCTL_DCDC2 0x21
	#define AXP_VOLTCTL_STATUS (1 << 7)
	#define AXP_VOLTCTL_MASK 0x7f
	#define AXP_POWERBAT 0x32
	#define AXP_POWERBAT_SHUTDOWN (1 << 7)
	#define AXP_IRQEN1 0x40
	#define AXP_IRQEN2 0x41
	#define AXP_IRQEN3 0x42
	#define AXP_IRQEN4 0x43
	#define AXP_IRQEN5 0x44
	#define AXP_IRQEN5_POKSIRQ (1 << 4)
	#define AXP_IRQEN6 0x45
	#define AXP_IRQSTAT5 0x4c
	#define AXP_IRQSTAT5_POKSIRQ (1 << 4)
	#define AXP_GPIO0_CTRL 0x90
	#define AXP_GPIO1_CTRL 0x92
	#define AXP_GPIO_FUNC (0x7 << 0)
	#define AXP_GPIO_FUNC_SHIFT 0
	#define AXP_GPIO_FUNC_DRVLO 0
	#define AXP_GPIO_FUNC_DRVHI 1
	#define AXP_GPIO_FUNC_INPUT 2
	#define AXP_GPIO_SIGBIT 0x94
	#define AXP_GPIO_PD 0x97

	static const struct {
	const char *name;
	uint8_t ctrl_reg;
	} axp81x_pins[] = {
	{ "GPIO0", AXP_GPIO0_CTRL },
	{ "GPIO1", AXP_GPIO1_CTRL },
	};

	static struct ofw_compat_data compat_data[] = {
	{ "x-powers,axp813", 1 },
	{ "x-powers,axp818", 1 },
	{ NULL, 0 }
	};

	static struct resource_spec axp81x_spec[] = {
	{ SYS_RES_IRQ, 0, RF_ACTIVE },
	{ -1, 0 }
	};

	struct axp81x_regdef {
	intptr_t id;
	char *name;
	char *supply_name;
	uint8_t enable_reg;
	uint8_t enable_mask;
	uint8_t voltage_reg;
	int voltage_min;
	int voltage_max;
	int voltage_step1;
	int voltage_nstep1;
	int voltage_step2;
	int voltage_nstep2;
	};

	enum axp81x_reg_id {
	AXP81X_REG_ID_DC1SW,
	AXP81X_REG_ID_DCDC2,
	};

	static struct axp81x_regdef axp81x_regdefs[] = {
	{
	.id = AXP81X_REG_ID_DC1SW,
	.name = "dc1sw",
	.enable_reg = AXP_POWERCTL2,
	.enable_mask = AXP_POWERCTL2_DC1SW,
	},
	{
	.id = AXP81X_REG_ID_DCDC2,
	.name = "dcdc2",
	.enable_reg = AXP_POWERCTL1,
	.enable_mask = AXP_POWERCTL1_DCDC2,
	.voltage_reg = AXP_VOLTCTL_DCDC2,
	.voltage_min = 500,
	.voltage_max = 1300,
	.voltage_step1 = 10,
	.voltage_nstep1 = 70,
	.voltage_step2 = 20,
	.voltage_nstep2 = 5,
	},
	};

	struct axp81x_softc;

	struct axp81x_reg_sc {
	struct regnode *regnode;
	device_t base_dev;
	struct axp81x_regdef *def;
	phandle_t xref;
	struct regnode_std_param *param;
	};

	struct axp81x_softc {
	struct resource *res;
	uint16_t addr;
	void *ih;
	device_t gpiodev;
	struct mtx mtx;
	int busy;

	/* Regulators */
	struct axp81x_reg_sc **regs;
	int nregs;
	};

	#define AXP_LOCK(sc) mtx_lock(&(sc)->mtx)
	#define AXP_UNLOCK(sc) mtx_unlock(&(sc)->mtx)

	static int
	axp81x_read(device_t dev, uint8_t reg, uint8_t *data, uint8_t size)
	{
	struct axp81x_softc *sc;
	struct iic_msg msg[2];

	sc = device_get_softc(dev);

	msg[0].slave = sc->addr;
	msg[0].flags = IIC_M_WR;
	msg[0].len = 1;
	msg[0].buf = ®

	msg[1].slave = sc->addr;
	msg[1].flags = IIC_M_RD;
	msg[1].len = size;
	msg[1].buf = data;

	return (iicbus_transfer(dev, msg, 2));
	}

	static int
	axp81x_write(device_t dev, uint8_t reg, uint8_t val)
	{
	struct axp81x_softc *sc;
	struct iic_msg msg[2];

	sc = device_get_softc(dev);

	msg[0].slave = sc->addr;
	msg[0].flags = IIC_M_WR;
	msg[0].len = 1;
	msg[0].buf = ®

	msg[1].slave = sc->addr;
	msg[1].flags = IIC_M_WR;
	msg[1].len = 1;
	msg[1].buf = &val;

	return (iicbus_transfer(dev, msg, 2));
	}

	static int
	axp81x_regnode_init(struct regnode *regnode)
	{
	return (0);
	}

	static int
	axp81x_regnode_enable(struct regnode regnode, bool enable, int udelay)
	{
	struct axp81x_reg_sc *sc;
	uint8_t val;

	sc = regnode_get_softc(regnode);

	axp81x_read(sc->base_dev, sc->def->enable_reg, &val, 1);
	if (enable)
	val \|= sc->def->enable_mask;
	else
	val &= ~sc->def->enable_mask;
	axp81x_write(sc->base_dev, sc->def->enable_reg, val);

	*udelay = 0;

	return (0);
	}

	static void
	axp81x_regnode_reg_to_voltage(struct axp81x_reg_sc sc, uint8_t val, int uv)
	{
	if (val < sc->def->voltage_nstep1)
	uv = sc->def->voltage_min + val sc->def->voltage_step1;
	else
	*uv = sc->def->voltage_min +
	(sc->def->voltage_nstep1 * sc->def->voltage_step1) +
	((val - sc->def->voltage_nstep1) * sc->def->voltage_step2);
	uv = 1000;
	}

	static int
	axp81x_regnode_voltage_to_reg(struct axp81x_reg_sc *sc, int min_uvolt,
	int max_uvolt, uint8_t *val)
	{
	uint8_t nval;
	int nstep, uvolt;

	nval = 0;
	uvolt = sc->def->voltage_min * 1000;

	for (nstep = 0; nstep < sc->def->voltage_nstep1 && uvolt < min_uvolt;
	nstep++) {
	++nval;
	uvolt += (sc->def->voltage_step1 * 1000);
	}
	for (nstep = 0; nstep < sc->def->voltage_nstep2 && uvolt < min_uvolt;
	nstep++) {
	++nval;
	uvolt += (sc->def->voltage_step2 * 1000);
	}
	if (uvolt > max_uvolt)
	return (EINVAL);

	*val = nval;
	return (0);
	}

	static int
	axp81x_regnode_set_voltage(struct regnode *regnode, int min_uvolt,
	int max_uvolt, int *udelay)
	{
	struct axp81x_reg_sc *sc;
	uint8_t val;

	sc = regnode_get_softc(regnode);

	if (!sc->def->voltage_step1 \|\| !sc->def->voltage_step2)
	return (ENXIO);

	if (axp81x_regnode_voltage_to_reg(sc, min_uvolt, max_uvolt, &val) != 0)
	return (ERANGE);

	axp81x_write(sc->base_dev, sc->def->voltage_reg, val);

	*udelay = 0;

	return (0);
	}

	static int
	axp81x_regnode_get_voltage(struct regnode regnode, int uvolt)
	{
	struct axp81x_reg_sc *sc;
	uint8_t val;

	sc = regnode_get_softc(regnode);

	if (!sc->def->voltage_step1 \|\| !sc->def->voltage_step2)
	return (ENXIO);

	axp81x_read(sc->base_dev, sc->def->voltage_reg, &val, 1);
	axp81x_regnode_reg_to_voltage(sc, val & AXP_VOLTCTL_MASK, uvolt);

	return (0);
	}

	static regnode_method_t axp81x_regnode_methods[] = {
	/* Regulator interface */
	REGNODEMETHOD(regnode_init, axp81x_regnode_init),
	REGNODEMETHOD(regnode_enable, axp81x_regnode_enable),
	REGNODEMETHOD(regnode_set_voltage, axp81x_regnode_set_voltage),
	REGNODEMETHOD(regnode_get_voltage, axp81x_regnode_get_voltage),
	REGNODEMETHOD_END
	};
	DEFINE_CLASS_1(axp81x_regnode, axp81x_regnode_class, axp81x_regnode_methods,
	sizeof(struct axp81x_reg_sc), regnode_class);

	static void
	axp81x_shutdown(void *devp, int howto)
	{
	device_t dev;

	if ((howto & RB_POWEROFF) == 0)
	return;

	dev = devp;

	if (bootverbose)
	device_printf(dev, "Shutdown AXP81x\n");

	axp81x_write(dev, AXP_POWERBAT, AXP_POWERBAT_SHUTDOWN);
	}

	static void
	axp81x_intr(void *arg)
	{
	- struct axp81x_softc *sc;
	device_t dev;
	uint8_t val;
	int error;

	dev = arg;
	- sc = device_get_softc(dev);

	error = axp81x_read(dev, AXP_IRQSTAT5, &val, 1);
	if (error != 0)
	return;

	if (val != 0) {
	if ((val & AXP_IRQSTAT5_POKSIRQ) != 0) {
	if (bootverbose)
	device_printf(dev, "Power button pressed\n");
	shutdown_nice(RB_POWEROFF);
	}
	/* Acknowledge */
	axp81x_write(dev, AXP_IRQSTAT5, val);
	}
	}

	static device_t
	axp81x_gpio_get_bus(device_t dev)
	{
	struct axp81x_softc *sc;

	sc = device_get_softc(dev);

	return (sc->gpiodev);
	}

	static int
	axp81x_gpio_pin_max(device_t dev, int *maxpin)
	{
	*maxpin = nitems(axp81x_pins) - 1;

	return (0);
	}

	static int
	axp81x_gpio_pin_getname(device_t dev, uint32_t pin, char *name)
	{
	if (pin >= nitems(axp81x_pins))
	return (EINVAL);

	snprintf(name, GPIOMAXNAME, "%s", axp81x_pins[pin].name);

	return (0);
	}

	static int
	axp81x_gpio_pin_getcaps(device_t dev, uint32_t pin, uint32_t *caps)
	{
	if (pin >= nitems(axp81x_pins))
	return (EINVAL);

	*caps = GPIO_PIN_INPUT \| GPIO_PIN_OUTPUT;

	return (0);
	}

	static int
	axp81x_gpio_pin_getflags(device_t dev, uint32_t pin, uint32_t *flags)
	{
	struct axp81x_softc *sc;
	uint8_t data, func;
	int error;

	if (pin >= nitems(axp81x_pins))
	return (EINVAL);

	sc = device_get_softc(dev);

	AXP_LOCK(sc);
	error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
	if (error == 0) {
	func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
	if (func == AXP_GPIO_FUNC_INPUT)
	*flags = GPIO_PIN_INPUT;
	else if (func == AXP_GPIO_FUNC_DRVLO \|\|
	func == AXP_GPIO_FUNC_DRVHI)
	*flags = GPIO_PIN_OUTPUT;
	else
	*flags = 0;
	}
	AXP_UNLOCK(sc);

	return (error);
	}

	static int
	axp81x_gpio_pin_setflags(device_t dev, uint32_t pin, uint32_t flags)
	{
	struct axp81x_softc *sc;
	uint8_t data;
	int error;

	if (pin >= nitems(axp81x_pins))
	return (EINVAL);

	sc = device_get_softc(dev);

	AXP_LOCK(sc);
	error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
	if (error == 0) {
	data &= ~AXP_GPIO_FUNC;
	if ((flags & (GPIO_PIN_INPUT\|GPIO_PIN_OUTPUT)) != 0) {
	if ((flags & GPIO_PIN_OUTPUT) == 0)
	data \|= AXP_GPIO_FUNC_INPUT;
	}
	error = axp81x_write(dev, axp81x_pins[pin].ctrl_reg, data);
	}
	AXP_UNLOCK(sc);

	return (error);
	}

	static int
	axp81x_gpio_pin_get(device_t dev, uint32_t pin, unsigned int *val)
	{
	struct axp81x_softc *sc;
	uint8_t data, func;
	int error;

	if (pin >= nitems(axp81x_pins))
	return (EINVAL);

	sc = device_get_softc(dev);

	AXP_LOCK(sc);
	error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
	if (error == 0) {
	func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
	switch (func) {
	case AXP_GPIO_FUNC_DRVLO:
	*val = 0;
	break;
	case AXP_GPIO_FUNC_DRVHI:
	*val = 1;
	break;
	case AXP_GPIO_FUNC_INPUT:
	error = axp81x_read(dev, AXP_GPIO_SIGBIT, &data, 1);
	if (error == 0)
	*val = (data & (1 << pin)) ? 1 : 0;
	break;
	default:
	error = EIO;
	break;
	}
	}
	AXP_UNLOCK(sc);

	return (error);
	}

	static int
	axp81x_gpio_pin_set(device_t dev, uint32_t pin, unsigned int val)
	{
	struct axp81x_softc *sc;
	uint8_t data, func;
	int error;

	if (pin >= nitems(axp81x_pins))
	return (EINVAL);

	sc = device_get_softc(dev);

	AXP_LOCK(sc);
	error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
	if (error == 0) {
	func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
	switch (func) {
	case AXP_GPIO_FUNC_DRVLO:
	case AXP_GPIO_FUNC_DRVHI:
	data &= ~AXP_GPIO_FUNC;
	data \|= (val << AXP_GPIO_FUNC_SHIFT);
	break;
	default:
	error = EIO;
	break;
	}
	}
	if (error == 0)
	error = axp81x_write(dev, axp81x_pins[pin].ctrl_reg, data);
	AXP_UNLOCK(sc);

	return (error);
	}


	static int
	axp81x_gpio_pin_toggle(device_t dev, uint32_t pin)
	{
	struct axp81x_softc *sc;
	uint8_t data, func;
	int error;

	if (pin >= nitems(axp81x_pins))
	return (EINVAL);

	sc = device_get_softc(dev);

	AXP_LOCK(sc);
	error = axp81x_read(dev, axp81x_pins[pin].ctrl_reg, &data, 1);
	if (error == 0) {
	func = (data & AXP_GPIO_FUNC) >> AXP_GPIO_FUNC_SHIFT;
	switch (func) {
	case AXP_GPIO_FUNC_DRVLO:
	data &= ~AXP_GPIO_FUNC;
	data \|= (AXP_GPIO_FUNC_DRVHI << AXP_GPIO_FUNC_SHIFT);
	break;
	case AXP_GPIO_FUNC_DRVHI:
	data &= ~AXP_GPIO_FUNC;
	data \|= (AXP_GPIO_FUNC_DRVLO << AXP_GPIO_FUNC_SHIFT);
	break;
	default:
	error = EIO;
	break;
	}
	}
	if (error == 0)
	error = axp81x_write(dev, axp81x_pins[pin].ctrl_reg, data);
	AXP_UNLOCK(sc);

	return (error);
	}

	static int
	axp81x_gpio_map_gpios(device_t bus, phandle_t dev, phandle_t gparent,
	int gcells, pcell_t gpios, uint32_t pin, uint32_t *flags)
	{
	if (gpios[0] >= nitems(axp81x_pins))
	return (EINVAL);

	*pin = gpios[0];
	*flags = gpios[1];

	return (0);
	}

	static phandle_t
	axp81x_get_node(device_t dev, device_t bus)
	{
	return (ofw_bus_get_node(dev));
	}

	static struct axp81x_reg_sc *
	axp81x_reg_attach(device_t dev, phandle_t node,
	struct axp81x_regdef *def)
	{
	struct axp81x_reg_sc *reg_sc;
	struct regnode_init_def initdef;
	struct regnode *regnode;

	memset(&initdef, 0, sizeof(initdef));
	regulator_parse_ofw_stdparam(dev, node, &initdef);
	if (initdef.std_param.min_uvolt == 0)
	initdef.std_param.min_uvolt = def->voltage_min * 1000;
	if (initdef.std_param.max_uvolt == 0)
	initdef.std_param.max_uvolt = def->voltage_max * 1000;
	initdef.id = def->id;
	initdef.ofw_node = node;
	regnode = regnode_create(dev, &axp81x_regnode_class, &initdef);
	if (regnode == NULL) {
	device_printf(dev, "cannot create regulator\n");
	return (NULL);
	}

	reg_sc = regnode_get_softc(regnode);
	reg_sc->regnode = regnode;
	reg_sc->base_dev = dev;
	reg_sc->def = def;
	reg_sc->xref = OF_xref_from_node(node);
	reg_sc->param = regnode_get_stdparam(regnode);

	regnode_register(regnode);

	return (reg_sc);
	}

	static int
	axp81x_regdev_map(device_t dev, phandle_t xref, int ncells, pcell_t *cells,
	intptr_t *num)
	{
	struct axp81x_softc *sc;
	int i;

	sc = device_get_softc(dev);
	for (i = 0; i < sc->nregs; i++) {
	if (sc->regs[i] == NULL)
	continue;
	if (sc->regs[i]->xref == xref) {
	*num = sc->regs[i]->def->id;
	return (0);
	}
	}

	return (ENXIO);
	}

	static int
	axp81x_probe(device_t dev)
	{
	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
	return (ENXIO);

	device_set_desc(dev, "X-Powers AXP81x Power Management Unit");

	return (BUS_PROBE_DEFAULT);
	}

	static int
	axp81x_attach(device_t dev)
	{
	struct axp81x_softc *sc;
	struct axp81x_reg_sc *reg;
	uint8_t chip_id;
	phandle_t rnode, child;
	int error, i;

	sc = device_get_softc(dev);

	sc->addr = iicbus_get_addr(dev);
	mtx_init(&sc->mtx, device_get_nameunit(dev), NULL, MTX_DEF);

	error = bus_alloc_resources(dev, axp81x_spec, &sc->res);
	if (error != 0) {
	device_printf(dev, "cannot allocate resources for device\n");
	return (error);
	}

	if (bootverbose) {
	axp81x_read(dev, AXP_ICTYPE, &chip_id, 1);
	device_printf(dev, "chip ID 0x%02x\n", chip_id);
	}

	sc->nregs = nitems(axp81x_regdefs);
	sc->regs = malloc(sizeof(struct axp81x_reg_sc ) sc->nregs,
	M_AXP81X_REG, M_WAITOK \| M_ZERO);

	/* Attach known regulators that exist in the DT */
	rnode = ofw_bus_find_child(ofw_bus_get_node(dev), "regulators");
	if (rnode > 0) {
	for (i = 0; i < sc->nregs; i++) {
	child = ofw_bus_find_child(rnode,
	axp81x_regdefs[i].name);
	if (child == 0)
	continue;
	reg = axp81x_reg_attach(dev, child, &axp81x_regdefs[i]);
	if (reg == NULL) {
	device_printf(dev,
	"cannot attach regulator %s\n",
	axp81x_regdefs[i].name);
	return (ENXIO);
	}
	sc->regs[i] = reg;
	}
	}

	/* Enable IRQ on short power key press */
	axp81x_write(dev, AXP_IRQEN1, 0);
	axp81x_write(dev, AXP_IRQEN2, 0);
	axp81x_write(dev, AXP_IRQEN3, 0);
	axp81x_write(dev, AXP_IRQEN4, 0);
	axp81x_write(dev, AXP_IRQEN5, AXP_IRQEN5_POKSIRQ);
	axp81x_write(dev, AXP_IRQEN6, 0);

	/* Install interrupt handler */
	error = bus_setup_intr(dev, sc->res, INTR_TYPE_MISC \| INTR_MPSAFE,
	NULL, axp81x_intr, dev, &sc->ih);
	if (error != 0) {
	device_printf(dev, "cannot setup interrupt handler\n");
	return (error);
	}

	EVENTHANDLER_REGISTER(shutdown_final, axp81x_shutdown, dev,
	SHUTDOWN_PRI_LAST);

	sc->gpiodev = gpiobus_attach_bus(dev);

	return (0);
	}

	static device_method_t axp81x_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, axp81x_probe),
	DEVMETHOD(device_attach, axp81x_attach),

	/* GPIO interface */
	DEVMETHOD(gpio_get_bus, axp81x_gpio_get_bus),
	DEVMETHOD(gpio_pin_max, axp81x_gpio_pin_max),
	DEVMETHOD(gpio_pin_getname, axp81x_gpio_pin_getname),
	DEVMETHOD(gpio_pin_getcaps, axp81x_gpio_pin_getcaps),
	DEVMETHOD(gpio_pin_getflags, axp81x_gpio_pin_getflags),
	DEVMETHOD(gpio_pin_setflags, axp81x_gpio_pin_setflags),
	DEVMETHOD(gpio_pin_get, axp81x_gpio_pin_get),
	DEVMETHOD(gpio_pin_set, axp81x_gpio_pin_set),
	DEVMETHOD(gpio_pin_toggle, axp81x_gpio_pin_toggle),
	DEVMETHOD(gpio_map_gpios, axp81x_gpio_map_gpios),

	/* Regdev interface */
	DEVMETHOD(regdev_map, axp81x_regdev_map),

	/* OFW bus interface */
	DEVMETHOD(ofw_bus_get_node, axp81x_get_node),

	DEVMETHOD_END
	};

	static driver_t axp81x_driver = {
	"axp81x_pmu",
	axp81x_methods,
	sizeof(struct axp81x_softc),
	};

	static devclass_t axp81x_devclass;
	extern devclass_t ofwgpiobus_devclass, gpioc_devclass;
	extern driver_t ofw_gpiobus_driver, gpioc_driver;

	EARLY_DRIVER_MODULE(axp81x, iicbus, axp81x_driver, axp81x_devclass, 0, 0,
	BUS_PASS_INTERRUPT + BUS_PASS_ORDER_LAST);
	EARLY_DRIVER_MODULE(ofw_gpiobus, axp81x_pmu, ofw_gpiobus_driver,
	ofwgpiobus_devclass, 0, 0, BUS_PASS_INTERRUPT + BUS_PASS_ORDER_LAST);
	DRIVER_MODULE(gpioc, axp81x_pmu, gpioc_driver, gpioc_devclass, 0, 0);
	MODULE_VERSION(axp81x, 1);
	MODULE_DEPEND(axp81x, iicbus, 1, 1, 1);
	Index: head/sys/arm/allwinner/clk/aw_pll.c
	===================================================================
	--- head/sys/arm/allwinner/clk/aw_pll.c (revision 327172)
	+++ head/sys/arm/allwinner/clk/aw_pll.c (revision 327173)
	@@ -1,1349 +1,1347 @@
	/*-
	* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* Allwinner PLL clock
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/rman.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <machine/bus.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#include <dev/ofw/ofw_subr.h>

	#include <dev/extres/clk/clk.h>

	#include <arm/allwinner/aw_machdep.h>

	#include "clkdev_if.h"

	#define SUN4I_A10_PLL2_1X 0
	#define SUN4I_A10_PLL2_2X 1
	#define SUN4I_A10_PLL2_4X 2
	#define SUN4I_A10_PLL2_8X 3

	#define AW_PLL_ENABLE (1 << 31)

	#define A10_PLL1_OUT_EXT_DIVP (0x3 << 16)
	#define A10_PLL1_OUT_EXT_DIVP_SHIFT 16
	#define A10_PLL1_FACTOR_N (0x1f << 8)
	#define A10_PLL1_FACTOR_N_SHIFT 8
	#define A10_PLL1_FACTOR_K (0x3 << 4)
	#define A10_PLL1_FACTOR_K_SHIFT 4
	#define A10_PLL1_FACTOR_M (0x3 << 0)
	#define A10_PLL1_FACTOR_M_SHIFT 0

	#define A10_PLL2_POST_DIV (0xf << 26)
	#define A10_PLL2_POST_DIV_SHIFT 26
	#define A10_PLL2_FACTOR_N (0x7f << 8)
	#define A10_PLL2_FACTOR_N_SHIFT 8
	#define A10_PLL2_PRE_DIV (0x1f << 0)
	#define A10_PLL2_PRE_DIV_SHIFT 0

	#define A10_PLL3_MODE_SEL (0x1 << 15)
	#define A10_PLL3_MODE_SEL_FRACT (0 << 15)
	#define A10_PLL3_MODE_SEL_INT (1 << 15)
	#define A10_PLL3_FUNC_SET (0x1 << 14)
	#define A10_PLL3_FUNC_SET_270MHZ (0 << 14)
	#define A10_PLL3_FUNC_SET_297MHZ (1 << 14)
	#define A10_PLL3_FACTOR_M (0x7f << 0)
	#define A10_PLL3_FACTOR_M_SHIFT 0
	#define A10_PLL3_REF_FREQ 3000000

	#define A10_PLL5_OUT_EXT_DIVP (0x3 << 16)
	#define A10_PLL5_OUT_EXT_DIVP_SHIFT 16
	#define A10_PLL5_FACTOR_N (0x1f << 8)
	#define A10_PLL5_FACTOR_N_SHIFT 8
	#define A10_PLL5_FACTOR_K (0x3 << 4)
	#define A10_PLL5_FACTOR_K_SHIFT 4
	#define A10_PLL5_FACTOR_M1 (0x3 << 2)
	#define A10_PLL5_FACTOR_M1_SHIFT 2
	#define A10_PLL5_FACTOR_M (0x3 << 0)
	#define A10_PLL5_FACTOR_M_SHIFT 0

	#define A10_PLL6_BYPASS_EN (1 << 30)
	#define A10_PLL6_SATA_CLK_EN (1 << 14)
	#define A10_PLL6_FACTOR_N (0x1f << 8)
	#define A10_PLL6_FACTOR_N_SHIFT 8
	#define A10_PLL6_FACTOR_K (0x3 << 4)
	#define A10_PLL6_FACTOR_K_SHIFT 4
	#define A10_PLL6_FACTOR_M (0x3 << 0)
	#define A10_PLL6_FACTOR_M_SHIFT 0

	#define A10_PLL2_POST_DIV (0xf << 26)

	#define A13_PLL2_POST_DIV (0xf << 26)
	#define A13_PLL2_POST_DIV_SHIFT 26
	#define A13_PLL2_FACTOR_N (0x7f << 8)
	#define A13_PLL2_FACTOR_N_SHIFT 8
	#define A13_PLL2_PRE_DIV (0x1f << 0)
	#define A13_PLL2_PRE_DIV_SHIFT 0

	#define A23_PLL1_FACTOR_P (0x3 << 16)
	#define A23_PLL1_FACTOR_P_SHIFT 16
	#define A23_PLL1_FACTOR_N (0x1f << 8)
	#define A23_PLL1_FACTOR_N_SHIFT 8
	#define A23_PLL1_FACTOR_K (0x3 << 4)
	#define A23_PLL1_FACTOR_K_SHIFT 4
	#define A23_PLL1_FACTOR_M (0x3 << 0)
	#define A23_PLL1_FACTOR_M_SHIFT 0

	#define A31_PLL1_LOCK (1 << 28)
	#define A31_PLL1_CPU_SIGMA_DELTA_EN (1 << 24)
	#define A31_PLL1_FACTOR_N (0x1f << 8)
	#define A31_PLL1_FACTOR_N_SHIFT 8
	#define A31_PLL1_FACTOR_K (0x3 << 4)
	#define A31_PLL1_FACTOR_K_SHIFT 4
	#define A31_PLL1_FACTOR_M (0x3 << 0)
	#define A31_PLL1_FACTOR_M_SHIFT 0

	#define A31_PLL6_LOCK (1 << 28)
	#define A31_PLL6_BYPASS_EN (1 << 25)
	#define A31_PLL6_CLK_OUT_EN (1 << 24)
	#define A31_PLL6_24M_OUT_EN (1 << 18)
	#define A31_PLL6_24M_POST_DIV (0x3 << 16)
	#define A31_PLL6_24M_POST_DIV_SHIFT 16
	#define A31_PLL6_FACTOR_N (0x1f << 8)
	#define A31_PLL6_FACTOR_N_SHIFT 8
	#define A31_PLL6_FACTOR_K (0x3 << 4)
	#define A31_PLL6_FACTOR_K_SHIFT 4
	#define A31_PLL6_DEFAULT_N 0x18
	#define A31_PLL6_DEFAULT_K 0x1
	#define A31_PLL6_TIMEOUT 10

	#define A64_PLLHSIC_LOCK (1 << 28)
	#define A64_PLLHSIC_FRAC_CLK_OUT (1 << 25)
	#define A64_PLLHSIC_PLL_MODE_SEL (1 << 24)
	#define A64_PLLHSIC_PLL_SDM_EN (1 << 20)
	#define A64_PLLHSIC_FACTOR_N (0x7f << 8)
	#define A64_PLLHSIC_FACTOR_N_SHIFT 8
	#define A64_PLLHSIC_PRE_DIV_M (0xf << 0)
	#define A64_PLLHSIC_PRE_DIV_M_SHIFT 0

	#define A80_PLL4_CLK_OUT_EN (1 << 20)
	#define A80_PLL4_PLL_DIV2 (1 << 18)
	#define A80_PLL4_PLL_DIV1 (1 << 16)
	#define A80_PLL4_FACTOR_N (0xff << 8)
	#define A80_PLL4_FACTOR_N_SHIFT 8

	#define A83T_PLLCPUX_LOCK_TIME (0x7 << 24)
	#define A83T_PLLCPUX_LOCK_TIME_SHIFT 24
	#define A83T_PLLCPUX_CLOCK_OUTPUT_DIS (1 << 20)
	#define A83T_PLLCPUX_OUT_EXT_DIVP (1 << 16)
	#define A83T_PLLCPUX_FACTOR_N (0xff << 8)
	#define A83T_PLLCPUX_FACTOR_N_SHIFT 8
	#define A83T_PLLCPUX_FACTOR_N_MIN 12
	#define A83T_PLLCPUX_FACTOR_N_MAX 125
	#define A83T_PLLCPUX_POSTDIV_M (0x3 << 0)
	#define A83T_PLLCPUX_POSTDIV_M_SHIFT 0

	#define H3_PLL2_LOCK (1 << 28)
	#define H3_PLL2_SDM_EN (1 << 24)
	#define H3_PLL2_POST_DIV (0xf << 16)
	#define H3_PLL2_POST_DIV_SHIFT 16
	#define H3_PLL2_FACTOR_N (0x7f << 8)
	#define H3_PLL2_FACTOR_N_SHIFT 8
	#define H3_PLL2_PRE_DIV (0x1f << 0)
	#define H3_PLL2_PRE_DIV_SHIFT 0

	#define CLKID_A10_PLL5_DDR 0
	#define CLKID_A10_PLL5_OTHER 1

	#define CLKID_A10_PLL6_SATA 0
	#define CLKID_A10_PLL6_OTHER 1
	#define CLKID_A10_PLL6 2
	#define CLKID_A10_PLL6_DIV_4 3

	#define CLKID_A31_PLL6 0
	#define CLKID_A31_PLL6_X2 1

	struct aw_pll_factor {
	unsigned int n;
	unsigned int k;
	unsigned int m;
	unsigned int p;
	uint64_t freq;
	};
	#define PLLFACTOR(_n, _k, _m, _p, _freq) \
	{ .n = (_n), .k = (_k), .m = (_m), .p = (_p), .freq = (_freq) }

	static struct aw_pll_factor aw_a10_pll1_factors[] = {
	PLLFACTOR(6, 0, 0, 0, 144000000),
	PLLFACTOR(12, 0, 0, 0, 312000000),
	PLLFACTOR(21, 0, 0, 0, 528000000),
	PLLFACTOR(29, 0, 0, 0, 720000000),
	PLLFACTOR(18, 1, 0, 0, 864000000),
	PLLFACTOR(19, 1, 0, 0, 912000000),
	PLLFACTOR(20, 1, 0, 0, 960000000),
	};

	static struct aw_pll_factor aw_a23_pll1_factors[] = {
	PLLFACTOR(9, 0, 0, 2, 60000000),
	PLLFACTOR(10, 0, 0, 2, 66000000),
	PLLFACTOR(11, 0, 0, 2, 72000000),
	PLLFACTOR(12, 0, 0, 2, 78000000),
	PLLFACTOR(13, 0, 0, 2, 84000000),
	PLLFACTOR(14, 0, 0, 2, 90000000),
	PLLFACTOR(15, 0, 0, 2, 96000000),
	PLLFACTOR(16, 0, 0, 2, 102000000),
	PLLFACTOR(17, 0, 0, 2, 108000000),
	PLLFACTOR(18, 0, 0, 2, 114000000),
	PLLFACTOR(9, 0, 0, 1, 120000000),
	PLLFACTOR(10, 0, 0, 1, 132000000),
	PLLFACTOR(11, 0, 0, 1, 144000000),
	PLLFACTOR(12, 0, 0, 1, 156000000),
	PLLFACTOR(13, 0, 0, 1, 168000000),
	PLLFACTOR(14, 0, 0, 1, 180000000),
	PLLFACTOR(15, 0, 0, 1, 192000000),
	PLLFACTOR(16, 0, 0, 1, 204000000),
	PLLFACTOR(17, 0, 0, 1, 216000000),
	PLLFACTOR(18, 0, 0, 1, 228000000),
	PLLFACTOR(9, 0, 0, 0, 240000000),
	PLLFACTOR(10, 0, 0, 0, 264000000),
	PLLFACTOR(11, 0, 0, 0, 288000000),
	PLLFACTOR(12, 0, 0, 0, 312000000),
	PLLFACTOR(13, 0, 0, 0, 336000000),
	PLLFACTOR(14, 0, 0, 0, 360000000),
	PLLFACTOR(15, 0, 0, 0, 384000000),
	PLLFACTOR(16, 0, 0, 0, 408000000),
	PLLFACTOR(17, 0, 0, 0, 432000000),
	PLLFACTOR(18, 0, 0, 0, 456000000),
	PLLFACTOR(19, 0, 0, 0, 480000000),
	PLLFACTOR(20, 0, 0, 0, 504000000),
	PLLFACTOR(21, 0, 0, 0, 528000000),
	PLLFACTOR(22, 0, 0, 0, 552000000),
	PLLFACTOR(23, 0, 0, 0, 576000000),
	PLLFACTOR(24, 0, 0, 0, 600000000),
	PLLFACTOR(25, 0, 0, 0, 624000000),
	PLLFACTOR(26, 0, 0, 0, 648000000),
	PLLFACTOR(27, 0, 0, 0, 672000000),
	PLLFACTOR(28, 0, 0, 0, 696000000),
	PLLFACTOR(29, 0, 0, 0, 720000000),
	PLLFACTOR(15, 1, 0, 0, 768000000),
	PLLFACTOR(10, 2, 0, 0, 792000000),
	PLLFACTOR(16, 1, 0, 0, 816000000),
	PLLFACTOR(17, 1, 0, 0, 864000000),
	PLLFACTOR(18, 1, 0, 0, 912000000),
	PLLFACTOR(12, 2, 0, 0, 936000000),
	PLLFACTOR(19, 1, 0, 0, 960000000),
	PLLFACTOR(20, 1, 0, 0, 1008000000),
	PLLFACTOR(21, 1, 0, 0, 1056000000),
	PLLFACTOR(14, 2, 0, 0, 1080000000),
	PLLFACTOR(22, 1, 0, 0, 1104000000),
	PLLFACTOR(23, 1, 0, 0, 1152000000),
	PLLFACTOR(24, 1, 0, 0, 1200000000),
	PLLFACTOR(16, 2, 0, 0, 1224000000),
	PLLFACTOR(25, 1, 0, 0, 1248000000),
	PLLFACTOR(26, 1, 0, 0, 1296000000),
	PLLFACTOR(27, 1, 0, 0, 1344000000),
	PLLFACTOR(18, 2, 0, 0, 1368000000),
	PLLFACTOR(28, 1, 0, 0, 1392000000),
	PLLFACTOR(29, 1, 0, 0, 1440000000),
	PLLFACTOR(20, 2, 0, 0, 1512000000),
	PLLFACTOR(15, 3, 0, 0, 1536000000),
	PLLFACTOR(21, 2, 0, 0, 1584000000),
	PLLFACTOR(16, 3, 0, 0, 1632000000),
	PLLFACTOR(22, 2, 0, 0, 1656000000),
	PLLFACTOR(23, 2, 0, 0, 1728000000),
	PLLFACTOR(24, 2, 0, 0, 1800000000),
	PLLFACTOR(18, 3, 0, 0, 1824000000),
	PLLFACTOR(25, 2, 0, 0, 1872000000),
	};

	static struct aw_pll_factor aw_h3_pll2_factors[] = {
	PLLFACTOR(13, 0, 0, 13, 24576000),
	PLLFACTOR(6, 0, 0, 7, 22579200),
	};

	enum aw_pll_type {
	AWPLL_A10_PLL1 = 1,
	AWPLL_A10_PLL2,
	AWPLL_A10_PLL3,
	AWPLL_A10_PLL5,
	AWPLL_A10_PLL6,
	AWPLL_A13_PLL2,
	AWPLL_A23_PLL1,
	AWPLL_A31_PLL1,
	AWPLL_A31_PLL6,
	AWPLL_A64_PLLHSIC,
	AWPLL_A80_PLL4,
	AWPLL_A83T_PLLCPUX,
	AWPLL_H3_PLL1,
	AWPLL_H3_PLL2,
	};

	struct aw_pll_sc {
	enum aw_pll_type type;
	device_t clkdev;
	bus_addr_t reg;
	int id;
	};

	struct aw_pll_funcs {
	int (recalc)(struct aw_pll_sc , uint64_t *);
	int (set_freq)(struct aw_pll_sc , uint64_t, uint64_t *, int);
	int (init)(device_t, bus_addr_t, struct clknode_init_def );
	};

	#define PLL_READ(sc, val) CLKDEV_READ_4((sc)->clkdev, (sc)->reg, (val))
	#define PLL_WRITE(sc, val) CLKDEV_WRITE_4((sc)->clkdev, (sc)->reg, (val))
	#define DEVICE_LOCK(sc) CLKDEV_DEVICE_LOCK((sc)->clkdev)
	#define DEVICE_UNLOCK(sc) CLKDEV_DEVICE_UNLOCK((sc)->clkdev)

	static int
	a10_pll1_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	struct aw_pll_factor *f;
	uint32_t val;
	int n;

	f = NULL;
	for (n = 0; n < nitems(aw_a10_pll1_factors); n++) {
	if (aw_a10_pll1_factors[n].freq == *fout) {
	f = &aw_a10_pll1_factors[n];
	break;
	}
	}
	if (f == NULL)
	return (EINVAL);

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	val &= ~(A10_PLL1_FACTOR_N\|A10_PLL1_FACTOR_K\|A10_PLL1_FACTOR_M\|
	A10_PLL1_OUT_EXT_DIVP);
	val \|= (f->p << A10_PLL1_OUT_EXT_DIVP_SHIFT);
	val \|= (f->n << A10_PLL1_FACTOR_N_SHIFT);
	val \|= (f->k << A10_PLL1_FACTOR_K_SHIFT);
	val \|= (f->m << A10_PLL1_FACTOR_M_SHIFT);
	PLL_WRITE(sc, val);
	DEVICE_UNLOCK(sc);

	return (0);
	}

	static int
	a10_pll1_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, m, n, k, p;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	p = 1 << ((val & A10_PLL1_OUT_EXT_DIVP) >> A10_PLL1_OUT_EXT_DIVP_SHIFT);
	m = ((val & A10_PLL1_FACTOR_M) >> A10_PLL1_FACTOR_M_SHIFT) + 1;
	k = ((val & A10_PLL1_FACTOR_K) >> A10_PLL1_FACTOR_K_SHIFT) + 1;
	n = (val & A10_PLL1_FACTOR_N) >> A10_PLL1_FACTOR_N_SHIFT;
	if (n == 0)
	n = 1;

	freq = (freq * n * k) / (m * p);

	return (0);
	}

	static int
	a10_pll2_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, post_div, n, pre_div;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	post_div = (val & A10_PLL2_POST_DIV) >> A10_PLL2_POST_DIV_SHIFT;
	if (post_div == 0)
	post_div = 1;
	n = (val & A10_PLL2_FACTOR_N) >> A10_PLL2_FACTOR_N_SHIFT;
	if (n == 0)
	n = 1;
	pre_div = (val & A10_PLL2_PRE_DIV) >> A10_PLL2_PRE_DIV_SHIFT;
	if (pre_div == 0)
	pre_div = 1;

	switch (sc->id) {
	case SUN4I_A10_PLL2_1X:
	freq = (freq * 2 * n) / pre_div / post_div / 2;
	break;
	case SUN4I_A10_PLL2_2X:
	freq = (freq * 2 * n) / pre_div / 4;
	break;
	case SUN4I_A10_PLL2_4X:
	freq = (freq * 2 * n) / pre_div / 2;
	break;
	case SUN4I_A10_PLL2_8X:
	freq = (freq * 2 * n) / pre_div;
	break;
	default:
	return (EINVAL);
	}

	return (0);
	}

	static int
	a10_pll2_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	uint32_t val, post_div, n, pre_div;

	if (sc->id != SUN4I_A10_PLL2_1X)
	return (ENXIO);

	/*
	* Audio Codec needs PLL2-1X to be either 24576000 or 22579200.
	*
	* PLL2-1X output frequency is (48MHz * n) / pre_div / post_div / 2.
	* To get as close as possible to the desired rate, we use a
	* pre-divider of 21 and a post-divider of 4. With these values,
	* a multiplier of 86 or 79 gets us close to the target rates.
	*/
	if (fout != 24576000 && fout != 22579200)
	return (EINVAL);

	pre_div = 21;
	post_div = 4;
	n = (fout pre_div * post_div * 2) / (2 * fin);

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	val &= ~(A10_PLL2_POST_DIV \| A10_PLL2_FACTOR_N \| A10_PLL2_PRE_DIV);
	val \|= (post_div << A10_PLL2_POST_DIV_SHIFT);
	val \|= (n << A10_PLL2_FACTOR_N_SHIFT);
	val \|= (pre_div << A10_PLL2_PRE_DIV_SHIFT);
	PLL_WRITE(sc, val);
	DEVICE_UNLOCK(sc);

	return (0);
	}

	static int
	a10_pll3_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, m;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	if ((val & A10_PLL3_MODE_SEL) == A10_PLL3_MODE_SEL_INT) {
	/* In integer mode, output is 3MHz * m */
	m = (val & A10_PLL3_FACTOR_M) >> A10_PLL3_FACTOR_M_SHIFT;
	freq = A10_PLL3_REF_FREQ m;
	} else {
	/* In fractional mode, output is either 270MHz or 297MHz */
	if ((val & A10_PLL3_FUNC_SET) == A10_PLL3_FUNC_SET_270MHZ)
	*freq = 270000000;
	else
	*freq = 297000000;
	}

	return (0);
	}

	static int
	a10_pll3_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	uint32_t val, m, mode, func;

	if (*fout == 297000000) {
	func = A10_PLL3_FUNC_SET_297MHZ;
	mode = A10_PLL3_MODE_SEL_FRACT;
	m = 0;
	} else if (*fout == 270000000) {
	func = A10_PLL3_FUNC_SET_270MHZ;
	mode = A10_PLL3_MODE_SEL_FRACT;
	m = 0;
	} else {
	mode = A10_PLL3_MODE_SEL_INT;
	func = 0;
	m = *fout / A10_PLL3_REF_FREQ;
	fout = m A10_PLL3_REF_FREQ;
	}

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	val &= ~(A10_PLL3_MODE_SEL \| A10_PLL3_FUNC_SET \| A10_PLL3_FACTOR_M);
	val \|= mode;
	val \|= func;
	val \|= (m << A10_PLL3_FACTOR_M_SHIFT);
	PLL_WRITE(sc, val);
	DEVICE_UNLOCK(sc);

	return (0);
	}

	static int
	a10_pll3_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
	{
	uint32_t val;

	/* Allow changing PLL frequency while enabled */
	def->flags = CLK_NODE_GLITCH_FREE;

	/* Set PLL to 297MHz */
	CLKDEV_DEVICE_LOCK(dev);
	CLKDEV_READ_4(dev, reg, &val);
	val &= ~(A10_PLL3_MODE_SEL \| A10_PLL3_FUNC_SET \| A10_PLL3_FACTOR_M);
	val \|= A10_PLL3_MODE_SEL_FRACT;
	val \|= A10_PLL3_FUNC_SET_297MHZ;
	CLKDEV_WRITE_4(dev, reg, val);
	CLKDEV_DEVICE_UNLOCK(dev);

	return (0);
	}

	static int
	a10_pll5_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, m, n, k, p;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	p = 1 << ((val & A10_PLL5_OUT_EXT_DIVP) >> A10_PLL5_OUT_EXT_DIVP_SHIFT);
	m = ((val & A10_PLL5_FACTOR_M) >> A10_PLL5_FACTOR_M_SHIFT) + 1;
	k = ((val & A10_PLL5_FACTOR_K) >> A10_PLL5_FACTOR_K_SHIFT) + 1;
	n = (val & A10_PLL5_FACTOR_N) >> A10_PLL5_FACTOR_N_SHIFT;
	if (n == 0)
	return (ENXIO);

	switch (sc->id) {
	case CLKID_A10_PLL5_DDR:
	freq = (freq * n * k) / m;
	break;
	case CLKID_A10_PLL5_OTHER:
	freq = (freq * n * k) / p;
	break;
	default:
	return (ENXIO);
	}

	return (0);
	}

	static int
	a10_pll6_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
	{
	uint32_t val, m, n, k;

	/*
	* SATA needs PLL6 to be a 100MHz clock.
	*
	* The SATA output frequency is (24MHz * n * k) / m / 6.
	* To get to 100MHz, k & m must be equal and n must be 25.
	*/
	m = k = 0;
	n = 25;

	CLKDEV_DEVICE_LOCK(dev);
	CLKDEV_READ_4(dev, reg, &val);
	val &= ~(A10_PLL6_FACTOR_N \| A10_PLL6_FACTOR_K \| A10_PLL6_FACTOR_M);
	val &= ~A10_PLL6_BYPASS_EN;
	val \|= A10_PLL6_SATA_CLK_EN;
	val \|= (n << A10_PLL6_FACTOR_N_SHIFT);
	val \|= (k << A10_PLL6_FACTOR_K_SHIFT);
	val \|= (m << A10_PLL6_FACTOR_M_SHIFT);
	CLKDEV_WRITE_4(dev, reg, val);
	CLKDEV_DEVICE_UNLOCK(dev);

	return (0);
	}

	static int
	a10_pll6_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, m, k, n;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	m = ((val & A10_PLL6_FACTOR_M) >> A10_PLL6_FACTOR_M_SHIFT) + 1;
	k = ((val & A10_PLL6_FACTOR_K) >> A10_PLL6_FACTOR_K_SHIFT) + 1;
	n = (val & A10_PLL6_FACTOR_N) >> A10_PLL6_FACTOR_N_SHIFT;
	if (n == 0)
	return (ENXIO);

	switch (sc->id) {
	case CLKID_A10_PLL6_SATA:
	freq = (freq * n * k) / m / 6;
	break;
	case CLKID_A10_PLL6_OTHER:
	freq = (freq * n * k) / 2;
	break;
	case CLKID_A10_PLL6:
	freq = (freq * n * k);
	break;
	case CLKID_A10_PLL6_DIV_4:
	freq = (freq * n * k) / 4;
	break;
	default:
	return (ENXIO);
	}

	return (0);
	}

	static int
	a10_pll6_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	if (sc->id != CLKID_A10_PLL6_SATA)
	return (ENXIO);

	/* PLL6 SATA output has been set to 100MHz in a10_pll6_init */
	if (*fout != 100000000)
	return (ERANGE);

	return (0);
	}

	static int
	a13_pll2_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, post_div, n, pre_div;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	post_div = ((val & A13_PLL2_POST_DIV) >> A13_PLL2_POST_DIV_SHIFT) + 1;
	if (post_div == 0)
	post_div = 1;
	n = (val & A13_PLL2_FACTOR_N) >> A13_PLL2_FACTOR_N_SHIFT;
	if (n == 0)
	n = 1;
	pre_div = ((val & A13_PLL2_PRE_DIV) >> A13_PLL2_PRE_DIV_SHIFT) + 1;
	if (pre_div == 0)
	pre_div = 1;

	switch (sc->id) {
	case SUN4I_A10_PLL2_1X:
	freq = (freq * 2 * n) / pre_div / post_div / 2;
	break;
	case SUN4I_A10_PLL2_2X:
	freq = (freq * 2 * n) / pre_div / 4;
	break;
	case SUN4I_A10_PLL2_4X:
	freq = (freq * 2 * n) / pre_div / 2;
	break;
	case SUN4I_A10_PLL2_8X:
	freq = (freq * 2 * n) / pre_div;
	break;
	default:
	return (EINVAL);
	}

	return (0);
	}

	static int
	a13_pll2_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	uint32_t val, post_div, n, pre_div;

	if (sc->id != SUN4I_A10_PLL2_1X)
	return (ENXIO);

	/*
	* Audio Codec needs PLL2-1X to be either 24576000 or 22579200.
	*
	* PLL2-1X output frequency is (48MHz * n) / pre_div / post_div / 2.
	* To get as close as possible to the desired rate, we use a
	* pre-divider of 21 and a post-divider of 4. With these values,
	* a multiplier of 86 or 79 gets us close to the target rates.
	*/
	if (fout != 24576000 && fout != 22579200)
	return (EINVAL);

	pre_div = 21;
	post_div = 4;
	n = (fout pre_div * post_div * 2) / (2 * fin);

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	val &= ~(A13_PLL2_POST_DIV \| A13_PLL2_FACTOR_N \| A13_PLL2_PRE_DIV);
	val \|= ((post_div - 1) << A13_PLL2_POST_DIV_SHIFT);
	val \|= (n << A13_PLL2_FACTOR_N_SHIFT);
	val \|= ((pre_div - 1) << A13_PLL2_PRE_DIV_SHIFT);
	PLL_WRITE(sc, val);
	DEVICE_UNLOCK(sc);

	return (0);
	}

	static int
	h3_pll2_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, p, n, m;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	p = ((val & H3_PLL2_POST_DIV) >> H3_PLL2_POST_DIV_SHIFT) + 1;
	n = ((val & H3_PLL2_FACTOR_N) >> H3_PLL2_FACTOR_N_SHIFT) + 1;
	m = ((val & H3_PLL2_PRE_DIV) >> H3_PLL2_PRE_DIV_SHIFT) + 1;

	switch (sc->id) {
	case SUN4I_A10_PLL2_1X:
	freq = (freq * n) / (m * p);
	break;
	case SUN4I_A10_PLL2_2X:
	freq = (freq * 2 * n) / m / 4;
	break;
	case SUN4I_A10_PLL2_4X:
	freq = (freq * 2 * n) / m / 2;
	break;
	case SUN4I_A10_PLL2_8X:
	freq = (freq * 2 * n) / m;
	break;
	default:
	return (EINVAL);
	}

	return (0);
	}

	static int
	h3_pll2_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	struct aw_pll_factor *f;
	uint32_t val;
	int n, error, retry;

	if (sc->id != SUN4I_A10_PLL2_1X)
	return (ENXIO);

	f = NULL;
	for (n = 0; n < nitems(aw_h3_pll2_factors); n++) {
	if (aw_h3_pll2_factors[n].freq == *fout) {
	f = &aw_h3_pll2_factors[n];
	break;
	}
	}
	if (f == NULL)
	return (EINVAL);

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	val &= ~(H3_PLL2_POST_DIV\|H3_PLL2_FACTOR_N\|H3_PLL2_PRE_DIV);
	val \|= (f->p << H3_PLL2_POST_DIV_SHIFT);
	val \|= (f->n << H3_PLL2_FACTOR_N_SHIFT);
	val \|= (f->m << H3_PLL2_PRE_DIV_SHIFT);
	val \|= AW_PLL_ENABLE;
	PLL_WRITE(sc, val);

	/* Wait for lock */
	error = 0;
	for (retry = 0; retry < 1000; retry++) {
	PLL_READ(sc, &val);
	if ((val & H3_PLL2_LOCK) != 0)
	break;
	DELAY(100);
	}
	if (retry == 0)
	error = ETIMEDOUT;

	DEVICE_UNLOCK(sc);

	return (error);
	}

	static int
	a23_pll1_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	struct aw_pll_factor *f;
	uint32_t val;
	int n;

	f = NULL;
	for (n = 0; n < nitems(aw_a23_pll1_factors); n++) {
	if (aw_a23_pll1_factors[n].freq == *fout) {
	f = &aw_a23_pll1_factors[n];
	break;
	}
	}
	if (f == NULL)
	return (EINVAL);

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	val &= ~(A23_PLL1_FACTOR_N\|A23_PLL1_FACTOR_K\|A23_PLL1_FACTOR_M\|
	A23_PLL1_FACTOR_P);
	val \|= (f->n << A23_PLL1_FACTOR_N_SHIFT);
	val \|= (f->k << A23_PLL1_FACTOR_K_SHIFT);
	val \|= (f->m << A23_PLL1_FACTOR_M_SHIFT);
	val \|= (f->p << A23_PLL1_FACTOR_P_SHIFT);
	PLL_WRITE(sc, val);
	DEVICE_UNLOCK(sc);

	return (0);

	}

	static int
	a23_pll1_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, m, n, k, p;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	m = ((val & A23_PLL1_FACTOR_M) >> A23_PLL1_FACTOR_M_SHIFT) + 1;
	k = ((val & A23_PLL1_FACTOR_K) >> A23_PLL1_FACTOR_K_SHIFT) + 1;
	n = ((val & A23_PLL1_FACTOR_N) >> A23_PLL1_FACTOR_N_SHIFT) + 1;
	p = ((val & A23_PLL1_FACTOR_P) >> A23_PLL1_FACTOR_P_SHIFT) + 1;

	freq = (freq * n * k) / (m * p);

	return (0);
	}

	static int
	h3_pll1_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	struct aw_pll_factor *f;
	- uint32_t val, n, k, m, p;
	+ uint32_t val, m, p;
	int i;

	f = NULL;
	for (i = 0; i < nitems(aw_a23_pll1_factors); i++) {
	if (aw_a23_pll1_factors[i].freq == *fout) {
	f = &aw_a23_pll1_factors[i];
	break;
	}
	}
	if (f == NULL)
	return (EINVAL);

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);

	- n = (val & A23_PLL1_FACTOR_N) >> A23_PLL1_FACTOR_N_SHIFT;
	- k = (val & A23_PLL1_FACTOR_K) >> A23_PLL1_FACTOR_K_SHIFT;
	m = (val & A23_PLL1_FACTOR_M) >> A23_PLL1_FACTOR_M_SHIFT;
	p = (val & A23_PLL1_FACTOR_P) >> A23_PLL1_FACTOR_P_SHIFT;

	if (p < f->p) {
	val &= ~A23_PLL1_FACTOR_P;
	val \|= (f->p << A23_PLL1_FACTOR_P_SHIFT);
	PLL_WRITE(sc, val);
	DELAY(2000);
	}

	if (m < f->m) {
	val &= ~A23_PLL1_FACTOR_M;
	val \|= (f->m << A23_PLL1_FACTOR_M_SHIFT);
	PLL_WRITE(sc, val);
	DELAY(2000);
	}

	val &= ~(A23_PLL1_FACTOR_N\|A23_PLL1_FACTOR_K);
	val \|= (f->n << A23_PLL1_FACTOR_N_SHIFT);
	val \|= (f->k << A23_PLL1_FACTOR_K_SHIFT);
	PLL_WRITE(sc, val);
	DELAY(2000);

	if (m > f->m) {
	val &= ~A23_PLL1_FACTOR_M;
	val \|= (f->m << A23_PLL1_FACTOR_M_SHIFT);
	PLL_WRITE(sc, val);
	DELAY(2000);
	}

	if (p > f->p) {
	val &= ~A23_PLL1_FACTOR_P;
	val \|= (f->p << A23_PLL1_FACTOR_P_SHIFT);
	PLL_WRITE(sc, val);
	DELAY(2000);
	}

	DEVICE_UNLOCK(sc);

	return (0);

	}

	static int
	a31_pll1_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, m, n, k;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	m = ((val & A31_PLL1_FACTOR_M) >> A31_PLL1_FACTOR_M_SHIFT) + 1;
	k = ((val & A31_PLL1_FACTOR_K) >> A31_PLL1_FACTOR_K_SHIFT) + 1;
	n = ((val & A31_PLL1_FACTOR_N) >> A31_PLL1_FACTOR_N_SHIFT) + 1;

	freq = (freq * n * k) / m;

	return (0);
	}

	static int
	a31_pll6_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
	{
	uint32_t val;
	int retry;

	if (def->id != CLKID_A31_PLL6)
	return (0);

	/*
	* The datasheet recommends that PLL6 output should be fixed to
	* 600MHz.
	*/
	CLKDEV_DEVICE_LOCK(dev);
	CLKDEV_READ_4(dev, reg, &val);
	val &= ~(A31_PLL6_FACTOR_N \| A31_PLL6_FACTOR_K \| A31_PLL6_BYPASS_EN);
	val \|= (A31_PLL6_DEFAULT_N << A31_PLL6_FACTOR_N_SHIFT);
	val \|= (A31_PLL6_DEFAULT_K << A31_PLL6_FACTOR_K_SHIFT);
	val \|= AW_PLL_ENABLE;
	CLKDEV_WRITE_4(dev, reg, val);

	/* Wait for PLL to become stable */
	for (retry = A31_PLL6_TIMEOUT; retry > 0; retry--) {
	CLKDEV_READ_4(dev, reg, &val);
	if ((val & A31_PLL6_LOCK) == A31_PLL6_LOCK)
	break;
	DELAY(1);
	}

	CLKDEV_DEVICE_UNLOCK(dev);

	return (0);
	}

	static int
	a31_pll6_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, k, n;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	k = ((val & A10_PLL6_FACTOR_K) >> A10_PLL6_FACTOR_K_SHIFT) + 1;
	n = ((val & A10_PLL6_FACTOR_N) >> A10_PLL6_FACTOR_N_SHIFT) + 1;

	switch (sc->id) {
	case CLKID_A31_PLL6:
	freq = (freq * n * k) / 2;
	break;
	case CLKID_A31_PLL6_X2:
	freq = freq * n * k;
	break;
	default:
	return (ENXIO);
	}

	return (0);
	}

	static int
	a80_pll4_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, n, div1, div2;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	n = (val & A80_PLL4_FACTOR_N) >> A80_PLL4_FACTOR_N_SHIFT;
	div1 = (val & A80_PLL4_PLL_DIV1) == 0 ? 1 : 2;
	div2 = (val & A80_PLL4_PLL_DIV2) == 0 ? 1 : 2;

	freq = (freq * n) / div1 / div2;

	return (0);
	}

	static int
	a64_pllhsic_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, n, m;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	n = ((val & A64_PLLHSIC_FACTOR_N) >> A64_PLLHSIC_FACTOR_N_SHIFT) + 1;
	m = ((val & A64_PLLHSIC_PRE_DIV_M) >> A64_PLLHSIC_PRE_DIV_M_SHIFT) + 1;

	freq = (freq * n) / m;

	return (0);
	}

	static int
	a64_pllhsic_init(device_t dev, bus_addr_t reg, struct clknode_init_def *def)
	{
	uint32_t val;

	/*
	* PLL_HSIC default is 480MHz, just enable it.
	*/
	CLKDEV_DEVICE_LOCK(dev);
	CLKDEV_READ_4(dev, reg, &val);
	val \|= AW_PLL_ENABLE;
	CLKDEV_WRITE_4(dev, reg, val);
	CLKDEV_DEVICE_UNLOCK(dev);

	return (0);
	}

	static int
	a83t_pllcpux_recalc(struct aw_pll_sc sc, uint64_t freq)
	{
	uint32_t val, n, p;

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	DEVICE_UNLOCK(sc);

	n = (val & A83T_PLLCPUX_FACTOR_N) >> A83T_PLLCPUX_FACTOR_N_SHIFT;
	p = (val & A83T_PLLCPUX_OUT_EXT_DIVP) ? 4 : 1;

	freq = (freq * n) / p;

	return (0);
	}

	static int
	a83t_pllcpux_set_freq(struct aw_pll_sc sc, uint64_t fin, uint64_t fout,
	int flags)
	{
	uint32_t val;
	u_int n;

	n = *fout / fin;

	if (n < A83T_PLLCPUX_FACTOR_N_MIN \|\| n > A83T_PLLCPUX_FACTOR_N_MAX)
	return (EINVAL);

	if ((flags & CLK_SET_DRYRUN) != 0)
	return (0);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	val &= ~A83T_PLLCPUX_FACTOR_N;
	val \|= (n << A83T_PLLCPUX_FACTOR_N_SHIFT);
	val &= ~A83T_PLLCPUX_CLOCK_OUTPUT_DIS;
	PLL_WRITE(sc, val);
	DEVICE_UNLOCK(sc);

	return (0);
	}

	#define PLL(_type, _recalc, _set_freq, _init) \
	[(_type)] = { \
	.recalc = (_recalc), \
	.set_freq = (_set_freq), \
	.init = (_init) \
	}

	static struct aw_pll_funcs aw_pll_func[] = {
	PLL(AWPLL_A10_PLL1, a10_pll1_recalc, a10_pll1_set_freq, NULL),
	PLL(AWPLL_A10_PLL2, a10_pll2_recalc, a10_pll2_set_freq, NULL),
	PLL(AWPLL_A10_PLL3, a10_pll3_recalc, a10_pll3_set_freq, a10_pll3_init),
	PLL(AWPLL_A10_PLL5, a10_pll5_recalc, NULL, NULL),
	PLL(AWPLL_A10_PLL6, a10_pll6_recalc, a10_pll6_set_freq, a10_pll6_init),
	PLL(AWPLL_A13_PLL2, a13_pll2_recalc, a13_pll2_set_freq, NULL),
	PLL(AWPLL_A23_PLL1, a23_pll1_recalc, a23_pll1_set_freq, NULL),
	PLL(AWPLL_A31_PLL1, a31_pll1_recalc, NULL, NULL),
	PLL(AWPLL_A31_PLL6, a31_pll6_recalc, NULL, a31_pll6_init),
	PLL(AWPLL_A80_PLL4, a80_pll4_recalc, NULL, NULL),
	PLL(AWPLL_A83T_PLLCPUX, a83t_pllcpux_recalc, a83t_pllcpux_set_freq, NULL),
	PLL(AWPLL_A64_PLLHSIC, a64_pllhsic_recalc, NULL, a64_pllhsic_init),
	PLL(AWPLL_H3_PLL1, a23_pll1_recalc, h3_pll1_set_freq, NULL),
	PLL(AWPLL_H3_PLL2, h3_pll2_recalc, h3_pll2_set_freq, NULL),
	};

	static struct ofw_compat_data compat_data[] = {
	{ "allwinner,sun4i-a10-pll1-clk", AWPLL_A10_PLL1 },
	{ "allwinner,sun4i-a10-pll2-clk", AWPLL_A10_PLL2 },
	{ "allwinner,sun4i-a10-pll3-clk", AWPLL_A10_PLL3 },
	{ "allwinner,sun4i-a10-pll5-clk", AWPLL_A10_PLL5 },
	{ "allwinner,sun4i-a10-pll6-clk", AWPLL_A10_PLL6 },
	{ "allwinner,sun5i-a13-pll2-clk", AWPLL_A13_PLL2 },
	{ "allwinner,sun6i-a31-pll1-clk", AWPLL_A31_PLL1 },
	{ "allwinner,sun6i-a31-pll6-clk", AWPLL_A31_PLL6 },
	{ "allwinner,sun8i-a23-pll1-clk", AWPLL_A23_PLL1 },
	{ "allwinner,sun8i-a83t-pllcpux-clk", AWPLL_A83T_PLLCPUX },
	{ "allwinner,sun8i-h3-pll1-clk", AWPLL_H3_PLL1 },
	{ "allwinner,sun8i-h3-pll2-clk", AWPLL_H3_PLL2 },
	{ "allwinner,sun9i-a80-pll4-clk", AWPLL_A80_PLL4 },
	{ "allwinner,sun50i-a64-pllhsic-clk", AWPLL_A64_PLLHSIC },
	{ NULL, 0 }
	};

	static int
	aw_pll_init(struct clknode *clk, device_t dev)
	{
	clknode_init_parent_idx(clk, 0);
	return (0);
	}

	static int
	aw_pll_set_gate(struct clknode *clk, bool enable)
	{
	struct aw_pll_sc *sc;
	uint32_t val;

	sc = clknode_get_softc(clk);

	DEVICE_LOCK(sc);
	PLL_READ(sc, &val);
	if (enable)
	val \|= AW_PLL_ENABLE;
	else
	val &= ~AW_PLL_ENABLE;
	PLL_WRITE(sc, val);
	DEVICE_UNLOCK(sc);

	return (0);
	}

	static int
	aw_pll_recalc(struct clknode clk, uint64_t freq)
	{
	struct aw_pll_sc *sc;

	sc = clknode_get_softc(clk);

	if (aw_pll_func[sc->type].recalc == NULL)
	return (ENXIO);

	return (aw_pll_func[sc->type].recalc(sc, freq));
	}

	static int
	aw_pll_set_freq(struct clknode clk, uint64_t fin, uint64_t fout,
	int flags, int *stop)
	{
	struct aw_pll_sc *sc;

	sc = clknode_get_softc(clk);

	*stop = 1;

	if (aw_pll_func[sc->type].set_freq == NULL)
	return (ENXIO);

	return (aw_pll_func[sc->type].set_freq(sc, fin, fout, flags));
	}

	static clknode_method_t aw_pll_clknode_methods[] = {
	/* Device interface */
	CLKNODEMETHOD(clknode_init, aw_pll_init),
	CLKNODEMETHOD(clknode_set_gate, aw_pll_set_gate),
	CLKNODEMETHOD(clknode_recalc_freq, aw_pll_recalc),
	CLKNODEMETHOD(clknode_set_freq, aw_pll_set_freq),
	CLKNODEMETHOD_END
	};

	DEFINE_CLASS_1(aw_pll_clknode, aw_pll_clknode_class, aw_pll_clknode_methods,
	sizeof(struct aw_pll_sc), clknode_class);

	static int
	aw_pll_create(device_t dev, bus_addr_t paddr, struct clkdom *clkdom,
	const char pclkname, const char clkname, int index)
	{
	enum aw_pll_type type;
	struct clknode_init_def clkdef;
	struct aw_pll_sc *sc;
	struct clknode *clk;
	int error;

	type = ofw_bus_search_compatible(dev, compat_data)->ocd_data;

	memset(&clkdef, 0, sizeof(clkdef));
	clkdef.id = index;
	clkdef.name = clkname;
	if (pclkname != NULL) {
	clkdef.parent_names = malloc(sizeof(char *), M_OFWPROP,
	M_WAITOK);
	clkdef.parent_names[0] = pclkname;
	clkdef.parent_cnt = 1;
	} else
	clkdef.parent_cnt = 0;

	if (aw_pll_func[type].init != NULL) {
	error = aw_pll_func[type].init(device_get_parent(dev),
	paddr, &clkdef);
	if (error != 0) {
	device_printf(dev, "clock %s init failed\n", clkname);
	return (error);
	}
	}

	clk = clknode_create(clkdom, &aw_pll_clknode_class, &clkdef);
	if (clk == NULL) {
	device_printf(dev, "cannot create clock node\n");
	return (ENXIO);
	}
	sc = clknode_get_softc(clk);
	sc->clkdev = device_get_parent(dev);
	sc->reg = paddr;
	sc->type = type;
	sc->id = clkdef.id;

	clknode_register(clkdom, clk);

	OF_prop_free(__DECONST(char *, clkdef.parent_names));

	return (0);
	}

	static int
	aw_pll_probe(device_t dev)
	{
	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
	return (ENXIO);

	device_set_desc(dev, "Allwinner PLL Clock");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	aw_pll_attach(device_t dev)
	{
	struct clkdom *clkdom;
	const char **names;
	int index, nout, error;
	clk_t clk_parent;
	uint32_t *indices;
	bus_addr_t paddr;
	bus_size_t psize;
	phandle_t node;

	node = ofw_bus_get_node(dev);

	if (ofw_reg_to_paddr(node, 0, &paddr, &psize, NULL) != 0) {
	device_printf(dev, "couldn't parse 'reg' property\n");
	return (ENXIO);
	}

	clkdom = clkdom_create(dev);

	nout = clk_parse_ofw_out_names(dev, node, &names, &indices);
	if (nout == 0) {
	device_printf(dev, "no clock outputs found\n");
	error = ENOENT;
	goto fail;
	}

	if (clk_get_by_ofw_index(dev, 0, 0, &clk_parent) != 0)
	clk_parent = NULL;

	for (index = 0; index < nout; index++) {
	error = aw_pll_create(dev, paddr, clkdom,
	clk_parent ? clk_get_name(clk_parent) : NULL,
	names[index], nout == 1 ? 1 : index);
	if (error)
	goto fail;
	}

	if (clkdom_finit(clkdom) != 0) {
	device_printf(dev, "cannot finalize clkdom initialization\n");
	error = ENXIO;
	goto fail;
	}

	if (bootverbose)
	clkdom_dump(clkdom);

	return (0);

	fail:
	return (error);
	}

	static device_method_t aw_pll_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, aw_pll_probe),
	DEVMETHOD(device_attach, aw_pll_attach),

	DEVMETHOD_END
	};

	static driver_t aw_pll_driver = {
	"aw_pll",
	aw_pll_methods,
	0,
	};

	static devclass_t aw_pll_devclass;

	EARLY_DRIVER_MODULE(aw_pll, simplebus, aw_pll_driver,
	aw_pll_devclass, 0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
	Index: head/sys/arm/allwinner/if_awg.c
	===================================================================
	--- head/sys/arm/allwinner/if_awg.c (revision 327172)
	+++ head/sys/arm/allwinner/if_awg.c (revision 327173)
	@@ -1,1816 +1,1812 @@
	/*-
	* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* Allwinner Gigabit Ethernet MAC (EMAC) controller
	*/

	#include "opt_device_polling.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/rman.h>
	#include <sys/kernel.h>
	#include <sys/endian.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/module.h>
	#include <sys/taskqueue.h>
	#include <sys/gpio.h>

	#include <net/bpf.h>
	#include <net/if.h>
	#include <net/ethernet.h>
	#include <net/if_dl.h>
	#include <net/if_media.h>
	#include <net/if_types.h>
	#include <net/if_var.h>

	#include <machine/bus.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <arm/allwinner/if_awgreg.h>
	#include <arm/allwinner/aw_sid.h>
	#include <dev/mii/mii.h>
	#include <dev/mii/miivar.h>

	#include <dev/extres/clk/clk.h>
	#include <dev/extres/hwreset/hwreset.h>
	#include <dev/extres/regulator/regulator.h>

	#include "miibus_if.h"
	#include "gpio_if.h"

	#define RD4(sc, reg) bus_read_4((sc)->res[_RES_EMAC], (reg))
	#define WR4(sc, reg, val) bus_write_4((sc)->res[_RES_EMAC], (reg), (val))

	#define AWG_LOCK(sc) mtx_lock(&(sc)->mtx)
	#define AWG_UNLOCK(sc) mtx_unlock(&(sc)->mtx);
	#define AWG_ASSERT_LOCKED(sc) mtx_assert(&(sc)->mtx, MA_OWNED)
	#define AWG_ASSERT_UNLOCKED(sc) mtx_assert(&(sc)->mtx, MA_NOTOWNED)

	#define DESC_ALIGN 4
	#define TX_DESC_COUNT 1024
	#define TX_DESC_SIZE (sizeof(struct emac_desc) * TX_DESC_COUNT)
	#define RX_DESC_COUNT 256
	#define RX_DESC_SIZE (sizeof(struct emac_desc) * RX_DESC_COUNT)

	#define DESC_OFF(n) ((n) * sizeof(struct emac_desc))
	#define TX_NEXT(n) (((n) + 1) & (TX_DESC_COUNT - 1))
	#define TX_SKIP(n, o) (((n) + (o)) & (TX_DESC_COUNT - 1))
	#define RX_NEXT(n) (((n) + 1) & (RX_DESC_COUNT - 1))

	#define TX_MAX_SEGS 20

	#define SOFT_RST_RETRY 1000
	#define MII_BUSY_RETRY 1000
	#define MDIO_FREQ 2500000

	#define BURST_LEN_DEFAULT 8
	#define RX_TX_PRI_DEFAULT 0
	#define PAUSE_TIME_DEFAULT 0x400
	#define TX_INTERVAL_DEFAULT 64
	#define RX_BATCH_DEFAULT 64

	/* syscon EMAC clock register */
	#define EMAC_CLK_EPHY_ADDR (0x1f << 20) /* H3 */
	#define EMAC_CLK_EPHY_ADDR_SHIFT 20
	#define EMAC_CLK_EPHY_LED_POL (1 << 17) /* H3 */
	#define EMAC_CLK_EPHY_SHUTDOWN (1 << 16) /* H3 */
	#define EMAC_CLK_EPHY_SELECT (1 << 15) /* H3 */
	#define EMAC_CLK_RMII_EN (1 << 13)
	#define EMAC_CLK_ETXDC (0x7 << 10)
	#define EMAC_CLK_ETXDC_SHIFT 10
	#define EMAC_CLK_ERXDC (0x1f << 5)
	#define EMAC_CLK_ERXDC_SHIFT 5
	#define EMAC_CLK_PIT (0x1 << 2)
	#define EMAC_CLK_PIT_MII (0 << 2)
	#define EMAC_CLK_PIT_RGMII (1 << 2)
	#define EMAC_CLK_SRC (0x3 << 0)
	#define EMAC_CLK_SRC_MII (0 << 0)
	#define EMAC_CLK_SRC_EXT_RGMII (1 << 0)
	#define EMAC_CLK_SRC_RGMII (2 << 0)

	/* Burst length of RX and TX DMA transfers */
	static int awg_burst_len = BURST_LEN_DEFAULT;
	TUNABLE_INT("hw.awg.burst_len", &awg_burst_len);

	/* RX / TX DMA priority. If 1, RX DMA has priority over TX DMA. */
	static int awg_rx_tx_pri = RX_TX_PRI_DEFAULT;
	TUNABLE_INT("hw.awg.rx_tx_pri", &awg_rx_tx_pri);

	/* Pause time field in the transmitted control frame */
	static int awg_pause_time = PAUSE_TIME_DEFAULT;
	TUNABLE_INT("hw.awg.pause_time", &awg_pause_time);

	/* Request a TX interrupt every <n> descriptors */
	static int awg_tx_interval = TX_INTERVAL_DEFAULT;
	TUNABLE_INT("hw.awg.tx_interval", &awg_tx_interval);

	/* Maximum number of mbufs to send to if_input */
	static int awg_rx_batch = RX_BATCH_DEFAULT;
	TUNABLE_INT("hw.awg.rx_batch", &awg_rx_batch);

	enum awg_type {
	EMAC_A83T = 1,
	EMAC_H3,
	EMAC_A64,
	};

	static struct ofw_compat_data compat_data[] = {
	{ "allwinner,sun8i-a83t-emac", EMAC_A83T },
	{ "allwinner,sun8i-h3-emac", EMAC_H3 },
	{ "allwinner,sun50i-a64-emac", EMAC_A64 },
	{ NULL, 0 }
	};

	struct awg_bufmap {
	bus_dmamap_t map;
	struct mbuf *mbuf;
	};

	struct awg_txring {
	bus_dma_tag_t desc_tag;
	bus_dmamap_t desc_map;
	struct emac_desc *desc_ring;
	bus_addr_t desc_ring_paddr;
	bus_dma_tag_t buf_tag;
	struct awg_bufmap buf_map[TX_DESC_COUNT];
	u_int cur, next, queued;
	u_int segs;
	};

	struct awg_rxring {
	bus_dma_tag_t desc_tag;
	bus_dmamap_t desc_map;
	struct emac_desc *desc_ring;
	bus_addr_t desc_ring_paddr;
	bus_dma_tag_t buf_tag;
	struct awg_bufmap buf_map[RX_DESC_COUNT];
	bus_dmamap_t buf_spare_map;
	u_int cur;
	};

	enum {
	_RES_EMAC,
	_RES_IRQ,
	_RES_SYSCON,
	_RES_NITEMS
	};

	struct awg_softc {
	struct resource *res[_RES_NITEMS];
	struct mtx mtx;
	if_t ifp;
	device_t dev;
	device_t miibus;
	struct callout stat_ch;
	struct task link_task;
	void *ih;
	u_int mdc_div_ratio_m;
	int link;
	int if_flags;
	enum awg_type type;

	struct awg_txring tx;
	struct awg_rxring rx;
	};

	static struct resource_spec awg_spec[] = {
	{ SYS_RES_MEMORY, 0, RF_ACTIVE },
	{ SYS_RES_IRQ, 0, RF_ACTIVE },
	{ SYS_RES_MEMORY, 1, RF_ACTIVE \| RF_OPTIONAL },
	{ -1, 0 }
	};

	static void awg_txeof(struct awg_softc *sc);

	static int
	awg_miibus_readreg(device_t dev, int phy, int reg)
	{
	struct awg_softc *sc;
	int retry, val;

	sc = device_get_softc(dev);
	val = 0;

	WR4(sc, EMAC_MII_CMD,
	(sc->mdc_div_ratio_m << MDC_DIV_RATIO_M_SHIFT) \|
	(phy << PHY_ADDR_SHIFT) \|
	(reg << PHY_REG_ADDR_SHIFT) \|
	MII_BUSY);
	for (retry = MII_BUSY_RETRY; retry > 0; retry--) {
	if ((RD4(sc, EMAC_MII_CMD) & MII_BUSY) == 0) {
	val = RD4(sc, EMAC_MII_DATA);
	break;
	}
	DELAY(10);
	}

	if (retry == 0)
	device_printf(dev, "phy read timeout, phy=%d reg=%d\n",
	phy, reg);

	return (val);
	}

	static int
	awg_miibus_writereg(device_t dev, int phy, int reg, int val)
	{
	struct awg_softc *sc;
	int retry;

	sc = device_get_softc(dev);

	WR4(sc, EMAC_MII_DATA, val);
	WR4(sc, EMAC_MII_CMD,
	(sc->mdc_div_ratio_m << MDC_DIV_RATIO_M_SHIFT) \|
	(phy << PHY_ADDR_SHIFT) \|
	(reg << PHY_REG_ADDR_SHIFT) \|
	MII_WR \| MII_BUSY);
	for (retry = MII_BUSY_RETRY; retry > 0; retry--) {
	if ((RD4(sc, EMAC_MII_CMD) & MII_BUSY) == 0)
	break;
	DELAY(10);
	}

	if (retry == 0)
	device_printf(dev, "phy write timeout, phy=%d reg=%d\n",
	phy, reg);

	return (0);
	}

	static void
	awg_update_link_locked(struct awg_softc *sc)
	{
	struct mii_data *mii;
	uint32_t val;

	AWG_ASSERT_LOCKED(sc);

	if ((if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING) == 0)
	return;
	mii = device_get_softc(sc->miibus);

	if ((mii->mii_media_status & (IFM_ACTIVE \| IFM_AVALID)) ==
	(IFM_ACTIVE \| IFM_AVALID)) {
	switch (IFM_SUBTYPE(mii->mii_media_active)) {
	case IFM_1000_T:
	case IFM_1000_SX:
	case IFM_100_TX:
	case IFM_10_T:
	sc->link = 1;
	break;
	default:
	sc->link = 0;
	break;
	}
	} else
	sc->link = 0;

	if (sc->link == 0)
	return;

	val = RD4(sc, EMAC_BASIC_CTL_0);
	val &= ~(BASIC_CTL_SPEED \| BASIC_CTL_DUPLEX);

	if (IFM_SUBTYPE(mii->mii_media_active) == IFM_1000_T \|\|
	IFM_SUBTYPE(mii->mii_media_active) == IFM_1000_SX)
	val \|= BASIC_CTL_SPEED_1000 << BASIC_CTL_SPEED_SHIFT;
	else if (IFM_SUBTYPE(mii->mii_media_active) == IFM_100_TX)
	val \|= BASIC_CTL_SPEED_100 << BASIC_CTL_SPEED_SHIFT;
	else
	val \|= BASIC_CTL_SPEED_10 << BASIC_CTL_SPEED_SHIFT;

	if ((IFM_OPTIONS(mii->mii_media_active) & IFM_FDX) != 0)
	val \|= BASIC_CTL_DUPLEX;

	WR4(sc, EMAC_BASIC_CTL_0, val);

	val = RD4(sc, EMAC_RX_CTL_0);
	val &= ~RX_FLOW_CTL_EN;
	if ((IFM_OPTIONS(mii->mii_media_active) & IFM_ETH_RXPAUSE) != 0)
	val \|= RX_FLOW_CTL_EN;
	WR4(sc, EMAC_RX_CTL_0, val);

	val = RD4(sc, EMAC_TX_FLOW_CTL);
	val &= ~(PAUSE_TIME\|TX_FLOW_CTL_EN);
	if ((IFM_OPTIONS(mii->mii_media_active) & IFM_ETH_TXPAUSE) != 0)
	val \|= TX_FLOW_CTL_EN;
	if ((IFM_OPTIONS(mii->mii_media_active) & IFM_FDX) != 0)
	val \|= awg_pause_time << PAUSE_TIME_SHIFT;
	WR4(sc, EMAC_TX_FLOW_CTL, val);
	}

	static void
	awg_link_task(void *arg, int pending)
	{
	struct awg_softc *sc;

	sc = arg;

	AWG_LOCK(sc);
	awg_update_link_locked(sc);
	AWG_UNLOCK(sc);
	}

	static void
	awg_miibus_statchg(device_t dev)
	{
	struct awg_softc *sc;

	sc = device_get_softc(dev);

	taskqueue_enqueue(taskqueue_swi, &sc->link_task);
	}

	static void
	awg_media_status(if_t ifp, struct ifmediareq *ifmr)
	{
	struct awg_softc *sc;
	struct mii_data *mii;

	sc = if_getsoftc(ifp);
	mii = device_get_softc(sc->miibus);

	AWG_LOCK(sc);
	mii_pollstat(mii);
	ifmr->ifm_active = mii->mii_media_active;
	ifmr->ifm_status = mii->mii_media_status;
	AWG_UNLOCK(sc);
	}

	static int
	awg_media_change(if_t ifp)
	{
	struct awg_softc *sc;
	struct mii_data *mii;
	int error;

	sc = if_getsoftc(ifp);
	mii = device_get_softc(sc->miibus);

	AWG_LOCK(sc);
	error = mii_mediachg(mii);
	AWG_UNLOCK(sc);

	return (error);
	}

	static int
	awg_encap(struct awg_softc sc, struct mbuf *mp)
	{
	bus_dmamap_t map;
	bus_dma_segment_t segs[TX_MAX_SEGS];
	int error, nsegs, cur, first, last, i;
	u_int csum_flags;
	uint32_t flags, status;
	struct mbuf *m;

	cur = first = sc->tx.cur;
	map = sc->tx.buf_map[first].map;

	m = *mp;
	error = bus_dmamap_load_mbuf_sg(sc->tx.buf_tag, map, m, segs,
	&nsegs, BUS_DMA_NOWAIT);
	if (error == EFBIG) {
	m = m_collapse(m, M_NOWAIT, TX_MAX_SEGS);
	if (m == NULL) {
	device_printf(sc->dev, "awg_encap: m_collapse failed\n");
	m_freem(*mp);
	*mp = NULL;
	return (ENOMEM);
	}
	*mp = m;
	error = bus_dmamap_load_mbuf_sg(sc->tx.buf_tag, map, m,
	segs, &nsegs, BUS_DMA_NOWAIT);
	if (error != 0) {
	m_freem(*mp);
	*mp = NULL;
	}
	}
	if (error != 0) {
	device_printf(sc->dev, "awg_encap: bus_dmamap_load_mbuf_sg failed\n");
	return (error);
	}
	if (nsegs == 0) {
	m_freem(*mp);
	*mp = NULL;
	return (EIO);
	}

	if (sc->tx.queued + nsegs > TX_DESC_COUNT) {
	bus_dmamap_unload(sc->tx.buf_tag, map);
	return (ENOBUFS);
	}

	bus_dmamap_sync(sc->tx.buf_tag, map, BUS_DMASYNC_PREWRITE);

	flags = TX_FIR_DESC;
	status = 0;
	if ((m->m_pkthdr.csum_flags & CSUM_IP) != 0) {
	if ((m->m_pkthdr.csum_flags & (CSUM_TCP\|CSUM_UDP)) != 0)
	csum_flags = TX_CHECKSUM_CTL_FULL;
	else
	csum_flags = TX_CHECKSUM_CTL_IP;
	flags \|= (csum_flags << TX_CHECKSUM_CTL_SHIFT);
	}

	for (i = 0; i < nsegs; i++) {
	sc->tx.segs++;
	if (i == nsegs - 1) {
	flags \|= TX_LAST_DESC;
	/*
	* Can only request TX completion
	* interrupt on last descriptor.
	*/
	if (sc->tx.segs >= awg_tx_interval) {
	sc->tx.segs = 0;
	flags \|= TX_INT_CTL;
	}
	}

	sc->tx.desc_ring[cur].addr = htole32((uint32_t)segs[i].ds_addr);
	sc->tx.desc_ring[cur].size = htole32(flags \| segs[i].ds_len);
	sc->tx.desc_ring[cur].status = htole32(status);

	flags &= ~TX_FIR_DESC;
	/*
	* Setting of the valid bit in the first descriptor is
	* deferred until the whole chain is fully set up.
	*/
	status = TX_DESC_CTL;

	++sc->tx.queued;
	cur = TX_NEXT(cur);
	}

	sc->tx.cur = cur;

	/* Store mapping and mbuf in the last segment */
	last = TX_SKIP(cur, TX_DESC_COUNT - 1);
	sc->tx.buf_map[first].map = sc->tx.buf_map[last].map;
	sc->tx.buf_map[last].map = map;
	sc->tx.buf_map[last].mbuf = m;

	/*
	* The whole mbuf chain has been DMA mapped,
	* fix the first descriptor.
	*/
	sc->tx.desc_ring[first].status = htole32(TX_DESC_CTL);

	return (0);
	}

	static void
	awg_clean_txbuf(struct awg_softc *sc, int index)
	{
	struct awg_bufmap *bmap;

	--sc->tx.queued;

	bmap = &sc->tx.buf_map[index];
	if (bmap->mbuf != NULL) {
	bus_dmamap_sync(sc->tx.buf_tag, bmap->map,
	BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(sc->tx.buf_tag, bmap->map);
	m_freem(bmap->mbuf);
	bmap->mbuf = NULL;
	}
	}

	static void
	awg_setup_rxdesc(struct awg_softc *sc, int index, bus_addr_t paddr)
	{
	uint32_t status, size;

	status = RX_DESC_CTL;
	size = MCLBYTES - 1;

	sc->rx.desc_ring[index].addr = htole32((uint32_t)paddr);
	sc->rx.desc_ring[index].size = htole32(size);
	sc->rx.desc_ring[index].status = htole32(status);
	}

	static void
	awg_reuse_rxdesc(struct awg_softc *sc, int index)
	{

	sc->rx.desc_ring[index].status = htole32(RX_DESC_CTL);
	}

	static int
	awg_newbuf_rx(struct awg_softc *sc, int index)
	{
	struct mbuf *m;
	bus_dma_segment_t seg;
	bus_dmamap_t map;
	int nsegs;

	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
	if (m == NULL)
	return (ENOBUFS);

	m->m_pkthdr.len = m->m_len = m->m_ext.ext_size;
	m_adj(m, ETHER_ALIGN);

	if (bus_dmamap_load_mbuf_sg(sc->rx.buf_tag, sc->rx.buf_spare_map,
	m, &seg, &nsegs, BUS_DMA_NOWAIT) != 0) {
	m_freem(m);
	return (ENOBUFS);
	}

	if (sc->rx.buf_map[index].mbuf != NULL) {
	bus_dmamap_sync(sc->rx.buf_tag, sc->rx.buf_map[index].map,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->rx.buf_tag, sc->rx.buf_map[index].map);
	}
	map = sc->rx.buf_map[index].map;
	sc->rx.buf_map[index].map = sc->rx.buf_spare_map;
	sc->rx.buf_spare_map = map;
	bus_dmamap_sync(sc->rx.buf_tag, sc->rx.buf_map[index].map,
	BUS_DMASYNC_PREREAD);

	sc->rx.buf_map[index].mbuf = m;
	awg_setup_rxdesc(sc, index, seg.ds_addr);

	return (0);
	}

	static void
	awg_start_locked(struct awg_softc *sc)
	{
	struct mbuf *m;
	uint32_t val;
	if_t ifp;
	int cnt, err;

	AWG_ASSERT_LOCKED(sc);

	if (!sc->link)
	return;

	ifp = sc->ifp;

	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING\|IFF_DRV_OACTIVE)) !=
	IFF_DRV_RUNNING)
	return;

	for (cnt = 0; ; cnt++) {
	m = if_dequeue(ifp);
	if (m == NULL)
	break;

	err = awg_encap(sc, &m);
	if (err != 0) {
	if (err == ENOBUFS)
	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
	if (m != NULL)
	if_sendq_prepend(ifp, m);
	break;
	}
	if_bpfmtap(ifp, m);
	}

	if (cnt != 0) {
	bus_dmamap_sync(sc->tx.desc_tag, sc->tx.desc_map,
	BUS_DMASYNC_PREREAD\|BUS_DMASYNC_PREWRITE);

	/* Start and run TX DMA */
	val = RD4(sc, EMAC_TX_CTL_1);
	WR4(sc, EMAC_TX_CTL_1, val \| TX_DMA_START);
	}
	}

	static void
	awg_start(if_t ifp)
	{
	struct awg_softc *sc;

	sc = if_getsoftc(ifp);

	AWG_LOCK(sc);
	awg_start_locked(sc);
	AWG_UNLOCK(sc);
	}

	static void
	awg_tick(void *softc)
	{
	struct awg_softc *sc;
	struct mii_data *mii;
	if_t ifp;
	int link;

	sc = softc;
	ifp = sc->ifp;
	mii = device_get_softc(sc->miibus);

	AWG_ASSERT_LOCKED(sc);

	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
	return;

	link = sc->link;
	mii_tick(mii);
	if (sc->link && !link)
	awg_start_locked(sc);

	callout_reset(&sc->stat_ch, hz, awg_tick, sc);
	}

	/* Bit Reversal - http://aggregate.org/MAGIC/#Bit%20Reversal */
	static uint32_t
	bitrev32(uint32_t x)
	{
	x = (((x & 0xaaaaaaaa) >> 1) \| ((x & 0x55555555) << 1));
	x = (((x & 0xcccccccc) >> 2) \| ((x & 0x33333333) << 2));
	x = (((x & 0xf0f0f0f0) >> 4) \| ((x & 0x0f0f0f0f) << 4));
	x = (((x & 0xff00ff00) >> 8) \| ((x & 0x00ff00ff) << 8));

	return (x >> 16) \| (x << 16);
	}

	static void
	awg_setup_rxfilter(struct awg_softc *sc)
	{
	uint32_t val, crc, hashreg, hashbit, hash[2], machi, maclo;
	int mc_count, mcnt, i;
	uint8_t eaddr, mta;
	if_t ifp;

	AWG_ASSERT_LOCKED(sc);

	ifp = sc->ifp;
	val = 0;
	hash[0] = hash[1] = 0;

	mc_count = if_multiaddr_count(ifp, -1);

	if (if_getflags(ifp) & IFF_PROMISC)
	val \|= DIS_ADDR_FILTER;
	else if (if_getflags(ifp) & IFF_ALLMULTI) {
	val \|= RX_ALL_MULTICAST;
	hash[0] = hash[1] = ~0;
	} else if (mc_count > 0) {
	val \|= HASH_MULTICAST;

	mta = malloc(sizeof(unsigned char) * ETHER_ADDR_LEN * mc_count,
	M_DEVBUF, M_NOWAIT);
	if (mta == NULL) {
	if_printf(ifp,
	"failed to allocate temporary multicast list\n");
	return;
	}

	if_multiaddr_array(ifp, mta, &mcnt, mc_count);
	for (i = 0; i < mcnt; i++) {
	crc = ether_crc32_le(mta + (i * ETHER_ADDR_LEN),
	ETHER_ADDR_LEN) & 0x7f;
	crc = bitrev32(~crc) >> 26;
	hashreg = (crc >> 5);
	hashbit = (crc & 0x1f);
	hash[hashreg] \|= (1 << hashbit);
	}

	free(mta, M_DEVBUF);
	}

	/* Write our unicast address */
	eaddr = IF_LLADDR(ifp);
	machi = (eaddr[5] << 8) \| eaddr[4];
	maclo = (eaddr[3] << 24) \| (eaddr[2] << 16) \| (eaddr[1] << 8) \|
	(eaddr[0] << 0);
	WR4(sc, EMAC_ADDR_HIGH(0), machi);
	WR4(sc, EMAC_ADDR_LOW(0), maclo);

	/* Multicast hash filters */
	WR4(sc, EMAC_RX_HASH_0, hash[1]);
	WR4(sc, EMAC_RX_HASH_1, hash[0]);

	/* RX frame filter config */
	WR4(sc, EMAC_RX_FRM_FLT, val);
	}

	static void
	awg_enable_intr(struct awg_softc *sc)
	{
	/* Enable interrupts */
	WR4(sc, EMAC_INT_EN, RX_INT_EN \| TX_INT_EN \| TX_BUF_UA_INT_EN);
	}

	static void
	awg_disable_intr(struct awg_softc *sc)
	{
	/* Disable interrupts */
	WR4(sc, EMAC_INT_EN, 0);
	}

	static void
	awg_init_locked(struct awg_softc *sc)
	{
	struct mii_data *mii;
	uint32_t val;
	if_t ifp;

	mii = device_get_softc(sc->miibus);
	ifp = sc->ifp;

	AWG_ASSERT_LOCKED(sc);

	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
	return;

	awg_setup_rxfilter(sc);

	/* Configure DMA burst length and priorities */
	val = awg_burst_len << BASIC_CTL_BURST_LEN_SHIFT;
	if (awg_rx_tx_pri)
	val \|= BASIC_CTL_RX_TX_PRI;
	WR4(sc, EMAC_BASIC_CTL_1, val);

	/* Enable interrupts */
	#ifdef DEVICE_POLLING
	if ((if_getcapenable(ifp) & IFCAP_POLLING) == 0)
	awg_enable_intr(sc);
	else
	awg_disable_intr(sc);
	#else
	awg_enable_intr(sc);
	#endif

	/* Enable transmit DMA */
	val = RD4(sc, EMAC_TX_CTL_1);
	WR4(sc, EMAC_TX_CTL_1, val \| TX_DMA_EN \| TX_MD \| TX_NEXT_FRAME);

	/* Enable receive DMA */
	val = RD4(sc, EMAC_RX_CTL_1);
	WR4(sc, EMAC_RX_CTL_1, val \| RX_DMA_EN \| RX_MD);

	/* Enable transmitter */
	val = RD4(sc, EMAC_TX_CTL_0);
	WR4(sc, EMAC_TX_CTL_0, val \| TX_EN);

	/* Enable receiver */
	val = RD4(sc, EMAC_RX_CTL_0);
	WR4(sc, EMAC_RX_CTL_0, val \| RX_EN \| CHECK_CRC);

	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);

	mii_mediachg(mii);
	callout_reset(&sc->stat_ch, hz, awg_tick, sc);
	}

	static void
	awg_init(void *softc)
	{
	struct awg_softc *sc;

	sc = softc;

	AWG_LOCK(sc);
	awg_init_locked(sc);
	AWG_UNLOCK(sc);
	}

	static void
	awg_stop(struct awg_softc *sc)
	{
	if_t ifp;
	uint32_t val;
	int i;

	AWG_ASSERT_LOCKED(sc);

	ifp = sc->ifp;

	callout_stop(&sc->stat_ch);

	/* Stop transmit DMA and flush data in the TX FIFO */
	val = RD4(sc, EMAC_TX_CTL_1);
	val &= ~TX_DMA_EN;
	val \|= FLUSH_TX_FIFO;
	WR4(sc, EMAC_TX_CTL_1, val);

	/* Disable transmitter */
	val = RD4(sc, EMAC_TX_CTL_0);
	WR4(sc, EMAC_TX_CTL_0, val & ~TX_EN);

	/* Disable receiver */
	val = RD4(sc, EMAC_RX_CTL_0);
	WR4(sc, EMAC_RX_CTL_0, val & ~RX_EN);

	/* Disable interrupts */
	awg_disable_intr(sc);

	/* Disable transmit DMA */
	val = RD4(sc, EMAC_TX_CTL_1);
	WR4(sc, EMAC_TX_CTL_1, val & ~TX_DMA_EN);

	/* Disable receive DMA */
	val = RD4(sc, EMAC_RX_CTL_1);
	WR4(sc, EMAC_RX_CTL_1, val & ~RX_DMA_EN);

	sc->link = 0;

	/* Finish handling transmitted buffers */
	awg_txeof(sc);

	/* Release any untransmitted buffers. */
	for (i = sc->tx.next; sc->tx.queued > 0; i = TX_NEXT(i)) {
	val = le32toh(sc->tx.desc_ring[i].status);
	if ((val & TX_DESC_CTL) != 0)
	break;
	awg_clean_txbuf(sc, i);
	}
	sc->tx.next = i;
	for (; sc->tx.queued > 0; i = TX_NEXT(i)) {
	sc->tx.desc_ring[i].status = 0;
	awg_clean_txbuf(sc, i);
	}
	sc->tx.cur = sc->tx.next;
	bus_dmamap_sync(sc->tx.desc_tag, sc->tx.desc_map,
	BUS_DMASYNC_PREREAD \| BUS_DMASYNC_PREWRITE);

	/* Setup RX buffers for reuse */
	bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
	BUS_DMASYNC_POSTREAD \| BUS_DMASYNC_POSTWRITE);

	for (i = sc->rx.cur; ; i = RX_NEXT(i)) {
	val = le32toh(sc->rx.desc_ring[i].status);
	if ((val & RX_DESC_CTL) != 0)
	break;
	awg_reuse_rxdesc(sc, i);
	}
	sc->rx.cur = i;
	bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
	BUS_DMASYNC_PREREAD \| BUS_DMASYNC_PREWRITE);

	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING \| IFF_DRV_OACTIVE);
	}

	static int
	awg_rxintr(struct awg_softc *sc)
	{
	if_t ifp;
	struct mbuf m, mh, *mt;
	int error, index, len, cnt, npkt;
	uint32_t status;

	ifp = sc->ifp;
	mh = mt = NULL;
	cnt = 0;
	npkt = 0;

	bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
	BUS_DMASYNC_POSTREAD \| BUS_DMASYNC_POSTWRITE);

	for (index = sc->rx.cur; ; index = RX_NEXT(index)) {
	status = le32toh(sc->rx.desc_ring[index].status);
	if ((status & RX_DESC_CTL) != 0)
	break;

	len = (status & RX_FRM_LEN) >> RX_FRM_LEN_SHIFT;

	if (len == 0) {
	if ((status & (RX_NO_ENOUGH_BUF_ERR \| RX_OVERFLOW_ERR)) != 0)
	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
	awg_reuse_rxdesc(sc, index);
	continue;
	}

	m = sc->rx.buf_map[index].mbuf;

	error = awg_newbuf_rx(sc, index);
	if (error != 0) {
	if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
	awg_reuse_rxdesc(sc, index);
	continue;
	}

	m->m_pkthdr.rcvif = ifp;
	m->m_pkthdr.len = len;
	m->m_len = len;
	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);

	if ((if_getcapenable(ifp) & IFCAP_RXCSUM) != 0 &&
	(status & RX_FRM_TYPE) != 0) {
	m->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
	if ((status & RX_HEADER_ERR) == 0)
	m->m_pkthdr.csum_flags \|= CSUM_IP_VALID;
	if ((status & RX_PAYLOAD_ERR) == 0) {
	m->m_pkthdr.csum_flags \|=
	CSUM_DATA_VALID \| CSUM_PSEUDO_HDR;
	m->m_pkthdr.csum_data = 0xffff;
	}
	}

	m->m_nextpkt = NULL;
	if (mh == NULL)
	mh = m;
	else
	mt->m_nextpkt = m;
	mt = m;
	++cnt;
	++npkt;

	if (cnt == awg_rx_batch) {
	AWG_UNLOCK(sc);
	if_input(ifp, mh);
	AWG_LOCK(sc);
	mh = mt = NULL;
	cnt = 0;
	}
	}

	if (index != sc->rx.cur) {
	bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
	BUS_DMASYNC_PREREAD \| BUS_DMASYNC_PREWRITE);
	}

	if (mh != NULL) {
	AWG_UNLOCK(sc);
	if_input(ifp, mh);
	AWG_LOCK(sc);
	}

	sc->rx.cur = index;

	return (npkt);
	}

	static void
	awg_txeof(struct awg_softc *sc)
	{
	struct emac_desc *desc;
	uint32_t status, size;
	if_t ifp;
	int i, prog;

	AWG_ASSERT_LOCKED(sc);

	bus_dmamap_sync(sc->tx.desc_tag, sc->tx.desc_map,
	BUS_DMASYNC_POSTREAD \| BUS_DMASYNC_POSTWRITE);

	ifp = sc->ifp;

	prog = 0;
	for (i = sc->tx.next; sc->tx.queued > 0; i = TX_NEXT(i)) {
	desc = &sc->tx.desc_ring[i];
	status = le32toh(desc->status);
	if ((status & TX_DESC_CTL) != 0)
	break;
	size = le32toh(desc->size);
	if (size & TX_LAST_DESC) {
	if ((status & (TX_HEADER_ERR \| TX_PAYLOAD_ERR)) != 0)
	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
	else
	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
	}
	prog++;
	awg_clean_txbuf(sc, i);
	}

	if (prog > 0) {
	sc->tx.next = i;
	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
	}
	}

	static void
	awg_intr(void *arg)
	{
	struct awg_softc *sc;
	uint32_t val;

	sc = arg;

	AWG_LOCK(sc);
	val = RD4(sc, EMAC_INT_STA);
	WR4(sc, EMAC_INT_STA, val);

	if (val & RX_INT)
	awg_rxintr(sc);

	if (val & TX_INT)
	awg_txeof(sc);

	if (val & (TX_INT \| TX_BUF_UA_INT)) {
	if (!if_sendq_empty(sc->ifp))
	awg_start_locked(sc);
	}

	AWG_UNLOCK(sc);
	}

	#ifdef DEVICE_POLLING
	static int
	awg_poll(if_t ifp, enum poll_cmd cmd, int count)
	{
	struct awg_softc *sc;
	uint32_t val;
	int rx_npkts;

	sc = if_getsoftc(ifp);
	rx_npkts = 0;

	AWG_LOCK(sc);

	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
	AWG_UNLOCK(sc);
	return (0);
	}

	rx_npkts = awg_rxintr(sc);
	awg_txeof(sc);
	if (!if_sendq_empty(ifp))
	awg_start_locked(sc);

	if (cmd == POLL_AND_CHECK_STATUS) {
	val = RD4(sc, EMAC_INT_STA);
	if (val != 0)
	WR4(sc, EMAC_INT_STA, val);
	}

	AWG_UNLOCK(sc);

	return (rx_npkts);
	}
	#endif

	static int
	awg_ioctl(if_t ifp, u_long cmd, caddr_t data)
	{
	struct awg_softc *sc;
	struct mii_data *mii;
	struct ifreq *ifr;
	int flags, mask, error;

	sc = if_getsoftc(ifp);
	mii = device_get_softc(sc->miibus);
	ifr = (struct ifreq *)data;
	error = 0;

	switch (cmd) {
	case SIOCSIFFLAGS:
	AWG_LOCK(sc);
	if (if_getflags(ifp) & IFF_UP) {
	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
	flags = if_getflags(ifp) ^ sc->if_flags;
	if ((flags & (IFF_PROMISC\|IFF_ALLMULTI)) != 0)
	awg_setup_rxfilter(sc);
	} else
	awg_init_locked(sc);
	} else {
	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
	awg_stop(sc);
	}
	sc->if_flags = if_getflags(ifp);
	AWG_UNLOCK(sc);
	break;
	case SIOCADDMULTI:
	case SIOCDELMULTI:
	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
	AWG_LOCK(sc);
	awg_setup_rxfilter(sc);
	AWG_UNLOCK(sc);
	}
	break;
	case SIOCSIFMEDIA:
	case SIOCGIFMEDIA:
	error = ifmedia_ioctl(ifp, ifr, &mii->mii_media, cmd);
	break;
	case SIOCSIFCAP:
	mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
	#ifdef DEVICE_POLLING
	if (mask & IFCAP_POLLING) {
	if ((ifr->ifr_reqcap & IFCAP_POLLING) != 0) {
	error = ether_poll_register(awg_poll, ifp);
	if (error != 0)
	break;
	AWG_LOCK(sc);
	awg_disable_intr(sc);
	if_setcapenablebit(ifp, IFCAP_POLLING, 0);
	AWG_UNLOCK(sc);
	} else {
	error = ether_poll_deregister(ifp);
	AWG_LOCK(sc);
	awg_enable_intr(sc);
	if_setcapenablebit(ifp, 0, IFCAP_POLLING);
	AWG_UNLOCK(sc);
	}
	}
	#endif
	if (mask & IFCAP_VLAN_MTU)
	if_togglecapenable(ifp, IFCAP_VLAN_MTU);
	if (mask & IFCAP_RXCSUM)
	if_togglecapenable(ifp, IFCAP_RXCSUM);
	if (mask & IFCAP_TXCSUM)
	if_togglecapenable(ifp, IFCAP_TXCSUM);
	if ((if_getcapenable(ifp) & IFCAP_TXCSUM) != 0)
	if_sethwassistbits(ifp, CSUM_IP \| CSUM_UDP \| CSUM_TCP, 0);
	else
	if_sethwassistbits(ifp, 0, CSUM_IP \| CSUM_UDP \| CSUM_TCP);
	break;
	default:
	error = ether_ioctl(ifp, cmd, data);
	break;
	}

	return (error);
	}

	static int
	awg_setup_phy(device_t dev)
	{
	struct awg_softc *sc;
	clk_t clk_tx, clk_tx_parent;
	const char *tx_parent_name;
	char *phy_type;
	phandle_t node;
	uint32_t reg, tx_delay, rx_delay;
	int error;

	sc = device_get_softc(dev);
	node = ofw_bus_get_node(dev);

	if (OF_getprop_alloc(node, "phy-mode", 1, (void **)&phy_type) == 0)
	return (0);

	if (bootverbose)
	device_printf(dev, "PHY type: %s, conf mode: %s\n", phy_type,
	sc->res[_RES_SYSCON] != NULL ? "reg" : "clk");

	if (sc->res[_RES_SYSCON] != NULL) {
	reg = bus_read_4(sc->res[_RES_SYSCON], 0);
	reg &= ~(EMAC_CLK_PIT \| EMAC_CLK_SRC \| EMAC_CLK_RMII_EN);
	if (strcmp(phy_type, "rgmii") == 0)
	reg \|= EMAC_CLK_PIT_RGMII \| EMAC_CLK_SRC_RGMII;
	else if (strcmp(phy_type, "rmii") == 0)
	reg \|= EMAC_CLK_RMII_EN;
	else
	reg \|= EMAC_CLK_PIT_MII \| EMAC_CLK_SRC_MII;

	if (OF_getencprop(node, "tx-delay", &tx_delay,
	sizeof(tx_delay)) > 0) {
	reg &= ~EMAC_CLK_ETXDC;
	reg \|= (tx_delay << EMAC_CLK_ETXDC_SHIFT);
	}
	if (OF_getencprop(node, "rx-delay", &rx_delay,
	sizeof(rx_delay)) > 0) {
	reg &= ~EMAC_CLK_ERXDC;
	reg \|= (rx_delay << EMAC_CLK_ERXDC_SHIFT);
	}

	if (sc->type == EMAC_H3) {
	if (OF_hasprop(node, "allwinner,use-internal-phy")) {
	reg \|= EMAC_CLK_EPHY_SELECT;
	reg &= ~EMAC_CLK_EPHY_SHUTDOWN;
	if (OF_hasprop(node,
	"allwinner,leds-active-low"))
	reg \|= EMAC_CLK_EPHY_LED_POL;
	else
	reg &= ~EMAC_CLK_EPHY_LED_POL;

	/* Set internal PHY addr to 1 */
	reg &= ~EMAC_CLK_EPHY_ADDR;
	reg \|= (1 << EMAC_CLK_EPHY_ADDR_SHIFT);
	} else {
	reg &= ~EMAC_CLK_EPHY_SELECT;
	}
	}

	if (bootverbose)
	device_printf(dev, "EMAC clock: 0x%08x\n", reg);
	bus_write_4(sc->res[_RES_SYSCON], 0, reg);
	} else {
	if (strcmp(phy_type, "rgmii") == 0)
	tx_parent_name = "emac_int_tx";
	else
	tx_parent_name = "mii_phy_tx";

	/* Get the TX clock */
	error = clk_get_by_ofw_name(dev, 0, "tx", &clk_tx);
	if (error != 0) {
	device_printf(dev, "cannot get tx clock\n");
	goto fail;
	}

	/* Find the desired parent clock based on phy-mode property */
	error = clk_get_by_name(dev, tx_parent_name, &clk_tx_parent);
	if (error != 0) {
	device_printf(dev, "cannot get clock '%s'\n",
	tx_parent_name);
	goto fail;
	}

	/* Set TX clock parent */
	error = clk_set_parent_by_clk(clk_tx, clk_tx_parent);
	if (error != 0) {
	device_printf(dev, "cannot set tx clock parent\n");
	goto fail;
	}

	/* Enable TX clock */
	error = clk_enable(clk_tx);
	if (error != 0) {
	device_printf(dev, "cannot enable tx clock\n");
	goto fail;
	}
	}

	error = 0;

	fail:
	OF_prop_free(phy_type);
	return (error);
	}

	static int
	awg_setup_extres(device_t dev)
	{
	struct awg_softc *sc;
	hwreset_t rst_ahb, rst_ephy;
	clk_t clk_ahb, clk_ephy;
	regulator_t reg;
	- phandle_t node;
	uint64_t freq;
	int error, div;

	sc = device_get_softc(dev);
	- node = ofw_bus_get_node(dev);
	rst_ahb = rst_ephy = NULL;
	clk_ahb = clk_ephy = NULL;
	reg = NULL;

	/* Get AHB clock and reset resources */
	error = hwreset_get_by_ofw_name(dev, 0, "ahb", &rst_ahb);
	if (error != 0) {
	device_printf(dev, "cannot get ahb reset\n");
	goto fail;
	}
	if (hwreset_get_by_ofw_name(dev, 0, "ephy", &rst_ephy) != 0)
	rst_ephy = NULL;
	error = clk_get_by_ofw_name(dev, 0, "ahb", &clk_ahb);
	if (error != 0) {
	device_printf(dev, "cannot get ahb clock\n");
	goto fail;
	}
	if (clk_get_by_ofw_name(dev, 0, "ephy", &clk_ephy) != 0)
	clk_ephy = NULL;

	/* Configure PHY for MII or RGMII mode */
	if (awg_setup_phy(dev) != 0)
	goto fail;

	/* Enable clocks */
	error = clk_enable(clk_ahb);
	if (error != 0) {
	device_printf(dev, "cannot enable ahb clock\n");
	goto fail;
	}
	if (clk_ephy != NULL) {
	error = clk_enable(clk_ephy);
	if (error != 0) {
	device_printf(dev, "cannot enable ephy clock\n");
	goto fail;
	}
	}

	/* De-assert reset */
	error = hwreset_deassert(rst_ahb);
	if (error != 0) {
	device_printf(dev, "cannot de-assert ahb reset\n");
	goto fail;
	}
	if (rst_ephy != NULL) {
	error = hwreset_deassert(rst_ephy);
	if (error != 0) {
	device_printf(dev, "cannot de-assert ephy reset\n");
	goto fail;
	}
	}

	/* Enable PHY regulator if applicable */
	if (regulator_get_by_ofw_property(dev, 0, "phy-supply", &reg) == 0) {
	error = regulator_enable(reg);
	if (error != 0) {
	device_printf(dev, "cannot enable PHY regulator\n");
	goto fail;
	}
	}

	/* Determine MDC clock divide ratio based on AHB clock */
	error = clk_get_freq(clk_ahb, &freq);
	if (error != 0) {
	device_printf(dev, "cannot get AHB clock frequency\n");
	goto fail;
	}
	div = freq / MDIO_FREQ;
	if (div <= 16)
	sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_16;
	else if (div <= 32)
	sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_32;
	else if (div <= 64)
	sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_64;
	else if (div <= 128)
	sc->mdc_div_ratio_m = MDC_DIV_RATIO_M_128;
	else {
	device_printf(dev, "cannot determine MDC clock divide ratio\n");
	error = ENXIO;
	goto fail;
	}

	if (bootverbose)
	device_printf(dev, "AHB frequency %ju Hz, MDC div: 0x%x\n",
	(uintmax_t)freq, sc->mdc_div_ratio_m);

	return (0);

	fail:
	if (reg != NULL)
	regulator_release(reg);
	if (clk_ephy != NULL)
	clk_release(clk_ephy);
	if (clk_ahb != NULL)
	clk_release(clk_ahb);
	if (rst_ephy != NULL)
	hwreset_release(rst_ephy);
	if (rst_ahb != NULL)
	hwreset_release(rst_ahb);
	return (error);
	}

	static void
	awg_get_eaddr(device_t dev, uint8_t *eaddr)
	{
	struct awg_softc *sc;
	uint32_t maclo, machi, rnd;
	u_char rootkey[16];

	sc = device_get_softc(dev);

	machi = RD4(sc, EMAC_ADDR_HIGH(0)) & 0xffff;
	maclo = RD4(sc, EMAC_ADDR_LOW(0));

	if (maclo == 0xffffffff && machi == 0xffff) {
	/* MAC address in hardware is invalid, create one */
	if (aw_sid_get_rootkey(rootkey) == 0 &&
	(rootkey[3] \| rootkey[12] \| rootkey[13] \| rootkey[14] \|
	rootkey[15]) != 0) {
	/* MAC address is derived from the root key in SID */
	maclo = (rootkey[13] << 24) \| (rootkey[12] << 16) \|
	(rootkey[3] << 8) \| 0x02;
	machi = (rootkey[15] << 8) \| rootkey[14];
	} else {
	/* Create one */
	rnd = arc4random();
	maclo = 0x00f2 \| (rnd & 0xffff0000);
	machi = rnd & 0xffff;
	}
	}

	eaddr[0] = maclo & 0xff;
	eaddr[1] = (maclo >> 8) & 0xff;
	eaddr[2] = (maclo >> 16) & 0xff;
	eaddr[3] = (maclo >> 24) & 0xff;
	eaddr[4] = machi & 0xff;
	eaddr[5] = (machi >> 8) & 0xff;
	}

	#ifdef AWG_DEBUG
	static void
	awg_dump_regs(device_t dev)
	{
	static const struct {
	const char *name;
	u_int reg;
	} regs[] = {
	{ "BASIC_CTL_0", EMAC_BASIC_CTL_0 },
	{ "BASIC_CTL_1", EMAC_BASIC_CTL_1 },
	{ "INT_STA", EMAC_INT_STA },
	{ "INT_EN", EMAC_INT_EN },
	{ "TX_CTL_0", EMAC_TX_CTL_0 },
	{ "TX_CTL_1", EMAC_TX_CTL_1 },
	{ "TX_FLOW_CTL", EMAC_TX_FLOW_CTL },
	{ "TX_DMA_LIST", EMAC_TX_DMA_LIST },
	{ "RX_CTL_0", EMAC_RX_CTL_0 },
	{ "RX_CTL_1", EMAC_RX_CTL_1 },
	{ "RX_DMA_LIST", EMAC_RX_DMA_LIST },
	{ "RX_FRM_FLT", EMAC_RX_FRM_FLT },
	{ "RX_HASH_0", EMAC_RX_HASH_0 },
	{ "RX_HASH_1", EMAC_RX_HASH_1 },
	{ "MII_CMD", EMAC_MII_CMD },
	{ "ADDR_HIGH0", EMAC_ADDR_HIGH(0) },
	{ "ADDR_LOW0", EMAC_ADDR_LOW(0) },
	{ "TX_DMA_STA", EMAC_TX_DMA_STA },
	{ "TX_DMA_CUR_DESC", EMAC_TX_DMA_CUR_DESC },
	{ "TX_DMA_CUR_BUF", EMAC_TX_DMA_CUR_BUF },
	{ "RX_DMA_STA", EMAC_RX_DMA_STA },
	{ "RX_DMA_CUR_DESC", EMAC_RX_DMA_CUR_DESC },
	{ "RX_DMA_CUR_BUF", EMAC_RX_DMA_CUR_BUF },
	{ "RGMII_STA", EMAC_RGMII_STA },
	};
	struct awg_softc *sc;
	unsigned int n;

	sc = device_get_softc(dev);

	for (n = 0; n < nitems(regs); n++)
	device_printf(dev, " %-20s %08x\n", regs[n].name,
	RD4(sc, regs[n].reg));
	}
	#endif

	#define GPIO_ACTIVE_LOW 1

	static int
	awg_phy_reset(device_t dev)
	{
	pcell_t gpio_prop[4], delay_prop[3];
	phandle_t node, gpio_node;
	device_t gpio;
	uint32_t pin, flags;
	uint32_t pin_value;

	node = ofw_bus_get_node(dev);
	if (OF_getencprop(node, "allwinner,reset-gpio", gpio_prop,
	sizeof(gpio_prop)) <= 0)
	return (0);

	if (OF_getencprop(node, "allwinner,reset-delays-us", delay_prop,
	sizeof(delay_prop)) <= 0)
	return (ENXIO);

	gpio_node = OF_node_from_xref(gpio_prop[0]);
	if ((gpio = OF_device_from_xref(gpio_prop[0])) == NULL)
	return (ENXIO);

	if (GPIO_MAP_GPIOS(gpio, node, gpio_node, nitems(gpio_prop) - 1,
	gpio_prop + 1, &pin, &flags) != 0)
	return (ENXIO);

	pin_value = GPIO_PIN_LOW;
	if (OF_hasprop(node, "allwinner,reset-active-low"))
	pin_value = GPIO_PIN_HIGH;

	if (flags & GPIO_ACTIVE_LOW)
	pin_value = !pin_value;

	GPIO_PIN_SETFLAGS(gpio, pin, GPIO_PIN_OUTPUT);
	GPIO_PIN_SET(gpio, pin, pin_value);
	DELAY(delay_prop[0]);
	GPIO_PIN_SET(gpio, pin, !pin_value);
	DELAY(delay_prop[1]);
	GPIO_PIN_SET(gpio, pin, pin_value);
	DELAY(delay_prop[2]);

	return (0);
	}

	static int
	awg_reset(device_t dev)
	{
	struct awg_softc *sc;
	int retry;

	sc = device_get_softc(dev);

	/* Reset PHY if necessary */
	if (awg_phy_reset(dev) != 0) {
	device_printf(dev, "failed to reset PHY\n");
	return (ENXIO);
	}

	/* Soft reset all registers and logic */
	WR4(sc, EMAC_BASIC_CTL_1, BASIC_CTL_SOFT_RST);

	/* Wait for soft reset bit to self-clear */
	for (retry = SOFT_RST_RETRY; retry > 0; retry--) {
	if ((RD4(sc, EMAC_BASIC_CTL_1) & BASIC_CTL_SOFT_RST) == 0)
	break;
	DELAY(10);
	}
	if (retry == 0) {
	device_printf(dev, "soft reset timed out\n");
	#ifdef AWG_DEBUG
	awg_dump_regs(dev);
	#endif
	return (ETIMEDOUT);
	}

	return (0);
	}

	static void
	awg_dmamap_cb(void arg, bus_dma_segment_t segs, int nseg, int error)
	{
	if (error != 0)
	return;
	(bus_addr_t )arg = segs[0].ds_addr;
	}

	static int
	awg_setup_dma(device_t dev)
	{
	struct awg_softc *sc;
	int error, i;

	sc = device_get_softc(dev);

	/* Setup TX ring */
	error = bus_dma_tag_create(
	bus_get_dma_tag(dev), /* Parent tag */
	DESC_ALIGN, 0, /* alignment, boundary */
	BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	TX_DESC_SIZE, 1, /* maxsize, nsegs */
	TX_DESC_SIZE, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->tx.desc_tag);
	if (error != 0) {
	device_printf(dev, "cannot create TX descriptor ring tag\n");
	return (error);
	}

	error = bus_dmamem_alloc(sc->tx.desc_tag, (void **)&sc->tx.desc_ring,
	BUS_DMA_COHERENT \| BUS_DMA_WAITOK \| BUS_DMA_ZERO, &sc->tx.desc_map);
	if (error != 0) {
	device_printf(dev, "cannot allocate TX descriptor ring\n");
	return (error);
	}

	error = bus_dmamap_load(sc->tx.desc_tag, sc->tx.desc_map,
	sc->tx.desc_ring, TX_DESC_SIZE, awg_dmamap_cb,
	&sc->tx.desc_ring_paddr, 0);
	if (error != 0) {
	device_printf(dev, "cannot load TX descriptor ring\n");
	return (error);
	}

	for (i = 0; i < TX_DESC_COUNT; i++)
	sc->tx.desc_ring[i].next =
	htole32(sc->tx.desc_ring_paddr + DESC_OFF(TX_NEXT(i)));

	error = bus_dma_tag_create(
	bus_get_dma_tag(dev), /* Parent tag */
	1, 0, /* alignment, boundary */
	BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	MCLBYTES, TX_MAX_SEGS, /* maxsize, nsegs */
	MCLBYTES, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->tx.buf_tag);
	if (error != 0) {
	device_printf(dev, "cannot create TX buffer tag\n");
	return (error);
	}

	sc->tx.queued = 0;
	for (i = 0; i < TX_DESC_COUNT; i++) {
	error = bus_dmamap_create(sc->tx.buf_tag, 0,
	&sc->tx.buf_map[i].map);
	if (error != 0) {
	device_printf(dev, "cannot create TX buffer map\n");
	return (error);
	}
	}

	/* Setup RX ring */
	error = bus_dma_tag_create(
	bus_get_dma_tag(dev), /* Parent tag */
	DESC_ALIGN, 0, /* alignment, boundary */
	BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	RX_DESC_SIZE, 1, /* maxsize, nsegs */
	RX_DESC_SIZE, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->rx.desc_tag);
	if (error != 0) {
	device_printf(dev, "cannot create RX descriptor ring tag\n");
	return (error);
	}

	error = bus_dmamem_alloc(sc->rx.desc_tag, (void **)&sc->rx.desc_ring,
	BUS_DMA_COHERENT \| BUS_DMA_WAITOK \| BUS_DMA_ZERO, &sc->rx.desc_map);
	if (error != 0) {
	device_printf(dev, "cannot allocate RX descriptor ring\n");
	return (error);
	}

	error = bus_dmamap_load(sc->rx.desc_tag, sc->rx.desc_map,
	sc->rx.desc_ring, RX_DESC_SIZE, awg_dmamap_cb,
	&sc->rx.desc_ring_paddr, 0);
	if (error != 0) {
	device_printf(dev, "cannot load RX descriptor ring\n");
	return (error);
	}

	error = bus_dma_tag_create(
	bus_get_dma_tag(dev), /* Parent tag */
	1, 0, /* alignment, boundary */
	BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	MCLBYTES, 1, /* maxsize, nsegs */
	MCLBYTES, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->rx.buf_tag);
	if (error != 0) {
	device_printf(dev, "cannot create RX buffer tag\n");
	return (error);
	}

	error = bus_dmamap_create(sc->rx.buf_tag, 0, &sc->rx.buf_spare_map);
	if (error != 0) {
	device_printf(dev,
	"cannot create RX buffer spare map\n");
	return (error);
	}

	for (i = 0; i < RX_DESC_COUNT; i++) {
	sc->rx.desc_ring[i].next =
	htole32(sc->rx.desc_ring_paddr + DESC_OFF(RX_NEXT(i)));

	error = bus_dmamap_create(sc->rx.buf_tag, 0,
	&sc->rx.buf_map[i].map);
	if (error != 0) {
	device_printf(dev, "cannot create RX buffer map\n");
	return (error);
	}
	sc->rx.buf_map[i].mbuf = NULL;
	error = awg_newbuf_rx(sc, i);
	if (error != 0) {
	device_printf(dev, "cannot create RX buffer\n");
	return (error);
	}
	}
	bus_dmamap_sync(sc->rx.desc_tag, sc->rx.desc_map,
	BUS_DMASYNC_PREWRITE);

	/* Write transmit and receive descriptor base address registers */
	WR4(sc, EMAC_TX_DMA_LIST, sc->tx.desc_ring_paddr);
	WR4(sc, EMAC_RX_DMA_LIST, sc->rx.desc_ring_paddr);

	return (0);
	}

	static int
	awg_probe(device_t dev)
	{
	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
	return (ENXIO);

	device_set_desc(dev, "Allwinner Gigabit Ethernet");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	awg_attach(device_t dev)
	{
	uint8_t eaddr[ETHER_ADDR_LEN];
	struct awg_softc *sc;
	- phandle_t node;
	int error;

	sc = device_get_softc(dev);
	sc->dev = dev;
	sc->type = ofw_bus_search_compatible(dev, compat_data)->ocd_data;
	- node = ofw_bus_get_node(dev);

	if (bus_alloc_resources(dev, awg_spec, sc->res) != 0) {
	device_printf(dev, "cannot allocate resources for device\n");
	return (ENXIO);
	}

	mtx_init(&sc->mtx, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF);
	callout_init_mtx(&sc->stat_ch, &sc->mtx, 0);
	TASK_INIT(&sc->link_task, 0, awg_link_task, sc);

	/* Setup clocks and regulators */
	error = awg_setup_extres(dev);
	if (error != 0)
	return (error);

	/* Read MAC address before resetting the chip */
	awg_get_eaddr(dev, eaddr);

	/* Soft reset EMAC core */
	error = awg_reset(dev);
	if (error != 0)
	return (error);

	/* Setup DMA descriptors */
	error = awg_setup_dma(dev);
	if (error != 0)
	return (error);

	/* Install interrupt handler */
	error = bus_setup_intr(dev, sc->res[_RES_IRQ],
	INTR_TYPE_NET \| INTR_MPSAFE, NULL, awg_intr, sc, &sc->ih);
	if (error != 0) {
	device_printf(dev, "cannot setup interrupt handler\n");
	return (error);
	}

	/* Setup ethernet interface */
	sc->ifp = if_alloc(IFT_ETHER);
	if_setsoftc(sc->ifp, sc);
	if_initname(sc->ifp, device_get_name(dev), device_get_unit(dev));
	if_setflags(sc->ifp, IFF_BROADCAST \| IFF_SIMPLEX \| IFF_MULTICAST);
	if_setstartfn(sc->ifp, awg_start);
	if_setioctlfn(sc->ifp, awg_ioctl);
	if_setinitfn(sc->ifp, awg_init);
	if_setsendqlen(sc->ifp, TX_DESC_COUNT - 1);
	if_setsendqready(sc->ifp);
	if_sethwassist(sc->ifp, CSUM_IP \| CSUM_UDP \| CSUM_TCP);
	if_setcapabilities(sc->ifp, IFCAP_VLAN_MTU \| IFCAP_HWCSUM);
	if_setcapenable(sc->ifp, if_getcapabilities(sc->ifp));
	#ifdef DEVICE_POLLING
	if_setcapabilitiesbit(sc->ifp, IFCAP_POLLING, 0);
	#endif

	/* Attach MII driver */
	error = mii_attach(dev, &sc->miibus, sc->ifp, awg_media_change,
	awg_media_status, BMSR_DEFCAPMASK, MII_PHY_ANY, MII_OFFSET_ANY,
	MIIF_DOPAUSE);
	if (error != 0) {
	device_printf(dev, "cannot attach PHY\n");
	return (error);
	}

	/* Attach ethernet interface */
	ether_ifattach(sc->ifp, eaddr);

	return (0);
	}

	static device_method_t awg_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, awg_probe),
	DEVMETHOD(device_attach, awg_attach),

	/* MII interface */
	DEVMETHOD(miibus_readreg, awg_miibus_readreg),
	DEVMETHOD(miibus_writereg, awg_miibus_writereg),
	DEVMETHOD(miibus_statchg, awg_miibus_statchg),

	DEVMETHOD_END
	};

	static driver_t awg_driver = {
	"awg",
	awg_methods,
	sizeof(struct awg_softc),
	};

	static devclass_t awg_devclass;

	DRIVER_MODULE(awg, simplebus, awg_driver, awg_devclass, 0, 0);
	DRIVER_MODULE(miibus, awg, miibus_driver, miibus_devclass, 0, 0);

	MODULE_DEPEND(awg, ether, 1, 1, 1);
	MODULE_DEPEND(awg, miibus, 1, 1, 1);
	Index: head/sys/arm/arm/gic.c
	===================================================================
	--- head/sys/arm/arm/gic.c (revision 327172)
	+++ head/sys/arm/arm/gic.c (revision 327173)
	@@ -1,1599 +1,1597 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2011 The FreeBSD Foundation
	* All rights reserved.
	*
	* Developed by Damjan Marion <damjan.marion@gmail.com>
	*
	* Based on OMAP4 GIC code by Ben Gray
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the company nor the name of the author may be used to
	* endorse or promote products derived from this software without specific
	* prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_platform.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/module.h>
	#include <sys/malloc.h>
	#include <sys/rman.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/cpuset.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/smp.h>
	#ifdef INTRNG
	#include <sys/sched.h>
	#endif

	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include <machine/bus.h>
	#include <machine/intr.h>
	#include <machine/smp.h>

	#ifdef FDT
	#include <dev/fdt/fdt_intr.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#endif

	#include <arm/arm/gic.h>
	#include <arm/arm/gic_common.h>

	#ifdef INTRNG
	#include "pic_if.h"
	#include "msi_if.h"
	#endif

	/* We are using GICv2 register naming */

	/* Distributor Registers */

	/* CPU Registers */
	#define GICC_CTLR 0x0000 /* v1 ICCICR */
	#define GICC_PMR 0x0004 /* v1 ICCPMR */
	#define GICC_BPR 0x0008 /* v1 ICCBPR */
	#define GICC_IAR 0x000C /* v1 ICCIAR */
	#define GICC_EOIR 0x0010 /* v1 ICCEOIR */
	#define GICC_RPR 0x0014 /* v1 ICCRPR */
	#define GICC_HPPIR 0x0018 /* v1 ICCHPIR */
	#define GICC_ABPR 0x001C /* v1 ICCABPR */
	#define GICC_IIDR 0x00FC /* v1 ICCIIDR*/

	/* TYPER Registers */
	#define GICD_TYPER_SECURITYEXT 0x400
	#define GIC_SUPPORT_SECEXT(_sc) \
	((_sc->typer & GICD_TYPER_SECURITYEXT) == GICD_TYPER_SECURITYEXT)


	#ifndef GIC_DEFAULT_ICFGR_INIT
	#define GIC_DEFAULT_ICFGR_INIT 0x00000000
	#endif

	#ifdef INTRNG
	struct gic_irqsrc {
	struct intr_irqsrc gi_isrc;
	uint32_t gi_irq;
	enum intr_polarity gi_pol;
	enum intr_trigger gi_trig;
	#define GI_FLAG_EARLY_EOI (1 << 0)
	#define GI_FLAG_MSI (1 << 1) /* This interrupt source should only */
	/* be used for MSI/MSI-X interrupts */
	#define GI_FLAG_MSI_USED (1 << 2) /* This irq is already allocated */
	/* for a MSI/MSI-X interrupt */
	u_int gi_flags;
	};

	static u_int gic_irq_cpu;
	static int arm_gic_bind_intr(device_t dev, struct intr_irqsrc *isrc);

	#ifdef SMP
	static u_int sgi_to_ipi[GIC_LAST_SGI - GIC_FIRST_SGI + 1];
	static u_int sgi_first_unused = GIC_FIRST_SGI;
	#endif

	#define GIC_INTR_ISRC(sc, irq) (&sc->gic_irqs[irq].gi_isrc)
	#else /* !INTRNG */
	static struct ofw_compat_data compat_data[] = {
	{"arm,gic", true}, /* Non-standard, used in FreeBSD dts. */
	{"arm,gic-400", true},
	{"arm,cortex-a15-gic", true},
	{"arm,cortex-a9-gic", true},
	{"arm,cortex-a7-gic", true},
	{"arm,arm11mp-gic", true},
	{"brcm,brahma-b15-gic", true},
	{"qcom,msm-qgic2", true},
	{NULL, false}
	};
	#endif

	static struct resource_spec arm_gic_spec[] = {
	{ SYS_RES_MEMORY, 0, RF_ACTIVE }, /* Distributor registers */
	{ SYS_RES_MEMORY, 1, RF_ACTIVE }, /* CPU Interrupt Intf. registers */
	#ifdef INTRNG
	{ SYS_RES_IRQ, 0, RF_ACTIVE \| RF_OPTIONAL }, /* Parent interrupt */
	#endif
	{ -1, 0 }
	};


	#if defined(__arm__) && defined(INVARIANTS)
	static int gic_debug_spurious = 1;
	#else
	static int gic_debug_spurious = 0;
	#endif
	TUNABLE_INT("hw.gic.debug_spurious", &gic_debug_spurious);

	static u_int arm_gic_map[MAXCPU];

	static struct arm_gic_softc *gic_sc = NULL;

	#define gic_c_read_4(_sc, _reg) \
	bus_space_read_4((_sc)->gic_c_bst, (_sc)->gic_c_bsh, (_reg))
	#define gic_c_write_4(_sc, _reg, _val) \
	bus_space_write_4((_sc)->gic_c_bst, (_sc)->gic_c_bsh, (_reg), (_val))
	#define gic_d_read_4(_sc, _reg) \
	bus_space_read_4((_sc)->gic_d_bst, (_sc)->gic_d_bsh, (_reg))
	#define gic_d_write_1(_sc, _reg, _val) \
	bus_space_write_1((_sc)->gic_d_bst, (_sc)->gic_d_bsh, (_reg), (_val))
	#define gic_d_write_4(_sc, _reg, _val) \
	bus_space_write_4((_sc)->gic_d_bst, (_sc)->gic_d_bsh, (_reg), (_val))

	#ifndef INTRNG
	static int gic_config_irq(int irq, enum intr_trigger trig,
	enum intr_polarity pol);
	static void gic_post_filter(void *);
	#endif

	#ifdef INTRNG
	static inline void
	gic_irq_unmask(struct arm_gic_softc *sc, u_int irq)
	{

	gic_d_write_4(sc, GICD_ISENABLER(irq), GICD_I_MASK(irq));
	}

	static inline void
	gic_irq_mask(struct arm_gic_softc *sc, u_int irq)
	{

	gic_d_write_4(sc, GICD_ICENABLER(irq), GICD_I_MASK(irq));
	}
	#endif

	static uint8_t
	gic_cpu_mask(struct arm_gic_softc *sc)
	{
	uint32_t mask;
	int i;

	/* Read the current cpuid mask by reading ITARGETSR{0..7} */
	for (i = 0; i < 8; i++) {
	mask = gic_d_read_4(sc, GICD_ITARGETSR(4 * i));
	if (mask != 0)
	break;
	}
	/* No mask found, assume we are on CPU interface 0 */
	if (mask == 0)
	return (1);

	/* Collect the mask in the lower byte */
	mask \|= mask >> 16;
	mask \|= mask >> 8;

	return (mask);
	}

	#ifdef SMP
	#ifdef INTRNG
	static void
	arm_gic_init_secondary(device_t dev)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	u_int irq, cpu;

	/* Set the mask so we can find this CPU to send it IPIs */
	cpu = PCPU_GET(cpuid);
	arm_gic_map[cpu] = gic_cpu_mask(sc);

	for (irq = 0; irq < sc->nirqs; irq += 4)
	gic_d_write_4(sc, GICD_IPRIORITYR(irq), 0);

	/* Set all the interrupts to be in Group 0 (secure) */
	for (irq = 0; GIC_SUPPORT_SECEXT(sc) && irq < sc->nirqs; irq += 32) {
	gic_d_write_4(sc, GICD_IGROUPR(irq), 0);
	}

	/* Enable CPU interface */
	gic_c_write_4(sc, GICC_CTLR, 1);

	/* Set priority mask register. */
	gic_c_write_4(sc, GICC_PMR, 0xff);

	/* Enable interrupt distribution */
	gic_d_write_4(sc, GICD_CTLR, 0x01);

	/* Unmask attached SGI interrupts. */
	for (irq = GIC_FIRST_SGI; irq <= GIC_LAST_SGI; irq++)
	if (intr_isrc_init_on_cpu(GIC_INTR_ISRC(sc, irq), cpu))
	gic_irq_unmask(sc, irq);

	/* Unmask attached PPI interrupts. */
	for (irq = GIC_FIRST_PPI; irq <= GIC_LAST_PPI; irq++)
	if (intr_isrc_init_on_cpu(GIC_INTR_ISRC(sc, irq), cpu))
	gic_irq_unmask(sc, irq);
	}
	#else
	static void
	arm_gic_init_secondary(device_t dev)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	int i;

	/* Set the mask so we can find this CPU to send it IPIs */
	arm_gic_map[PCPU_GET(cpuid)] = gic_cpu_mask(sc);

	for (i = 0; i < sc->nirqs; i += 4)
	gic_d_write_4(sc, GICD_IPRIORITYR(i), 0);

	/* Set all the interrupts to be in Group 0 (secure) */
	for (i = 0; GIC_SUPPORT_SECEXT(sc) && i < sc->nirqs; i += 32) {
	gic_d_write_4(sc, GICD_IGROUPR(i), 0);
	}

	/* Enable CPU interface */
	gic_c_write_4(sc, GICC_CTLR, 1);

	/* Set priority mask register. */
	gic_c_write_4(sc, GICC_PMR, 0xff);

	/* Enable interrupt distribution */
	gic_d_write_4(sc, GICD_CTLR, 0x01);

	/*
	* Activate the timer interrupts: virtual, secure, and non-secure.
	*/
	gic_d_write_4(sc, GICD_ISENABLER(27), GICD_I_MASK(27));
	gic_d_write_4(sc, GICD_ISENABLER(29), GICD_I_MASK(29));
	gic_d_write_4(sc, GICD_ISENABLER(30), GICD_I_MASK(30));
	}
	#endif /* INTRNG */
	#endif /* SMP */

	#ifndef INTRNG
	int
	gic_decode_fdt(phandle_t iparent, pcell_t intr, int interrupt,
	int trig, int pol)
	{
	static u_int num_intr_cells;
	static phandle_t self;
	struct ofw_compat_data *ocd;

	if (self == 0) {
	for (ocd = compat_data; ocd->ocd_str != NULL; ocd++) {
	if (ofw_bus_node_is_compatible(iparent, ocd->ocd_str)) {
	self = iparent;
	break;
	}
	}
	}
	if (self != iparent)
	return (ENXIO);

	if (num_intr_cells == 0) {
	if (OF_searchencprop(OF_node_from_xref(iparent),
	"#interrupt-cells", &num_intr_cells,
	sizeof(num_intr_cells)) == -1) {
	num_intr_cells = 1;
	}
	}

	if (num_intr_cells == 1) {
	*interrupt = fdt32_to_cpu(intr[0]);
	*trig = INTR_TRIGGER_CONFORM;
	*pol = INTR_POLARITY_CONFORM;
	} else {
	if (fdt32_to_cpu(intr[0]) == 0)
	*interrupt = fdt32_to_cpu(intr[1]) + GIC_FIRST_SPI;
	else
	*interrupt = fdt32_to_cpu(intr[1]) + GIC_FIRST_PPI;
	/*
	* In intr[2], bits[3:0] are trigger type and level flags.
	* 1 = low-to-high edge triggered
	* 2 = high-to-low edge triggered
	* 4 = active high level-sensitive
	* 8 = active low level-sensitive
	* The hardware only supports active-high-level or rising-edge
	* for SPIs
	*/
	if (*interrupt >= GIC_FIRST_SPI &&
	fdt32_to_cpu(intr[2]) & 0x0a) {
	printf("unsupported trigger/polarity configuration "
	"0x%02x\n", fdt32_to_cpu(intr[2]) & 0x0f);
	}
	*pol = INTR_POLARITY_CONFORM;
	if (fdt32_to_cpu(intr[2]) & 0x03)
	*trig = INTR_TRIGGER_EDGE;
	else
	*trig = INTR_TRIGGER_LEVEL;
	}
	return (0);
	}
	#endif

	#ifdef INTRNG
	static int
	arm_gic_register_isrcs(struct arm_gic_softc *sc, uint32_t num)
	{
	int error;
	uint32_t irq;
	struct gic_irqsrc *irqs;
	struct intr_irqsrc *isrc;
	const char *name;

	irqs = malloc(num * sizeof(struct gic_irqsrc), M_DEVBUF,
	M_WAITOK \| M_ZERO);

	name = device_get_nameunit(sc->gic_dev);
	for (irq = 0; irq < num; irq++) {
	irqs[irq].gi_irq = irq;
	irqs[irq].gi_pol = INTR_POLARITY_CONFORM;
	irqs[irq].gi_trig = INTR_TRIGGER_CONFORM;

	isrc = &irqs[irq].gi_isrc;
	if (irq <= GIC_LAST_SGI) {
	error = intr_isrc_register(isrc, sc->gic_dev,
	INTR_ISRCF_IPI, "%s,i%u", name, irq - GIC_FIRST_SGI);
	} else if (irq <= GIC_LAST_PPI) {
	error = intr_isrc_register(isrc, sc->gic_dev,
	INTR_ISRCF_PPI, "%s,p%u", name, irq - GIC_FIRST_PPI);
	} else {
	error = intr_isrc_register(isrc, sc->gic_dev, 0,
	"%s,s%u", name, irq - GIC_FIRST_SPI);
	}
	if (error != 0) {
	/* XXX call intr_isrc_deregister() */
	free(irqs, M_DEVBUF);
	return (error);
	}
	}
	sc->gic_irqs = irqs;
	sc->nirqs = num;
	return (0);
	}

	static void
	arm_gic_reserve_msi_range(device_t dev, u_int start, u_int count)
	{
	struct arm_gic_softc *sc;
	int i;

	sc = device_get_softc(dev);

	KASSERT((start + count) < sc->nirqs,
	("%s: Trying to allocate too many MSI IRQs: %d + %d > %d", __func__,
	start, count, sc->nirqs));
	for (i = 0; i < count; i++) {
	KASSERT(sc->gic_irqs[start + i].gi_isrc.isrc_handlers == 0,
	("%s: MSI interrupt %d already has a handler", __func__,
	count + i));
	KASSERT(sc->gic_irqs[start + i].gi_pol == INTR_POLARITY_CONFORM,
	("%s: MSI interrupt %d already has a polarity", __func__,
	count + i));
	KASSERT(sc->gic_irqs[start + i].gi_trig == INTR_TRIGGER_CONFORM,
	("%s: MSI interrupt %d already has a trigger", __func__,
	count + i));
	sc->gic_irqs[start + i].gi_pol = INTR_POLARITY_HIGH;
	sc->gic_irqs[start + i].gi_trig = INTR_TRIGGER_EDGE;
	sc->gic_irqs[start + i].gi_flags \|= GI_FLAG_MSI;
	}
	}
	#endif

	int
	arm_gic_attach(device_t dev)
	{
	struct arm_gic_softc *sc;
	int i;
	uint32_t icciidr, mask, nirqs;

	if (gic_sc)
	return (ENXIO);

	sc = device_get_softc(dev);

	if (bus_alloc_resources(dev, arm_gic_spec, sc->gic_res)) {
	device_printf(dev, "could not allocate resources\n");
	return (ENXIO);
	}

	sc->gic_dev = dev;
	gic_sc = sc;

	/* Initialize mutex */
	mtx_init(&sc->mutex, "GIC lock", "", MTX_SPIN);

	/* Distributor Interface */
	sc->gic_d_bst = rman_get_bustag(sc->gic_res[0]);
	sc->gic_d_bsh = rman_get_bushandle(sc->gic_res[0]);

	/* CPU Interface */
	sc->gic_c_bst = rman_get_bustag(sc->gic_res[1]);
	sc->gic_c_bsh = rman_get_bushandle(sc->gic_res[1]);

	/* Disable interrupt forwarding to the CPU interface */
	gic_d_write_4(sc, GICD_CTLR, 0x00);

	/* Get the number of interrupts */
	sc->typer = gic_d_read_4(sc, GICD_TYPER);
	nirqs = GICD_TYPER_I_NUM(sc->typer);

	#ifdef INTRNG
	if (arm_gic_register_isrcs(sc, nirqs)) {
	device_printf(dev, "could not register irqs\n");
	goto cleanup;
	}
	#else
	sc->nirqs = nirqs;

	/* Set up function pointers */
	arm_post_filter = gic_post_filter;
	arm_config_irq = gic_config_irq;
	#endif

	icciidr = gic_c_read_4(sc, GICC_IIDR);
	device_printf(dev,
	"pn 0x%x, arch 0x%x, rev 0x%x, implementer 0x%x irqs %u\n",
	GICD_IIDR_PROD(icciidr), GICD_IIDR_VAR(icciidr),
	GICD_IIDR_REV(icciidr), GICD_IIDR_IMPL(icciidr), sc->nirqs);
	#ifdef INTRNG
	sc->gic_iidr = icciidr;
	#endif

	/* Set all global interrupts to be level triggered, active low. */
	for (i = 32; i < sc->nirqs; i += 16) {
	gic_d_write_4(sc, GICD_ICFGR(i), GIC_DEFAULT_ICFGR_INIT);
	}

	/* Disable all interrupts. */
	for (i = 32; i < sc->nirqs; i += 32) {
	gic_d_write_4(sc, GICD_ICENABLER(i), 0xFFFFFFFF);
	}

	/* Find the current cpu mask */
	mask = gic_cpu_mask(sc);
	/* Set the mask so we can find this CPU to send it IPIs */
	arm_gic_map[PCPU_GET(cpuid)] = mask;
	/* Set all four targets to this cpu */
	mask \|= mask << 8;
	mask \|= mask << 16;

	for (i = 0; i < sc->nirqs; i += 4) {
	gic_d_write_4(sc, GICD_IPRIORITYR(i), 0);
	if (i > 32) {
	gic_d_write_4(sc, GICD_ITARGETSR(i), mask);
	}
	}

	/* Set all the interrupts to be in Group 0 (secure) */
	for (i = 0; GIC_SUPPORT_SECEXT(sc) && i < sc->nirqs; i += 32) {
	gic_d_write_4(sc, GICD_IGROUPR(i), 0);
	}

	/* Enable CPU interface */
	gic_c_write_4(sc, GICC_CTLR, 1);

	/* Set priority mask register. */
	gic_c_write_4(sc, GICC_PMR, 0xff);

	/* Enable interrupt distribution */
	gic_d_write_4(sc, GICD_CTLR, 0x01);
	return (0);

	#ifdef INTRNG
	cleanup:
	arm_gic_detach(dev);
	return(ENXIO);
	#endif
	}

	int
	arm_gic_detach(device_t dev)
	{
	#ifdef INTRNG
	struct arm_gic_softc *sc;

	sc = device_get_softc(dev);

	if (sc->gic_irqs != NULL)
	free(sc->gic_irqs, M_DEVBUF);

	bus_release_resources(dev, arm_gic_spec, sc->gic_res);
	#endif

	return (0);
	}

	#ifdef INTRNG
	static int
	arm_gic_print_child(device_t bus, device_t child)
	{
	struct resource_list *rl;
	int rv;

	rv = bus_print_child_header(bus, child);

	rl = BUS_GET_RESOURCE_LIST(bus, child);
	if (rl != NULL) {
	rv += resource_list_print_type(rl, "mem", SYS_RES_MEMORY,
	"%#jx");
	rv += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
	}

	rv += bus_print_child_footer(bus, child);

	return (rv);
	}

	static struct resource *
	arm_gic_alloc_resource(device_t bus, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct arm_gic_softc *sc;
	struct resource_list_entry *rle;
	struct resource_list *rl;
	int j;

	KASSERT(type == SYS_RES_MEMORY, ("Invalid resoure type %x", type));

	sc = device_get_softc(bus);

	/*
	* Request for the default allocation with a given rid: use resource
	* list stored in the local device info.
	*/
	if (RMAN_IS_DEFAULT_RANGE(start, end)) {
	rl = BUS_GET_RESOURCE_LIST(bus, child);

	if (type == SYS_RES_IOPORT)
	type = SYS_RES_MEMORY;

	rle = resource_list_find(rl, type, *rid);
	if (rle == NULL) {
	if (bootverbose)
	device_printf(bus, "no default resources for "
	"rid = %d, type = %d\n", *rid, type);
	return (NULL);
	}
	start = rle->start;
	end = rle->end;
	count = rle->count;
	}

	/* Remap through ranges property */
	for (j = 0; j < sc->nranges; j++) {
	if (start >= sc->ranges[j].bus && end <
	sc->ranges[j].bus + sc->ranges[j].size) {
	start -= sc->ranges[j].bus;
	start += sc->ranges[j].host;
	end -= sc->ranges[j].bus;
	end += sc->ranges[j].host;
	break;
	}
	}
	if (j == sc->nranges && sc->nranges != 0) {
	if (bootverbose)
	device_printf(bus, "Could not map resource "
	"%#jx-%#jx\n", (uintmax_t)start, (uintmax_t)end);

	return (NULL);
	}

	return (bus_generic_alloc_resource(bus, child, type, rid, start, end,
	count, flags));
	}

	static int
	arm_gic_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
	{
	struct arm_gic_softc *sc;

	sc = device_get_softc(dev);

	switch(which) {
	case GIC_IVAR_HW_REV:
	KASSERT(GICD_IIDR_VAR(sc->gic_iidr) < 3 &&
	GICD_IIDR_VAR(sc->gic_iidr) != 0,
	("arm_gic_read_ivar: Unknown IIDR revision %u (%.08x)",
	GICD_IIDR_VAR(sc->gic_iidr), sc->gic_iidr));
	*result = GICD_IIDR_VAR(sc->gic_iidr);
	return (0);
	case GIC_IVAR_BUS:
	KASSERT(sc->gic_bus != GIC_BUS_UNKNOWN,
	("arm_gic_read_ivar: Unknown bus type"));
	KASSERT(sc->gic_bus <= GIC_BUS_MAX,
	("arm_gic_read_ivar: Invalid bus type %u", sc->gic_bus));
	*result = sc->gic_bus;
	return (0);
	}

	return (ENOENT);
	}

	int
	arm_gic_intr(void *arg)
	{
	struct arm_gic_softc *sc = arg;
	struct gic_irqsrc *gi;
	uint32_t irq_active_reg, irq;
	struct trapframe *tf;

	irq_active_reg = gic_c_read_4(sc, GICC_IAR);
	irq = irq_active_reg & 0x3FF;

	/*
	* 1. We do EOI here because recent read value from active interrupt
	* register must be used for it. Another approach is to save this
	* value into associated interrupt source.
	* 2. EOI must be done on same CPU where interrupt has fired. Thus
	* we must ensure that interrupted thread does not migrate to
	* another CPU.
	* 3. EOI cannot be delayed by any preemption which could happen on
	* critical_exit() used in MI intr code, when interrupt thread is
	* scheduled. See next point.
	* 4. IPI_RENDEZVOUS assumes that no preemption is permitted during
	* an action and any use of critical_exit() could break this
	* assumption. See comments within smp_rendezvous_action().
	* 5. We always return FILTER_HANDLED as this is an interrupt
	* controller dispatch function. Otherwise, in cascaded interrupt
	* case, the whole interrupt subtree would be masked.
	*/

	if (irq >= sc->nirqs) {
	if (gic_debug_spurious)
	device_printf(sc->gic_dev,
	"Spurious interrupt detected: last irq: %d on CPU%d\n",
	sc->last_irq[PCPU_GET(cpuid)], PCPU_GET(cpuid));
	return (FILTER_HANDLED);
	}

	tf = curthread->td_intr_frame;
	dispatch_irq:
	gi = sc->gic_irqs + irq;
	/*
	* Note that GIC_FIRST_SGI is zero and is not used in 'if' statement
	* as compiler complains that comparing u_int >= 0 is always true.
	*/
	if (irq <= GIC_LAST_SGI) {
	#ifdef SMP
	/* Call EOI for all IPI before dispatch. */
	gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
	intr_ipi_dispatch(sgi_to_ipi[gi->gi_irq], tf);
	goto next_irq;
	#else
	device_printf(sc->gic_dev, "SGI %u on UP system detected\n",
	irq - GIC_FIRST_SGI);
	gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
	goto next_irq;
	#endif
	}

	if (gic_debug_spurious)
	sc->last_irq[PCPU_GET(cpuid)] = irq;
	if ((gi->gi_flags & GI_FLAG_EARLY_EOI) == GI_FLAG_EARLY_EOI)
	gic_c_write_4(sc, GICC_EOIR, irq_active_reg);

	if (intr_isrc_dispatch(&gi->gi_isrc, tf) != 0) {
	gic_irq_mask(sc, irq);
	if ((gi->gi_flags & GI_FLAG_EARLY_EOI) != GI_FLAG_EARLY_EOI)
	gic_c_write_4(sc, GICC_EOIR, irq_active_reg);
	device_printf(sc->gic_dev, "Stray irq %u disabled\n", irq);
	}

	next_irq:
	arm_irq_memory_barrier(irq);
	irq_active_reg = gic_c_read_4(sc, GICC_IAR);
	irq = irq_active_reg & 0x3FF;
	if (irq < sc->nirqs)
	goto dispatch_irq;

	return (FILTER_HANDLED);
	}

	static void
	gic_config(struct arm_gic_softc *sc, u_int irq, enum intr_trigger trig,
	enum intr_polarity pol)
	{
	uint32_t reg;
	uint32_t mask;

	if (irq < GIC_FIRST_SPI)
	return;

	mtx_lock_spin(&sc->mutex);

	reg = gic_d_read_4(sc, GICD_ICFGR(irq));
	mask = (reg >> 2*(irq % 16)) & 0x3;

	if (pol == INTR_POLARITY_LOW) {
	mask &= ~GICD_ICFGR_POL_MASK;
	mask \|= GICD_ICFGR_POL_LOW;
	} else if (pol == INTR_POLARITY_HIGH) {
	mask &= ~GICD_ICFGR_POL_MASK;
	mask \|= GICD_ICFGR_POL_HIGH;
	}

	if (trig == INTR_TRIGGER_LEVEL) {
	mask &= ~GICD_ICFGR_TRIG_MASK;
	mask \|= GICD_ICFGR_TRIG_LVL;
	} else if (trig == INTR_TRIGGER_EDGE) {
	mask &= ~GICD_ICFGR_TRIG_MASK;
	mask \|= GICD_ICFGR_TRIG_EDGE;
	}

	/* Set mask */
	reg = reg & ~(0x3 << 2*(irq % 16));
	reg = reg \| (mask << 2*(irq % 16));
	gic_d_write_4(sc, GICD_ICFGR(irq), reg);

	mtx_unlock_spin(&sc->mutex);
	}

	static int
	gic_bind(struct arm_gic_softc sc, u_int irq, cpuset_t cpus)
	{
	uint32_t cpu, end, mask;

	end = min(mp_ncpus, 8);
	for (cpu = end; cpu < MAXCPU; cpu++)
	if (CPU_ISSET(cpu, cpus))
	return (EINVAL);

	for (mask = 0, cpu = 0; cpu < end; cpu++)
	if (CPU_ISSET(cpu, cpus))
	mask \|= arm_gic_map[cpu];

	gic_d_write_1(sc, GICD_ITARGETSR(0) + irq, mask);
	return (0);
	}

	#ifdef FDT
	static int
	gic_map_fdt(device_t dev, u_int ncells, pcell_t cells, u_int irqp,
	enum intr_polarity polp, enum intr_trigger trigp)
	{

	if (ncells == 1) {
	*irqp = cells[0];
	*polp = INTR_POLARITY_CONFORM;
	*trigp = INTR_TRIGGER_CONFORM;
	return (0);
	}
	if (ncells == 3) {
	u_int irq, tripol;

	/*
	* The 1st cell is the interrupt type:
	* 0 = SPI
	* 1 = PPI
	* The 2nd cell contains the interrupt number:
	* [0 - 987] for SPI
	* [0 - 15] for PPI
	* The 3rd cell is the flags, encoded as follows:
	* bits[3:0] trigger type and level flags
	* 1 = low-to-high edge triggered
	* 2 = high-to-low edge triggered
	* 4 = active high level-sensitive
	* 8 = active low level-sensitive
	* bits[15:8] PPI interrupt cpu mask
	* Each bit corresponds to each of the 8 possible cpus
	* attached to the GIC. A bit set to '1' indicated
	* the interrupt is wired to that CPU.
	*/
	switch (cells[0]) {
	case 0:
	irq = GIC_FIRST_SPI + cells[1];
	/* SPI irq is checked later. */
	break;
	case 1:
	irq = GIC_FIRST_PPI + cells[1];
	if (irq > GIC_LAST_PPI) {
	device_printf(dev, "unsupported PPI interrupt "
	"number %u\n", cells[1]);
	return (EINVAL);
	}
	break;
	default:
	device_printf(dev, "unsupported interrupt type "
	"configuration %u\n", cells[0]);
	return (EINVAL);
	}

	tripol = cells[2] & 0xff;
	if (tripol & 0xf0 \|\| (tripol & FDT_INTR_LOW_MASK &&
	cells[0] == 0))
	device_printf(dev, "unsupported trigger/polarity "
	"configuration 0x%02x\n", tripol);

	*irqp = irq;
	*polp = INTR_POLARITY_CONFORM;
	*trigp = tripol & FDT_INTR_EDGE_MASK ?
	INTR_TRIGGER_EDGE : INTR_TRIGGER_LEVEL;
	return (0);
	}
	return (EINVAL);
	}
	#endif

	static int
	gic_map_msi(device_t dev, struct intr_map_data_msi msi_data, u_int irqp,
	enum intr_polarity polp, enum intr_trigger trigp)
	{
	struct gic_irqsrc *gi;

	/* Map a non-GICv2m MSI */
	gi = (struct gic_irqsrc *)msi_data->isrc;
	if (gi == NULL)
	return (ENXIO);

	*irqp = gi->gi_irq;

	/* MSI/MSI-X interrupts are always edge triggered with high polarity */
	*polp = INTR_POLARITY_HIGH;
	*trigp = INTR_TRIGGER_EDGE;

	return (0);
	}

	static int
	gic_map_intr(device_t dev, struct intr_map_data data, u_int irqp,
	enum intr_polarity polp, enum intr_trigger trigp)
	{
	u_int irq;
	enum intr_polarity pol;
	enum intr_trigger trig;
	struct arm_gic_softc *sc;
	struct intr_map_data_msi *dam;
	#ifdef FDT
	struct intr_map_data_fdt *daf;
	#endif

	sc = device_get_softc(dev);
	switch (data->type) {
	#ifdef FDT
	case INTR_MAP_DATA_FDT:
	daf = (struct intr_map_data_fdt *)data;
	if (gic_map_fdt(dev, daf->ncells, daf->cells, &irq, &pol,
	&trig) != 0)
	return (EINVAL);
	KASSERT(irq >= sc->nirqs \|\|
	(sc->gic_irqs[irq].gi_flags & GI_FLAG_MSI) == 0,
	("%s: Attempting to map a MSI interrupt from FDT",
	__func__));
	break;
	#endif
	case INTR_MAP_DATA_MSI:
	/* Non-GICv2m MSI */
	dam = (struct intr_map_data_msi *)data;
	if (gic_map_msi(dev, dam, &irq, &pol, &trig) != 0)
	return (EINVAL);
	break;
	default:
	return (ENOTSUP);
	}

	if (irq >= sc->nirqs)
	return (EINVAL);
	if (pol != INTR_POLARITY_CONFORM && pol != INTR_POLARITY_LOW &&
	pol != INTR_POLARITY_HIGH)
	return (EINVAL);
	if (trig != INTR_TRIGGER_CONFORM && trig != INTR_TRIGGER_EDGE &&
	trig != INTR_TRIGGER_LEVEL)
	return (EINVAL);

	*irqp = irq;
	if (polp != NULL)
	*polp = pol;
	if (trigp != NULL)
	*trigp = trig;
	return (0);
	}

	static int
	arm_gic_map_intr(device_t dev, struct intr_map_data *data,
	struct intr_irqsrc **isrcp)
	{
	int error;
	u_int irq;
	struct arm_gic_softc *sc;

	error = gic_map_intr(dev, data, &irq, NULL, NULL);
	if (error == 0) {
	sc = device_get_softc(dev);
	*isrcp = GIC_INTR_ISRC(sc, irq);
	}
	return (error);
	}

	static int
	arm_gic_setup_intr(device_t dev, struct intr_irqsrc *isrc,
	struct resource res, struct intr_map_data data)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;
	enum intr_trigger trig;
	enum intr_polarity pol;

	if ((gi->gi_flags & GI_FLAG_MSI) == GI_FLAG_MSI) {
	/* GICv2m MSI */
	pol = gi->gi_pol;
	trig = gi->gi_trig;
	KASSERT(pol == INTR_POLARITY_HIGH,
	("%s: MSI interrupts must be active-high", __func__));
	KASSERT(trig == INTR_TRIGGER_EDGE,
	("%s: MSI interrupts must be edge triggered", __func__));
	} else if (data != NULL) {
	u_int irq;

	/* Get config for resource. */
	if (gic_map_intr(dev, data, &irq, &pol, &trig) \|\|
	gi->gi_irq != irq)
	return (EINVAL);
	} else {
	pol = INTR_POLARITY_CONFORM;
	trig = INTR_TRIGGER_CONFORM;
	}

	/* Compare config if this is not first setup. */
	if (isrc->isrc_handlers != 0) {
	if ((pol != INTR_POLARITY_CONFORM && pol != gi->gi_pol) \|\|
	(trig != INTR_TRIGGER_CONFORM && trig != gi->gi_trig))
	return (EINVAL);
	else
	return (0);
	}

	/* For MSI/MSI-X we should have already configured these */
	if ((gi->gi_flags & GI_FLAG_MSI) == 0) {
	if (pol == INTR_POLARITY_CONFORM)
	pol = INTR_POLARITY_LOW; /* just pick some */
	if (trig == INTR_TRIGGER_CONFORM)
	trig = INTR_TRIGGER_EDGE; /* just pick some */

	gi->gi_pol = pol;
	gi->gi_trig = trig;

	/* Edge triggered interrupts need an early EOI sent */
	if (gi->gi_trig == INTR_TRIGGER_EDGE)
	gi->gi_flags \|= GI_FLAG_EARLY_EOI;
	}

	/*
	* XXX - In case that per CPU interrupt is going to be enabled in time
	* when SMP is already started, we need some IPI call which
	* enables it on others CPUs. Further, it's more complicated as
	* pic_enable_source() and pic_disable_source() should act on
	* per CPU basis only. Thus, it should be solved here somehow.
	*/
	if (isrc->isrc_flags & INTR_ISRCF_PPI)
	CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);

	gic_config(sc, gi->gi_irq, gi->gi_trig, gi->gi_pol);
	arm_gic_bind_intr(dev, isrc);
	return (0);
	}

	static int
	arm_gic_teardown_intr(device_t dev, struct intr_irqsrc *isrc,
	struct resource res, struct intr_map_data data)
	{
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;

	if (isrc->isrc_handlers == 0 && (gi->gi_flags & GI_FLAG_MSI) == 0) {
	gi->gi_pol = INTR_POLARITY_CONFORM;
	gi->gi_trig = INTR_TRIGGER_CONFORM;
	}
	return (0);
	}

	static void
	arm_gic_enable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;

	arm_irq_memory_barrier(gi->gi_irq);
	gic_irq_unmask(sc, gi->gi_irq);
	}

	static void
	arm_gic_disable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;

	gic_irq_mask(sc, gi->gi_irq);
	}

	static void
	arm_gic_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;

	arm_gic_disable_intr(dev, isrc);
	gic_c_write_4(sc, GICC_EOIR, gi->gi_irq);
	}

	static void
	arm_gic_post_ithread(device_t dev, struct intr_irqsrc *isrc)
	{

	arm_irq_memory_barrier(0);
	arm_gic_enable_intr(dev, isrc);
	}

	static void
	arm_gic_post_filter(device_t dev, struct intr_irqsrc *isrc)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;

	/* EOI for edge-triggered done earlier. */
	if ((gi->gi_flags & GI_FLAG_EARLY_EOI) == GI_FLAG_EARLY_EOI)
	return;

	arm_irq_memory_barrier(0);
	gic_c_write_4(sc, GICC_EOIR, gi->gi_irq);
	}

	static int
	arm_gic_bind_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;

	if (gi->gi_irq < GIC_FIRST_SPI)
	return (EINVAL);

	if (CPU_EMPTY(&isrc->isrc_cpu)) {
	gic_irq_cpu = intr_irq_next_cpu(gic_irq_cpu, &all_cpus);
	CPU_SETOF(gic_irq_cpu, &isrc->isrc_cpu);
	}
	return (gic_bind(sc, gi->gi_irq, &isrc->isrc_cpu));
	}

	#ifdef SMP
	static void
	arm_gic_ipi_send(device_t dev, struct intr_irqsrc *isrc, cpuset_t cpus,
	u_int ipi)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;
	uint32_t val = 0, i;

	for (i = 0; i < MAXCPU; i++)
	if (CPU_ISSET(i, &cpus))
	val \|= arm_gic_map[i] << GICD_SGI_TARGET_SHIFT;

	gic_d_write_4(sc, GICD_SGIR, val \| gi->gi_irq);
	}

	static int
	arm_gic_ipi_setup(device_t dev, u_int ipi, struct intr_irqsrc **isrcp)
	{
	struct intr_irqsrc *isrc;
	struct arm_gic_softc *sc = device_get_softc(dev);

	if (sgi_first_unused > GIC_LAST_SGI)
	return (ENOSPC);

	isrc = GIC_INTR_ISRC(sc, sgi_first_unused);
	sgi_to_ipi[sgi_first_unused++] = ipi;

	CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);

	*isrcp = isrc;
	return (0);
	}
	#endif
	#else
	static int
	arm_gic_next_irq(struct arm_gic_softc *sc, int last_irq)
	{
	uint32_t active_irq;

	active_irq = gic_c_read_4(sc, GICC_IAR);

	/*
	* Immediately EOIR the SGIs, because doing so requires the other
	* bits (ie CPU number), not just the IRQ number, and we do not
	* have this information later.
	*/
	if ((active_irq & 0x3ff) <= GIC_LAST_SGI)
	gic_c_write_4(sc, GICC_EOIR, active_irq);
	active_irq &= 0x3FF;

	if (active_irq == 0x3FF) {
	if (last_irq == -1)
	device_printf(sc->gic_dev,
	"Spurious interrupt detected\n");
	return -1;
	}

	return active_irq;
	}

	static int
	arm_gic_config(device_t dev, int irq, enum intr_trigger trig,
	enum intr_polarity pol)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	uint32_t reg;
	uint32_t mask;

	/* Function is public-accessible, so validate input arguments */
	if ((irq < 0) \|\| (irq >= sc->nirqs))
	goto invalid_args;
	if ((trig != INTR_TRIGGER_EDGE) && (trig != INTR_TRIGGER_LEVEL) &&
	(trig != INTR_TRIGGER_CONFORM))
	goto invalid_args;
	if ((pol != INTR_POLARITY_HIGH) && (pol != INTR_POLARITY_LOW) &&
	(pol != INTR_POLARITY_CONFORM))
	goto invalid_args;

	mtx_lock_spin(&sc->mutex);

	reg = gic_d_read_4(sc, GICD_ICFGR(irq));
	mask = (reg >> 2*(irq % 16)) & 0x3;

	if (pol == INTR_POLARITY_LOW) {
	mask &= ~GICD_ICFGR_POL_MASK;
	mask \|= GICD_ICFGR_POL_LOW;
	} else if (pol == INTR_POLARITY_HIGH) {
	mask &= ~GICD_ICFGR_POL_MASK;
	mask \|= GICD_ICFGR_POL_HIGH;
	}

	if (trig == INTR_TRIGGER_LEVEL) {
	mask &= ~GICD_ICFGR_TRIG_MASK;
	mask \|= GICD_ICFGR_TRIG_LVL;
	} else if (trig == INTR_TRIGGER_EDGE) {
	mask &= ~GICD_ICFGR_TRIG_MASK;
	mask \|= GICD_ICFGR_TRIG_EDGE;
	}

	/* Set mask */
	reg = reg & ~(0x3 << 2*(irq % 16));
	reg = reg \| (mask << 2*(irq % 16));
	gic_d_write_4(sc, GICD_ICFGR(irq), reg);

	mtx_unlock_spin(&sc->mutex);

	return (0);

	invalid_args:
	device_printf(dev, "gic_config_irg, invalid parameters\n");
	return (EINVAL);
	}


	static void
	arm_gic_mask(device_t dev, int irq)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);

	gic_d_write_4(sc, GICD_ICENABLER(irq), (1UL << (irq & 0x1F)));
	gic_c_write_4(sc, GICC_EOIR, irq); /* XXX - not allowed */
	}

	static void
	arm_gic_unmask(device_t dev, int irq)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);

	if (irq > GIC_LAST_SGI)
	arm_irq_memory_barrier(irq);

	gic_d_write_4(sc, GICD_ISENABLER(irq), (1UL << (irq & 0x1F)));
	}

	#ifdef SMP
	static void
	arm_gic_ipi_send(device_t dev, cpuset_t cpus, u_int ipi)
	{
	struct arm_gic_softc *sc = device_get_softc(dev);
	uint32_t val = 0, i;

	for (i = 0; i < MAXCPU; i++)
	if (CPU_ISSET(i, &cpus))
	val \|= arm_gic_map[i] << GICD_SGI_TARGET_SHIFT;

	gic_d_write_4(sc, GICD_SGIR, val \| ipi);
	}

	static int
	arm_gic_ipi_read(device_t dev, int i)
	{

	if (i != -1) {
	/*
	* The intr code will automagically give the frame pointer
	* if the interrupt argument is 0.
	*/
	if ((unsigned int)i > 16)
	return (0);
	return (i);
	}

	return (0x3ff);
	}

	static void
	arm_gic_ipi_clear(device_t dev, int ipi)
	{
	/* no-op */
	}
	#endif

	static void
	gic_post_filter(void *arg)
	{
	struct arm_gic_softc *sc = gic_sc;
	uintptr_t irq = (uintptr_t) arg;

	if (irq > GIC_LAST_SGI)
	arm_irq_memory_barrier(irq);
	gic_c_write_4(sc, GICC_EOIR, irq);
	}

	static int
	gic_config_irq(int irq, enum intr_trigger trig, enum intr_polarity pol)
	{

	return (arm_gic_config(gic_sc->gic_dev, irq, trig, pol));
	}

	void
	arm_mask_irq(uintptr_t nb)
	{

	arm_gic_mask(gic_sc->gic_dev, nb);
	}

	void
	arm_unmask_irq(uintptr_t nb)
	{

	arm_gic_unmask(gic_sc->gic_dev, nb);
	}

	int
	arm_get_next_irq(int last_irq)
	{

	return (arm_gic_next_irq(gic_sc, last_irq));
	}

	#ifdef SMP
	void
	intr_pic_init_secondary(void)
	{

	arm_gic_init_secondary(gic_sc->gic_dev);
	}

	void
	pic_ipi_send(cpuset_t cpus, u_int ipi)
	{

	arm_gic_ipi_send(gic_sc->gic_dev, cpus, ipi);
	}

	int
	pic_ipi_read(int i)
	{

	return (arm_gic_ipi_read(gic_sc->gic_dev, i));
	}

	void
	pic_ipi_clear(int ipi)
	{

	arm_gic_ipi_clear(gic_sc->gic_dev, ipi);
	}
	#endif
	#endif /* INTRNG */

	static device_method_t arm_gic_methods[] = {
	#ifdef INTRNG
	/* Bus interface */
	DEVMETHOD(bus_print_child, arm_gic_print_child),
	DEVMETHOD(bus_add_child, bus_generic_add_child),
	DEVMETHOD(bus_alloc_resource, arm_gic_alloc_resource),
	DEVMETHOD(bus_release_resource, bus_generic_release_resource),
	DEVMETHOD(bus_activate_resource,bus_generic_activate_resource),
	DEVMETHOD(bus_read_ivar, arm_gic_read_ivar),

	/* Interrupt controller interface */
	DEVMETHOD(pic_disable_intr, arm_gic_disable_intr),
	DEVMETHOD(pic_enable_intr, arm_gic_enable_intr),
	DEVMETHOD(pic_map_intr, arm_gic_map_intr),
	DEVMETHOD(pic_setup_intr, arm_gic_setup_intr),
	DEVMETHOD(pic_teardown_intr, arm_gic_teardown_intr),
	DEVMETHOD(pic_post_filter, arm_gic_post_filter),
	DEVMETHOD(pic_post_ithread, arm_gic_post_ithread),
	DEVMETHOD(pic_pre_ithread, arm_gic_pre_ithread),
	#ifdef SMP
	DEVMETHOD(pic_bind_intr, arm_gic_bind_intr),
	DEVMETHOD(pic_init_secondary, arm_gic_init_secondary),
	DEVMETHOD(pic_ipi_send, arm_gic_ipi_send),
	DEVMETHOD(pic_ipi_setup, arm_gic_ipi_setup),
	#endif
	#endif
	{ 0, 0 }
	};

	DEFINE_CLASS_0(gic, arm_gic_driver, arm_gic_methods,
	sizeof(struct arm_gic_softc));

	#ifdef INTRNG
	/*
	* GICv2m support -- the GICv2 MSI/MSI-X controller.
	*/

	#define GICV2M_MSI_TYPER 0x008
	#define MSI_TYPER_SPI_BASE(x) (((x) >> 16) & 0x3ff)
	#define MSI_TYPER_SPI_COUNT(x) (((x) >> 0) & 0x3ff)
	#define GICv2M_MSI_SETSPI_NS 0x040
	#define GICV2M_MSI_IIDR 0xFCC

	int
	arm_gicv2m_attach(device_t dev)
	{
	struct arm_gicv2m_softc *sc;
	- struct arm_gic_softc *psc;
	uint32_t typer;
	int rid;

	- psc = device_get_softc(device_get_parent(dev));
	sc = device_get_softc(dev);

	rid = 0;
	sc->sc_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
	RF_ACTIVE);
	if (sc->sc_mem == NULL) {
	device_printf(dev, "Unable to allocate resources\n");
	return (ENXIO);
	}

	typer = bus_read_4(sc->sc_mem, GICV2M_MSI_TYPER);
	sc->sc_spi_start = MSI_TYPER_SPI_BASE(typer);
	sc->sc_spi_count = MSI_TYPER_SPI_COUNT(typer);
	sc->sc_spi_end = sc->sc_spi_start + sc->sc_spi_count;

	/* Reserve these interrupts for MSI/MSI-X use */
	arm_gic_reserve_msi_range(device_get_parent(dev), sc->sc_spi_start,
	sc->sc_spi_count);

	mtx_init(&sc->sc_mutex, "GICv2m lock", "", MTX_DEF);

	intr_msi_register(dev, sc->sc_xref);

	if (bootverbose)
	device_printf(dev, "using spi %u to %u\n", sc->sc_spi_start,
	sc->sc_spi_start + sc->sc_spi_count - 1);

	return (0);
	}

	static int
	arm_gicv2m_alloc_msi(device_t dev, device_t child, int count, int maxcount,
	device_t pic, struct intr_irqsrc *srcs)
	{
	struct arm_gic_softc *psc;
	struct arm_gicv2m_softc *sc;
	int i, irq, end_irq;
	bool found;

	KASSERT(powerof2(count), ("%s: bad count", __func__));
	KASSERT(powerof2(maxcount), ("%s: bad maxcount", __func__));

	psc = device_get_softc(device_get_parent(dev));
	sc = device_get_softc(dev);

	mtx_lock(&sc->sc_mutex);

	found = false;
	for (irq = sc->sc_spi_start; irq < sc->sc_spi_end; irq++) {
	/* Start on an aligned interrupt */
	if ((irq & (maxcount - 1)) != 0)
	continue;

	/* Assume we found a valid range until shown otherwise */
	found = true;

	/* Check this range is valid */
	for (end_irq = irq; end_irq != irq + count; end_irq++) {
	/* No free interrupts */
	if (end_irq == sc->sc_spi_end) {
	found = false;
	break;
	}

	KASSERT((psc->gic_irqs[end_irq].gi_flags & GI_FLAG_MSI)!= 0,
	("%s: Non-MSI interrupt found", __func__));

	/* This is already used */
	if ((psc->gic_irqs[end_irq].gi_flags & GI_FLAG_MSI_USED) ==
	GI_FLAG_MSI_USED) {
	found = false;
	break;
	}
	}
	if (found)
	break;
	}

	/* Not enough interrupts were found */
	if (!found \|\| irq == sc->sc_spi_end) {
	mtx_unlock(&sc->sc_mutex);
	return (ENXIO);
	}

	for (i = 0; i < count; i++) {
	/* Mark the interrupt as used */
	psc->gic_irqs[irq + i].gi_flags \|= GI_FLAG_MSI_USED;

	}
	mtx_unlock(&sc->sc_mutex);

	for (i = 0; i < count; i++)
	srcs[i] = (struct intr_irqsrc *)&psc->gic_irqs[irq + i];
	*pic = device_get_parent(dev);

	return (0);
	}

	static int
	arm_gicv2m_release_msi(device_t dev, device_t child, int count,
	struct intr_irqsrc **isrc)
	{
	struct arm_gicv2m_softc *sc;
	struct gic_irqsrc *gi;
	int i;

	sc = device_get_softc(dev);

	mtx_lock(&sc->sc_mutex);
	for (i = 0; i < count; i++) {
	gi = (struct gic_irqsrc *)isrc[i];

	KASSERT((gi->gi_flags & GI_FLAG_MSI_USED) == GI_FLAG_MSI_USED,
	("%s: Trying to release an unused MSI-X interrupt",
	__func__));

	gi->gi_flags &= ~GI_FLAG_MSI_USED;
	}
	mtx_unlock(&sc->sc_mutex);

	return (0);
	}

	static int
	arm_gicv2m_alloc_msix(device_t dev, device_t child, device_t *pic,
	struct intr_irqsrc **isrcp)
	{
	struct arm_gicv2m_softc *sc;
	struct arm_gic_softc *psc;
	int irq;

	psc = device_get_softc(device_get_parent(dev));
	sc = device_get_softc(dev);

	mtx_lock(&sc->sc_mutex);
	/* Find an unused interrupt */
	for (irq = sc->sc_spi_start; irq < sc->sc_spi_end; irq++) {
	KASSERT((psc->gic_irqs[irq].gi_flags & GI_FLAG_MSI) != 0,
	("%s: Non-MSI interrupt found", __func__));
	if ((psc->gic_irqs[irq].gi_flags & GI_FLAG_MSI_USED) == 0)
	break;
	}
	/* No free interrupt was found */
	if (irq == sc->sc_spi_end) {
	mtx_unlock(&sc->sc_mutex);
	return (ENXIO);
	}

	/* Mark the interrupt as used */
	psc->gic_irqs[irq].gi_flags \|= GI_FLAG_MSI_USED;
	mtx_unlock(&sc->sc_mutex);

	isrcp = (struct intr_irqsrc )&psc->gic_irqs[irq];
	*pic = device_get_parent(dev);

	return (0);
	}

	static int
	arm_gicv2m_release_msix(device_t dev, device_t child, struct intr_irqsrc *isrc)
	{
	struct arm_gicv2m_softc *sc;
	struct gic_irqsrc *gi;

	sc = device_get_softc(dev);
	gi = (struct gic_irqsrc *)isrc;

	KASSERT((gi->gi_flags & GI_FLAG_MSI_USED) == GI_FLAG_MSI_USED,
	("%s: Trying to release an unused MSI-X interrupt", __func__));

	mtx_lock(&sc->sc_mutex);
	gi->gi_flags &= ~GI_FLAG_MSI_USED;
	mtx_unlock(&sc->sc_mutex);

	return (0);
	}

	static int
	arm_gicv2m_map_msi(device_t dev, device_t child, struct intr_irqsrc *isrc,
	uint64_t addr, uint32_t data)
	{
	struct arm_gicv2m_softc *sc = device_get_softc(dev);
	struct gic_irqsrc gi = (struct gic_irqsrc )isrc;

	*addr = vtophys(rman_get_virtual(sc->sc_mem)) + GICv2M_MSI_SETSPI_NS;
	*data = gi->gi_irq;

	return (0);
	}

	static device_method_t arm_gicv2m_methods[] = {
	/* Device interface */
	DEVMETHOD(device_attach, arm_gicv2m_attach),

	/* MSI/MSI-X */
	DEVMETHOD(msi_alloc_msi, arm_gicv2m_alloc_msi),
	DEVMETHOD(msi_release_msi, arm_gicv2m_release_msi),
	DEVMETHOD(msi_alloc_msix, arm_gicv2m_alloc_msix),
	DEVMETHOD(msi_release_msix, arm_gicv2m_release_msix),
	DEVMETHOD(msi_map_msi, arm_gicv2m_map_msi),

	/* End */
	DEVMETHOD_END
	};

	DEFINE_CLASS_0(gicv2m, arm_gicv2m_driver, arm_gicv2m_methods,
	sizeof(struct arm_gicv2m_softc));
	#endif
	Index: head/sys/arm/broadcom/bcm2835/bcm2835_cpufreq.c
	===================================================================
	--- head/sys/arm/broadcom/bcm2835/bcm2835_cpufreq.c (revision 327172)
	+++ head/sys/arm/broadcom/bcm2835/bcm2835_cpufreq.c (revision 327173)
	@@ -1,1640 +1,1642 @@
	/*-
	* Copyright (C) 2013-2015 Daisuke Aoyama <aoyama@peach.ne.jp>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/cpu.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/sema.h>
	#include <sys/sysctl.h>

	#include <machine/bus.h>
	#include <machine/cpu.h>
	#include <machine/intr.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <arm/broadcom/bcm2835/bcm2835_mbox.h>
	#include <arm/broadcom/bcm2835/bcm2835_mbox_prop.h>
	#include <arm/broadcom/bcm2835/bcm2835_vcbus.h>

	#include "cpufreq_if.h"
	#include "mbox_if.h"

	#ifdef DEBUG
	#define DPRINTF(fmt, ...) do { \
	printf("%s:%u: ", __func__, __LINE__); \
	printf(fmt, ##__VA_ARGS__); \
	} while (0)
	#else
	#define DPRINTF(fmt, ...)
	#endif

	#define HZ2MHZ(freq) ((freq) / (1000 * 1000))
	#define MHZ2HZ(freq) ((freq) * (1000 * 1000))

	#ifdef SOC_BCM2835
	#define OFFSET2MVOLT(val) (1200 + ((val) * 25))
	#define MVOLT2OFFSET(val) (((val) - 1200) / 25)
	#define DEFAULT_ARM_FREQUENCY 700
	#define DEFAULT_LOWEST_FREQ 300
	#else
	#define OFFSET2MVOLT(val) (((val) / 1000))
	#define MVOLT2OFFSET(val) (((val) * 1000))
	#define DEFAULT_ARM_FREQUENCY 600
	#define DEFAULT_LOWEST_FREQ 600
	#endif
	#define DEFAULT_CORE_FREQUENCY 250
	#define DEFAULT_SDRAM_FREQUENCY 400
	#define TRANSITION_LATENCY 1000
	#define MIN_OVER_VOLTAGE -16
	#define MAX_OVER_VOLTAGE 6
	#define MSG_ERROR -999999999
	#define MHZSTEP 100
	#define HZSTEP (MHZ2HZ(MHZSTEP))
	#define TZ_ZEROC 2731

	#define VC_LOCK(sc) do { \
	sema_wait(&vc_sema); \
	} while (0)
	#define VC_UNLOCK(sc) do { \
	sema_post(&vc_sema); \
	} while (0)

	/* ARM->VC mailbox property semaphore */
	static struct sema vc_sema;

	static struct sysctl_ctx_list bcm2835_sysctl_ctx;

	struct bcm2835_cpufreq_softc {
	device_t dev;
	int arm_max_freq;
	int arm_min_freq;
	int core_max_freq;
	int core_min_freq;
	int sdram_max_freq;
	int sdram_min_freq;
	int max_voltage_core;
	int min_voltage_core;

	/* the values written in mbox */
	int voltage_core;
	int voltage_sdram;
	int voltage_sdram_c;
	int voltage_sdram_i;
	int voltage_sdram_p;
	int turbo_mode;

	/* initial hook for waiting mbox intr */
	struct intr_config_hook init_hook;
	};

	static struct ofw_compat_data compat_data[] = {
	{ "broadcom,bcm2835-vc", 1 },
	{ "broadcom,bcm2708-vc", 1 },
	{ "brcm,bcm2709", 1 },
	{ "brcm,bcm2836", 1 },
	{ NULL, 0 }
	};

	static int cpufreq_verbose = 0;
	TUNABLE_INT("hw.bcm2835.cpufreq.verbose", &cpufreq_verbose);
	static int cpufreq_lowest_freq = DEFAULT_LOWEST_FREQ;
	TUNABLE_INT("hw.bcm2835.cpufreq.lowest_freq", &cpufreq_lowest_freq);

	#ifdef PROP_DEBUG
	static void
	bcm2835_dump(const void *data, int len)
	{
	const uint8_t p = (const uint8_t)data;
	int i;

	printf("dump @ %p:\n", data);
	for (i = 0; i < len; i++) {
	printf("%2.2x ", p[i]);
	if ((i % 4) == 3)
	printf(" ");
	if ((i % 16) == 15)
	printf("\n");
	}
	printf("\n");
	}
	#endif

	static int
	bcm2835_cpufreq_get_clock_rate(struct bcm2835_cpufreq_softc *sc,
	uint32_t clock_id)
	{
	struct msg_get_clock_rate msg;
	int rate;
	int err;

	/*
	* Get clock rate
	* Tag: 0x00030002
	* Request:
	* Length: 4
	* Value:
	* u32: clock id
	* Response:
	* Length: 8
	* Value:
	* u32: clock id
	* u32: rate (in Hz)
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_CLOCK_RATE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.clock_id = clock_id;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get clock rate (id=%u)\n",
	clock_id);
	return (MSG_ERROR);
	}

	/* result (Hz) */
	rate = (int)msg.body.resp.rate_hz;
	DPRINTF("clock = %d(Hz)\n", rate);
	return (rate);
	}

	static int
	bcm2835_cpufreq_get_max_clock_rate(struct bcm2835_cpufreq_softc *sc,
	uint32_t clock_id)
	{
	struct msg_get_max_clock_rate msg;
	int rate;
	int err;

	/*
	* Get max clock rate
	* Tag: 0x00030004
	* Request:
	* Length: 4
	* Value:
	* u32: clock id
	* Response:
	* Length: 8
	* Value:
	* u32: clock id
	* u32: rate (in Hz)
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MAX_CLOCK_RATE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.clock_id = clock_id;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get max clock rate (id=%u)\n",
	clock_id);
	return (MSG_ERROR);
	}

	/* result (Hz) */
	rate = (int)msg.body.resp.rate_hz;
	DPRINTF("clock = %d(Hz)\n", rate);
	return (rate);
	}

	static int
	bcm2835_cpufreq_get_min_clock_rate(struct bcm2835_cpufreq_softc *sc,
	uint32_t clock_id)
	{
	struct msg_get_min_clock_rate msg;
	int rate;
	int err;

	/*
	* Get min clock rate
	* Tag: 0x00030007
	* Request:
	* Length: 4
	* Value:
	* u32: clock id
	* Response:
	* Length: 8
	* Value:
	* u32: clock id
	* u32: rate (in Hz)
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MIN_CLOCK_RATE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.clock_id = clock_id;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get min clock rate (id=%u)\n",
	clock_id);
	return (MSG_ERROR);
	}

	/* result (Hz) */
	rate = (int)msg.body.resp.rate_hz;
	DPRINTF("clock = %d(Hz)\n", rate);
	return (rate);
	}

	static int
	bcm2835_cpufreq_set_clock_rate(struct bcm2835_cpufreq_softc *sc,
	uint32_t clock_id, uint32_t rate_hz)
	{
	struct msg_set_clock_rate msg;
	int rate;
	int err;

	/*
	* Set clock rate
	* Tag: 0x00038002
	* Request:
	* Length: 8
	* Value:
	* u32: clock id
	* u32: rate (in Hz)
	* Response:
	* Length: 8
	* Value:
	* u32: clock id
	* u32: rate (in Hz)
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_CLOCK_RATE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.clock_id = clock_id;
	msg.body.req.rate_hz = rate_hz;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't set clock rate (id=%u)\n",
	clock_id);
	return (MSG_ERROR);
	}

	/* workaround for core clock */
	if (clock_id == BCM2835_MBOX_CLOCK_ID_CORE) {
	/* for safety (may change voltage without changing clock) */
	DELAY(TRANSITION_LATENCY);

	/*
	* XXX: the core clock is unable to change at once,
	* to change certainly, write it twice now.
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_CLOCK_RATE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.clock_id = clock_id;
	msg.body.req.rate_hz = rate_hz;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev,
	"can't set clock rate (id=%u)\n", clock_id);
	return (MSG_ERROR);
	}
	}

	/* result (Hz) */
	rate = (int)msg.body.resp.rate_hz;
	DPRINTF("clock = %d(Hz)\n", rate);
	return (rate);
	}

	static int
	bcm2835_cpufreq_get_turbo(struct bcm2835_cpufreq_softc *sc)
	{
	struct msg_get_turbo msg;
	int level;
	int err;

	/*
	* Get turbo
	* Tag: 0x00030009
	* Request:
	* Length: 4
	* Value:
	* u32: id
	* Response:
	* Length: 8
	* Value:
	* u32: id
	* u32: level
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_TURBO;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.id = 0;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get turbo\n");
	return (MSG_ERROR);
	}

	/* result 0=non-turbo, 1=turbo */
	level = (int)msg.body.resp.level;
	DPRINTF("level = %d\n", level);
	return (level);
	}

	static int
	bcm2835_cpufreq_set_turbo(struct bcm2835_cpufreq_softc *sc, uint32_t level)
	{
	struct msg_set_turbo msg;
	int value;
	int err;

	/*
	* Set turbo
	* Tag: 0x00038009
	* Request:
	* Length: 8
	* Value:
	* u32: id
	* u32: level
	* Response:
	* Length: 8
	* Value:
	* u32: id
	* u32: level
	*/

	/* replace unknown value to OFF */
	if (level != BCM2835_MBOX_TURBO_ON && level != BCM2835_MBOX_TURBO_OFF)
	level = BCM2835_MBOX_TURBO_OFF;

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_TURBO;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.id = 0;
	msg.body.req.level = level;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't set turbo\n");
	return (MSG_ERROR);
	}

	/* result 0=non-turbo, 1=turbo */
	value = (int)msg.body.resp.level;
	DPRINTF("level = %d\n", value);
	return (value);
	}

	static int
	bcm2835_cpufreq_get_voltage(struct bcm2835_cpufreq_softc *sc,
	uint32_t voltage_id)
	{
	struct msg_get_voltage msg;
	int value;
	int err;

	/*
	* Get voltage
	* Tag: 0x00030003
	* Request:
	* Length: 4
	* Value:
	* u32: voltage id
	* Response:
	* Length: 8
	* Value:
	* u32: voltage id
	* u32: value (offset from 1.2V in units of 0.025V)
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_VOLTAGE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.voltage_id = voltage_id;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get voltage\n");
	return (MSG_ERROR);
	}

	/* result (offset from 1.2V) */
	value = (int)msg.body.resp.value;
	DPRINTF("value = %d\n", value);
	return (value);
	}

	static int
	bcm2835_cpufreq_get_max_voltage(struct bcm2835_cpufreq_softc *sc,
	uint32_t voltage_id)
	{
	struct msg_get_max_voltage msg;
	int value;
	int err;

	/*
	* Get voltage
	* Tag: 0x00030005
	* Request:
	* Length: 4
	* Value:
	* u32: voltage id
	* Response:
	* Length: 8
	* Value:
	* u32: voltage id
	* u32: value (offset from 1.2V in units of 0.025V)
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MAX_VOLTAGE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.voltage_id = voltage_id;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get max voltage\n");
	return (MSG_ERROR);
	}

	/* result (offset from 1.2V) */
	value = (int)msg.body.resp.value;
	DPRINTF("value = %d\n", value);
	return (value);
	}
	static int
	bcm2835_cpufreq_get_min_voltage(struct bcm2835_cpufreq_softc *sc,
	uint32_t voltage_id)
	{
	struct msg_get_min_voltage msg;
	int value;
	int err;

	/*
	* Get voltage
	* Tag: 0x00030008
	* Request:
	* Length: 4
	* Value:
	* u32: voltage id
	* Response:
	* Length: 8
	* Value:
	* u32: voltage id
	* u32: value (offset from 1.2V in units of 0.025V)
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_MIN_VOLTAGE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.voltage_id = voltage_id;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get min voltage\n");
	return (MSG_ERROR);
	}

	/* result (offset from 1.2V) */
	value = (int)msg.body.resp.value;
	DPRINTF("value = %d\n", value);
	return (value);
	}

	static int
	bcm2835_cpufreq_set_voltage(struct bcm2835_cpufreq_softc *sc,
	uint32_t voltage_id, int32_t value)
	{
	struct msg_set_voltage msg;
	int err;

	/*
	* Set voltage
	* Tag: 0x00038003
	* Request:
	* Length: 4
	* Value:
	* u32: voltage id
	* u32: value (offset from 1.2V in units of 0.025V)
	* Response:
	* Length: 8
	* Value:
	* u32: voltage id
	* u32: value (offset from 1.2V in units of 0.025V)
	*/

	/*
	* over_voltage:
	* 0 (1.2 V). Values above 6 are only allowed when force_turbo or
	* current_limit_override are specified (which set the warranty bit).
	*/
	if (value > MAX_OVER_VOLTAGE \|\| value < MIN_OVER_VOLTAGE) {
	/* currently not supported */
	device_printf(sc->dev, "not supported voltage: %d\n", value);
	return (MSG_ERROR);
	}

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_VOLTAGE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.voltage_id = voltage_id;
	msg.body.req.value = (uint32_t)value;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't set voltage\n");
	return (MSG_ERROR);
	}

	/* result (offset from 1.2V) */
	value = (int)msg.body.resp.value;
	DPRINTF("value = %d\n", value);
	return (value);
	}

	static int
	bcm2835_cpufreq_get_temperature(struct bcm2835_cpufreq_softc *sc)
	{
	struct msg_get_temperature msg;
	int value;
	int err;

	/*
	* Get temperature
	* Tag: 0x00030006
	* Request:
	* Length: 4
	* Value:
	* u32: temperature id
	* Response:
	* Length: 8
	* Value:
	* u32: temperature id
	* u32: value
	*/

	/* setup single tag buffer */
	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_TEMPERATURE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.temperature_id = 0;
	msg.end_tag = 0;

	/* call mailbox property */
	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err) {
	device_printf(sc->dev, "can't get temperature\n");
	return (MSG_ERROR);
	}

	/* result (temperature of degree C) */
	value = (int)msg.body.resp.value;
	DPRINTF("value = %d\n", value);
	return (value);
	}



	static int
	sysctl_bcm2835_cpufreq_arm_freq(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM,
	val);
	VC_UNLOCK(sc);
	if (err == MSG_ERROR) {
	device_printf(sc->dev, "set clock arm_freq error\n");
	return (EIO);
	}
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_core_freq(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE,
	val);
	if (err == MSG_ERROR) {
	VC_UNLOCK(sc);
	device_printf(sc->dev, "set clock core_freq error\n");
	return (EIO);
	}
	VC_UNLOCK(sc);
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_sdram_freq(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_SDRAM);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_SDRAM,
	val);
	VC_UNLOCK(sc);
	if (err == MSG_ERROR) {
	device_printf(sc->dev, "set clock sdram_freq error\n");
	return (EIO);
	}
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_turbo(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_turbo(sc);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	if (val > 0)
	sc->turbo_mode = BCM2835_MBOX_TURBO_ON;
	else
	sc->turbo_mode = BCM2835_MBOX_TURBO_OFF;

	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_turbo(sc, sc->turbo_mode);
	VC_UNLOCK(sc);
	if (err == MSG_ERROR) {
	device_printf(sc->dev, "set turbo error\n");
	return (EIO);
	}
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_voltage_core(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_CORE);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	if (val > MAX_OVER_VOLTAGE \|\| val < MIN_OVER_VOLTAGE)
	return (EINVAL);
	sc->voltage_core = val;

	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_CORE,
	sc->voltage_core);
	VC_UNLOCK(sc);
	if (err == MSG_ERROR) {
	device_printf(sc->dev, "set voltage core error\n");
	return (EIO);
	}
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_voltage_sdram_c(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	if (val > MAX_OVER_VOLTAGE \|\| val < MIN_OVER_VOLTAGE)
	return (EINVAL);
	sc->voltage_sdram_c = val;

	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_C,
	sc->voltage_sdram_c);
	VC_UNLOCK(sc);
	if (err == MSG_ERROR) {
	device_printf(sc->dev, "set voltage sdram_c error\n");
	return (EIO);
	}
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_voltage_sdram_i(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	if (val > MAX_OVER_VOLTAGE \|\| val < MIN_OVER_VOLTAGE)
	return (EINVAL);
	sc->voltage_sdram_i = val;

	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_I,
	sc->voltage_sdram_i);
	VC_UNLOCK(sc);
	if (err == MSG_ERROR) {
	device_printf(sc->dev, "set voltage sdram_i error\n");
	return (EIO);
	}
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_voltage_sdram_p(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	if (val > MAX_OVER_VOLTAGE \|\| val < MIN_OVER_VOLTAGE)
	return (EINVAL);
	sc->voltage_sdram_p = val;

	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_P,
	sc->voltage_sdram_p);
	VC_UNLOCK(sc);
	if (err == MSG_ERROR) {
	device_printf(sc->dev, "set voltage sdram_p error\n");
	return (EIO);
	}
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_voltage_sdram(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* multiple write only */
	if (!req->newptr)
	return (EINVAL);
	val = 0;
	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err)
	return (err);

	/* write request */
	if (val > MAX_OVER_VOLTAGE \|\| val < MIN_OVER_VOLTAGE)
	return (EINVAL);
	sc->voltage_sdram = val;

	VC_LOCK(sc);
	err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_C,
	val);
	if (err == MSG_ERROR) {
	VC_UNLOCK(sc);
	device_printf(sc->dev, "set voltage sdram_c error\n");
	return (EIO);
	}
	err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_I,
	val);
	if (err == MSG_ERROR) {
	VC_UNLOCK(sc);
	device_printf(sc->dev, "set voltage sdram_i error\n");
	return (EIO);
	}
	err = bcm2835_cpufreq_set_voltage(sc, BCM2835_MBOX_VOLTAGE_ID_SDRAM_P,
	val);
	if (err == MSG_ERROR) {
	VC_UNLOCK(sc);
	device_printf(sc->dev, "set voltage sdram_p error\n");
	return (EIO);
	}
	VC_UNLOCK(sc);
	DELAY(TRANSITION_LATENCY);

	return (0);
	}

	static int
	sysctl_bcm2835_cpufreq_temperature(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_temperature(sc);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	return (EINVAL);
	}

	static int
	sysctl_bcm2835_devcpu_temperature(SYSCTL_HANDLER_ARGS)
	{
	struct bcm2835_cpufreq_softc *sc = arg1;
	int val;
	int err;

	/* get realtime value */
	VC_LOCK(sc);
	val = bcm2835_cpufreq_get_temperature(sc);
	VC_UNLOCK(sc);
	if (val == MSG_ERROR)
	return (EIO);

	/* 1/1000 celsius (raw) to 1/10 kelvin */
	val = val / 100 + TZ_ZEROC;

	err = sysctl_handle_int(oidp, &val, 0, req);
	if (err \|\| !req->newptr) /* error \|\| read request */
	return (err);

	/* write request */
	return (EINVAL);
	}


	static void
	bcm2835_cpufreq_init(void *arg)
	{
	struct bcm2835_cpufreq_softc *sc = arg;
	struct sysctl_ctx_list *ctx;
	device_t cpu;
	int arm_freq, core_freq, sdram_freq;
	int arm_max_freq, arm_min_freq, core_max_freq, core_min_freq;
	int sdram_max_freq, sdram_min_freq;
	int voltage_core, voltage_sdram_c, voltage_sdram_i, voltage_sdram_p;
	int max_voltage_core, min_voltage_core;
	int max_voltage_sdram_c, min_voltage_sdram_c;
	int max_voltage_sdram_i, min_voltage_sdram_i;
	int max_voltage_sdram_p, min_voltage_sdram_p;
	int turbo, temperature;

	VC_LOCK(sc);

	/* current clock */
	arm_freq = bcm2835_cpufreq_get_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_ARM);
	core_freq = bcm2835_cpufreq_get_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_CORE);
	sdram_freq = bcm2835_cpufreq_get_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_SDRAM);

	/* max/min clock */
	arm_max_freq = bcm2835_cpufreq_get_max_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_ARM);
	arm_min_freq = bcm2835_cpufreq_get_min_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_ARM);
	core_max_freq = bcm2835_cpufreq_get_max_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_CORE);
	core_min_freq = bcm2835_cpufreq_get_min_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_CORE);
	sdram_max_freq = bcm2835_cpufreq_get_max_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_SDRAM);
	sdram_min_freq = bcm2835_cpufreq_get_min_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_SDRAM);

	/* turbo mode */
	turbo = bcm2835_cpufreq_get_turbo(sc);
	if (turbo > 0)
	sc->turbo_mode = BCM2835_MBOX_TURBO_ON;
	else
	sc->turbo_mode = BCM2835_MBOX_TURBO_OFF;

	/* voltage */
	voltage_core = bcm2835_cpufreq_get_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_CORE);
	voltage_sdram_c = bcm2835_cpufreq_get_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
	voltage_sdram_i = bcm2835_cpufreq_get_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
	voltage_sdram_p = bcm2835_cpufreq_get_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);

	/* current values (offset from 1.2V) */
	sc->voltage_core = voltage_core;
	sc->voltage_sdram = voltage_sdram_c;
	sc->voltage_sdram_c = voltage_sdram_c;
	sc->voltage_sdram_i = voltage_sdram_i;
	sc->voltage_sdram_p = voltage_sdram_p;

	/* max/min voltage */
	max_voltage_core = bcm2835_cpufreq_get_max_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_CORE);
	min_voltage_core = bcm2835_cpufreq_get_min_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_CORE);
	max_voltage_sdram_c = bcm2835_cpufreq_get_max_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
	max_voltage_sdram_i = bcm2835_cpufreq_get_max_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
	max_voltage_sdram_p = bcm2835_cpufreq_get_max_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);
	min_voltage_sdram_c = bcm2835_cpufreq_get_min_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_C);
	min_voltage_sdram_i = bcm2835_cpufreq_get_min_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_I);
	min_voltage_sdram_p = bcm2835_cpufreq_get_min_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_SDRAM_P);

	/* temperature */
	temperature = bcm2835_cpufreq_get_temperature(sc);

	/* show result */
	if (cpufreq_verbose \|\| bootverbose) {
	device_printf(sc->dev, "Boot settings:\n");
	device_printf(sc->dev,
	"current ARM %dMHz, Core %dMHz, SDRAM %dMHz, Turbo %s\n",
	HZ2MHZ(arm_freq), HZ2MHZ(core_freq), HZ2MHZ(sdram_freq),
	(sc->turbo_mode == BCM2835_MBOX_TURBO_ON) ? "ON" : "OFF");

	device_printf(sc->dev,
	"max/min ARM %d/%dMHz, Core %d/%dMHz, SDRAM %d/%dMHz\n",
	HZ2MHZ(arm_max_freq), HZ2MHZ(arm_min_freq),
	HZ2MHZ(core_max_freq), HZ2MHZ(core_min_freq),
	HZ2MHZ(sdram_max_freq), HZ2MHZ(sdram_min_freq));

	device_printf(sc->dev,
	"current Core %dmV, SDRAM_C %dmV, SDRAM_I %dmV, "
	"SDRAM_P %dmV\n",
	OFFSET2MVOLT(voltage_core), OFFSET2MVOLT(voltage_sdram_c),
	OFFSET2MVOLT(voltage_sdram_i),
	OFFSET2MVOLT(voltage_sdram_p));

	device_printf(sc->dev,
	"max/min Core %d/%dmV, SDRAM_C %d/%dmV, SDRAM_I %d/%dmV, "
	"SDRAM_P %d/%dmV\n",
	OFFSET2MVOLT(max_voltage_core),
	OFFSET2MVOLT(min_voltage_core),
	OFFSET2MVOLT(max_voltage_sdram_c),
	OFFSET2MVOLT(min_voltage_sdram_c),
	OFFSET2MVOLT(max_voltage_sdram_i),
	OFFSET2MVOLT(min_voltage_sdram_i),
	OFFSET2MVOLT(max_voltage_sdram_p),
	OFFSET2MVOLT(min_voltage_sdram_p));

	device_printf(sc->dev,
	"Temperature %d.%dC\n", (temperature / 1000),
	(temperature % 1000) / 100);
	} else { /* !cpufreq_verbose && !bootverbose */
	device_printf(sc->dev,
	"ARM %dMHz, Core %dMHz, SDRAM %dMHz, Turbo %s\n",
	HZ2MHZ(arm_freq), HZ2MHZ(core_freq), HZ2MHZ(sdram_freq),
	(sc->turbo_mode == BCM2835_MBOX_TURBO_ON) ? "ON" : "OFF");
	}

	/* keep in softc (MHz/mV) */
	sc->arm_max_freq = HZ2MHZ(arm_max_freq);
	sc->arm_min_freq = HZ2MHZ(arm_min_freq);
	sc->core_max_freq = HZ2MHZ(core_max_freq);
	sc->core_min_freq = HZ2MHZ(core_min_freq);
	sc->sdram_max_freq = HZ2MHZ(sdram_max_freq);
	sc->sdram_min_freq = HZ2MHZ(sdram_min_freq);
	sc->max_voltage_core = OFFSET2MVOLT(max_voltage_core);
	sc->min_voltage_core = OFFSET2MVOLT(min_voltage_core);

	/* if turbo is on, set to max values */
	if (sc->turbo_mode == BCM2835_MBOX_TURBO_ON) {
	bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM,
	arm_max_freq);
	DELAY(TRANSITION_LATENCY);
	bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE,
	core_max_freq);
	DELAY(TRANSITION_LATENCY);
	bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_SDRAM, sdram_max_freq);
	DELAY(TRANSITION_LATENCY);
	} else {
	bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_ARM,
	arm_min_freq);
	DELAY(TRANSITION_LATENCY);
	bcm2835_cpufreq_set_clock_rate(sc, BCM2835_MBOX_CLOCK_ID_CORE,
	core_min_freq);
	DELAY(TRANSITION_LATENCY);
	bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_SDRAM, sdram_min_freq);
	DELAY(TRANSITION_LATENCY);
	}

	VC_UNLOCK(sc);

	/* add human readable temperature to dev.cpu node */
	cpu = device_get_parent(sc->dev);
	if (cpu != NULL) {
	ctx = device_get_sysctl_ctx(cpu);
	SYSCTL_ADD_PROC(ctx,
	SYSCTL_CHILDREN(device_get_sysctl_tree(cpu)), OID_AUTO,
	"temperature", CTLTYPE_INT \| CTLFLAG_RD, sc, 0,
	sysctl_bcm2835_devcpu_temperature, "IK",
	"Current SoC temperature");
	}

	/* release this hook (continue boot) */
	config_intrhook_disestablish(&sc->init_hook);
	}

	static void
	bcm2835_cpufreq_identify(driver_t *driver, device_t parent)
	{
	const struct ofw_compat_data *compat;
	phandle_t root;

	root = OF_finddevice("/");
	for (compat = compat_data; compat->ocd_str != NULL; compat++)
	if (ofw_bus_node_is_compatible(root, compat->ocd_str))
	break;

	if (compat->ocd_data == 0)
	return;

	DPRINTF("driver=%p, parent=%p\n", driver, parent);
	if (device_find_child(parent, "bcm2835_cpufreq", -1) != NULL)
	return;
	if (BUS_ADD_CHILD(parent, 0, "bcm2835_cpufreq", -1) == NULL)
	device_printf(parent, "add child failed\n");
	}

	static int
	bcm2835_cpufreq_probe(device_t dev)
	{

	if (device_get_unit(dev) != 0)
	return (ENXIO);
	device_set_desc(dev, "CPU Frequency Control");

	return (0);
	}

	static int
	bcm2835_cpufreq_attach(device_t dev)
	{
	struct bcm2835_cpufreq_softc *sc;
	struct sysctl_oid *oid;

	/* set self dev */
	sc = device_get_softc(dev);
	sc->dev = dev;

	/* initial values */
	sc->arm_max_freq = -1;
	sc->arm_min_freq = -1;
	sc->core_max_freq = -1;
	sc->core_min_freq = -1;
	sc->sdram_max_freq = -1;
	sc->sdram_min_freq = -1;
	sc->max_voltage_core = 0;
	sc->min_voltage_core = 0;

	/* setup sysctl at first device */
	if (device_get_unit(dev) == 0) {
	sysctl_ctx_init(&bcm2835_sysctl_ctx);
	/* create node for hw.cpufreq */
	oid = SYSCTL_ADD_NODE(&bcm2835_sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO, "cpufreq",
	CTLFLAG_RD, NULL, "");

	/* Frequency (Hz) */
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "arm_freq", CTLTYPE_INT \| CTLFLAG_RW, sc, 0,
	sysctl_bcm2835_cpufreq_arm_freq, "IU",
	"ARM frequency (Hz)");
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "core_freq", CTLTYPE_INT \| CTLFLAG_RW, sc, 0,
	sysctl_bcm2835_cpufreq_core_freq, "IU",
	"Core frequency (Hz)");
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "sdram_freq", CTLTYPE_INT \| CTLFLAG_RW, sc, 0,
	sysctl_bcm2835_cpufreq_sdram_freq, "IU",
	"SDRAM frequency (Hz)");

	/* Turbo state */
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "turbo", CTLTYPE_INT \| CTLFLAG_RW, sc, 0,
	sysctl_bcm2835_cpufreq_turbo, "IU",
	"Disables dynamic clocking");

	/* Voltage (offset from 1.2V in units of 0.025V) */
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "voltage_core", CTLTYPE_INT \| CTLFLAG_RW, sc, 0,
	sysctl_bcm2835_cpufreq_voltage_core, "I",
	"ARM/GPU core voltage"
	"(offset from 1.2V in units of 0.025V)");
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "voltage_sdram", CTLTYPE_INT \| CTLFLAG_WR, sc,
	0, sysctl_bcm2835_cpufreq_voltage_sdram, "I",
	"SDRAM voltage (offset from 1.2V in units of 0.025V)");

	/* Voltage individual SDRAM */
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "voltage_sdram_c", CTLTYPE_INT \| CTLFLAG_RW, sc,
	0, sysctl_bcm2835_cpufreq_voltage_sdram_c, "I",
	"SDRAM controller voltage"
	"(offset from 1.2V in units of 0.025V)");
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "voltage_sdram_i", CTLTYPE_INT \| CTLFLAG_RW, sc,
	0, sysctl_bcm2835_cpufreq_voltage_sdram_i, "I",
	"SDRAM I/O voltage (offset from 1.2V in units of 0.025V)");
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "voltage_sdram_p", CTLTYPE_INT \| CTLFLAG_RW, sc,
	0, sysctl_bcm2835_cpufreq_voltage_sdram_p, "I",
	"SDRAM phy voltage (offset from 1.2V in units of 0.025V)");

	/* Temperature */
	SYSCTL_ADD_PROC(&bcm2835_sysctl_ctx, SYSCTL_CHILDREN(oid),
	OID_AUTO, "temperature", CTLTYPE_INT \| CTLFLAG_RD, sc, 0,
	sysctl_bcm2835_cpufreq_temperature, "I",
	"SoC temperature (thousandths of a degree C)");
	}

	/* ARM->VC lock */
	sema_init(&vc_sema, 1, "vcsema");

	/* register callback for using mbox when interrupts are enabled */
	sc->init_hook.ich_func = bcm2835_cpufreq_init;
	sc->init_hook.ich_arg = sc;

	if (config_intrhook_establish(&sc->init_hook) != 0) {
	device_printf(dev, "config_intrhook_establish failed\n");
	return (ENOMEM);
	}

	/* this device is controlled by cpufreq(4) */
	cpufreq_register(dev);

	return (0);
	}

	static int
	bcm2835_cpufreq_detach(device_t dev)
	{
	- struct bcm2835_cpufreq_softc *sc;

	- sc = device_get_softc(dev);
	-
	sema_destroy(&vc_sema);

	return (cpufreq_unregister(dev));
	}

	static int
	bcm2835_cpufreq_set(device_t dev, const struct cf_setting *cf)
	{
	struct bcm2835_cpufreq_softc *sc;
	uint32_t rate_hz, rem;
	- int cur_freq, resp_freq, arm_freq, min_freq, core_freq;
	+ int resp_freq, arm_freq, min_freq, core_freq;
	+#ifdef DEBUG
	+ int cur_freq;
	+#endif

	if (cf == NULL \|\| cf->freq < 0)
	return (EINVAL);

	sc = device_get_softc(dev);

	/* setting clock (Hz) */
	rate_hz = (uint32_t)MHZ2HZ(cf->freq);
	rem = rate_hz % HZSTEP;
	rate_hz -= rem;
	if (rate_hz == 0)
	return (EINVAL);

	/* adjust min freq */
	min_freq = sc->arm_min_freq;
	if (sc->turbo_mode != BCM2835_MBOX_TURBO_ON)
	if (min_freq > cpufreq_lowest_freq)
	min_freq = cpufreq_lowest_freq;

	if (rate_hz < MHZ2HZ(min_freq) \|\| rate_hz > MHZ2HZ(sc->arm_max_freq))
	return (EINVAL);

	/* set new value and verify it */
	VC_LOCK(sc);
	+#ifdef DEBUG
	cur_freq = bcm2835_cpufreq_get_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_ARM);
	+#endif
	resp_freq = bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_ARM, rate_hz);
	DELAY(TRANSITION_LATENCY);
	arm_freq = bcm2835_cpufreq_get_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_ARM);

	/*
	* if non-turbo and lower than or equal min_freq,
	* clock down core and sdram to default first.
	*/
	if (sc->turbo_mode != BCM2835_MBOX_TURBO_ON) {
	core_freq = bcm2835_cpufreq_get_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_CORE);
	if (rate_hz > MHZ2HZ(sc->arm_min_freq)) {
	bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_CORE,
	MHZ2HZ(sc->core_max_freq));
	DELAY(TRANSITION_LATENCY);
	bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_SDRAM,
	MHZ2HZ(sc->sdram_max_freq));
	DELAY(TRANSITION_LATENCY);
	} else {
	if (sc->core_min_freq < DEFAULT_CORE_FREQUENCY &&
	core_freq > DEFAULT_CORE_FREQUENCY) {
	/* first, down to 250, then down to min */
	DELAY(TRANSITION_LATENCY);
	bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_CORE,
	MHZ2HZ(DEFAULT_CORE_FREQUENCY));
	DELAY(TRANSITION_LATENCY);
	/* reset core voltage */
	bcm2835_cpufreq_set_voltage(sc,
	BCM2835_MBOX_VOLTAGE_ID_CORE, 0);
	DELAY(TRANSITION_LATENCY);
	}
	bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_CORE,
	MHZ2HZ(sc->core_min_freq));
	DELAY(TRANSITION_LATENCY);
	bcm2835_cpufreq_set_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_SDRAM,
	MHZ2HZ(sc->sdram_min_freq));
	DELAY(TRANSITION_LATENCY);
	}
	}

	VC_UNLOCK(sc);

	if (resp_freq < 0 \|\| arm_freq < 0 \|\| resp_freq != arm_freq) {
	device_printf(dev, "wrong freq\n");
	return (EIO);
	}
	DPRINTF("cpufreq: %d -> %d\n", cur_freq, arm_freq);

	return (0);
	}

	static int
	bcm2835_cpufreq_get(device_t dev, struct cf_setting *cf)
	{
	struct bcm2835_cpufreq_softc *sc;
	int arm_freq;

	if (cf == NULL)
	return (EINVAL);

	sc = device_get_softc(dev);
	memset(cf, CPUFREQ_VAL_UNKNOWN, sizeof(*cf));
	cf->dev = NULL;

	/* get cuurent value */
	VC_LOCK(sc);
	arm_freq = bcm2835_cpufreq_get_clock_rate(sc,
	BCM2835_MBOX_CLOCK_ID_ARM);
	VC_UNLOCK(sc);
	if (arm_freq < 0) {
	device_printf(dev, "can't get clock\n");
	return (EINVAL);
	}

	/* CPU clock in MHz or 100ths of a percent. */
	cf->freq = HZ2MHZ(arm_freq);
	/* Voltage in mV. */
	cf->volts = CPUFREQ_VAL_UNKNOWN;
	/* Power consumed in mW. */
	cf->power = CPUFREQ_VAL_UNKNOWN;
	/* Transition latency in us. */
	cf->lat = TRANSITION_LATENCY;
	/* Driver providing this setting. */
	cf->dev = dev;

	return (0);
	}

	static int
	bcm2835_cpufreq_make_freq_list(device_t dev, struct cf_setting *sets,
	int *count)
	{
	struct bcm2835_cpufreq_softc *sc;
	int freq, min_freq, volts, rem;
	int idx;

	sc = device_get_softc(dev);
	freq = sc->arm_max_freq;
	min_freq = sc->arm_min_freq;

	/* adjust head freq to STEP */
	rem = freq % MHZSTEP;
	freq -= rem;
	if (freq < min_freq)
	freq = min_freq;

	/* if non-turbo, add extra low freq */
	if (sc->turbo_mode != BCM2835_MBOX_TURBO_ON)
	if (min_freq > cpufreq_lowest_freq)
	min_freq = cpufreq_lowest_freq;

	#ifdef SOC_BCM2835
	/* from freq to min_freq */
	for (idx = 0; idx < *count && freq >= min_freq; idx++) {
	if (freq > sc->arm_min_freq)
	volts = sc->max_voltage_core;
	else
	volts = sc->min_voltage_core;
	sets[idx].freq = freq;
	sets[idx].volts = volts;
	sets[idx].lat = TRANSITION_LATENCY;
	sets[idx].dev = dev;
	freq -= MHZSTEP;
	}
	#else
	/* XXX RPi2 have only 900/600MHz */
	idx = 0;
	volts = sc->min_voltage_core;
	sets[idx].freq = freq;
	sets[idx].volts = volts;
	sets[idx].lat = TRANSITION_LATENCY;
	sets[idx].dev = dev;
	idx++;
	if (freq != min_freq) {
	sets[idx].freq = min_freq;
	sets[idx].volts = volts;
	sets[idx].lat = TRANSITION_LATENCY;
	sets[idx].dev = dev;
	idx++;
	}
	#endif
	*count = idx;

	return (0);
	}

	static int
	bcm2835_cpufreq_settings(device_t dev, struct cf_setting sets, int count)
	{
	struct bcm2835_cpufreq_softc *sc;

	if (sets == NULL \|\| count == NULL)
	return (EINVAL);

	sc = device_get_softc(dev);
	if (sc->arm_min_freq < 0 \|\| sc->arm_max_freq < 0) {
	printf("device is not configured\n");
	return (EINVAL);
	}

	/* fill data with unknown value */
	memset(sets, CPUFREQ_VAL_UNKNOWN, sizeof(sets) (*count));
	/* create new array up to count */
	bcm2835_cpufreq_make_freq_list(dev, sets, count);

	return (0);
	}

	static int
	bcm2835_cpufreq_type(device_t dev, int *type)
	{

	if (type == NULL)
	return (EINVAL);
	*type = CPUFREQ_TYPE_ABSOLUTE;

	return (0);
	}

	static device_method_t bcm2835_cpufreq_methods[] = {
	/* Device interface */
	DEVMETHOD(device_identify, bcm2835_cpufreq_identify),
	DEVMETHOD(device_probe, bcm2835_cpufreq_probe),
	DEVMETHOD(device_attach, bcm2835_cpufreq_attach),
	DEVMETHOD(device_detach, bcm2835_cpufreq_detach),

	/* cpufreq interface */
	DEVMETHOD(cpufreq_drv_set, bcm2835_cpufreq_set),
	DEVMETHOD(cpufreq_drv_get, bcm2835_cpufreq_get),
	DEVMETHOD(cpufreq_drv_settings, bcm2835_cpufreq_settings),
	DEVMETHOD(cpufreq_drv_type, bcm2835_cpufreq_type),

	DEVMETHOD_END
	};

	static devclass_t bcm2835_cpufreq_devclass;
	static driver_t bcm2835_cpufreq_driver = {
	"bcm2835_cpufreq",
	bcm2835_cpufreq_methods,
	sizeof(struct bcm2835_cpufreq_softc),
	};

	DRIVER_MODULE(bcm2835_cpufreq, cpu, bcm2835_cpufreq_driver,
	bcm2835_cpufreq_devclass, 0, 0);
	Index: head/sys/arm/broadcom/bcm2835/bcm2835_gpio.c
	===================================================================
	--- head/sys/arm/broadcom/bcm2835/bcm2835_gpio.c (revision 327172)
	+++ head/sys/arm/broadcom/bcm2835/bcm2835_gpio.c (revision 327173)
	@@ -1,1232 +1,1231 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Oleksandr Tymoshenko <gonzo@FreeBSD.org>
	* Copyright (c) 2012-2015 Luiz Otavio O Souza <loos@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_platform.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/gpio.h>
	#include <sys/interrupt.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/rman.h>
	#include <sys/sysctl.h>

	#include <machine/bus.h>
	#include <machine/intr.h>

	#include <dev/gpio/gpiobusvar.h>
	#include <dev/ofw/ofw_bus.h>

	#include <arm/broadcom/bcm2835/bcm2835_gpio.h>

	#include "gpio_if.h"

	#include "pic_if.h"

	#ifdef DEBUG
	#define dprintf(fmt, args...) do { printf("%s(): ", __func__); \
	printf(fmt,##args); } while (0)
	#else
	#define dprintf(fmt, args...)
	#endif

	#define BCM_GPIO_IRQS 4
	#define BCM_GPIO_PINS 54
	#define BCM_GPIO_PINS_PER_BANK 32

	#define BCM_GPIO_DEFAULT_CAPS (GPIO_PIN_INPUT \| GPIO_PIN_OUTPUT \| \
	GPIO_PIN_PULLUP \| GPIO_PIN_PULLDOWN \| GPIO_INTR_LEVEL_LOW \| \
	GPIO_INTR_LEVEL_HIGH \| GPIO_INTR_EDGE_RISING \| \
	GPIO_INTR_EDGE_FALLING \| GPIO_INTR_EDGE_BOTH)

	static struct resource_spec bcm_gpio_res_spec[] = {
	{ SYS_RES_MEMORY, 0, RF_ACTIVE },
	{ SYS_RES_IRQ, 0, RF_ACTIVE }, /* bank 0 interrupt */
	{ SYS_RES_IRQ, 1, RF_ACTIVE }, /* bank 1 interrupt */
	{ -1, 0, 0 }
	};

	struct bcm_gpio_sysctl {
	struct bcm_gpio_softc *sc;
	uint32_t pin;
	};

	struct bcm_gpio_irqsrc {
	struct intr_irqsrc bgi_isrc;
	uint32_t bgi_irq;
	uint32_t bgi_mode;
	uint32_t bgi_mask;
	};

	struct bcm_gpio_softc {
	device_t sc_dev;
	device_t sc_busdev;
	struct mtx sc_mtx;
	struct resource * sc_res[BCM_GPIO_IRQS + 1];
	bus_space_tag_t sc_bst;
	bus_space_handle_t sc_bsh;
	void * sc_intrhand[BCM_GPIO_IRQS];
	int sc_gpio_npins;
	int sc_ro_npins;
	int sc_ro_pins[BCM_GPIO_PINS];
	struct gpio_pin sc_gpio_pins[BCM_GPIO_PINS];
	struct bcm_gpio_sysctl sc_sysctl[BCM_GPIO_PINS];
	struct bcm_gpio_irqsrc sc_isrcs[BCM_GPIO_PINS];
	};

	enum bcm_gpio_pud {
	BCM_GPIO_NONE,
	BCM_GPIO_PULLDOWN,
	BCM_GPIO_PULLUP,
	};

	#define BCM_GPIO_LOCK(_sc) mtx_lock_spin(&(_sc)->sc_mtx)
	#define BCM_GPIO_UNLOCK(_sc) mtx_unlock_spin(&(_sc)->sc_mtx)
	#define BCM_GPIO_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->sc_mtx, MA_OWNED)
	#define BCM_GPIO_WRITE(_sc, _off, _val) \
	bus_space_write_4((_sc)->sc_bst, (_sc)->sc_bsh, _off, _val)
	#define BCM_GPIO_READ(_sc, _off) \
	bus_space_read_4((_sc)->sc_bst, (_sc)->sc_bsh, _off)
	#define BCM_GPIO_CLEAR_BITS(_sc, _off, _bits) \
	BCM_GPIO_WRITE(_sc, _off, BCM_GPIO_READ(_sc, _off) & ~(_bits))
	#define BCM_GPIO_SET_BITS(_sc, _off, _bits) \
	BCM_GPIO_WRITE(_sc, _off, BCM_GPIO_READ(_sc, _off) \| _bits)
	#define BCM_GPIO_BANK(a) (a / BCM_GPIO_PINS_PER_BANK)
	#define BCM_GPIO_MASK(a) (1U << (a % BCM_GPIO_PINS_PER_BANK))

	#define BCM_GPIO_GPFSEL(_bank) (0x00 + _bank * 4) /* Function Select */
	#define BCM_GPIO_GPSET(_bank) (0x1c + _bank * 4) /* Pin Out Set */
	#define BCM_GPIO_GPCLR(_bank) (0x28 + _bank * 4) /* Pin Out Clear */
	#define BCM_GPIO_GPLEV(_bank) (0x34 + _bank * 4) /* Pin Level */
	#define BCM_GPIO_GPEDS(_bank) (0x40 + _bank * 4) /* Event Status */
	#define BCM_GPIO_GPREN(_bank) (0x4c + _bank * 4) /* Rising Edge irq */
	#define BCM_GPIO_GPFEN(_bank) (0x58 + _bank * 4) /* Falling Edge irq */
	#define BCM_GPIO_GPHEN(_bank) (0x64 + _bank * 4) /* High Level irq */
	#define BCM_GPIO_GPLEN(_bank) (0x70 + _bank * 4) /* Low Level irq */
	#define BCM_GPIO_GPAREN(_bank) (0x7c + _bank * 4) /* Async Rising Edge */
	#define BCM_GPIO_GPAFEN(_bank) (0x88 + _bank * 4) /* Async Falling Egde */
	#define BCM_GPIO_GPPUD(_bank) (0x94) /* Pin Pull up/down */
	#define BCM_GPIO_GPPUDCLK(_bank) (0x98 + _bank * 4) /* Pin Pull up clock */

	static struct ofw_compat_data compat_data[] = {
	{"broadcom,bcm2835-gpio", 1},
	{"brcm,bcm2835-gpio", 1},
	{NULL, 0}
	};

	static struct bcm_gpio_softc *bcm_gpio_sc = NULL;

	static int bcm_gpio_intr_bank0(void *arg);
	static int bcm_gpio_intr_bank1(void *arg);
	static int bcm_gpio_pic_attach(struct bcm_gpio_softc *sc);
	static int bcm_gpio_pic_detach(struct bcm_gpio_softc *sc);

	static int
	bcm_gpio_pin_is_ro(struct bcm_gpio_softc *sc, int pin)
	{
	int i;

	for (i = 0; i < sc->sc_ro_npins; i++)
	if (pin == sc->sc_ro_pins[i])
	return (1);
	return (0);
	}

	static uint32_t
	bcm_gpio_get_function(struct bcm_gpio_softc *sc, uint32_t pin)
	{
	uint32_t bank, func, offset;

	/* Five banks, 10 pins per bank, 3 bits per pin. */
	bank = pin / 10;
	offset = (pin - bank * 10) * 3;

	BCM_GPIO_LOCK(sc);
	func = (BCM_GPIO_READ(sc, BCM_GPIO_GPFSEL(bank)) >> offset) & 7;
	BCM_GPIO_UNLOCK(sc);

	return (func);
	}

	static void
	bcm_gpio_func_str(uint32_t nfunc, char *buf, int bufsize)
	{

	switch (nfunc) {
	case BCM_GPIO_INPUT:
	strncpy(buf, "input", bufsize);
	break;
	case BCM_GPIO_OUTPUT:
	strncpy(buf, "output", bufsize);
	break;
	case BCM_GPIO_ALT0:
	strncpy(buf, "alt0", bufsize);
	break;
	case BCM_GPIO_ALT1:
	strncpy(buf, "alt1", bufsize);
	break;
	case BCM_GPIO_ALT2:
	strncpy(buf, "alt2", bufsize);
	break;
	case BCM_GPIO_ALT3:
	strncpy(buf, "alt3", bufsize);
	break;
	case BCM_GPIO_ALT4:
	strncpy(buf, "alt4", bufsize);
	break;
	case BCM_GPIO_ALT5:
	strncpy(buf, "alt5", bufsize);
	break;
	default:
	strncpy(buf, "invalid", bufsize);
	}
	}

	static int
	bcm_gpio_str_func(char func, uint32_t nfunc)
	{

	if (strcasecmp(func, "input") == 0)
	*nfunc = BCM_GPIO_INPUT;
	else if (strcasecmp(func, "output") == 0)
	*nfunc = BCM_GPIO_OUTPUT;
	else if (strcasecmp(func, "alt0") == 0)
	*nfunc = BCM_GPIO_ALT0;
	else if (strcasecmp(func, "alt1") == 0)
	*nfunc = BCM_GPIO_ALT1;
	else if (strcasecmp(func, "alt2") == 0)
	*nfunc = BCM_GPIO_ALT2;
	else if (strcasecmp(func, "alt3") == 0)
	*nfunc = BCM_GPIO_ALT3;
	else if (strcasecmp(func, "alt4") == 0)
	*nfunc = BCM_GPIO_ALT4;
	else if (strcasecmp(func, "alt5") == 0)
	*nfunc = BCM_GPIO_ALT5;
	else
	return (-1);

	return (0);
	}

	static uint32_t
	bcm_gpio_func_flag(uint32_t nfunc)
	{

	switch (nfunc) {
	case BCM_GPIO_INPUT:
	return (GPIO_PIN_INPUT);
	case BCM_GPIO_OUTPUT:
	return (GPIO_PIN_OUTPUT);
	}
	return (0);
	}

	static void
	bcm_gpio_set_function(struct bcm_gpio_softc *sc, uint32_t pin, uint32_t f)
	{
	uint32_t bank, data, offset;

	/* Must be called with lock held. */
	BCM_GPIO_LOCK_ASSERT(sc);

	/* Five banks, 10 pins per bank, 3 bits per pin. */
	bank = pin / 10;
	offset = (pin - bank * 10) * 3;

	data = BCM_GPIO_READ(sc, BCM_GPIO_GPFSEL(bank));
	data &= ~(7 << offset);
	data \|= (f << offset);
	BCM_GPIO_WRITE(sc, BCM_GPIO_GPFSEL(bank), data);
	}

	static void
	bcm_gpio_set_pud(struct bcm_gpio_softc *sc, uint32_t pin, uint32_t state)
	{
	uint32_t bank;

	/* Must be called with lock held. */
	BCM_GPIO_LOCK_ASSERT(sc);

	bank = BCM_GPIO_BANK(pin);
	BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUD(0), state);
	BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUDCLK(bank), BCM_GPIO_MASK(pin));
	BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUD(0), 0);
	BCM_GPIO_WRITE(sc, BCM_GPIO_GPPUDCLK(bank), 0);
	}

	void
	bcm_gpio_set_alternate(device_t dev, uint32_t pin, uint32_t nfunc)
	{
	struct bcm_gpio_softc *sc;
	int i;

	sc = device_get_softc(dev);
	BCM_GPIO_LOCK(sc);

	/* Disable pull-up or pull-down on pin. */
	bcm_gpio_set_pud(sc, pin, BCM_GPIO_NONE);

	/* And now set the pin function. */
	bcm_gpio_set_function(sc, pin, nfunc);

	/* Update the pin flags. */
	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}
	if (i < sc->sc_gpio_npins)
	sc->sc_gpio_pins[i].gp_flags = bcm_gpio_func_flag(nfunc);

	BCM_GPIO_UNLOCK(sc);
	}

	static void
	bcm_gpio_pin_configure(struct bcm_gpio_softc sc, struct gpio_pin pin,
	unsigned int flags)
	{

	BCM_GPIO_LOCK(sc);

	/*
	* Manage input/output.
	*/
	if (flags & (GPIO_PIN_INPUT\|GPIO_PIN_OUTPUT)) {
	pin->gp_flags &= ~(GPIO_PIN_INPUT\|GPIO_PIN_OUTPUT);
	if (flags & GPIO_PIN_OUTPUT) {
	pin->gp_flags \|= GPIO_PIN_OUTPUT;
	bcm_gpio_set_function(sc, pin->gp_pin,
	BCM_GPIO_OUTPUT);
	} else {
	pin->gp_flags \|= GPIO_PIN_INPUT;
	bcm_gpio_set_function(sc, pin->gp_pin,
	BCM_GPIO_INPUT);
	}
	}

	/* Manage Pull-up/pull-down. */
	pin->gp_flags &= ~(GPIO_PIN_PULLUP\|GPIO_PIN_PULLDOWN);
	if (flags & (GPIO_PIN_PULLUP\|GPIO_PIN_PULLDOWN)) {
	if (flags & GPIO_PIN_PULLUP) {
	pin->gp_flags \|= GPIO_PIN_PULLUP;
	bcm_gpio_set_pud(sc, pin->gp_pin, BCM_GPIO_PULLUP);
	} else {
	pin->gp_flags \|= GPIO_PIN_PULLDOWN;
	bcm_gpio_set_pud(sc, pin->gp_pin, BCM_GPIO_PULLDOWN);
	}
	} else
	bcm_gpio_set_pud(sc, pin->gp_pin, BCM_GPIO_NONE);

	BCM_GPIO_UNLOCK(sc);
	}

	static device_t
	bcm_gpio_get_bus(device_t dev)
	{
	struct bcm_gpio_softc *sc;

	sc = device_get_softc(dev);

	return (sc->sc_busdev);
	}

	static int
	bcm_gpio_pin_max(device_t dev, int *maxpin)
	{

	*maxpin = BCM_GPIO_PINS - 1;
	return (0);
	}

	static int
	bcm_gpio_pin_getcaps(device_t dev, uint32_t pin, uint32_t *caps)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	int i;

	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}

	if (i >= sc->sc_gpio_npins)
	return (EINVAL);

	BCM_GPIO_LOCK(sc);
	*caps = sc->sc_gpio_pins[i].gp_caps;
	BCM_GPIO_UNLOCK(sc);

	return (0);
	}

	static int
	bcm_gpio_pin_getflags(device_t dev, uint32_t pin, uint32_t *flags)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	int i;

	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}

	if (i >= sc->sc_gpio_npins)
	return (EINVAL);

	BCM_GPIO_LOCK(sc);
	*flags = sc->sc_gpio_pins[i].gp_flags;
	BCM_GPIO_UNLOCK(sc);

	return (0);
	}

	static int
	bcm_gpio_pin_getname(device_t dev, uint32_t pin, char *name)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	int i;

	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}

	if (i >= sc->sc_gpio_npins)
	return (EINVAL);

	BCM_GPIO_LOCK(sc);
	memcpy(name, sc->sc_gpio_pins[i].gp_name, GPIOMAXNAME);
	BCM_GPIO_UNLOCK(sc);

	return (0);
	}

	static int
	bcm_gpio_pin_setflags(device_t dev, uint32_t pin, uint32_t flags)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	int i;

	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}

	if (i >= sc->sc_gpio_npins)
	return (EINVAL);

	/* We never touch on read-only/reserved pins. */
	if (bcm_gpio_pin_is_ro(sc, pin))
	return (EINVAL);

	bcm_gpio_pin_configure(sc, &sc->sc_gpio_pins[i], flags);

	return (0);
	}

	static int
	bcm_gpio_pin_set(device_t dev, uint32_t pin, unsigned int value)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	uint32_t bank, reg;
	int i;

	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}
	if (i >= sc->sc_gpio_npins)
	return (EINVAL);
	/* We never write to read-only/reserved pins. */
	if (bcm_gpio_pin_is_ro(sc, pin))
	return (EINVAL);
	BCM_GPIO_LOCK(sc);
	bank = BCM_GPIO_BANK(pin);
	if (value)
	reg = BCM_GPIO_GPSET(bank);
	else
	reg = BCM_GPIO_GPCLR(bank);
	BCM_GPIO_WRITE(sc, reg, BCM_GPIO_MASK(pin));
	BCM_GPIO_UNLOCK(sc);

	return (0);
	}

	static int
	bcm_gpio_pin_get(device_t dev, uint32_t pin, unsigned int *val)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	uint32_t bank, reg_data;
	int i;

	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}
	if (i >= sc->sc_gpio_npins)
	return (EINVAL);
	bank = BCM_GPIO_BANK(pin);
	BCM_GPIO_LOCK(sc);
	reg_data = BCM_GPIO_READ(sc, BCM_GPIO_GPLEV(bank));
	BCM_GPIO_UNLOCK(sc);
	*val = (reg_data & BCM_GPIO_MASK(pin)) ? 1 : 0;

	return (0);
	}

	static int
	bcm_gpio_pin_toggle(device_t dev, uint32_t pin)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	uint32_t bank, data, reg;
	int i;

	for (i = 0; i < sc->sc_gpio_npins; i++) {
	if (sc->sc_gpio_pins[i].gp_pin == pin)
	break;
	}
	if (i >= sc->sc_gpio_npins)
	return (EINVAL);
	/* We never write to read-only/reserved pins. */
	if (bcm_gpio_pin_is_ro(sc, pin))
	return (EINVAL);
	BCM_GPIO_LOCK(sc);
	bank = BCM_GPIO_BANK(pin);
	data = BCM_GPIO_READ(sc, BCM_GPIO_GPLEV(bank));
	if (data & BCM_GPIO_MASK(pin))
	reg = BCM_GPIO_GPCLR(bank);
	else
	reg = BCM_GPIO_GPSET(bank);
	BCM_GPIO_WRITE(sc, reg, BCM_GPIO_MASK(pin));
	BCM_GPIO_UNLOCK(sc);

	return (0);
	}

	static int
	bcm_gpio_func_proc(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	struct bcm_gpio_softc *sc;
	struct bcm_gpio_sysctl *sc_sysctl;
	uint32_t nfunc;
	int error;

	sc_sysctl = arg1;
	sc = sc_sysctl->sc;

	/* Get the current pin function. */
	nfunc = bcm_gpio_get_function(sc, sc_sysctl->pin);
	bcm_gpio_func_str(nfunc, buf, sizeof(buf));

	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	/* Ignore changes on read-only pins. */
	if (bcm_gpio_pin_is_ro(sc, sc_sysctl->pin))
	return (0);
	/* Parse the user supplied string and check for a valid pin function. */
	if (bcm_gpio_str_func(buf, &nfunc) != 0)
	return (EINVAL);

	/* Update the pin alternate function. */
	bcm_gpio_set_alternate(sc->sc_dev, sc_sysctl->pin, nfunc);

	return (0);
	}

	static void
	bcm_gpio_sysctl_init(struct bcm_gpio_softc *sc)
	{
	char pinbuf[3];
	struct bcm_gpio_sysctl *sc_sysctl;
	struct sysctl_ctx_list *ctx;
	struct sysctl_oid tree_node, pin_node, *pinN_node;
	struct sysctl_oid_list tree, pin_tree, *pinN_tree;
	int i;

	/*
	* Add per-pin sysctl tree/handlers.
	*/
	ctx = device_get_sysctl_ctx(sc->sc_dev);
	tree_node = device_get_sysctl_tree(sc->sc_dev);
	tree = SYSCTL_CHILDREN(tree_node);
	pin_node = SYSCTL_ADD_NODE(ctx, tree, OID_AUTO, "pin",
	CTLFLAG_RD, NULL, "GPIO Pins");
	pin_tree = SYSCTL_CHILDREN(pin_node);

	for (i = 0; i < sc->sc_gpio_npins; i++) {

	snprintf(pinbuf, sizeof(pinbuf), "%d", i);
	pinN_node = SYSCTL_ADD_NODE(ctx, pin_tree, OID_AUTO, pinbuf,
	CTLFLAG_RD, NULL, "GPIO Pin");
	pinN_tree = SYSCTL_CHILDREN(pinN_node);

	sc->sc_sysctl[i].sc = sc;
	sc_sysctl = &sc->sc_sysctl[i];
	sc_sysctl->sc = sc;
	sc_sysctl->pin = sc->sc_gpio_pins[i].gp_pin;
	SYSCTL_ADD_PROC(ctx, pinN_tree, OID_AUTO, "function",
	CTLFLAG_RW \| CTLTYPE_STRING, sc_sysctl,
	sizeof(struct bcm_gpio_sysctl), bcm_gpio_func_proc,
	"A", "Pin Function");
	}
	}

	static int
	bcm_gpio_get_ro_pins(struct bcm_gpio_softc *sc, phandle_t node,
	const char propname, const char label)
	{
	int i, need_comma, npins, range_start, range_stop;
	pcell_t *pins;

	/* Get the property data. */
	npins = OF_getencprop_alloc(node, propname, sizeof(*pins),
	(void **)&pins);
	if (npins < 0)
	return (-1);
	if (npins == 0) {
	OF_prop_free(pins);
	return (0);
	}
	for (i = 0; i < npins; i++)
	sc->sc_ro_pins[i + sc->sc_ro_npins] = pins[i];
	sc->sc_ro_npins += npins;
	need_comma = 0;
	device_printf(sc->sc_dev, "%s pins: ", label);
	range_start = range_stop = pins[0];
	for (i = 1; i < npins; i++) {
	if (pins[i] != range_stop + 1) {
	if (need_comma)
	printf(",");
	if (range_start != range_stop)
	printf("%d-%d", range_start, range_stop);
	else
	printf("%d", range_start);
	range_start = range_stop = pins[i];
	need_comma = 1;
	} else
	range_stop++;
	}
	if (need_comma)
	printf(",");
	if (range_start != range_stop)
	printf("%d-%d.\n", range_start, range_stop);
	else
	printf("%d.\n", range_start);
	OF_prop_free(pins);

	return (0);
	}

	static int
	bcm_gpio_get_reserved_pins(struct bcm_gpio_softc *sc)
	{
	char *name;
	phandle_t gpio, node, reserved;
	ssize_t len;

	/* Get read-only pins if they're provided */
	gpio = ofw_bus_get_node(sc->sc_dev);
	if (bcm_gpio_get_ro_pins(sc, gpio, "broadcom,read-only",
	"read-only") != 0)
	return (0);
	/* Traverse the GPIO subnodes to find the reserved pins node. */
	reserved = 0;
	node = OF_child(gpio);
	while ((node != 0) && (reserved == 0)) {
	len = OF_getprop_alloc(node, "name", 1, (void **)&name);
	if (len == -1)
	return (-1);
	if (strcmp(name, "reserved") == 0)
	reserved = node;
	OF_prop_free(name);
	node = OF_peer(node);
	}
	if (reserved == 0)
	return (-1);
	/* Get the reserved pins. */
	if (bcm_gpio_get_ro_pins(sc, reserved, "broadcom,pins",
	"reserved") != 0)
	return (-1);

	return (0);
	}

	static int
	bcm_gpio_probe(device_t dev)
	{

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
	return (ENXIO);

	device_set_desc(dev, "BCM2708/2835 GPIO controller");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	bcm_gpio_intr_attach(device_t dev)
	{
	struct bcm_gpio_softc *sc;

	/*
	* Only first two interrupt lines are used. Third line is
	* mirrored second line and forth line is common for all banks.
	*/
	sc = device_get_softc(dev);
	if (sc->sc_res[1] == NULL \|\| sc->sc_res[2] == NULL)
	return (-1);

	if (bcm_gpio_pic_attach(sc) != 0) {
	device_printf(dev, "unable to attach PIC\n");
	return (-1);
	}
	if (bus_setup_intr(dev, sc->sc_res[1], INTR_TYPE_MISC \| INTR_MPSAFE,
	bcm_gpio_intr_bank0, NULL, sc, &sc->sc_intrhand[0]) != 0)
	return (-1);
	if (bus_setup_intr(dev, sc->sc_res[2], INTR_TYPE_MISC \| INTR_MPSAFE,
	bcm_gpio_intr_bank1, NULL, sc, &sc->sc_intrhand[1]) != 0)
	return (-1);

	return (0);
	}

	static void
	bcm_gpio_intr_detach(device_t dev)
	{
	struct bcm_gpio_softc *sc;

	sc = device_get_softc(dev);
	if (sc->sc_intrhand[0] != NULL)
	bus_teardown_intr(dev, sc->sc_res[1], sc->sc_intrhand[0]);
	if (sc->sc_intrhand[1] != NULL)
	bus_teardown_intr(dev, sc->sc_res[2], sc->sc_intrhand[1]);

	bcm_gpio_pic_detach(sc);
	}

	static int
	bcm_gpio_attach(device_t dev)
	{
	int i, j;
	phandle_t gpio;
	struct bcm_gpio_softc *sc;
	uint32_t func;

	if (bcm_gpio_sc != NULL)
	return (ENXIO);

	bcm_gpio_sc = sc = device_get_softc(dev);
	sc->sc_dev = dev;
	mtx_init(&sc->sc_mtx, "bcm gpio", "gpio", MTX_SPIN);
	if (bus_alloc_resources(dev, bcm_gpio_res_spec, sc->sc_res) != 0) {
	device_printf(dev, "cannot allocate resources\n");
	goto fail;
	}
	sc->sc_bst = rman_get_bustag(sc->sc_res[0]);
	sc->sc_bsh = rman_get_bushandle(sc->sc_res[0]);
	/* Setup the GPIO interrupt handler. */
	if (bcm_gpio_intr_attach(dev)) {
	device_printf(dev, "unable to setup the gpio irq handler\n");
	goto fail;
	}
	/* Find our node. */
	gpio = ofw_bus_get_node(sc->sc_dev);
	if (!OF_hasprop(gpio, "gpio-controller"))
	/* Node is not a GPIO controller. */
	goto fail;
	/*
	* Find the read-only pins. These are pins we never touch or bad
	* things could happen.
	*/
	if (bcm_gpio_get_reserved_pins(sc) == -1)
	goto fail;
	/* Initialize the software controlled pins. */
	for (i = 0, j = 0; j < BCM_GPIO_PINS; j++) {
	snprintf(sc->sc_gpio_pins[i].gp_name, GPIOMAXNAME,
	"pin %d", j);
	func = bcm_gpio_get_function(sc, j);
	sc->sc_gpio_pins[i].gp_pin = j;
	sc->sc_gpio_pins[i].gp_caps = BCM_GPIO_DEFAULT_CAPS;
	sc->sc_gpio_pins[i].gp_flags = bcm_gpio_func_flag(func);
	i++;
	}
	sc->sc_gpio_npins = i;
	bcm_gpio_sysctl_init(sc);
	sc->sc_busdev = gpiobus_attach_bus(dev);
	if (sc->sc_busdev == NULL)
	goto fail;

	return (0);

	fail:
	bcm_gpio_intr_detach(dev);
	bus_release_resources(dev, bcm_gpio_res_spec, sc->sc_res);
	mtx_destroy(&sc->sc_mtx);

	return (ENXIO);
	}

	static int
	bcm_gpio_detach(device_t dev)
	{

	return (EBUSY);
	}

	static inline void
	bcm_gpio_modify(struct bcm_gpio_softc *sc, uint32_t reg, uint32_t mask,
	bool set_bits)
	{

	if (set_bits)
	BCM_GPIO_SET_BITS(sc, reg, mask);
	else
	BCM_GPIO_CLEAR_BITS(sc, reg, mask);
	}

	static inline void
	bcm_gpio_isrc_eoi(struct bcm_gpio_softc sc, struct bcm_gpio_irqsrc bgi)
	{
	uint32_t bank;

	/* Write 1 to clear. */
	bank = BCM_GPIO_BANK(bgi->bgi_irq);
	BCM_GPIO_WRITE(sc, BCM_GPIO_GPEDS(bank), bgi->bgi_mask);
	}

	static inline bool
	bcm_gpio_isrc_is_level(struct bcm_gpio_irqsrc *bgi)
	{

	return (bgi->bgi_mode == GPIO_INTR_LEVEL_LOW \|\|
	bgi->bgi_mode == GPIO_INTR_LEVEL_HIGH);
	}

	static inline void
	bcm_gpio_isrc_mask(struct bcm_gpio_softc sc, struct bcm_gpio_irqsrc bgi)
	{
	uint32_t bank;

	bank = BCM_GPIO_BANK(bgi->bgi_irq);
	BCM_GPIO_LOCK(sc);
	switch (bgi->bgi_mode) {
	case GPIO_INTR_LEVEL_LOW:
	BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPLEN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_LEVEL_HIGH:
	BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPHEN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_EDGE_RISING:
	BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_EDGE_FALLING:
	BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_EDGE_BOTH:
	BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
	BCM_GPIO_CLEAR_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
	break;
	}
	BCM_GPIO_UNLOCK(sc);
	}

	static inline void
	bcm_gpio_isrc_unmask(struct bcm_gpio_softc sc, struct bcm_gpio_irqsrc bgi)
	{
	uint32_t bank;

	bank = BCM_GPIO_BANK(bgi->bgi_irq);
	BCM_GPIO_LOCK(sc);
	switch (bgi->bgi_mode) {
	case GPIO_INTR_LEVEL_LOW:
	BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPLEN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_LEVEL_HIGH:
	BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPHEN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_EDGE_RISING:
	BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_EDGE_FALLING:
	BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
	break;
	case GPIO_INTR_EDGE_BOTH:
	BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask);
	BCM_GPIO_SET_BITS(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask);
	break;
	}
	BCM_GPIO_UNLOCK(sc);
	}

	static int
	bcm_gpio_intr_internal(struct bcm_gpio_softc *sc, uint32_t bank)
	{
	u_int irq;
	struct bcm_gpio_irqsrc *bgi;
	uint32_t reg;

	/* Do not care of spurious interrupt on GPIO. */
	reg = BCM_GPIO_READ(sc, BCM_GPIO_GPEDS(bank));
	while (reg != 0) {
	irq = BCM_GPIO_PINS_PER_BANK * bank + ffs(reg) - 1;
	bgi = sc->sc_isrcs + irq;
	if (!bcm_gpio_isrc_is_level(bgi))
	bcm_gpio_isrc_eoi(sc, bgi);
	if (intr_isrc_dispatch(&bgi->bgi_isrc,
	curthread->td_intr_frame) != 0) {
	bcm_gpio_isrc_mask(sc, bgi);
	if (bcm_gpio_isrc_is_level(bgi))
	bcm_gpio_isrc_eoi(sc, bgi);
	device_printf(sc->sc_dev, "Stray irq %u disabled\n",
	irq);
	}
	reg &= ~bgi->bgi_mask;
	}
	return (FILTER_HANDLED);
	}

	static int
	bcm_gpio_intr_bank0(void *arg)
	{

	return (bcm_gpio_intr_internal(arg, 0));
	}

	static int
	bcm_gpio_intr_bank1(void *arg)
	{

	return (bcm_gpio_intr_internal(arg, 1));
	}

	static int
	bcm_gpio_pic_attach(struct bcm_gpio_softc *sc)
	{
	int error;
	uint32_t irq;
	const char *name;

	name = device_get_nameunit(sc->sc_dev);
	for (irq = 0; irq < BCM_GPIO_PINS; irq++) {
	sc->sc_isrcs[irq].bgi_irq = irq;
	sc->sc_isrcs[irq].bgi_mask = BCM_GPIO_MASK(irq);
	sc->sc_isrcs[irq].bgi_mode = GPIO_INTR_CONFORM;

	error = intr_isrc_register(&sc->sc_isrcs[irq].bgi_isrc,
	sc->sc_dev, 0, "%s,%u", name, irq);
	if (error != 0)
	return (error); /* XXX deregister ISRCs */
	}
	if (intr_pic_register(sc->sc_dev,
	OF_xref_from_node(ofw_bus_get_node(sc->sc_dev))) == NULL)
	return (ENXIO);

	return (0);
	}

	static int
	bcm_gpio_pic_detach(struct bcm_gpio_softc *sc)
	{

	/*
	* There has not been established any procedure yet
	* how to detach PIC from living system correctly.
	*/
	device_printf(sc->sc_dev, "%s: not implemented yet\n", __func__);
	return (EBUSY);
	}

	static void
	bcm_gpio_pic_config_intr(struct bcm_gpio_softc sc, struct bcm_gpio_irqsrc bgi,
	uint32_t mode)
	{
	uint32_t bank;

	bank = BCM_GPIO_BANK(bgi->bgi_irq);
	BCM_GPIO_LOCK(sc);
	bcm_gpio_modify(sc, BCM_GPIO_GPREN(bank), bgi->bgi_mask,
	mode == GPIO_INTR_EDGE_RISING \|\| mode == GPIO_INTR_EDGE_BOTH);
	bcm_gpio_modify(sc, BCM_GPIO_GPFEN(bank), bgi->bgi_mask,
	mode == GPIO_INTR_EDGE_FALLING \|\| mode == GPIO_INTR_EDGE_BOTH);
	bcm_gpio_modify(sc, BCM_GPIO_GPHEN(bank), bgi->bgi_mask,
	mode == GPIO_INTR_LEVEL_HIGH);
	bcm_gpio_modify(sc, BCM_GPIO_GPLEN(bank), bgi->bgi_mask,
	mode == GPIO_INTR_LEVEL_LOW);
	bgi->bgi_mode = mode;
	BCM_GPIO_UNLOCK(sc);
	}

	static void
	bcm_gpio_pic_disable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	struct bcm_gpio_irqsrc bgi = (struct bcm_gpio_irqsrc )isrc;

	bcm_gpio_isrc_mask(sc, bgi);
	}

	static void
	bcm_gpio_pic_enable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	struct bcm_gpio_irqsrc bgi = (struct bcm_gpio_irqsrc )isrc;

	arm_irq_memory_barrier(bgi->bgi_irq);
	bcm_gpio_isrc_unmask(sc, bgi);
	}

	static int
	bcm_gpio_pic_map_fdt(struct bcm_gpio_softc sc, struct intr_map_data_fdt daf,
	u_int irqp, uint32_t modep)
	{
	u_int irq;
	- uint32_t mode, bank;
	+ uint32_t mode;

	/*
	* The first cell is the interrupt number.
	* The second cell is used to specify flags:
	* bits[3:0] trigger type and level flags:
	* 1 = low-to-high edge triggered.
	* 2 = high-to-low edge triggered.
	* 4 = active high level-sensitive.
	* 8 = active low level-sensitive.
	*/
	if (daf->ncells != 2)
	return (EINVAL);

	irq = daf->cells[0];
	if (irq >= BCM_GPIO_PINS \|\| bcm_gpio_pin_is_ro(sc, irq))
	return (EINVAL);

	/* Only reasonable modes are supported. */
	- bank = BCM_GPIO_BANK(irq);
	if (daf->cells[1] == 1)
	mode = GPIO_INTR_EDGE_RISING;
	else if (daf->cells[1] == 2)
	mode = GPIO_INTR_EDGE_FALLING;
	else if (daf->cells[1] == 3)
	mode = GPIO_INTR_EDGE_BOTH;
	else if (daf->cells[1] == 4)
	mode = GPIO_INTR_LEVEL_HIGH;
	else if (daf->cells[1] == 8)
	mode = GPIO_INTR_LEVEL_LOW;
	else
	return (EINVAL);

	*irqp = irq;
	if (modep != NULL)
	*modep = mode;
	return (0);
	}

	static int
	bcm_gpio_pic_map_gpio(struct bcm_gpio_softc sc, struct intr_map_data_gpio dag,
	u_int irqp, uint32_t modep)
	{
	u_int irq;
	uint32_t mode;

	irq = dag->gpio_pin_num;
	if (irq >= BCM_GPIO_PINS \|\| bcm_gpio_pin_is_ro(sc, irq))
	return (EINVAL);

	mode = dag->gpio_intr_mode;
	if (mode != GPIO_INTR_LEVEL_LOW && mode != GPIO_INTR_LEVEL_HIGH &&
	mode != GPIO_INTR_EDGE_RISING && mode != GPIO_INTR_EDGE_FALLING &&
	mode != GPIO_INTR_EDGE_BOTH)
	return (EINVAL);

	*irqp = irq;
	if (modep != NULL)
	*modep = mode;
	return (0);
	}

	static int
	bcm_gpio_pic_map(struct bcm_gpio_softc sc, struct intr_map_data data,
	u_int irqp, uint32_t modep)
	{

	switch (data->type) {
	case INTR_MAP_DATA_FDT:
	return (bcm_gpio_pic_map_fdt(sc,
	(struct intr_map_data_fdt *)data, irqp, modep));
	case INTR_MAP_DATA_GPIO:
	return (bcm_gpio_pic_map_gpio(sc,
	(struct intr_map_data_gpio *)data, irqp, modep));
	default:
	return (ENOTSUP);
	}
	}

	static int
	bcm_gpio_pic_map_intr(device_t dev, struct intr_map_data *data,
	struct intr_irqsrc **isrcp)
	{
	int error;
	u_int irq;
	struct bcm_gpio_softc *sc = device_get_softc(dev);

	error = bcm_gpio_pic_map(sc, data, &irq, NULL);
	if (error == 0)
	*isrcp = &sc->sc_isrcs[irq].bgi_isrc;
	return (error);
	}

	static void
	bcm_gpio_pic_post_filter(device_t dev, struct intr_irqsrc *isrc)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	struct bcm_gpio_irqsrc bgi = (struct bcm_gpio_irqsrc )isrc;

	if (bcm_gpio_isrc_is_level(bgi))
	bcm_gpio_isrc_eoi(sc, bgi);
	}

	static void
	bcm_gpio_pic_post_ithread(device_t dev, struct intr_irqsrc *isrc)
	{

	bcm_gpio_pic_enable_intr(dev, isrc);
	}

	static void
	bcm_gpio_pic_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	struct bcm_gpio_irqsrc bgi = (struct bcm_gpio_irqsrc )isrc;

	bcm_gpio_isrc_mask(sc, bgi);
	if (bcm_gpio_isrc_is_level(bgi))
	bcm_gpio_isrc_eoi(sc, bgi);
	}

	static int
	bcm_gpio_pic_setup_intr(device_t dev, struct intr_irqsrc *isrc,
	struct resource res, struct intr_map_data data)
	{
	u_int irq;
	uint32_t mode;
	struct bcm_gpio_softc *sc;
	struct bcm_gpio_irqsrc *bgi;

	if (data == NULL)
	return (ENOTSUP);

	sc = device_get_softc(dev);
	bgi = (struct bcm_gpio_irqsrc *)isrc;

	/* Get and check config for an interrupt. */
	if (bcm_gpio_pic_map(sc, data, &irq, &mode) != 0 \|\| bgi->bgi_irq != irq)
	return (EINVAL);

	/*
	* If this is a setup for another handler,
	* only check that its configuration match.
	*/
	if (isrc->isrc_handlers != 0)
	return (bgi->bgi_mode == mode ? 0 : EINVAL);

	bcm_gpio_pic_config_intr(sc, bgi, mode);
	return (0);
	}

	static int
	bcm_gpio_pic_teardown_intr(device_t dev, struct intr_irqsrc *isrc,
	struct resource res, struct intr_map_data data)
	{
	struct bcm_gpio_softc *sc = device_get_softc(dev);
	struct bcm_gpio_irqsrc bgi = (struct bcm_gpio_irqsrc )isrc;

	if (isrc->isrc_handlers == 0)
	bcm_gpio_pic_config_intr(sc, bgi, GPIO_INTR_CONFORM);
	return (0);
	}

	static phandle_t
	bcm_gpio_get_node(device_t bus, device_t dev)
	{

	/* We only have one child, the GPIO bus, which needs our own node. */
	return (ofw_bus_get_node(bus));
	}

	static device_method_t bcm_gpio_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, bcm_gpio_probe),
	DEVMETHOD(device_attach, bcm_gpio_attach),
	DEVMETHOD(device_detach, bcm_gpio_detach),

	/* GPIO protocol */
	DEVMETHOD(gpio_get_bus, bcm_gpio_get_bus),
	DEVMETHOD(gpio_pin_max, bcm_gpio_pin_max),
	DEVMETHOD(gpio_pin_getname, bcm_gpio_pin_getname),
	DEVMETHOD(gpio_pin_getflags, bcm_gpio_pin_getflags),
	DEVMETHOD(gpio_pin_getcaps, bcm_gpio_pin_getcaps),
	DEVMETHOD(gpio_pin_setflags, bcm_gpio_pin_setflags),
	DEVMETHOD(gpio_pin_get, bcm_gpio_pin_get),
	DEVMETHOD(gpio_pin_set, bcm_gpio_pin_set),
	DEVMETHOD(gpio_pin_toggle, bcm_gpio_pin_toggle),

	/* Interrupt controller interface */
	DEVMETHOD(pic_disable_intr, bcm_gpio_pic_disable_intr),
	DEVMETHOD(pic_enable_intr, bcm_gpio_pic_enable_intr),
	DEVMETHOD(pic_map_intr, bcm_gpio_pic_map_intr),
	DEVMETHOD(pic_post_filter, bcm_gpio_pic_post_filter),
	DEVMETHOD(pic_post_ithread, bcm_gpio_pic_post_ithread),
	DEVMETHOD(pic_pre_ithread, bcm_gpio_pic_pre_ithread),
	DEVMETHOD(pic_setup_intr, bcm_gpio_pic_setup_intr),
	DEVMETHOD(pic_teardown_intr, bcm_gpio_pic_teardown_intr),

	/* ofw_bus interface */
	DEVMETHOD(ofw_bus_get_node, bcm_gpio_get_node),

	DEVMETHOD_END
	};

	static devclass_t bcm_gpio_devclass;

	static driver_t bcm_gpio_driver = {
	"gpio",
	bcm_gpio_methods,
	sizeof(struct bcm_gpio_softc),
	};

	DRIVER_MODULE(bcm_gpio, simplebus, bcm_gpio_driver, bcm_gpio_devclass, 0, 0);
	Index: head/sys/arm/broadcom/bcm2835/bcm2835_mbox.c
	===================================================================
	--- head/sys/arm/broadcom/bcm2835/bcm2835_mbox.c (revision 327172)
	+++ head/sys/arm/broadcom/bcm2835/bcm2835_mbox.c (revision 327173)
	@@ -1,538 +1,542 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Oleksandr Tymoshenko <gonzo@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/rman.h>
	#include <machine/bus.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <arm/broadcom/bcm2835/bcm2835_mbox.h>
	#include <arm/broadcom/bcm2835/bcm2835_mbox_prop.h>
	#include <arm/broadcom/bcm2835/bcm2835_vcbus.h>

	#include "mbox_if.h"

	#define REG_READ 0x00
	#define REG_POL 0x10
	#define REG_SENDER 0x14
	#define REG_STATUS 0x18
	#define STATUS_FULL 0x80000000
	#define STATUS_EMPTY 0x40000000
	#define REG_CONFIG 0x1C
	#define CONFIG_DATA_IRQ 0x00000001
	#define REG_WRITE 0x20 /* This is Mailbox 1 address */

	#define MBOX_MSG(chan, data) (((data) & ~0xf) \| ((chan) & 0xf))
	#define MBOX_CHAN(msg) ((msg) & 0xf)
	#define MBOX_DATA(msg) ((msg) & ~0xf)

	#define MBOX_LOCK(sc) do { \
	mtx_lock(&(sc)->lock); \
	} while(0)

	#define MBOX_UNLOCK(sc) do { \
	mtx_unlock(&(sc)->lock); \
	} while(0)

	#ifdef DEBUG
	#define dprintf(fmt, args...) printf(fmt, ##args)
	#else
	#define dprintf(fmt, args...)
	#endif

	struct bcm_mbox_softc {
	struct mtx lock;
	struct resource * mem_res;
	struct resource * irq_res;
	void* intr_hl;
	bus_space_tag_t bst;
	bus_space_handle_t bsh;
	int msg[BCM2835_MBOX_CHANS];
	int have_message[BCM2835_MBOX_CHANS];
	struct sx property_chan_lock;
	};

	#define mbox_read_4(sc, reg) \
	bus_space_read_4((sc)->bst, (sc)->bsh, reg)
	#define mbox_write_4(sc, reg, val) \
	bus_space_write_4((sc)->bst, (sc)->bsh, reg, val)

	static struct ofw_compat_data compat_data[] = {
	{"broadcom,bcm2835-mbox", 1},
	{"brcm,bcm2835-mbox", 1},
	{NULL, 0}
	};

	static int
	bcm_mbox_read_msg(struct bcm_mbox_softc sc, int ochan)
	{
	+#ifdef DEBUG
	uint32_t data;
	+#endif
	uint32_t msg;
	int chan;

	msg = mbox_read_4(sc, REG_READ);
	dprintf("bcm_mbox_intr: raw data %08x\n", msg);
	chan = MBOX_CHAN(msg);
	+#ifdef DEBUG
	data = MBOX_DATA(msg);
	+#endif
	if (sc->msg[chan]) {
	printf("bcm_mbox_intr: channel %d oveflow\n", chan);
	return (1);
	}
	dprintf("bcm_mbox_intr: chan %d, data %08x\n", chan, data);
	sc->msg[chan] = msg;

	if (ochan != NULL)
	*ochan = chan;

	return (0);
	}

	static void
	bcm_mbox_intr(void *arg)
	{
	struct bcm_mbox_softc *sc = arg;
	int chan;

	MBOX_LOCK(sc);
	while (!(mbox_read_4(sc, REG_STATUS) & STATUS_EMPTY))
	if (bcm_mbox_read_msg(sc, &chan) == 0) {
	sc->have_message[chan] = 1;
	wakeup(&sc->have_message[chan]);
	}
	MBOX_UNLOCK(sc);
	}

	static int
	bcm_mbox_probe(device_t dev)
	{

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
	return (ENXIO);

	device_set_desc(dev, "BCM2835 VideoCore Mailbox");

	return (BUS_PROBE_DEFAULT);
	}

	static int
	bcm_mbox_attach(device_t dev)
	{
	struct bcm_mbox_softc *sc = device_get_softc(dev);
	int i;
	int rid = 0;

	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
	if (sc->mem_res == NULL) {
	device_printf(dev, "could not allocate memory resource\n");
	return (ENXIO);
	}

	sc->bst = rman_get_bustag(sc->mem_res);
	sc->bsh = rman_get_bushandle(sc->mem_res);

	rid = 0;
	sc->irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE);
	if (sc->irq_res == NULL) {
	device_printf(dev, "could not allocate interrupt resource\n");
	return (ENXIO);
	}

	/* Setup and enable the timer */
	if (bus_setup_intr(dev, sc->irq_res, INTR_MPSAFE \| INTR_TYPE_MISC,
	NULL, bcm_mbox_intr, sc, &sc->intr_hl) != 0) {
	bus_release_resource(dev, SYS_RES_IRQ, rid, sc->irq_res);
	device_printf(dev, "Unable to setup the clock irq handler.\n");
	return (ENXIO);
	}

	mtx_init(&sc->lock, "vcio mbox", NULL, MTX_DEF);
	for (i = 0; i < BCM2835_MBOX_CHANS; i++) {
	sc->msg[i] = 0;
	sc->have_message[i] = 0;
	}

	sx_init(&sc->property_chan_lock, "mboxprop");

	/* Read all pending messages */
	while ((mbox_read_4(sc, REG_STATUS) & STATUS_EMPTY) == 0)
	(void)mbox_read_4(sc, REG_READ);

	mbox_write_4(sc, REG_CONFIG, CONFIG_DATA_IRQ);

	return (0);
	}

	/*
	* Mailbox API
	*/
	static int
	bcm_mbox_write(device_t dev, int chan, uint32_t data)
	{
	int limit = 1000;
	struct bcm_mbox_softc *sc = device_get_softc(dev);

	dprintf("bcm_mbox_write: chan %d, data %08x\n", chan, data);
	MBOX_LOCK(sc);
	sc->have_message[chan] = 0;
	while ((mbox_read_4(sc, REG_STATUS) & STATUS_FULL) && --limit)
	DELAY(5);
	if (limit == 0) {
	printf("bcm_mbox_write: STATUS_FULL stuck");
	MBOX_UNLOCK(sc);
	return (EAGAIN);
	}
	mbox_write_4(sc, REG_WRITE, MBOX_MSG(chan, data));
	MBOX_UNLOCK(sc);

	return (0);
	}

	static int
	bcm_mbox_read(device_t dev, int chan, uint32_t *data)
	{
	struct bcm_mbox_softc *sc = device_get_softc(dev);
	int err, read_chan;

	dprintf("bcm_mbox_read: chan %d\n", chan);

	err = 0;
	MBOX_LOCK(sc);
	if (!cold) {
	if (sc->have_message[chan] == 0) {
	if (mtx_sleep(&sc->have_message[chan], &sc->lock, 0,
	"mbox", 10*hz) != 0) {
	device_printf(dev, "timeout waiting for message on chan %d\n", chan);
	err = ETIMEDOUT;
	}
	}
	} else {
	do {
	/* Wait for a message */
	while ((mbox_read_4(sc, REG_STATUS) & STATUS_EMPTY))
	;
	/* Read the message */
	if (bcm_mbox_read_msg(sc, &read_chan) != 0) {
	err = EINVAL;
	goto out;
	}
	} while (read_chan != chan);
	}
	/*
	* get data from intr handler, the same channel is never coming
	* because of holding sc lock.
	*/
	*data = MBOX_DATA(sc->msg[chan]);
	sc->msg[chan] = 0;
	sc->have_message[chan] = 0;
	out:
	MBOX_UNLOCK(sc);
	dprintf("bcm_mbox_read: chan %d, data %08x\n", chan, *data);

	return (err);
	}

	static device_method_t bcm_mbox_methods[] = {
	DEVMETHOD(device_probe, bcm_mbox_probe),
	DEVMETHOD(device_attach, bcm_mbox_attach),

	DEVMETHOD(mbox_read, bcm_mbox_read),
	DEVMETHOD(mbox_write, bcm_mbox_write),

	DEVMETHOD_END
	};

	static driver_t bcm_mbox_driver = {
	"mbox",
	bcm_mbox_methods,
	sizeof(struct bcm_mbox_softc),
	};

	static devclass_t bcm_mbox_devclass;

	DRIVER_MODULE(mbox, simplebus, bcm_mbox_driver, bcm_mbox_devclass, 0, 0);

	static void
	bcm2835_mbox_dma_cb(void arg, bus_dma_segment_t segs, int nseg, int err)
	{
	bus_addr_t *addr;

	if (err)
	return;
	addr = (bus_addr_t *)arg;
	*addr = PHYS_TO_VCBUS(segs[0].ds_addr);
	}

	static void *
	bcm2835_mbox_init_dma(device_t dev, size_t len, bus_dma_tag_t *tag,
	bus_dmamap_t map, bus_addr_t phys)
	{
	void *buf;
	int err;

	err = bus_dma_tag_create(bus_get_dma_tag(dev), 16, 0,
	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
	len, 1, len, 0, NULL, NULL, tag);
	if (err != 0) {
	device_printf(dev, "can't create DMA tag\n");
	return (NULL);
	}

	err = bus_dmamem_alloc(*tag, &buf, 0, map);
	if (err != 0) {
	bus_dma_tag_destroy(*tag);
	device_printf(dev, "can't allocate dmamem\n");
	return (NULL);
	}

	err = bus_dmamap_load(tag, map, buf, len, bcm2835_mbox_dma_cb,
	phys, 0);
	if (err != 0) {
	bus_dmamem_free(tag, buf, map);
	bus_dma_tag_destroy(*tag);
	device_printf(dev, "can't load DMA map\n");
	return (NULL);
	}

	return (buf);
	}

	static int
	bcm2835_mbox_err(device_t dev, bus_addr_t msg_phys, uint32_t resp_phys,
	struct bcm2835_mbox_hdr *msg, size_t len)
	{
	int idx;
	struct bcm2835_mbox_tag_hdr *tag;
	uint8_t *last;

	if ((uint32_t)msg_phys != resp_phys) {
	device_printf(dev, "response channel mismatch\n");
	return (EIO);
	}
	if (msg->code != BCM2835_MBOX_CODE_RESP_SUCCESS) {
	device_printf(dev, "mbox response error\n");
	return (EIO);
	}

	/* Loop until the end tag. */
	tag = (struct bcm2835_mbox_tag_hdr *)(msg + 1);
	last = (uint8_t *)msg + len;
	for (idx = 0; tag->tag != 0; idx++) {
	if ((tag->val_len & BCM2835_MBOX_TAG_VAL_LEN_RESPONSE) == 0) {
	device_printf(dev, "tag %d response error\n", idx);
	return (EIO);
	}
	/* Clear the response bit. */
	tag->val_len &= ~BCM2835_MBOX_TAG_VAL_LEN_RESPONSE;

	/* Next tag. */
	tag = (struct bcm2835_mbox_tag_hdr )((uint8_t )tag +
	sizeof(*tag) + tag->val_buf_size);

	if ((uint8_t *)tag > last) {
	device_printf(dev, "mbox buffer size error\n");
	return (EIO);
	}
	}

	return (0);
	}

	int
	bcm2835_mbox_property(void *msg, size_t msg_size)
	{
	struct bcm_mbox_softc *sc;
	struct msg_set_power_state *buf;
	bus_dma_tag_t msg_tag;
	bus_dmamap_t msg_map;
	bus_addr_t msg_phys;
	uint32_t reg;
	device_t mbox;
	int err;

	/* get mbox device */
	mbox = devclass_get_device(devclass_find("mbox"), 0);
	if (mbox == NULL)
	return (ENXIO);

	sc = device_get_softc(mbox);
	sx_xlock(&sc->property_chan_lock);

	/* Allocate memory for the message */
	buf = bcm2835_mbox_init_dma(mbox, msg_size, &msg_tag, &msg_map,
	&msg_phys);
	if (buf == NULL) {
	err = ENOMEM;
	goto out;
	}

	memcpy(buf, msg, msg_size);

	bus_dmamap_sync(msg_tag, msg_map,
	BUS_DMASYNC_PREWRITE);

	MBOX_WRITE(mbox, BCM2835_MBOX_CHAN_PROP, (uint32_t)msg_phys);
	MBOX_READ(mbox, BCM2835_MBOX_CHAN_PROP, &reg);

	bus_dmamap_sync(msg_tag, msg_map,
	BUS_DMASYNC_PREREAD);

	memcpy(msg, buf, msg_size);

	err = bcm2835_mbox_err(mbox, msg_phys, reg,
	(struct bcm2835_mbox_hdr *)msg, msg_size);

	bus_dmamap_unload(msg_tag, msg_map);
	bus_dmamem_free(msg_tag, buf, msg_map);
	bus_dma_tag_destroy(msg_tag);
	out:
	sx_xunlock(&sc->property_chan_lock);
	return (err);
	}

	int
	bcm2835_mbox_set_power_state(uint32_t device_id, boolean_t on)
	{
	struct msg_set_power_state msg;
	int err;

	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_SET_POWER_STATE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.device_id = device_id;
	msg.body.req.state = (on ? BCM2835_MBOX_POWER_ON : 0) \|
	BCM2835_MBOX_POWER_WAIT;
	msg.end_tag = 0;

	err = bcm2835_mbox_property(&msg, sizeof(msg));

	return (err);
	}

	int
	bcm2835_mbox_get_clock_rate(uint32_t clock_id, uint32_t *hz)
	{
	struct msg_get_clock_rate msg;
	int err;

	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	msg.tag_hdr.tag = BCM2835_MBOX_TAG_GET_CLOCK_RATE;
	msg.tag_hdr.val_buf_size = sizeof(msg.body);
	msg.tag_hdr.val_len = sizeof(msg.body.req);
	msg.body.req.clock_id = clock_id;
	msg.end_tag = 0;

	err = bcm2835_mbox_property(&msg, sizeof(msg));
	*hz = msg.body.resp.rate_hz;

	return (err);
	}

	int
	bcm2835_mbox_fb_get_w_h(struct bcm2835_fb_config *fb)
	{
	int err;
	struct msg_fb_get_w_h msg;

	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	BCM2835_MBOX_INIT_TAG(&msg.physical_w_h, GET_PHYSICAL_W_H);
	msg.physical_w_h.tag_hdr.val_len = 0;
	msg.end_tag = 0;

	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err == 0) {
	fb->xres = msg.physical_w_h.body.resp.width;
	fb->yres = msg.physical_w_h.body.resp.height;
	}

	return (err);
	}

	int
	bcm2835_mbox_fb_init(struct bcm2835_fb_config *fb)
	{
	int err;
	struct msg_fb_setup msg;

	memset(&msg, 0, sizeof(msg));
	msg.hdr.buf_size = sizeof(msg);
	msg.hdr.code = BCM2835_MBOX_CODE_REQ;
	BCM2835_MBOX_INIT_TAG(&msg.physical_w_h, SET_PHYSICAL_W_H);
	msg.physical_w_h.body.req.width = fb->xres;
	msg.physical_w_h.body.req.height = fb->yres;
	BCM2835_MBOX_INIT_TAG(&msg.virtual_w_h, SET_VIRTUAL_W_H);
	msg.virtual_w_h.body.req.width = fb->vxres;
	msg.virtual_w_h.body.req.height = fb->vyres;
	BCM2835_MBOX_INIT_TAG(&msg.offset, SET_VIRTUAL_OFFSET);
	msg.offset.body.req.x = fb->xoffset;
	msg.offset.body.req.y = fb->yoffset;
	BCM2835_MBOX_INIT_TAG(&msg.depth, SET_DEPTH);
	msg.depth.body.req.bpp = fb->bpp;
	BCM2835_MBOX_INIT_TAG(&msg.alpha, SET_ALPHA_MODE);
	msg.alpha.body.req.alpha = BCM2835_MBOX_ALPHA_MODE_IGNORED;
	BCM2835_MBOX_INIT_TAG(&msg.buffer, ALLOCATE_BUFFER);
	msg.buffer.body.req.alignment = PAGE_SIZE;
	BCM2835_MBOX_INIT_TAG(&msg.pitch, GET_PITCH);
	msg.end_tag = 0;

	err = bcm2835_mbox_property(&msg, sizeof(msg));
	if (err == 0) {
	fb->xres = msg.physical_w_h.body.resp.width;
	fb->yres = msg.physical_w_h.body.resp.height;
	fb->vxres = msg.virtual_w_h.body.resp.width;
	fb->vyres = msg.virtual_w_h.body.resp.height;
	fb->xoffset = msg.offset.body.resp.x;
	fb->yoffset = msg.offset.body.resp.y;
	fb->pitch = msg.pitch.body.resp.pitch;
	fb->base = VCBUS_TO_PHYS(msg.buffer.body.resp.fb_address);
	fb->size = msg.buffer.body.resp.fb_size;
	}

	return (err);
	}
	Index: head/sys/arm64/arm64/gic_v3.c
	===================================================================
	--- head/sys/arm64/arm64/gic_v3.c (revision 327172)
	+++ head/sys/arm64/arm64/gic_v3.c (revision 327173)
	@@ -1,1248 +1,1246 @@
	/*-
	* Copyright (c) 2015-2016 The FreeBSD Foundation
	* All rights reserved.
	*
	* This software was developed by Andrew Turner under
	* the sponsorship of the FreeBSD Foundation.
	*
	* This software was developed by Semihalf under
	* the sponsorship of the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_platform.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bitstring.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/rman.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/cpuset.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/smp.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include <machine/bus.h>
	#include <machine/cpu.h>
	#include <machine/intr.h>

	#ifdef FDT
	#include <dev/fdt/fdt_intr.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#endif

	#include "pic_if.h"

	#include <arm/arm/gic_common.h>
	#include "gic_v3_reg.h"
	#include "gic_v3_var.h"

	static bus_get_domain_t gic_v3_get_domain;
	static bus_read_ivar_t gic_v3_read_ivar;

	static pic_disable_intr_t gic_v3_disable_intr;
	static pic_enable_intr_t gic_v3_enable_intr;
	static pic_map_intr_t gic_v3_map_intr;
	static pic_setup_intr_t gic_v3_setup_intr;
	static pic_teardown_intr_t gic_v3_teardown_intr;
	static pic_post_filter_t gic_v3_post_filter;
	static pic_post_ithread_t gic_v3_post_ithread;
	static pic_pre_ithread_t gic_v3_pre_ithread;
	static pic_bind_intr_t gic_v3_bind_intr;
	#ifdef SMP
	static pic_init_secondary_t gic_v3_init_secondary;
	static pic_ipi_send_t gic_v3_ipi_send;
	static pic_ipi_setup_t gic_v3_ipi_setup;
	#endif

	static u_int gic_irq_cpu;
	#ifdef SMP
	static u_int sgi_to_ipi[GIC_LAST_SGI - GIC_FIRST_SGI + 1];
	static u_int sgi_first_unused = GIC_FIRST_SGI;
	#endif

	static device_method_t gic_v3_methods[] = {
	/* Device interface */
	DEVMETHOD(device_detach, gic_v3_detach),

	/* Bus interface */
	DEVMETHOD(bus_get_domain, gic_v3_get_domain),
	DEVMETHOD(bus_read_ivar, gic_v3_read_ivar),

	/* Interrupt controller interface */
	DEVMETHOD(pic_disable_intr, gic_v3_disable_intr),
	DEVMETHOD(pic_enable_intr, gic_v3_enable_intr),
	DEVMETHOD(pic_map_intr, gic_v3_map_intr),
	DEVMETHOD(pic_setup_intr, gic_v3_setup_intr),
	DEVMETHOD(pic_teardown_intr, gic_v3_teardown_intr),
	DEVMETHOD(pic_post_filter, gic_v3_post_filter),
	DEVMETHOD(pic_post_ithread, gic_v3_post_ithread),
	DEVMETHOD(pic_pre_ithread, gic_v3_pre_ithread),
	#ifdef SMP
	DEVMETHOD(pic_bind_intr, gic_v3_bind_intr),
	DEVMETHOD(pic_init_secondary, gic_v3_init_secondary),
	DEVMETHOD(pic_ipi_send, gic_v3_ipi_send),
	DEVMETHOD(pic_ipi_setup, gic_v3_ipi_setup),
	#endif

	/* End */
	DEVMETHOD_END
	};

	DEFINE_CLASS_0(gic, gic_v3_driver, gic_v3_methods,
	sizeof(struct gic_v3_softc));

	/*
	* Driver-specific definitions.
	*/
	MALLOC_DEFINE(M_GIC_V3, "GICv3", GIC_V3_DEVSTR);

	/*
	* Helper functions and definitions.
	*/
	/* Destination registers, either Distributor or Re-Distributor */
	enum gic_v3_xdist {
	DIST = 0,
	REDIST,
	};

	struct gic_v3_irqsrc {
	struct intr_irqsrc gi_isrc;
	uint32_t gi_irq;
	enum intr_polarity gi_pol;
	enum intr_trigger gi_trig;
	};

	/* Helper routines starting with gic_v3_ */
	static int gic_v3_dist_init(struct gic_v3_softc *);
	static int gic_v3_redist_alloc(struct gic_v3_softc *);
	static int gic_v3_redist_find(struct gic_v3_softc *);
	static int gic_v3_redist_init(struct gic_v3_softc *);
	static int gic_v3_cpu_init(struct gic_v3_softc *);
	static void gic_v3_wait_for_rwp(struct gic_v3_softc *, enum gic_v3_xdist);

	/* A sequence of init functions for primary (boot) CPU */
	typedef int (gic_v3_initseq_t) (struct gic_v3_softc );
	/* Primary CPU initialization sequence */
	static gic_v3_initseq_t gic_v3_primary_init[] = {
	gic_v3_dist_init,
	gic_v3_redist_alloc,
	gic_v3_redist_init,
	gic_v3_cpu_init,
	NULL
	};

	#ifdef SMP
	/* Secondary CPU initialization sequence */
	static gic_v3_initseq_t gic_v3_secondary_init[] = {
	gic_v3_redist_init,
	gic_v3_cpu_init,
	NULL
	};
	#endif

	uint32_t
	gic_r_read_4(device_t dev, bus_size_t offset)
	{
	struct gic_v3_softc *sc;

	sc = device_get_softc(dev);
	return (bus_read_4(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset));
	}

	uint64_t
	gic_r_read_8(device_t dev, bus_size_t offset)
	{
	struct gic_v3_softc *sc;

	sc = device_get_softc(dev);
	return (bus_read_8(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset));
	}

	void
	gic_r_write_4(device_t dev, bus_size_t offset, uint32_t val)
	{
	struct gic_v3_softc *sc;

	sc = device_get_softc(dev);
	bus_write_4(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset, val);
	}

	void
	gic_r_write_8(device_t dev, bus_size_t offset, uint64_t val)
	{
	struct gic_v3_softc *sc;

	sc = device_get_softc(dev);
	bus_write_8(sc->gic_redists.pcpu[PCPU_GET(cpuid)], offset, val);
	}

	/*
	* Device interface.
	*/
	int
	gic_v3_attach(device_t dev)
	{
	struct gic_v3_softc *sc;
	gic_v3_initseq_t *init_func;
	uint32_t typer;
	int rid;
	int err;
	size_t i;
	u_int irq;
	const char *name;

	sc = device_get_softc(dev);
	sc->gic_registered = FALSE;
	sc->dev = dev;
	err = 0;

	/* Initialize mutex */
	mtx_init(&sc->gic_mtx, "GICv3 lock", NULL, MTX_SPIN);

	/*
	* Allocate array of struct resource.
	* One entry for Distributor and all remaining for Re-Distributor.
	*/
	sc->gic_res = malloc(
	sizeof(sc->gic_res) (sc->gic_redists.nregions + 1),
	M_GIC_V3, M_WAITOK);

	/* Now allocate corresponding resources */
	for (i = 0, rid = 0; i < (sc->gic_redists.nregions + 1); i++, rid++) {
	sc->gic_res[rid] = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&rid, RF_ACTIVE);
	if (sc->gic_res[rid] == NULL)
	return (ENXIO);
	}

	/*
	* Distributor interface
	*/
	sc->gic_dist = sc->gic_res[0];

	/*
	* Re-Dristributor interface
	*/
	/* Allocate space under region descriptions */
	sc->gic_redists.regions = malloc(
	sizeof(sc->gic_redists.regions) sc->gic_redists.nregions,
	M_GIC_V3, M_WAITOK);

	/* Fill-up bus_space information for each region. */
	for (i = 0, rid = 1; i < sc->gic_redists.nregions; i++, rid++)
	sc->gic_redists.regions[i] = sc->gic_res[rid];

	/* Get the number of supported SPI interrupts */
	typer = gic_d_read(sc, 4, GICD_TYPER);
	sc->gic_nirqs = GICD_TYPER_I_NUM(typer);
	if (sc->gic_nirqs > GIC_I_NUM_MAX)
	sc->gic_nirqs = GIC_I_NUM_MAX;

	sc->gic_irqs = malloc(sizeof(sc->gic_irqs) sc->gic_nirqs,
	M_GIC_V3, M_WAITOK \| M_ZERO);
	name = device_get_nameunit(dev);
	for (irq = 0; irq < sc->gic_nirqs; irq++) {
	struct intr_irqsrc *isrc;

	sc->gic_irqs[irq].gi_irq = irq;
	sc->gic_irqs[irq].gi_pol = INTR_POLARITY_CONFORM;
	sc->gic_irqs[irq].gi_trig = INTR_TRIGGER_CONFORM;

	isrc = &sc->gic_irqs[irq].gi_isrc;
	if (irq <= GIC_LAST_SGI) {
	err = intr_isrc_register(isrc, sc->dev,
	INTR_ISRCF_IPI, "%s,i%u", name, irq - GIC_FIRST_SGI);
	} else if (irq <= GIC_LAST_PPI) {
	err = intr_isrc_register(isrc, sc->dev,
	INTR_ISRCF_PPI, "%s,p%u", name, irq - GIC_FIRST_PPI);
	} else {
	err = intr_isrc_register(isrc, sc->dev, 0,
	"%s,s%u", name, irq - GIC_FIRST_SPI);
	}
	if (err != 0) {
	/* XXX call intr_isrc_deregister() */
	free(sc->gic_irqs, M_DEVBUF);
	return (err);
	}
	}

	/*
	* Read the Peripheral ID2 register. This is an implementation
	* defined register, but seems to be implemented in all GICv3
	* parts and Linux expects it to be there.
	*/
	sc->gic_pidr2 = gic_d_read(sc, 4, GICD_PIDR2);

	/* Get the number of supported interrupt identifier bits */
	sc->gic_idbits = GICD_TYPER_IDBITS(typer);

	if (bootverbose) {
	device_printf(dev, "SPIs: %u, IDs: %u\n",
	sc->gic_nirqs, (1 << sc->gic_idbits) - 1);
	}

	/* Train init sequence for boot CPU */
	for (init_func = gic_v3_primary_init; *init_func != NULL; init_func++) {
	err = (*init_func)(sc);
	if (err != 0)
	return (err);
	}

	return (0);
	}

	int
	gic_v3_detach(device_t dev)
	{
	struct gic_v3_softc *sc;
	size_t i;
	int rid;

	sc = device_get_softc(dev);

	if (device_is_attached(dev)) {
	/*
	* XXX: We should probably deregister PIC
	*/
	if (sc->gic_registered)
	panic("Trying to detach registered PIC");
	}
	for (rid = 0; rid < (sc->gic_redists.nregions + 1); rid++)
	bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->gic_res[rid]);

	for (i = 0; i <= mp_maxid; i++)
	free(sc->gic_redists.pcpu[i], M_GIC_V3);

	free(sc->gic_res, M_GIC_V3);
	free(sc->gic_redists.regions, M_GIC_V3);

	return (0);
	}

	static int
	gic_v3_get_domain(device_t dev, device_t child, int *domain)
	{
	struct gic_v3_devinfo *di;

	di = device_get_ivars(child);
	if (di->gic_domain < 0)
	return (ENOENT);

	*domain = di->gic_domain;
	return (0);
	}

	static int
	gic_v3_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
	{
	struct gic_v3_softc *sc;

	sc = device_get_softc(dev);

	switch (which) {
	case GICV3_IVAR_NIRQS:
	*result = (NIRQ - sc->gic_nirqs) / sc->gic_nchildren;
	return (0);
	case GICV3_IVAR_REDIST_VADDR:
	*result = (uintptr_t)rman_get_virtual(
	sc->gic_redists.pcpu[PCPU_GET(cpuid)]);
	return (0);
	case GIC_IVAR_HW_REV:
	KASSERT(
	GICR_PIDR2_ARCH(sc->gic_pidr2) == GICR_PIDR2_ARCH_GICv3 \|\|
	GICR_PIDR2_ARCH(sc->gic_pidr2) == GICR_PIDR2_ARCH_GICv4,
	("gic_v3_read_ivar: Invalid GIC architecture: %d (%.08X)",
	GICR_PIDR2_ARCH(sc->gic_pidr2), sc->gic_pidr2));
	*result = GICR_PIDR2_ARCH(sc->gic_pidr2);
	return (0);
	case GIC_IVAR_BUS:
	KASSERT(sc->gic_bus != GIC_BUS_UNKNOWN,
	("gic_v3_read_ivar: Unknown bus type"));
	KASSERT(sc->gic_bus <= GIC_BUS_MAX,
	("gic_v3_read_ivar: Invalid bus type %u", sc->gic_bus));
	*result = sc->gic_bus;
	return (0);
	}

	return (ENOENT);
	}

	int
	arm_gic_v3_intr(void *arg)
	{
	struct gic_v3_softc *sc = arg;
	struct gic_v3_irqsrc *gi;
	struct intr_pic *pic;
	uint64_t active_irq;
	struct trapframe *tf;
	- bool first;

	- first = true;
	pic = sc->gic_pic;

	while (1) {
	if (CPU_MATCH_ERRATA_CAVIUM_THUNDER_1_1) {
	/*
	* Hardware: Cavium ThunderX
	* Chip revision: Pass 1.0 (early version)
	* Pass 1.1 (production)
	* ERRATUM: 22978, 23154
	*/
	__asm __volatile(
	"nop;nop;nop;nop;nop;nop;nop;nop; \n"
	"mrs %0, ICC_IAR1_EL1 \n"
	"nop;nop;nop;nop; \n"
	"dsb sy \n"
	: "=&r" (active_irq));
	} else {
	active_irq = gic_icc_read(IAR1);
	}

	if (active_irq >= GIC_FIRST_LPI) {
	intr_child_irq_handler(pic, active_irq);
	continue;
	}

	if (__predict_false(active_irq >= sc->gic_nirqs))
	return (FILTER_HANDLED);

	tf = curthread->td_intr_frame;
	gi = &sc->gic_irqs[active_irq];
	if (active_irq <= GIC_LAST_SGI) {
	/* Call EOI for all IPI before dispatch. */
	gic_icc_write(EOIR1, (uint64_t)active_irq);
	#ifdef SMP
	intr_ipi_dispatch(sgi_to_ipi[gi->gi_irq], tf);
	#else
	device_printf(sc->dev, "SGI %ju on UP system detected\n",
	(uintmax_t)(active_irq - GIC_FIRST_SGI));
	#endif
	} else if (active_irq >= GIC_FIRST_PPI &&
	active_irq <= GIC_LAST_SPI) {
	if (gi->gi_trig == INTR_TRIGGER_EDGE)
	gic_icc_write(EOIR1, gi->gi_irq);

	if (intr_isrc_dispatch(&gi->gi_isrc, tf) != 0) {
	if (gi->gi_trig != INTR_TRIGGER_EDGE)
	gic_icc_write(EOIR1, gi->gi_irq);
	gic_v3_disable_intr(sc->dev, &gi->gi_isrc);
	device_printf(sc->dev,
	"Stray irq %lu disabled\n", active_irq);
	}
	}
	}
	}

	#ifdef FDT
	static int
	gic_map_fdt(device_t dev, u_int ncells, pcell_t cells, u_int irqp,
	enum intr_polarity polp, enum intr_trigger trigp)
	{
	u_int irq;

	if (ncells < 3)
	return (EINVAL);

	/*
	* The 1st cell is the interrupt type:
	* 0 = SPI
	* 1 = PPI
	* The 2nd cell contains the interrupt number:
	* [0 - 987] for SPI
	* [0 - 15] for PPI
	* The 3rd cell is the flags, encoded as follows:
	* bits[3:0] trigger type and level flags
	* 1 = edge triggered
	* 2 = edge triggered (PPI only)
	* 4 = level-sensitive
	* 8 = level-sensitive (PPI only)
	*/
	switch (cells[0]) {
	case 0:
	irq = GIC_FIRST_SPI + cells[1];
	/* SPI irq is checked later. */
	break;
	case 1:
	irq = GIC_FIRST_PPI + cells[1];
	if (irq > GIC_LAST_PPI) {
	device_printf(dev, "unsupported PPI interrupt "
	"number %u\n", cells[1]);
	return (EINVAL);
	}
	break;
	default:
	device_printf(dev, "unsupported interrupt type "
	"configuration %u\n", cells[0]);
	return (EINVAL);
	}

	switch (cells[2] & FDT_INTR_MASK) {
	case FDT_INTR_EDGE_RISING:
	*trigp = INTR_TRIGGER_EDGE;
	*polp = INTR_POLARITY_HIGH;
	break;
	case FDT_INTR_EDGE_FALLING:
	*trigp = INTR_TRIGGER_EDGE;
	*polp = INTR_POLARITY_LOW;
	break;
	case FDT_INTR_LEVEL_HIGH:
	*trigp = INTR_TRIGGER_LEVEL;
	*polp = INTR_POLARITY_HIGH;
	break;
	case FDT_INTR_LEVEL_LOW:
	*trigp = INTR_TRIGGER_LEVEL;
	*polp = INTR_POLARITY_LOW;
	break;
	default:
	device_printf(dev, "unsupported trigger/polarity "
	"configuration 0x%02x\n", cells[2]);
	return (EINVAL);
	}

	/* Check the interrupt is valid */
	if (irq >= GIC_FIRST_SPI && *polp != INTR_POLARITY_HIGH)
	return (EINVAL);

	*irqp = irq;
	return (0);
	}
	#endif

	static int
	gic_map_msi(device_t dev, struct intr_map_data_msi msi_data, u_int irqp,
	enum intr_polarity polp, enum intr_trigger trigp)
	{
	struct gic_v3_irqsrc *gi;

	/* SPI-mapped MSI */
	gi = (struct gic_v3_irqsrc *)msi_data->isrc;
	if (gi == NULL)
	return (ENXIO);

	*irqp = gi->gi_irq;

	/* MSI/MSI-X interrupts are always edge triggered with high polarity */
	*polp = INTR_POLARITY_HIGH;
	*trigp = INTR_TRIGGER_EDGE;

	return (0);
	}

	static int
	do_gic_v3_map_intr(device_t dev, struct intr_map_data data, u_int irqp,
	enum intr_polarity polp, enum intr_trigger trigp)
	{
	struct gic_v3_softc *sc;
	enum intr_polarity pol;
	enum intr_trigger trig;
	struct intr_map_data_msi *dam;
	#ifdef FDT
	struct intr_map_data_fdt *daf;
	#endif
	u_int irq;

	sc = device_get_softc(dev);

	switch (data->type) {
	#ifdef FDT
	case INTR_MAP_DATA_FDT:
	daf = (struct intr_map_data_fdt *)data;
	if (gic_map_fdt(dev, daf->ncells, daf->cells, &irq, &pol,
	&trig) != 0)
	return (EINVAL);
	break;
	#endif
	case INTR_MAP_DATA_MSI:
	/* SPI-mapped MSI */
	dam = (struct intr_map_data_msi *)data;
	if (gic_map_msi(dev, dam, &irq, &pol, &trig) != 0)
	return (EINVAL);
	break;
	default:
	return (EINVAL);
	}

	if (irq >= sc->gic_nirqs)
	return (EINVAL);
	switch (pol) {
	case INTR_POLARITY_CONFORM:
	case INTR_POLARITY_LOW:
	case INTR_POLARITY_HIGH:
	break;
	default:
	return (EINVAL);
	}
	switch (trig) {
	case INTR_TRIGGER_CONFORM:
	case INTR_TRIGGER_EDGE:
	case INTR_TRIGGER_LEVEL:
	break;
	default:
	return (EINVAL);
	}

	*irqp = irq;
	if (polp != NULL)
	*polp = pol;
	if (trigp != NULL)
	*trigp = trig;
	return (0);
	}

	static int
	gic_v3_map_intr(device_t dev, struct intr_map_data *data,
	struct intr_irqsrc **isrcp)
	{
	struct gic_v3_softc *sc;
	int error;
	u_int irq;

	error = do_gic_v3_map_intr(dev, data, &irq, NULL, NULL);
	if (error == 0) {
	sc = device_get_softc(dev);
	*isrcp = GIC_INTR_ISRC(sc, irq);
	}
	return (error);
	}

	static int
	gic_v3_setup_intr(device_t dev, struct intr_irqsrc *isrc,
	struct resource res, struct intr_map_data data)
	{
	struct gic_v3_softc *sc = device_get_softc(dev);
	struct gic_v3_irqsrc gi = (struct gic_v3_irqsrc )isrc;
	enum intr_trigger trig;
	enum intr_polarity pol;
	uint32_t reg;
	u_int irq;
	int error;

	if (data == NULL)
	return (ENOTSUP);

	error = do_gic_v3_map_intr(dev, data, &irq, &pol, &trig);
	if (error != 0)
	return (error);

	if (gi->gi_irq != irq \|\| pol == INTR_POLARITY_CONFORM \|\|
	trig == INTR_TRIGGER_CONFORM)
	return (EINVAL);

	/* Compare config if this is not first setup. */
	if (isrc->isrc_handlers != 0) {
	if (pol != gi->gi_pol \|\| trig != gi->gi_trig)
	return (EINVAL);
	else
	return (0);
	}

	gi->gi_pol = pol;
	gi->gi_trig = trig;

	/*
	* XXX - In case that per CPU interrupt is going to be enabled in time
	* when SMP is already started, we need some IPI call which
	* enables it on others CPUs. Further, it's more complicated as
	* pic_enable_source() and pic_disable_source() should act on
	* per CPU basis only. Thus, it should be solved here somehow.
	*/
	if (isrc->isrc_flags & INTR_ISRCF_PPI)
	CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);

	if (irq >= GIC_FIRST_PPI && irq <= GIC_LAST_SPI) {
	mtx_lock_spin(&sc->gic_mtx);

	/* Set the trigger and polarity */
	if (irq <= GIC_LAST_PPI)
	reg = gic_r_read(sc, 4,
	GICR_SGI_BASE_SIZE + GICD_ICFGR(irq));
	else
	reg = gic_d_read(sc, 4, GICD_ICFGR(irq));
	if (trig == INTR_TRIGGER_LEVEL)
	reg &= ~(2 << ((irq % 16) * 2));
	else
	reg \|= 2 << ((irq % 16) * 2);

	if (irq <= GIC_LAST_PPI) {
	gic_r_write(sc, 4,
	GICR_SGI_BASE_SIZE + GICD_ICFGR(irq), reg);
	gic_v3_wait_for_rwp(sc, REDIST);
	} else {
	gic_d_write(sc, 4, GICD_ICFGR(irq), reg);
	gic_v3_wait_for_rwp(sc, DIST);
	}

	mtx_unlock_spin(&sc->gic_mtx);

	gic_v3_bind_intr(dev, isrc);
	}

	return (0);
	}

	static int
	gic_v3_teardown_intr(device_t dev, struct intr_irqsrc *isrc,
	struct resource res, struct intr_map_data data)
	{
	struct gic_v3_irqsrc gi = (struct gic_v3_irqsrc )isrc;

	if (isrc->isrc_handlers == 0) {
	gi->gi_pol = INTR_POLARITY_CONFORM;
	gi->gi_trig = INTR_TRIGGER_CONFORM;
	}

	return (0);
	}

	static void
	gic_v3_disable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gic_v3_softc *sc;
	struct gic_v3_irqsrc *gi;
	u_int irq;

	sc = device_get_softc(dev);
	gi = (struct gic_v3_irqsrc *)isrc;
	irq = gi->gi_irq;

	if (irq <= GIC_LAST_PPI) {
	/* SGIs and PPIs in corresponding Re-Distributor */
	gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICD_ICENABLER(irq),
	GICD_I_MASK(irq));
	gic_v3_wait_for_rwp(sc, REDIST);
	} else if (irq >= GIC_FIRST_SPI && irq <= GIC_LAST_SPI) {
	/* SPIs in distributor */
	gic_d_write(sc, 4, GICD_ICENABLER(irq), GICD_I_MASK(irq));
	gic_v3_wait_for_rwp(sc, DIST);
	} else
	panic("%s: Unsupported IRQ %u", __func__, irq);
	}

	static void
	gic_v3_enable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gic_v3_softc *sc;
	struct gic_v3_irqsrc *gi;
	u_int irq;

	sc = device_get_softc(dev);
	gi = (struct gic_v3_irqsrc *)isrc;
	irq = gi->gi_irq;

	if (irq <= GIC_LAST_PPI) {
	/* SGIs and PPIs in corresponding Re-Distributor */
	gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICD_ISENABLER(irq),
	GICD_I_MASK(irq));
	gic_v3_wait_for_rwp(sc, REDIST);
	} else if (irq >= GIC_FIRST_SPI && irq <= GIC_LAST_SPI) {
	/* SPIs in distributor */
	gic_d_write(sc, 4, GICD_ISENABLER(irq), GICD_I_MASK(irq));
	gic_v3_wait_for_rwp(sc, DIST);
	} else
	panic("%s: Unsupported IRQ %u", __func__, irq);
	}

	static void
	gic_v3_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gic_v3_irqsrc gi = (struct gic_v3_irqsrc )isrc;

	gic_v3_disable_intr(dev, isrc);
	gic_icc_write(EOIR1, gi->gi_irq);
	}

	static void
	gic_v3_post_ithread(device_t dev, struct intr_irqsrc *isrc)
	{

	gic_v3_enable_intr(dev, isrc);
	}

	static void
	gic_v3_post_filter(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gic_v3_irqsrc gi = (struct gic_v3_irqsrc )isrc;

	if (gi->gi_trig == INTR_TRIGGER_EDGE)
	return;

	gic_icc_write(EOIR1, gi->gi_irq);
	}

	static int
	gic_v3_bind_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gic_v3_softc *sc;
	struct gic_v3_irqsrc *gi;
	int cpu;

	gi = (struct gic_v3_irqsrc *)isrc;
	if (gi->gi_irq <= GIC_LAST_PPI)
	return (EINVAL);

	KASSERT(gi->gi_irq >= GIC_FIRST_SPI && gi->gi_irq <= GIC_LAST_SPI,
	("%s: Attempting to bind an invalid IRQ", __func__));

	sc = device_get_softc(dev);

	if (CPU_EMPTY(&isrc->isrc_cpu)) {
	gic_irq_cpu = intr_irq_next_cpu(gic_irq_cpu, &all_cpus);
	CPU_SETOF(gic_irq_cpu, &isrc->isrc_cpu);
	gic_d_write(sc, 4, GICD_IROUTER(gi->gi_irq),
	CPU_AFFINITY(gic_irq_cpu));
	} else {
	/*
	* We can only bind to a single CPU so select
	* the first CPU found.
	*/
	cpu = CPU_FFS(&isrc->isrc_cpu) - 1;
	gic_d_write(sc, 4, GICD_IROUTER(gi->gi_irq), CPU_AFFINITY(cpu));
	}

	return (0);
	}

	#ifdef SMP
	static void
	gic_v3_init_secondary(device_t dev)
	{
	device_t child;
	struct gic_v3_softc *sc;
	gic_v3_initseq_t *init_func;
	struct intr_irqsrc *isrc;
	u_int cpu, irq;
	int err, i;

	sc = device_get_softc(dev);
	cpu = PCPU_GET(cpuid);

	/* Train init sequence for boot CPU */
	for (init_func = gic_v3_secondary_init; *init_func != NULL;
	init_func++) {
	err = (*init_func)(sc);
	if (err != 0) {
	device_printf(dev,
	"Could not initialize GIC for CPU%u\n", cpu);
	return;
	}
	}

	/* Unmask attached SGI interrupts. */
	for (irq = GIC_FIRST_SGI; irq <= GIC_LAST_SGI; irq++) {
	isrc = GIC_INTR_ISRC(sc, irq);
	if (intr_isrc_init_on_cpu(isrc, cpu))
	gic_v3_enable_intr(dev, isrc);
	}

	/* Unmask attached PPI interrupts. */
	for (irq = GIC_FIRST_PPI; irq <= GIC_LAST_PPI; irq++) {
	isrc = GIC_INTR_ISRC(sc, irq);
	if (intr_isrc_init_on_cpu(isrc, cpu))
	gic_v3_enable_intr(dev, isrc);
	}

	for (i = 0; i < sc->gic_nchildren; i++) {
	child = sc->gic_children[i];
	PIC_INIT_SECONDARY(child);
	}
	}

	static void
	gic_v3_ipi_send(device_t dev, struct intr_irqsrc *isrc, cpuset_t cpus,
	u_int ipi)
	{
	struct gic_v3_irqsrc gi = (struct gic_v3_irqsrc )isrc;
	uint64_t aff, val, irq;
	int i;

	#define GIC_AFF_MASK (CPU_AFF3_MASK \| CPU_AFF2_MASK \| CPU_AFF1_MASK)
	#define GIC_AFFINITY(i) (CPU_AFFINITY(i) & GIC_AFF_MASK)
	aff = GIC_AFFINITY(0);
	irq = gi->gi_irq;
	val = 0;

	/* Iterate through all CPUs in set */
	for (i = 0; i <= mp_maxid; i++) {
	/* Move to the next affinity group */
	if (aff != GIC_AFFINITY(i)) {
	/* Send the IPI */
	if (val != 0) {
	gic_icc_write(SGI1R, val);
	val = 0;
	}
	aff = GIC_AFFINITY(i);
	}

	/* Send the IPI to this cpu */
	if (CPU_ISSET(i, &cpus)) {
	#define ICC_SGI1R_AFFINITY(aff) \
	(((uint64_t)CPU_AFF3(aff) << ICC_SGI1R_EL1_AFF3_SHIFT) \| \
	((uint64_t)CPU_AFF2(aff) << ICC_SGI1R_EL1_AFF2_SHIFT) \| \
	((uint64_t)CPU_AFF1(aff) << ICC_SGI1R_EL1_AFF1_SHIFT))
	/* Set the affinity when the first at this level */
	if (val == 0)
	val = ICC_SGI1R_AFFINITY(aff) \|
	irq << ICC_SGI1R_EL1_SGIID_SHIFT;
	/* Set the bit to send the IPI to te CPU */
	val \|= 1 << CPU_AFF0(CPU_AFFINITY(i));
	}
	}

	/* Send the IPI to the last cpu affinity group */
	if (val != 0)
	gic_icc_write(SGI1R, val);
	#undef GIC_AFF_MASK
	#undef GIC_AFFINITY
	}

	static int
	gic_v3_ipi_setup(device_t dev, u_int ipi, struct intr_irqsrc **isrcp)
	{
	struct intr_irqsrc *isrc;
	struct gic_v3_softc *sc = device_get_softc(dev);

	if (sgi_first_unused > GIC_LAST_SGI)
	return (ENOSPC);

	isrc = GIC_INTR_ISRC(sc, sgi_first_unused);
	sgi_to_ipi[sgi_first_unused++] = ipi;

	CPU_SET(PCPU_GET(cpuid), &isrc->isrc_cpu);

	*isrcp = isrc;
	return (0);
	}
	#endif /* SMP */

	/*
	* Helper routines
	*/
	static void
	gic_v3_wait_for_rwp(struct gic_v3_softc *sc, enum gic_v3_xdist xdist)
	{
	struct resource *res;
	u_int cpuid;
	size_t us_left = 1000000;

	cpuid = PCPU_GET(cpuid);

	switch (xdist) {
	case DIST:
	res = sc->gic_dist;
	break;
	case REDIST:
	res = sc->gic_redists.pcpu[cpuid];
	break;
	default:
	KASSERT(0, ("%s: Attempt to wait for unknown RWP", __func__));
	return;
	}

	while ((bus_read_4(res, GICD_CTLR) & GICD_CTLR_RWP) != 0) {
	DELAY(1);
	if (us_left-- == 0)
	panic("GICD Register write pending for too long");
	}
	}

	/* CPU interface. */
	static __inline void
	gic_v3_cpu_priority(uint64_t mask)
	{

	/* Set prority mask */
	gic_icc_write(PMR, mask & ICC_PMR_EL1_PRIO_MASK);
	}

	static int
	gic_v3_cpu_enable_sre(struct gic_v3_softc *sc)
	{
	uint64_t sre;
	u_int cpuid;

	cpuid = PCPU_GET(cpuid);
	/*
	* Set the SRE bit to enable access to GIC CPU interface
	* via system registers.
	*/
	sre = READ_SPECIALREG(icc_sre_el1);
	sre \|= ICC_SRE_EL1_SRE;
	WRITE_SPECIALREG(icc_sre_el1, sre);
	isb();
	/*
	* Now ensure that the bit is set.
	*/
	sre = READ_SPECIALREG(icc_sre_el1);
	if ((sre & ICC_SRE_EL1_SRE) == 0) {
	/* We are done. This was disabled in EL2 */
	device_printf(sc->dev, "ERROR: CPU%u cannot enable CPU interface "
	"via system registers\n", cpuid);
	return (ENXIO);
	} else if (bootverbose) {
	device_printf(sc->dev,
	"CPU%u enabled CPU interface via system registers\n",
	cpuid);
	}

	return (0);
	}

	static int
	gic_v3_cpu_init(struct gic_v3_softc *sc)
	{
	int err;

	/* Enable access to CPU interface via system registers */
	err = gic_v3_cpu_enable_sre(sc);
	if (err != 0)
	return (err);
	/* Priority mask to minimum - accept all interrupts */
	gic_v3_cpu_priority(GIC_PRIORITY_MIN);
	/* Disable EOI mode */
	gic_icc_clear(CTLR, ICC_CTLR_EL1_EOIMODE);
	/* Enable group 1 (insecure) interrups */
	gic_icc_set(IGRPEN1, ICC_IGRPEN0_EL1_EN);

	return (0);
	}

	/* Distributor */
	static int
	gic_v3_dist_init(struct gic_v3_softc *sc)
	{
	uint64_t aff;
	u_int i;

	/*
	* 1. Disable the Distributor
	*/
	gic_d_write(sc, 4, GICD_CTLR, 0);
	gic_v3_wait_for_rwp(sc, DIST);

	/*
	* 2. Configure the Distributor
	*/
	/* Set all SPIs to be Group 1 Non-secure */
	for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i += GICD_I_PER_IGROUPRn)
	gic_d_write(sc, 4, GICD_IGROUPR(i), 0xFFFFFFFF);

	/* Set all global interrupts to be level triggered, active low. */
	for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i += GICD_I_PER_ICFGRn)
	gic_d_write(sc, 4, GICD_ICFGR(i), 0x00000000);

	/* Set priority to all shared interrupts */
	for (i = GIC_FIRST_SPI;
	i < sc->gic_nirqs; i += GICD_I_PER_IPRIORITYn) {
	/* Set highest priority */
	gic_d_write(sc, 4, GICD_IPRIORITYR(i), GIC_PRIORITY_MAX);
	}

	/*
	* Disable all interrupts. Leave PPI and SGIs as they are enabled in
	* Re-Distributor registers.
	*/
	for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i += GICD_I_PER_ISENABLERn)
	gic_d_write(sc, 4, GICD_ICENABLER(i), 0xFFFFFFFF);

	gic_v3_wait_for_rwp(sc, DIST);

	/*
	* 3. Enable Distributor
	*/
	/* Enable Distributor with ARE, Group 1 */
	gic_d_write(sc, 4, GICD_CTLR, GICD_CTLR_ARE_NS \| GICD_CTLR_G1A \|
	GICD_CTLR_G1);

	/*
	* 4. Route all interrupts to boot CPU.
	*/
	aff = CPU_AFFINITY(0);
	for (i = GIC_FIRST_SPI; i < sc->gic_nirqs; i++)
	gic_d_write(sc, 4, GICD_IROUTER(i), aff);

	return (0);
	}

	/* Re-Distributor */
	static int
	gic_v3_redist_alloc(struct gic_v3_softc *sc)
	{
	u_int cpuid;

	/* Allocate struct resource for all CPU's Re-Distributor registers */
	for (cpuid = 0; cpuid <= mp_maxid; cpuid++)
	if (CPU_ISSET(cpuid, &all_cpus) != 0)
	sc->gic_redists.pcpu[cpuid] =
	malloc(sizeof(*sc->gic_redists.pcpu[0]),
	M_GIC_V3, M_WAITOK);
	else
	sc->gic_redists.pcpu[cpuid] = NULL;
	return (0);
	}

	static int
	gic_v3_redist_find(struct gic_v3_softc *sc)
	{
	struct resource r_res;
	bus_space_handle_t r_bsh;
	uint64_t aff;
	uint64_t typer;
	uint32_t pidr2;
	u_int cpuid;
	size_t i;

	cpuid = PCPU_GET(cpuid);

	aff = CPU_AFFINITY(cpuid);
	/* Affinity in format for comparison with typer */
	aff = (CPU_AFF3(aff) << 24) \| (CPU_AFF2(aff) << 16) \|
	(CPU_AFF1(aff) << 8) \| CPU_AFF0(aff);

	if (bootverbose) {
	device_printf(sc->dev,
	"Start searching for Re-Distributor\n");
	}
	/* Iterate through Re-Distributor regions */
	for (i = 0; i < sc->gic_redists.nregions; i++) {
	/* Take a copy of the region's resource */
	r_res = *sc->gic_redists.regions[i];
	r_bsh = rman_get_bushandle(&r_res);

	pidr2 = bus_read_4(&r_res, GICR_PIDR2);
	switch (GICR_PIDR2_ARCH(pidr2)) {
	case GICR_PIDR2_ARCH_GICv3: /* fall through */
	case GICR_PIDR2_ARCH_GICv4:
	break;
	default:
	device_printf(sc->dev,
	"No Re-Distributor found for CPU%u\n", cpuid);
	return (ENODEV);
	}

	do {
	typer = bus_read_8(&r_res, GICR_TYPER);
	if ((typer >> GICR_TYPER_AFF_SHIFT) == aff) {
	KASSERT(sc->gic_redists.pcpu[cpuid] != NULL,
	("Invalid pointer to per-CPU redistributor"));
	/* Copy res contents to its final destination */
	*sc->gic_redists.pcpu[cpuid] = r_res;
	if (bootverbose) {
	device_printf(sc->dev,
	"CPU%u Re-Distributor has been found\n",
	cpuid);
	}
	return (0);
	}

	r_bsh += (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE);
	if ((typer & GICR_TYPER_VLPIS) != 0) {
	r_bsh +=
	(GICR_VLPI_BASE_SIZE + GICR_RESERVED_SIZE);
	}

	rman_set_bushandle(&r_res, r_bsh);
	} while ((typer & GICR_TYPER_LAST) == 0);
	}

	device_printf(sc->dev, "No Re-Distributor found for CPU%u\n", cpuid);
	return (ENXIO);
	}

	static int
	gic_v3_redist_wake(struct gic_v3_softc *sc)
	{
	uint32_t waker;
	size_t us_left = 1000000;

	waker = gic_r_read(sc, 4, GICR_WAKER);
	/* Wake up Re-Distributor for this CPU */
	waker &= ~GICR_WAKER_PS;
	gic_r_write(sc, 4, GICR_WAKER, waker);
	/*
	* When clearing ProcessorSleep bit it is required to wait for
	* ChildrenAsleep to become zero following the processor power-on.
	*/
	while ((gic_r_read(sc, 4, GICR_WAKER) & GICR_WAKER_CA) != 0) {
	DELAY(1);
	if (us_left-- == 0) {
	panic("Could not wake Re-Distributor for CPU%u",
	PCPU_GET(cpuid));
	}
	}

	if (bootverbose) {
	device_printf(sc->dev, "CPU%u Re-Distributor woke up\n",
	PCPU_GET(cpuid));
	}

	return (0);
	}

	static int
	gic_v3_redist_init(struct gic_v3_softc *sc)
	{
	int err;
	size_t i;

	err = gic_v3_redist_find(sc);
	if (err != 0)
	return (err);

	err = gic_v3_redist_wake(sc);
	if (err != 0)
	return (err);

	/* Configure SGIs and PPIs to be Group1 Non-secure */
	gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICR_IGROUPR0,
	0xFFFFFFFF);

	/* Disable SPIs */
	gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICR_ICENABLER0,
	GICR_I_ENABLER_PPI_MASK);
	/* Enable SGIs */
	gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICR_ISENABLER0,
	GICR_I_ENABLER_SGI_MASK);

	/* Set priority for SGIs and PPIs */
	for (i = 0; i <= GIC_LAST_PPI; i += GICR_I_PER_IPRIORITYn) {
	gic_r_write(sc, 4, GICR_SGI_BASE_SIZE + GICD_IPRIORITYR(i),
	GIC_PRIORITY_MAX);
	}

	gic_v3_wait_for_rwp(sc, REDIST);

	return (0);
	}
	Index: head/sys/arm64/arm64/gicv3_its.c
	===================================================================
	--- head/sys/arm64/arm64/gicv3_its.c (revision 327172)
	+++ head/sys/arm64/arm64/gicv3_its.c (revision 327173)
	@@ -1,1694 +1,1688 @@
	/*-
	* Copyright (c) 2015-2016 The FreeBSD Foundation
	* All rights reserved.
	*
	* This software was developed by Andrew Turner under
	* the sponsorship of the FreeBSD Foundation.
	*
	* This software was developed by Semihalf under
	* the sponsorship of the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_platform.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/cpuset.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/rman.h>
	#include <sys/smp.h>
	#include <sys/vmem.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include <machine/bus.h>
	#include <machine/intr.h>

	#include <arm/arm/gic_common.h>
	#include <arm64/arm64/gic_v3_reg.h>
	#include <arm64/arm64/gic_v3_var.h>

	#ifdef FDT
	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#endif
	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcivar.h>

	#include "pcib_if.h"
	#include "pic_if.h"
	#include "msi_if.h"

	MALLOC_DEFINE(M_GICV3_ITS, "GICv3 ITS",
	"ARM GICv3 Interrupt Translation Service");

	#define LPI_NIRQS (64 * 1024)

	/* The size and alignment of the command circular buffer */
	#define ITS_CMDQ_SIZE (64 * 1024) /* Must be a multiple of 4K */
	#define ITS_CMDQ_ALIGN (64 * 1024)

	#define LPI_CONFTAB_SIZE LPI_NIRQS
	#define LPI_CONFTAB_ALIGN (64 * 1024)
	#define LPI_CONFTAB_MAX_ADDR ((1ul << 48) - 1) /* We need a 47 bit PA */

	/* 1 bit per SPI, PPI, and SGI (8k), and 1 bit per LPI (LPI_CONFTAB_SIZE) */
	#define LPI_PENDTAB_SIZE ((LPI_NIRQS + GIC_FIRST_LPI) / 8)
	#define LPI_PENDTAB_ALIGN (64 * 1024)
	#define LPI_PENDTAB_MAX_ADDR ((1ul << 48) - 1) /* We need a 47 bit PA */

	#define LPI_INT_TRANS_TAB_ALIGN 256
	#define LPI_INT_TRANS_TAB_MAX_ADDR ((1ul << 48) - 1)

	/* ITS commands encoding */
	#define ITS_CMD_MOVI (0x01)
	#define ITS_CMD_SYNC (0x05)
	#define ITS_CMD_MAPD (0x08)
	#define ITS_CMD_MAPC (0x09)
	#define ITS_CMD_MAPTI (0x0a)
	#define ITS_CMD_MAPI (0x0b)
	#define ITS_CMD_INV (0x0c)
	#define ITS_CMD_INVALL (0x0d)
	/* Command */
	#define CMD_COMMAND_MASK (0xFFUL)
	/* PCI device ID */
	#define CMD_DEVID_SHIFT (32)
	#define CMD_DEVID_MASK (0xFFFFFFFFUL << CMD_DEVID_SHIFT)
	/* Size of IRQ ID bitfield */
	#define CMD_SIZE_MASK (0xFFUL)
	/* Virtual LPI ID */
	#define CMD_ID_MASK (0xFFFFFFFFUL)
	/* Physical LPI ID */
	#define CMD_PID_SHIFT (32)
	#define CMD_PID_MASK (0xFFFFFFFFUL << CMD_PID_SHIFT)
	/* Collection */
	#define CMD_COL_MASK (0xFFFFUL)
	/* Target (CPU or Re-Distributor) */
	#define CMD_TARGET_SHIFT (16)
	#define CMD_TARGET_MASK (0xFFFFFFFFUL << CMD_TARGET_SHIFT)
	/* Interrupt Translation Table address */
	#define CMD_ITT_MASK (0xFFFFFFFFFF00UL)
	/* Valid command bit */
	#define CMD_VALID_SHIFT (63)
	#define CMD_VALID_MASK (1UL << CMD_VALID_SHIFT)

	#define ITS_TARGET_NONE 0xFBADBEEF

	/* LPI chunk owned by ITS device */
	struct lpi_chunk {
	u_int lpi_base;
	u_int lpi_free; /* First free LPI in set */
	u_int lpi_num; /* Total number of LPIs in chunk */
	u_int lpi_busy; /* Number of busy LPIs in chink */
	};

	/* ITS device */
	struct its_dev {
	TAILQ_ENTRY(its_dev) entry;
	/* PCI device */
	device_t pci_dev;
	/* Device ID (i.e. PCI device ID) */
	uint32_t devid;
	/* List of assigned LPIs */
	struct lpi_chunk lpis;
	/* Virtual address of ITT */
	vm_offset_t itt;
	size_t itt_size;
	};

	/*
	* ITS command descriptor.
	* Idea for command description passing taken from Linux.
	*/
	struct its_cmd_desc {
	uint8_t cmd_type;

	union {
	struct {
	struct its_dev *its_dev;
	struct its_col *col;
	uint32_t id;
	} cmd_desc_movi;

	struct {
	struct its_col *col;
	} cmd_desc_sync;

	struct {
	struct its_col *col;
	uint8_t valid;
	} cmd_desc_mapc;

	struct {
	struct its_dev *its_dev;
	struct its_col *col;
	uint32_t pid;
	uint32_t id;
	} cmd_desc_mapvi;

	struct {
	struct its_dev *its_dev;
	struct its_col *col;
	uint32_t pid;
	} cmd_desc_mapi;

	struct {
	struct its_dev *its_dev;
	uint8_t valid;
	} cmd_desc_mapd;

	struct {
	struct its_dev *its_dev;
	struct its_col *col;
	uint32_t pid;
	} cmd_desc_inv;

	struct {
	struct its_col *col;
	} cmd_desc_invall;
	};
	};

	/* ITS command. Each command is 32 bytes long */
	struct its_cmd {
	uint64_t cmd_dword[4]; /* ITS command double word */
	};

	/* An ITS private table */
	struct its_ptable {
	vm_offset_t ptab_vaddr;
	unsigned long ptab_size;
	};

	/* ITS collection description. */
	struct its_col {
	uint64_t col_target; /* Target Re-Distributor */
	uint64_t col_id; /* Collection ID */
	};

	struct gicv3_its_irqsrc {
	struct intr_irqsrc gi_isrc;
	u_int gi_irq;
	struct its_dev *gi_its_dev;
	};

	struct gicv3_its_softc {
	struct intr_pic *sc_pic;
	struct resource *sc_its_res;

	cpuset_t sc_cpus;
	u_int gic_irq_cpu;

	struct its_ptable sc_its_ptab[GITS_BASER_NUM];
	struct its_col sc_its_cols[MAXCPU]; / Per-CPU collections */

	/*
	* TODO: We should get these from the parent as we only want a
	* single copy of each across the interrupt controller.
	*/
	vm_offset_t sc_conf_base;
	vm_offset_t sc_pend_base[MAXCPU];

	/* Command handling */
	struct mtx sc_its_cmd_lock;
	struct its_cmd sc_its_cmd_base; / Command circular buffer address */
	size_t sc_its_cmd_next_idx;

	vmem_t *sc_irq_alloc;
	struct gicv3_its_irqsrc *sc_irqs;
	u_int sc_irq_base;
	u_int sc_irq_length;

	struct mtx sc_its_dev_lock;
	TAILQ_HEAD(its_dev_list, its_dev) sc_its_dev_list;

	#define ITS_FLAGS_CMDQ_FLUSH 0x00000001
	#define ITS_FLAGS_LPI_CONF_FLUSH 0x00000002
	#define ITS_FLAGS_ERRATA_CAVIUM_22375 0x00000004
	u_int sc_its_flags;
	};

	typedef void (its_quirk_func_t)(device_t);
	static its_quirk_func_t its_quirk_cavium_22375;

	static const struct {
	const char *desc;
	uint32_t iidr;
	uint32_t iidr_mask;
	its_quirk_func_t *func;
	} its_quirks[] = {
	{
	/* Cavium ThunderX Pass 1.x */
	.desc = "Cavoum ThunderX errata: 22375, 24313",
	.iidr = GITS_IIDR_RAW(GITS_IIDR_IMPL_CAVIUM,
	GITS_IIDR_PROD_THUNDER, GITS_IIDR_VAR_THUNDER_1, 0),
	.iidr_mask = ~GITS_IIDR_REVISION_MASK,
	.func = its_quirk_cavium_22375,
	},
	};

	#define gic_its_read_4(sc, reg) \
	bus_read_4((sc)->sc_its_res, (reg))
	#define gic_its_read_8(sc, reg) \
	bus_read_8((sc)->sc_its_res, (reg))

	#define gic_its_write_4(sc, reg, val) \
	bus_write_4((sc)->sc_its_res, (reg), (val))
	#define gic_its_write_8(sc, reg, val) \
	bus_write_8((sc)->sc_its_res, (reg), (val))

	static device_attach_t gicv3_its_attach;
	static device_detach_t gicv3_its_detach;

	static pic_disable_intr_t gicv3_its_disable_intr;
	static pic_enable_intr_t gicv3_its_enable_intr;
	static pic_map_intr_t gicv3_its_map_intr;
	static pic_setup_intr_t gicv3_its_setup_intr;
	static pic_post_filter_t gicv3_its_post_filter;
	static pic_post_ithread_t gicv3_its_post_ithread;
	static pic_pre_ithread_t gicv3_its_pre_ithread;
	static pic_bind_intr_t gicv3_its_bind_intr;
	#ifdef SMP
	static pic_init_secondary_t gicv3_its_init_secondary;
	#endif
	static msi_alloc_msi_t gicv3_its_alloc_msi;
	static msi_release_msi_t gicv3_its_release_msi;
	static msi_alloc_msix_t gicv3_its_alloc_msix;
	static msi_release_msix_t gicv3_its_release_msix;
	static msi_map_msi_t gicv3_its_map_msi;

	static void its_cmd_movi(device_t, struct gicv3_its_irqsrc *);
	static void its_cmd_mapc(device_t, struct its_col *, uint8_t);
	static void its_cmd_mapti(device_t, struct gicv3_its_irqsrc *);
	static void its_cmd_mapd(device_t, struct its_dev *, uint8_t);
	static void its_cmd_inv(device_t, struct its_dev , struct gicv3_its_irqsrc );
	static void its_cmd_invall(device_t, struct its_col *);

	static device_method_t gicv3_its_methods[] = {
	/* Device interface */
	DEVMETHOD(device_detach, gicv3_its_detach),

	/* Interrupt controller interface */
	DEVMETHOD(pic_disable_intr, gicv3_its_disable_intr),
	DEVMETHOD(pic_enable_intr, gicv3_its_enable_intr),
	DEVMETHOD(pic_map_intr, gicv3_its_map_intr),
	DEVMETHOD(pic_setup_intr, gicv3_its_setup_intr),
	DEVMETHOD(pic_post_filter, gicv3_its_post_filter),
	DEVMETHOD(pic_post_ithread, gicv3_its_post_ithread),
	DEVMETHOD(pic_pre_ithread, gicv3_its_pre_ithread),
	#ifdef SMP
	DEVMETHOD(pic_bind_intr, gicv3_its_bind_intr),
	DEVMETHOD(pic_init_secondary, gicv3_its_init_secondary),
	#endif

	/* MSI/MSI-X */
	DEVMETHOD(msi_alloc_msi, gicv3_its_alloc_msi),
	DEVMETHOD(msi_release_msi, gicv3_its_release_msi),
	DEVMETHOD(msi_alloc_msix, gicv3_its_alloc_msix),
	DEVMETHOD(msi_release_msix, gicv3_its_release_msix),
	DEVMETHOD(msi_map_msi, gicv3_its_map_msi),

	/* End */
	DEVMETHOD_END
	};

	static DEFINE_CLASS_0(gic, gicv3_its_driver, gicv3_its_methods,
	sizeof(struct gicv3_its_softc));

	static void
	gicv3_its_cmdq_init(struct gicv3_its_softc *sc)
	{
	vm_paddr_t cmd_paddr;
	uint64_t reg, tmp;

	/* Set up the command circular buffer */
	sc->sc_its_cmd_base = contigmalloc(ITS_CMDQ_SIZE, M_GICV3_ITS,
	M_WAITOK \| M_ZERO, 0, (1ul << 48) - 1, ITS_CMDQ_ALIGN, 0);
	sc->sc_its_cmd_next_idx = 0;

	cmd_paddr = vtophys(sc->sc_its_cmd_base);

	/* Set the base of the command buffer */
	reg = GITS_CBASER_VALID \|
	(GITS_CBASER_CACHE_NIWAWB << GITS_CBASER_CACHE_SHIFT) \|
	cmd_paddr \| (GITS_CBASER_SHARE_IS << GITS_CBASER_SHARE_SHIFT) \|
	(ITS_CMDQ_SIZE / 4096 - 1);
	gic_its_write_8(sc, GITS_CBASER, reg);

	/* Read back to check for fixed value fields */
	tmp = gic_its_read_8(sc, GITS_CBASER);

	if ((tmp & GITS_CBASER_SHARE_MASK) !=
	(GITS_CBASER_SHARE_IS << GITS_CBASER_SHARE_SHIFT)) {
	/* Check if the hardware reported non-shareable */
	if ((tmp & GITS_CBASER_SHARE_MASK) ==
	(GITS_CBASER_SHARE_NS << GITS_CBASER_SHARE_SHIFT)) {
	/* If so remove the cache attribute */
	reg &= ~GITS_CBASER_CACHE_MASK;
	reg &= ~GITS_CBASER_SHARE_MASK;
	/* Set to Non-cacheable, Non-shareable */
	reg \|= GITS_CBASER_CACHE_NIN << GITS_CBASER_CACHE_SHIFT;
	reg \|= GITS_CBASER_SHARE_NS << GITS_CBASER_SHARE_SHIFT;

	gic_its_write_8(sc, GITS_CBASER, reg);
	}

	/* The command queue has to be flushed after each command */
	sc->sc_its_flags \|= ITS_FLAGS_CMDQ_FLUSH;
	}

	/* Get the next command from the start of the buffer */
	gic_its_write_8(sc, GITS_CWRITER, 0x0);
	}

	static int
	gicv3_its_table_init(device_t dev, struct gicv3_its_softc *sc)
	{
	vm_offset_t table;
	vm_paddr_t paddr;
	uint64_t cache, reg, share, tmp, type;
	size_t esize, its_tbl_size, nidents, nitspages, npages;
	int i, page_size;
	int devbits;

	if ((sc->sc_its_flags & ITS_FLAGS_ERRATA_CAVIUM_22375) != 0) {
	/*
	* GITS_TYPER[17:13] of ThunderX reports that device IDs
	* are to be 21 bits in length. The entry size of the ITS
	* table can be read from GITS_BASERn[52:48] and on ThunderX
	* is supposed to be 8 bytes in length (for device table).
	* Finally the page size that is to be used by ITS to access
	* this table will be set to 64KB.
	*
	* This gives 0x200000 entries of size 0x8 bytes covered by
	* 256 pages each of which 64KB in size. The number of pages
	* (minus 1) should then be written to GITS_BASERn[7:0]. In
	* that case this value would be 0xFF but on ThunderX the
	* maximum value that HW accepts is 0xFD.
	*
	* Set an arbitrary number of device ID bits to 20 in order
	* to limit the number of entries in ITS device table to
	* 0x100000 and the table size to 8MB.
	*/
	devbits = 20;
	cache = 0;
	} else {
	devbits = GITS_TYPER_DEVB(gic_its_read_8(sc, GITS_TYPER));
	cache = GITS_BASER_CACHE_WAWB;
	}
	share = GITS_BASER_SHARE_IS;
	page_size = PAGE_SIZE_64K;

	for (i = 0; i < GITS_BASER_NUM; i++) {
	reg = gic_its_read_8(sc, GITS_BASER(i));
	/* The type of table */
	type = GITS_BASER_TYPE(reg);
	/* The table entry size */
	esize = GITS_BASER_ESIZE(reg);

	switch(type) {
	case GITS_BASER_TYPE_DEV:
	nidents = (1 << devbits);
	its_tbl_size = esize * nidents;
	its_tbl_size = roundup2(its_tbl_size, PAGE_SIZE_64K);
	break;
	case GITS_BASER_TYPE_VP:
	case GITS_BASER_TYPE_PP: /* Undocumented? */
	case GITS_BASER_TYPE_IC:
	its_tbl_size = page_size;
	break;
	default:
	continue;
	}
	npages = howmany(its_tbl_size, PAGE_SIZE);

	/* Allocate the table */
	table = (vm_offset_t)contigmalloc(npages * PAGE_SIZE,
	M_GICV3_ITS, M_WAITOK \| M_ZERO, 0, (1ul << 48) - 1,
	PAGE_SIZE_64K, 0);

	sc->sc_its_ptab[i].ptab_vaddr = table;
	sc->sc_its_ptab[i].ptab_size = npages * PAGE_SIZE;

	paddr = vtophys(table);

	while (1) {
	nitspages = howmany(its_tbl_size, page_size);

	/* Clear the fields we will be setting */
	reg &= ~(GITS_BASER_VALID \|
	GITS_BASER_CACHE_MASK \| GITS_BASER_TYPE_MASK \|
	GITS_BASER_ESIZE_MASK \| GITS_BASER_PA_MASK \|
	GITS_BASER_SHARE_MASK \| GITS_BASER_PSZ_MASK \|
	GITS_BASER_SIZE_MASK);
	/* Set the new values */
	reg \|= GITS_BASER_VALID \|
	(cache << GITS_BASER_CACHE_SHIFT) \|
	(type << GITS_BASER_TYPE_SHIFT) \|
	((esize - 1) << GITS_BASER_ESIZE_SHIFT) \|
	paddr \| (share << GITS_BASER_SHARE_SHIFT) \|
	(nitspages - 1);

	switch (page_size) {
	case PAGE_SIZE: /* 4KB */
	reg \|=
	GITS_BASER_PSZ_4K << GITS_BASER_PSZ_SHIFT;
	break;
	case PAGE_SIZE_16K: /* 16KB */
	reg \|=
	GITS_BASER_PSZ_4K << GITS_BASER_PSZ_SHIFT;
	break;
	case PAGE_SIZE_64K: /* 64KB */
	reg \|=
	GITS_BASER_PSZ_64K << GITS_BASER_PSZ_SHIFT;
	break;
	}

	gic_its_write_8(sc, GITS_BASER(i), reg);

	/* Read back to check */
	tmp = gic_its_read_8(sc, GITS_BASER(i));

	/* Do the snareability masks line up? */
	if ((tmp & GITS_BASER_SHARE_MASK) !=
	(reg & GITS_BASER_SHARE_MASK)) {
	share = (tmp & GITS_BASER_SHARE_MASK) >>
	GITS_BASER_SHARE_SHIFT;
	continue;
	}

	if ((tmp & GITS_BASER_PSZ_MASK) !=
	(reg & GITS_BASER_PSZ_MASK)) {
	switch (page_size) {
	case PAGE_SIZE_16K:
	page_size = PAGE_SIZE;
	continue;
	case PAGE_SIZE_64K:
	page_size = PAGE_SIZE_16K;
	continue;
	}
	}

	if (tmp != reg) {
	device_printf(dev, "GITS_BASER%d: "
	"unable to be updated: %lx != %lx\n",
	i, reg, tmp);
	return (ENXIO);
	}

	/* We should have made all needed changes */
	break;
	}
	}

	return (0);
	}

	static void
	gicv3_its_conftable_init(struct gicv3_its_softc *sc)
	{

	sc->sc_conf_base = (vm_offset_t)contigmalloc(LPI_CONFTAB_SIZE,
	M_GICV3_ITS, M_WAITOK, 0, LPI_CONFTAB_MAX_ADDR, LPI_CONFTAB_ALIGN,
	0);

	/* Set the default configuration */
	memset((void *)sc->sc_conf_base, GIC_PRIORITY_MAX \| LPI_CONF_GROUP1,
	LPI_CONFTAB_SIZE);

	/* Flush the table to memory */
	cpu_dcache_wb_range(sc->sc_conf_base, LPI_CONFTAB_SIZE);
	}

	static void
	gicv3_its_pendtables_init(struct gicv3_its_softc *sc)
	{
	int i;

	for (i = 0; i <= mp_maxid; i++) {
	if (CPU_ISSET(i, &sc->sc_cpus) == 0)
	continue;

	sc->sc_pend_base[i] = (vm_offset_t)contigmalloc(
	LPI_PENDTAB_SIZE, M_GICV3_ITS, M_WAITOK \| M_ZERO,
	0, LPI_PENDTAB_MAX_ADDR, LPI_PENDTAB_ALIGN, 0);

	/* Flush so the ITS can see the memory */
	cpu_dcache_wb_range((vm_offset_t)sc->sc_pend_base,
	LPI_PENDTAB_SIZE);
	}
	}

	static int
	its_init_cpu(device_t dev, struct gicv3_its_softc *sc)
	{
	device_t gicv3;
	vm_paddr_t target;
	uint64_t xbaser, tmp;
	uint32_t ctlr;
	u_int cpuid;
	int domain;

	if (!CPU_ISSET(PCPU_GET(cpuid), &sc->sc_cpus))
	return (0);

	if (bus_get_domain(dev, &domain) == 0) {
	if (PCPU_GET(domain) != domain)
	return (0);
	}

	gicv3 = device_get_parent(dev);
	cpuid = PCPU_GET(cpuid);

	/* Check if the ITS is enabled on this CPU */
	if ((gic_r_read_4(gicv3, GICR_TYPER) & GICR_TYPER_PLPIS) == 0) {
	return (ENXIO);
	}

	/* Disable LPIs */
	ctlr = gic_r_read_4(gicv3, GICR_CTLR);
	ctlr &= ~GICR_CTLR_LPI_ENABLE;
	gic_r_write_4(gicv3, GICR_CTLR, ctlr);

	/* Make sure changes are observable my the GIC */
	dsb(sy);

	/*
	* Set the redistributor base
	*/
	xbaser = vtophys(sc->sc_conf_base) \|
	(GICR_PROPBASER_SHARE_IS << GICR_PROPBASER_SHARE_SHIFT) \|
	(GICR_PROPBASER_CACHE_NIWAWB << GICR_PROPBASER_CACHE_SHIFT) \|
	(flsl(LPI_CONFTAB_SIZE \| GIC_FIRST_LPI) - 1);
	gic_r_write_8(gicv3, GICR_PROPBASER, xbaser);

	/* Check the cache attributes we set */
	tmp = gic_r_read_8(gicv3, GICR_PROPBASER);

	if ((tmp & GICR_PROPBASER_SHARE_MASK) !=
	(xbaser & GICR_PROPBASER_SHARE_MASK)) {
	if ((tmp & GICR_PROPBASER_SHARE_MASK) ==
	(GICR_PROPBASER_SHARE_NS << GICR_PROPBASER_SHARE_SHIFT)) {
	/* We need to mark as non-cacheable */
	xbaser &= ~(GICR_PROPBASER_SHARE_MASK \|
	GICR_PROPBASER_CACHE_MASK);
	/* Non-cacheable */
	xbaser \|= GICR_PROPBASER_CACHE_NIN <<
	GICR_PROPBASER_CACHE_SHIFT;
	/* Non-sareable */
	xbaser \|= GICR_PROPBASER_SHARE_NS <<
	GICR_PROPBASER_SHARE_SHIFT;
	gic_r_write_8(gicv3, GICR_PROPBASER, xbaser);
	}
	sc->sc_its_flags \|= ITS_FLAGS_LPI_CONF_FLUSH;
	}

	/*
	* Set the LPI pending table base
	*/
	xbaser = vtophys(sc->sc_pend_base[cpuid]) \|
	(GICR_PENDBASER_CACHE_NIWAWB << GICR_PENDBASER_CACHE_SHIFT) \|
	(GICR_PENDBASER_SHARE_IS << GICR_PENDBASER_SHARE_SHIFT);

	gic_r_write_8(gicv3, GICR_PENDBASER, xbaser);

	tmp = gic_r_read_8(gicv3, GICR_PENDBASER);

	if ((tmp & GICR_PENDBASER_SHARE_MASK) ==
	(GICR_PENDBASER_SHARE_NS << GICR_PENDBASER_SHARE_SHIFT)) {
	/* Clear the cahce and shareability bits */
	xbaser &= ~(GICR_PENDBASER_CACHE_MASK \|
	GICR_PENDBASER_SHARE_MASK);
	/* Mark as non-shareable */
	xbaser \|= GICR_PENDBASER_SHARE_NS << GICR_PENDBASER_SHARE_SHIFT;
	/* And non-cacheable */
	xbaser \|= GICR_PENDBASER_CACHE_NIN <<
	GICR_PENDBASER_CACHE_SHIFT;
	}

	/* Enable LPIs */
	ctlr = gic_r_read_4(gicv3, GICR_CTLR);
	ctlr \|= GICR_CTLR_LPI_ENABLE;
	gic_r_write_4(gicv3, GICR_CTLR, ctlr);

	/* Make sure the GIC has seen everything */
	dsb(sy);

	if ((gic_its_read_8(sc, GITS_TYPER) & GITS_TYPER_PTA) != 0) {
	/* This ITS wants the redistributor physical address */
	target = vtophys(gicv3_get_redist_vaddr(dev));
	} else {
	/* This ITS wants the unique processor number */
	target = GICR_TYPER_CPUNUM(gic_r_read_8(gicv3, GICR_TYPER));
	}

	sc->sc_its_cols[cpuid]->col_target = target;
	sc->sc_its_cols[cpuid]->col_id = cpuid;

	its_cmd_mapc(dev, sc->sc_its_cols[cpuid], 1);
	its_cmd_invall(dev, sc->sc_its_cols[cpuid]);

	return (0);
	}

	static int
	gicv3_its_attach(device_t dev)
	{
	struct gicv3_its_softc *sc;
	const char *name;
	uint32_t iidr;
	int domain, err, i, rid;

	sc = device_get_softc(dev);

	rid = 0;
	sc->sc_its_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
	RF_ACTIVE);
	if (sc->sc_its_res == NULL) {
	device_printf(dev, "Could not allocate memory\n");
	return (ENXIO);
	}

	iidr = gic_its_read_4(sc, GITS_IIDR);
	for (i = 0; i < nitems(its_quirks); i++) {
	if ((iidr & its_quirks[i].iidr_mask) == its_quirks[i].iidr) {
	if (bootverbose) {
	device_printf(dev, "Applying %s\n",
	its_quirks[i].desc);
	}
	its_quirks[i].func(dev);
	break;
	}
	}

	/* Allocate the private tables */
	err = gicv3_its_table_init(dev, sc);
	if (err != 0)
	return (err);

	/* Protects access to the device list */
	mtx_init(&sc->sc_its_dev_lock, "ITS device lock", NULL, MTX_SPIN);

	/* Protects access to the ITS command circular buffer. */
	mtx_init(&sc->sc_its_cmd_lock, "ITS cmd lock", NULL, MTX_SPIN);

	if (bus_get_domain(dev, &domain) == 0) {
	CPU_ZERO(&sc->sc_cpus);
	if (domain < MAXMEMDOM)
	CPU_COPY(&cpuset_domain[domain], &sc->sc_cpus);
	} else {
	CPU_COPY(&all_cpus, &sc->sc_cpus);
	}

	/* Allocate the command circular buffer */
	gicv3_its_cmdq_init(sc);

	/* Allocate the per-CPU collections */
	for (int cpu = 0; cpu <= mp_maxid; cpu++)
	if (CPU_ISSET(cpu, &sc->sc_cpus) != 0)
	sc->sc_its_cols[cpu] = malloc(
	sizeof(*sc->sc_its_cols[0]), M_GICV3_ITS,
	M_WAITOK \| M_ZERO);
	else
	sc->sc_its_cols[cpu] = NULL;

	/* Enable the ITS */
	gic_its_write_4(sc, GITS_CTLR,
	gic_its_read_4(sc, GITS_CTLR) \| GITS_CTLR_EN);

	/* Create the LPI configuration table */
	gicv3_its_conftable_init(sc);

	/* And the pending tebles */
	gicv3_its_pendtables_init(sc);

	/* Enable LPIs on this CPU */
	its_init_cpu(dev, sc);

	TAILQ_INIT(&sc->sc_its_dev_list);

	/*
	* Create the vmem object to allocate INTRNG IRQs from. We try to
	* use all IRQs not already used by the GICv3.
	* XXX: This assumes there are no other interrupt controllers in the
	* system.
	*/
	sc->sc_irq_alloc = vmem_create("GICv3 ITS IRQs", 0,
	gicv3_get_nirqs(dev), 1, 1, M_FIRSTFIT \| M_WAITOK);

	sc->sc_irqs = malloc(sizeof(sc->sc_irqs) sc->sc_irq_length,
	M_GICV3_ITS, M_WAITOK \| M_ZERO);
	name = device_get_nameunit(dev);
	for (i = 0; i < sc->sc_irq_length; i++) {
	sc->sc_irqs[i].gi_irq = i;
	err = intr_isrc_register(&sc->sc_irqs[i].gi_isrc, dev, 0,
	"%s,%u", name, i);
	}

	return (0);
	}

	static int
	gicv3_its_detach(device_t dev)
	{

	return (ENXIO);
	}

	static void
	its_quirk_cavium_22375(device_t dev)
	{
	struct gicv3_its_softc *sc;

	sc = device_get_softc(dev);
	sc->sc_its_flags \|= ITS_FLAGS_ERRATA_CAVIUM_22375;
	}

	static void
	gicv3_its_disable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gicv3_its_softc *sc;
	struct gicv3_its_irqsrc *girq;
	uint8_t *conf;

	sc = device_get_softc(dev);
	girq = (struct gicv3_its_irqsrc *)isrc;
	conf = (uint8_t *)sc->sc_conf_base;

	conf[girq->gi_irq] &= ~LPI_CONF_ENABLE;

	if ((sc->sc_its_flags & ITS_FLAGS_LPI_CONF_FLUSH) != 0) {
	/* Clean D-cache under command. */
	cpu_dcache_wb_range((vm_offset_t)&conf[girq->gi_irq], 1);
	} else {
	/* DSB inner shareable, store */
	dsb(ishst);
	}

	its_cmd_inv(dev, girq->gi_its_dev, girq);
	}

	static void
	gicv3_its_enable_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gicv3_its_softc *sc;
	struct gicv3_its_irqsrc *girq;
	uint8_t *conf;

	sc = device_get_softc(dev);
	girq = (struct gicv3_its_irqsrc *)isrc;
	conf = (uint8_t *)sc->sc_conf_base;

	conf[girq->gi_irq] \|= LPI_CONF_ENABLE;

	if ((sc->sc_its_flags & ITS_FLAGS_LPI_CONF_FLUSH) != 0) {
	/* Clean D-cache under command. */
	cpu_dcache_wb_range((vm_offset_t)&conf[girq->gi_irq], 1);
	} else {
	/* DSB inner shareable, store */
	dsb(ishst);
	}

	its_cmd_inv(dev, girq->gi_its_dev, girq);
	}

	static int
	gicv3_its_intr(void *arg, uintptr_t irq)
	{
	struct gicv3_its_softc *sc = arg;
	struct gicv3_its_irqsrc *girq;
	struct trapframe *tf;

	irq -= sc->sc_irq_base;
	girq = &sc->sc_irqs[irq];
	if (girq == NULL)
	panic("gicv3_its_intr: Invalid interrupt %ld",
	irq + sc->sc_irq_base);

	tf = curthread->td_intr_frame;
	intr_isrc_dispatch(&girq->gi_isrc, tf);
	return (FILTER_HANDLED);
	}

	static void
	gicv3_its_pre_ithread(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gicv3_its_irqsrc *girq;
	struct gicv3_its_softc *sc;

	sc = device_get_softc(dev);
	girq = (struct gicv3_its_irqsrc *)isrc;
	gicv3_its_disable_intr(dev, isrc);
	gic_icc_write(EOIR1, girq->gi_irq + sc->sc_irq_base);
	}

	static void
	gicv3_its_post_ithread(device_t dev, struct intr_irqsrc *isrc)
	{

	gicv3_its_enable_intr(dev, isrc);
	}

	static void
	gicv3_its_post_filter(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gicv3_its_irqsrc *girq;
	struct gicv3_its_softc *sc;

	sc = device_get_softc(dev);
	girq = (struct gicv3_its_irqsrc *)isrc;
	gic_icc_write(EOIR1, girq->gi_irq + sc->sc_irq_base);
	}

	static int
	gicv3_its_bind_intr(device_t dev, struct intr_irqsrc *isrc)
	{
	struct gicv3_its_irqsrc *girq;
	struct gicv3_its_softc *sc;

	sc = device_get_softc(dev);
	girq = (struct gicv3_its_irqsrc *)isrc;
	if (CPU_EMPTY(&isrc->isrc_cpu)) {
	sc->gic_irq_cpu = intr_irq_next_cpu(sc->gic_irq_cpu,
	&sc->sc_cpus);
	CPU_SETOF(sc->gic_irq_cpu, &isrc->isrc_cpu);
	}

	its_cmd_movi(dev, girq);

	return (0);
	}

	static int
	gicv3_its_map_intr(device_t dev, struct intr_map_data *data,
	struct intr_irqsrc **isrcp)
	{

	/*
	* This should never happen, we only call this function to map
	* interrupts found before the controller driver is ready.
	*/
	panic("gicv3_its_map_intr: Unable to map a MSI interrupt");
	}

	static int
	gicv3_its_setup_intr(device_t dev, struct intr_irqsrc *isrc,
	struct resource res, struct intr_map_data data)
	{

	/* Bind the interrupt to a CPU */
	gicv3_its_bind_intr(dev, isrc);

	return (0);
	}

	#ifdef SMP
	static void
	gicv3_its_init_secondary(device_t dev)
	{
	struct gicv3_its_softc *sc;

	sc = device_get_softc(dev);

	/*
	* This is fatal as otherwise we may bind interrupts to this CPU.
	* We need a way to tell the interrupt framework to only bind to a
	* subset of given CPUs when it performs the shuffle.
	*/
	if (its_init_cpu(dev, sc) != 0)
	panic("gicv3_its_init_secondary: No usable ITS on CPU%d",
	PCPU_GET(cpuid));
	}
	#endif

	static uint32_t
	its_get_devid(device_t pci_dev)
	{
	uintptr_t id;

	if (pci_get_id(pci_dev, PCI_ID_MSI, &id) != 0)
	panic("its_get_devid: Unable to get the MSI DeviceID");

	return (id);
	}

	static struct its_dev *
	its_device_find(device_t dev, device_t child)
	{
	struct gicv3_its_softc *sc;
	struct its_dev *its_dev = NULL;

	sc = device_get_softc(dev);

	mtx_lock_spin(&sc->sc_its_dev_lock);
	TAILQ_FOREACH(its_dev, &sc->sc_its_dev_list, entry) {
	if (its_dev->pci_dev == child)
	break;
	}
	mtx_unlock_spin(&sc->sc_its_dev_lock);

	return (its_dev);
	}

	static struct its_dev *
	its_device_get(device_t dev, device_t child, u_int nvecs)
	{
	struct gicv3_its_softc *sc;
	struct its_dev *its_dev;
	vmem_addr_t irq_base;
	size_t esize;

	sc = device_get_softc(dev);

	its_dev = its_device_find(dev, child);
	if (its_dev != NULL)
	return (its_dev);

	its_dev = malloc(sizeof(*its_dev), M_GICV3_ITS, M_NOWAIT \| M_ZERO);
	if (its_dev == NULL)
	return (NULL);

	its_dev->pci_dev = child;
	its_dev->devid = its_get_devid(child);

	its_dev->lpis.lpi_busy = 0;
	its_dev->lpis.lpi_num = nvecs;
	its_dev->lpis.lpi_free = nvecs;

	if (vmem_alloc(sc->sc_irq_alloc, nvecs, M_FIRSTFIT \| M_NOWAIT,
	&irq_base) != 0) {
	free(its_dev, M_GICV3_ITS);
	return (NULL);
	}
	its_dev->lpis.lpi_base = irq_base;

	/* Get ITT entry size */
	esize = GITS_TYPER_ITTES(gic_its_read_8(sc, GITS_TYPER));

	/*
	* Allocate ITT for this device.
	* PA has to be 256 B aligned. At least two entries for device.
	*/
	its_dev->itt_size = roundup2(MAX(nvecs, 2) * esize, 256);
	its_dev->itt = (vm_offset_t)contigmalloc(its_dev->itt_size,
	M_GICV3_ITS, M_NOWAIT \| M_ZERO, 0, LPI_INT_TRANS_TAB_MAX_ADDR,
	LPI_INT_TRANS_TAB_ALIGN, 0);
	if (its_dev->itt == 0) {
	vmem_free(sc->sc_irq_alloc, its_dev->lpis.lpi_base, nvecs);
	free(its_dev, M_GICV3_ITS);
	return (NULL);
	}

	mtx_lock_spin(&sc->sc_its_dev_lock);
	TAILQ_INSERT_TAIL(&sc->sc_its_dev_list, its_dev, entry);
	mtx_unlock_spin(&sc->sc_its_dev_lock);

	/* Map device to its ITT */
	its_cmd_mapd(dev, its_dev, 1);

	return (its_dev);
	}

	static void
	its_device_release(device_t dev, struct its_dev *its_dev)
	{
	struct gicv3_its_softc *sc;

	KASSERT(its_dev->lpis.lpi_busy == 0,
	("its_device_release: Trying to release an inuse ITS device"));

	/* Unmap device in ITS */
	its_cmd_mapd(dev, its_dev, 0);

	sc = device_get_softc(dev);

	/* Remove the device from the list of devices */
	mtx_lock_spin(&sc->sc_its_dev_lock);
	TAILQ_REMOVE(&sc->sc_its_dev_list, its_dev, entry);
	mtx_unlock_spin(&sc->sc_its_dev_lock);

	/* Free ITT */
	KASSERT(its_dev->itt != 0, ("Invalid ITT in valid ITS device"));
	contigfree((void *)its_dev->itt, its_dev->itt_size, M_GICV3_ITS);

	/* Free the IRQ allocation */
	vmem_free(sc->sc_irq_alloc, its_dev->lpis.lpi_base,
	its_dev->lpis.lpi_num);

	free(its_dev, M_GICV3_ITS);
	}

	static int
	gicv3_its_alloc_msi(device_t dev, device_t child, int count, int maxcount,
	device_t pic, struct intr_irqsrc *srcs)
	{
	struct gicv3_its_softc *sc;
	struct gicv3_its_irqsrc *girq;
	struct its_dev *its_dev;
	u_int irq;
	int i;

	its_dev = its_device_get(dev, child, count);
	if (its_dev == NULL)
	return (ENXIO);

	KASSERT(its_dev->lpis.lpi_free >= count,
	("gicv3_its_alloc_msi: No free LPIs"));
	sc = device_get_softc(dev);
	irq = its_dev->lpis.lpi_base + its_dev->lpis.lpi_num -
	its_dev->lpis.lpi_free;
	for (i = 0; i < count; i++, irq++) {
	its_dev->lpis.lpi_free--;
	girq = &sc->sc_irqs[irq];
	girq->gi_its_dev = its_dev;
	srcs[i] = (struct intr_irqsrc *)girq;
	}
	its_dev->lpis.lpi_busy += count;
	*pic = dev;

	return (0);
	}

	static int
	gicv3_its_release_msi(device_t dev, device_t child, int count,
	struct intr_irqsrc **isrc)
	{
	- struct gicv3_its_softc *sc;
	struct gicv3_its_irqsrc *girq;
	struct its_dev *its_dev;
	int i;

	- sc = device_get_softc(dev);
	its_dev = its_device_find(dev, child);

	KASSERT(its_dev != NULL,
	("gicv3_its_release_msi: Releasing a MSI interrupt with "
	"no ITS device"));
	KASSERT(its_dev->lpis.lpi_busy >= count,
	("gicv3_its_release_msi: Releasing more interrupts than "
	"were allocated: releasing %d, allocated %d", count,
	its_dev->lpis.lpi_busy));
	for (i = 0; i < count; i++) {
	girq = (struct gicv3_its_irqsrc *)isrc[i];
	girq->gi_its_dev = NULL;
	}
	its_dev->lpis.lpi_busy -= count;

	if (its_dev->lpis.lpi_busy == 0)
	its_device_release(dev, its_dev);

	return (0);
	}

	static int
	gicv3_its_alloc_msix(device_t dev, device_t child, device_t *pic,
	struct intr_irqsrc **isrcp)
	{
	struct gicv3_its_softc *sc;
	struct gicv3_its_irqsrc *girq;
	struct its_dev *its_dev;
	u_int nvecs, irq;

	nvecs = pci_msix_count(child);
	its_dev = its_device_get(dev, child, nvecs);
	if (its_dev == NULL)
	return (ENXIO);

	KASSERT(its_dev->lpis.lpi_free > 0,
	("gicv3_its_alloc_msix: No free LPIs"));
	sc = device_get_softc(dev);
	irq = its_dev->lpis.lpi_base + its_dev->lpis.lpi_num -
	its_dev->lpis.lpi_free;
	its_dev->lpis.lpi_free--;
	its_dev->lpis.lpi_busy++;
	girq = &sc->sc_irqs[irq];
	girq->gi_its_dev = its_dev;

	*pic = dev;
	isrcp = (struct intr_irqsrc )girq;

	return (0);
	}

	static int
	gicv3_its_release_msix(device_t dev, device_t child, struct intr_irqsrc *isrc)
	{
	- struct gicv3_its_softc *sc;
	struct gicv3_its_irqsrc *girq;
	struct its_dev *its_dev;

	- sc = device_get_softc(dev);
	its_dev = its_device_find(dev, child);

	KASSERT(its_dev != NULL,
	("gicv3_its_release_msix: Releasing a MSI-X interrupt with "
	"no ITS device"));
	KASSERT(its_dev->lpis.lpi_busy > 0,
	("gicv3_its_release_msix: Releasing more interrupts than "
	"were allocated: allocated %d", its_dev->lpis.lpi_busy));
	girq = (struct gicv3_its_irqsrc *)isrc;
	girq->gi_its_dev = NULL;
	its_dev->lpis.lpi_busy--;

	if (its_dev->lpis.lpi_busy == 0)
	its_device_release(dev, its_dev);

	return (0);
	}

	static int
	gicv3_its_map_msi(device_t dev, device_t child, struct intr_irqsrc *isrc,
	uint64_t addr, uint32_t data)
	{
	struct gicv3_its_softc *sc;
	struct gicv3_its_irqsrc *girq;

	sc = device_get_softc(dev);
	girq = (struct gicv3_its_irqsrc *)isrc;

	/* Map the message to the given IRQ */
	its_cmd_mapti(dev, girq);

	*addr = vtophys(rman_get_virtual(sc->sc_its_res)) + GITS_TRANSLATER;
	*data = girq->gi_irq - girq->gi_its_dev->lpis.lpi_base;

	return (0);
	}

	/*
	* Commands handling.
	*/

	static __inline void
	cmd_format_command(struct its_cmd *cmd, uint8_t cmd_type)
	{
	/* Command field: DW0 [7:0] */
	cmd->cmd_dword[0] &= htole64(~CMD_COMMAND_MASK);
	cmd->cmd_dword[0] \|= htole64(cmd_type);
	}

	static __inline void
	cmd_format_devid(struct its_cmd *cmd, uint32_t devid)
	{
	/* Device ID field: DW0 [63:32] */
	cmd->cmd_dword[0] &= htole64(~CMD_DEVID_MASK);
	cmd->cmd_dword[0] \|= htole64((uint64_t)devid << CMD_DEVID_SHIFT);
	}

	static __inline void
	cmd_format_size(struct its_cmd *cmd, uint16_t size)
	{
	/* Size field: DW1 [4:0] */
	cmd->cmd_dword[1] &= htole64(~CMD_SIZE_MASK);
	cmd->cmd_dword[1] \|= htole64((size & CMD_SIZE_MASK));
	}

	static __inline void
	cmd_format_id(struct its_cmd *cmd, uint32_t id)
	{
	/* ID field: DW1 [31:0] */
	cmd->cmd_dword[1] &= htole64(~CMD_ID_MASK);
	cmd->cmd_dword[1] \|= htole64(id);
	}

	static __inline void
	cmd_format_pid(struct its_cmd *cmd, uint32_t pid)
	{
	/* Physical ID field: DW1 [63:32] */
	cmd->cmd_dword[1] &= htole64(~CMD_PID_MASK);
	cmd->cmd_dword[1] \|= htole64((uint64_t)pid << CMD_PID_SHIFT);
	}

	static __inline void
	cmd_format_col(struct its_cmd *cmd, uint16_t col_id)
	{
	/* Collection field: DW2 [16:0] */
	cmd->cmd_dword[2] &= htole64(~CMD_COL_MASK);
	cmd->cmd_dword[2] \|= htole64(col_id);
	}

	static __inline void
	cmd_format_target(struct its_cmd *cmd, uint64_t target)
	{
	/* Target Address field: DW2 [47:16] */
	cmd->cmd_dword[2] &= htole64(~CMD_TARGET_MASK);
	cmd->cmd_dword[2] \|= htole64(target & CMD_TARGET_MASK);
	}

	static __inline void
	cmd_format_itt(struct its_cmd *cmd, uint64_t itt)
	{
	/* ITT Address field: DW2 [47:8] */
	cmd->cmd_dword[2] &= htole64(~CMD_ITT_MASK);
	cmd->cmd_dword[2] \|= htole64(itt & CMD_ITT_MASK);
	}

	static __inline void
	cmd_format_valid(struct its_cmd *cmd, uint8_t valid)
	{
	/* Valid field: DW2 [63] */
	cmd->cmd_dword[2] &= htole64(~CMD_VALID_MASK);
	cmd->cmd_dword[2] \|= htole64((uint64_t)valid << CMD_VALID_SHIFT);
	}

	static inline bool
	its_cmd_queue_full(struct gicv3_its_softc *sc)
	{
	size_t read_idx, next_write_idx;

	/* Get the index of the next command */
	next_write_idx = (sc->sc_its_cmd_next_idx + 1) %
	(ITS_CMDQ_SIZE / sizeof(struct its_cmd));
	/* And the index of the current command being read */
	read_idx = gic_its_read_4(sc, GITS_CREADR) / sizeof(struct its_cmd);

	/*
	* The queue is full when the write offset points
	* at the command before the current read offset.
	*/
	return (next_write_idx == read_idx);
	}

	static inline void
	its_cmd_sync(struct gicv3_its_softc sc, struct its_cmd cmd)
	{

	if ((sc->sc_its_flags & ITS_FLAGS_CMDQ_FLUSH) != 0) {
	/* Clean D-cache under command. */
	cpu_dcache_wb_range((vm_offset_t)cmd, sizeof(*cmd));
	} else {
	/* DSB inner shareable, store */
	dsb(ishst);
	}

	}

	static inline uint64_t
	its_cmd_cwriter_offset(struct gicv3_its_softc sc, struct its_cmd cmd)
	{
	uint64_t off;

	off = (cmd - sc->sc_its_cmd_base) * sizeof(*cmd);

	return (off);
	}

	static void
	its_cmd_wait_completion(device_t dev, struct its_cmd *cmd_first,
	struct its_cmd *cmd_last)
	{
	struct gicv3_its_softc *sc;
	uint64_t first, last, read;
	size_t us_left;

	sc = device_get_softc(dev);

	/*
	* XXX ARM64TODO: This is obviously a significant delay.
	* The reason for that is that currently the time frames for
	* the command to complete are not known.
	*/
	us_left = 1000000;

	first = its_cmd_cwriter_offset(sc, cmd_first);
	last = its_cmd_cwriter_offset(sc, cmd_last);

	for (;;) {
	read = gic_its_read_8(sc, GITS_CREADR);
	if (first < last) {
	if (read < first \|\| read >= last)
	break;
	} else if (read < first && read >= last)
	break;

	if (us_left-- == 0) {
	/* This means timeout */
	device_printf(dev,
	"Timeout while waiting for CMD completion.\n");
	return;
	}
	DELAY(1);
	}
	}


	static struct its_cmd *
	its_cmd_alloc_locked(device_t dev)
	{
	struct gicv3_its_softc *sc;
	struct its_cmd *cmd;
	size_t us_left;

	sc = device_get_softc(dev);

	/*
	* XXX ARM64TODO: This is obviously a significant delay.
	* The reason for that is that currently the time frames for
	* the command to complete (and therefore free the descriptor)
	* are not known.
	*/
	us_left = 1000000;

	mtx_assert(&sc->sc_its_cmd_lock, MA_OWNED);
	while (its_cmd_queue_full(sc)) {
	if (us_left-- == 0) {
	/* Timeout while waiting for free command */
	device_printf(dev,
	"Timeout while waiting for free command\n");
	return (NULL);
	}
	DELAY(1);
	}

	cmd = &sc->sc_its_cmd_base[sc->sc_its_cmd_next_idx];
	sc->sc_its_cmd_next_idx++;
	sc->sc_its_cmd_next_idx %= ITS_CMDQ_SIZE / sizeof(struct its_cmd);

	return (cmd);
	}

	static uint64_t
	its_cmd_prepare(struct its_cmd cmd, struct its_cmd_desc desc)
	{
	uint64_t target;
	uint8_t cmd_type;
	u_int size;
	- boolean_t error;

	- error = FALSE;
	cmd_type = desc->cmd_type;
	target = ITS_TARGET_NONE;

	switch (cmd_type) {
	case ITS_CMD_MOVI: /* Move interrupt ID to another collection */
	target = desc->cmd_desc_movi.col->col_target;
	cmd_format_command(cmd, ITS_CMD_MOVI);
	cmd_format_id(cmd, desc->cmd_desc_movi.id);
	cmd_format_col(cmd, desc->cmd_desc_movi.col->col_id);
	cmd_format_devid(cmd, desc->cmd_desc_movi.its_dev->devid);
	break;
	case ITS_CMD_SYNC: /* Wait for previous commands completion */
	target = desc->cmd_desc_sync.col->col_target;
	cmd_format_command(cmd, ITS_CMD_SYNC);
	cmd_format_target(cmd, target);
	break;
	case ITS_CMD_MAPD: /* Assign ITT to device */
	cmd_format_command(cmd, ITS_CMD_MAPD);
	cmd_format_itt(cmd, vtophys(desc->cmd_desc_mapd.its_dev->itt));
	/*
	* Size describes number of bits to encode interrupt IDs
	* supported by the device minus one.
	* When V (valid) bit is zero, this field should be written
	* as zero.
	*/
	if (desc->cmd_desc_mapd.valid != 0) {
	size = fls(desc->cmd_desc_mapd.its_dev->lpis.lpi_num);
	size = MAX(1, size) - 1;
	} else
	size = 0;

	cmd_format_size(cmd, size);
	cmd_format_devid(cmd, desc->cmd_desc_mapd.its_dev->devid);
	cmd_format_valid(cmd, desc->cmd_desc_mapd.valid);
	break;
	case ITS_CMD_MAPC: /* Map collection to Re-Distributor */
	target = desc->cmd_desc_mapc.col->col_target;
	cmd_format_command(cmd, ITS_CMD_MAPC);
	cmd_format_col(cmd, desc->cmd_desc_mapc.col->col_id);
	cmd_format_valid(cmd, desc->cmd_desc_mapc.valid);
	cmd_format_target(cmd, target);
	break;
	case ITS_CMD_MAPTI:
	target = desc->cmd_desc_mapvi.col->col_target;
	cmd_format_command(cmd, ITS_CMD_MAPTI);
	cmd_format_devid(cmd, desc->cmd_desc_mapvi.its_dev->devid);
	cmd_format_id(cmd, desc->cmd_desc_mapvi.id);
	cmd_format_pid(cmd, desc->cmd_desc_mapvi.pid);
	cmd_format_col(cmd, desc->cmd_desc_mapvi.col->col_id);
	break;
	case ITS_CMD_MAPI:
	target = desc->cmd_desc_mapi.col->col_target;
	cmd_format_command(cmd, ITS_CMD_MAPI);
	cmd_format_devid(cmd, desc->cmd_desc_mapi.its_dev->devid);
	cmd_format_id(cmd, desc->cmd_desc_mapi.pid);
	cmd_format_col(cmd, desc->cmd_desc_mapi.col->col_id);
	break;
	case ITS_CMD_INV:
	target = desc->cmd_desc_inv.col->col_target;
	cmd_format_command(cmd, ITS_CMD_INV);
	cmd_format_devid(cmd, desc->cmd_desc_inv.its_dev->devid);
	cmd_format_id(cmd, desc->cmd_desc_inv.pid);
	break;
	case ITS_CMD_INVALL:
	cmd_format_command(cmd, ITS_CMD_INVALL);
	cmd_format_col(cmd, desc->cmd_desc_invall.col->col_id);
	break;
	default:
	panic("its_cmd_prepare: Invalid command: %x", cmd_type);
	}

	return (target);
	}

	static int
	its_cmd_send(device_t dev, struct its_cmd_desc *desc)
	{
	struct gicv3_its_softc *sc;
	struct its_cmd cmd, cmd_sync, *cmd_write;
	struct its_col col_sync;
	struct its_cmd_desc desc_sync;
	uint64_t target, cwriter;

	sc = device_get_softc(dev);
	mtx_lock_spin(&sc->sc_its_cmd_lock);
	cmd = its_cmd_alloc_locked(dev);
	if (cmd == NULL) {
	device_printf(dev, "could not allocate ITS command\n");
	mtx_unlock_spin(&sc->sc_its_cmd_lock);
	return (EBUSY);
	}

	target = its_cmd_prepare(cmd, desc);
	its_cmd_sync(sc, cmd);

	if (target != ITS_TARGET_NONE) {
	cmd_sync = its_cmd_alloc_locked(dev);
	if (cmd_sync != NULL) {
	desc_sync.cmd_type = ITS_CMD_SYNC;
	col_sync.col_target = target;
	desc_sync.cmd_desc_sync.col = &col_sync;
	its_cmd_prepare(cmd_sync, &desc_sync);
	its_cmd_sync(sc, cmd_sync);
	}
	}

	/* Update GITS_CWRITER */
	cwriter = sc->sc_its_cmd_next_idx * sizeof(struct its_cmd);
	gic_its_write_8(sc, GITS_CWRITER, cwriter);
	cmd_write = &sc->sc_its_cmd_base[sc->sc_its_cmd_next_idx];
	mtx_unlock_spin(&sc->sc_its_cmd_lock);

	its_cmd_wait_completion(dev, cmd, cmd_write);

	return (0);
	}

	/* Handlers to send commands */
	static void
	its_cmd_movi(device_t dev, struct gicv3_its_irqsrc *girq)
	{
	struct gicv3_its_softc *sc;
	struct its_cmd_desc desc;
	struct its_col *col;

	sc = device_get_softc(dev);
	col = sc->sc_its_cols[CPU_FFS(&girq->gi_isrc.isrc_cpu) - 1];

	desc.cmd_type = ITS_CMD_MOVI;
	desc.cmd_desc_movi.its_dev = girq->gi_its_dev;
	desc.cmd_desc_movi.col = col;
	desc.cmd_desc_movi.id = girq->gi_irq - girq->gi_its_dev->lpis.lpi_base;

	its_cmd_send(dev, &desc);
	}

	static void
	its_cmd_mapc(device_t dev, struct its_col *col, uint8_t valid)
	{
	struct its_cmd_desc desc;

	desc.cmd_type = ITS_CMD_MAPC;
	desc.cmd_desc_mapc.col = col;
	/*
	* Valid bit set - map the collection.
	* Valid bit cleared - unmap the collection.
	*/
	desc.cmd_desc_mapc.valid = valid;

	its_cmd_send(dev, &desc);
	}

	static void
	its_cmd_mapti(device_t dev, struct gicv3_its_irqsrc *girq)
	{
	struct gicv3_its_softc *sc;
	struct its_cmd_desc desc;
	struct its_col *col;
	u_int col_id;

	sc = device_get_softc(dev);

	col_id = CPU_FFS(&girq->gi_isrc.isrc_cpu) - 1;
	col = sc->sc_its_cols[col_id];

	desc.cmd_type = ITS_CMD_MAPTI;
	desc.cmd_desc_mapvi.its_dev = girq->gi_its_dev;
	desc.cmd_desc_mapvi.col = col;
	/* The EventID sent to the device */
	desc.cmd_desc_mapvi.id = girq->gi_irq - girq->gi_its_dev->lpis.lpi_base;
	/* The physical interrupt presented to softeware */
	desc.cmd_desc_mapvi.pid = girq->gi_irq + sc->sc_irq_base;

	its_cmd_send(dev, &desc);
	}

	static void
	its_cmd_mapd(device_t dev, struct its_dev *its_dev, uint8_t valid)
	{
	struct its_cmd_desc desc;

	desc.cmd_type = ITS_CMD_MAPD;
	desc.cmd_desc_mapd.its_dev = its_dev;
	desc.cmd_desc_mapd.valid = valid;

	its_cmd_send(dev, &desc);
	}

	static void
	its_cmd_inv(device_t dev, struct its_dev *its_dev,
	struct gicv3_its_irqsrc *girq)
	{
	struct gicv3_its_softc *sc;
	struct its_cmd_desc desc;
	struct its_col *col;

	sc = device_get_softc(dev);
	col = sc->sc_its_cols[CPU_FFS(&girq->gi_isrc.isrc_cpu) - 1];

	desc.cmd_type = ITS_CMD_INV;
	/* The EventID sent to the device */
	desc.cmd_desc_inv.pid = girq->gi_irq - its_dev->lpis.lpi_base;
	desc.cmd_desc_inv.its_dev = its_dev;
	desc.cmd_desc_inv.col = col;

	its_cmd_send(dev, &desc);
	}

	static void
	its_cmd_invall(device_t dev, struct its_col *col)
	{
	struct its_cmd_desc desc;

	desc.cmd_type = ITS_CMD_INVALL;
	desc.cmd_desc_invall.col = col;

	its_cmd_send(dev, &desc);
	}

	#ifdef FDT
	static device_probe_t gicv3_its_fdt_probe;
	static device_attach_t gicv3_its_fdt_attach;

	static device_method_t gicv3_its_fdt_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, gicv3_its_fdt_probe),
	DEVMETHOD(device_attach, gicv3_its_fdt_attach),

	/* End */
	DEVMETHOD_END
	};

	#define its_baseclasses its_fdt_baseclasses
	DEFINE_CLASS_1(its, gicv3_its_fdt_driver, gicv3_its_fdt_methods,
	sizeof(struct gicv3_its_softc), gicv3_its_driver);
	#undef its_baseclasses
	static devclass_t gicv3_its_fdt_devclass;

	EARLY_DRIVER_MODULE(its, gic, gicv3_its_fdt_driver,
	gicv3_its_fdt_devclass, 0, 0, BUS_PASS_INTERRUPT + BUS_PASS_ORDER_MIDDLE);

	static int
	gicv3_its_fdt_probe(device_t dev)
	{

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (!ofw_bus_is_compatible(dev, "arm,gic-v3-its"))
	return (ENXIO);

	device_set_desc(dev, "ARM GIC Interrupt Translation Service");
	return (BUS_PROBE_DEFAULT);
	}

	static int
	gicv3_its_fdt_attach(device_t dev)
	{
	struct gicv3_its_softc *sc;
	phandle_t xref;
	int err;

	sc = device_get_softc(dev);

	sc->sc_irq_length = gicv3_get_nirqs(dev);
	sc->sc_irq_base = GIC_FIRST_LPI;
	sc->sc_irq_base += device_get_unit(dev) * sc->sc_irq_length;

	err = gicv3_its_attach(dev);
	if (err != 0)
	return (err);

	/* Register this device as a interrupt controller */
	xref = OF_xref_from_node(ofw_bus_get_node(dev));
	sc->sc_pic = intr_pic_register(dev, xref);
	intr_pic_add_handler(device_get_parent(dev), sc->sc_pic,
	gicv3_its_intr, sc, sc->sc_irq_base, sc->sc_irq_length);

	/* Register this device to handle MSI interrupts */
	intr_msi_register(dev, xref);

	return (0);
	}
	#endif
	Index: head/sys/arm64/arm64/machdep.c
	===================================================================
	--- head/sys/arm64/arm64/machdep.c (revision 327172)
	+++ head/sys/arm64/arm64/machdep.c (revision 327173)
	@@ -1,1235 +1,1234 @@
	/*-
	* Copyright (c) 2014 Andrew Turner
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include "opt_acpi.h"
	#include "opt_compat.h"
	#include "opt_platform.h"
	#include "opt_ddb.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/cons.h>
	#include <sys/cpu.h>
	#include <sys/devmap.h>
	#include <sys/efi.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/linker.h>
	#include <sys/msgbuf.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/ptrace.h>
	#include <sys/reboot.h>
	#include <sys/rwlock.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/ucontext.h>
	#include <sys/vdso.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_pager.h>

	#include <machine/armreg.h>
	#include <machine/cpu.h>
	#include <machine/debug_monitor.h>
	#include <machine/kdb.h>
	#include <machine/machdep.h>
	#include <machine/metadata.h>
	#include <machine/md_var.h>
	#include <machine/pcb.h>
	#include <machine/reg.h>
	#include <machine/undefined.h>
	#include <machine/vmparam.h>

	#ifdef VFP
	#include <machine/vfp.h>
	#endif

	#ifdef DEV_ACPI
	#include <contrib/dev/acpica/include/acpi.h>
	#include <machine/acpica_machdep.h>
	#endif

	#ifdef FDT
	#include <dev/fdt/fdt_common.h>
	#include <dev/ofw/openfirm.h>
	#endif


	enum arm64_bus arm64_bus_method = ARM64_BUS_NONE;

	struct pcpu __pcpu[MAXCPU];

	static struct trapframe proc0_tf;

	vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2];
	vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2];

	int early_boot = 1;
	int cold = 1;
	long realmem = 0;
	long Maxmem = 0;

	#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
	vm_paddr_t physmap[PHYSMAP_SIZE];
	u_int physmap_idx;

	struct kva_md_info kmi;

	int64_t dcache_line_size; /* The minimum D cache line size */
	int64_t icache_line_size; /* The minimum I cache line size */
	int64_t idcache_line_size; /* The minimum cache line size */
	int64_t dczva_line_size; /* The size of cache line the dc zva zeroes */
	int has_pan;

	/*
	* Physical address of the EFI System Table. Stashed from the metadata hints
	* passed into the kernel and used by the EFI code to call runtime services.
	*/
	vm_paddr_t efi_systbl_phys;

	/* pagezero_* implementations are provided in support.S */
	void pagezero_simple(void *);
	void pagezero_cache(void *);

	/* pagezero_simple is default pagezero */
	void (pagezero)(void p) = pagezero_simple;

	static void
	pan_setup(void)
	{
	uint64_t id_aa64mfr1;

	id_aa64mfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
	if (ID_AA64MMFR1_PAN(id_aa64mfr1) != ID_AA64MMFR1_PAN_NONE)
	has_pan = 1;
	}

	void
	pan_enable(void)
	{

	/*
	* The LLVM integrated assembler doesn't understand the PAN
	* PSTATE field. Because of this we need to manually create
	* the instruction in an asm block. This is equivalent to:
	* msr pan, #1
	*
	* This sets the PAN bit, stopping the kernel from accessing
	* memory when userspace can also access it unless the kernel
	* uses the userspace load/store instructions.
	*/
	if (has_pan) {
	WRITE_SPECIALREG(sctlr_el1,
	READ_SPECIALREG(sctlr_el1) & ~SCTLR_SPAN);
	__asm __volatile(".inst 0xd500409f \| (0x1 << 8)");
	}
	}

	static void
	cpu_startup(void *dummy)
	{

	undef_init();
	identify_cpu();

	vm_ksubmap_init(&kmi);
	bufinit();
	vm_pager_bufferinit();
	}

	SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);

	int
	cpu_idle_wakeup(int cpu)
	{

	return (0);
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{
	struct trapframe *frame;

	frame = td->td_frame;
	regs->sp = frame->tf_sp;
	regs->lr = frame->tf_lr;
	regs->elr = frame->tf_elr;
	regs->spsr = frame->tf_spsr;

	memcpy(regs->x, frame->tf_x, sizeof(regs->x));

	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct trapframe *frame;

	frame = td->td_frame;
	frame->tf_sp = regs->sp;
	frame->tf_lr = regs->lr;
	frame->tf_elr = regs->elr;
	frame->tf_spsr &= ~PSR_FLAGS;
	frame->tf_spsr \|= regs->spsr & PSR_FLAGS;

	memcpy(frame->tf_x, regs->x, sizeof(frame->tf_x));

	return (0);
	}

	int
	fill_fpregs(struct thread td, struct fpreg regs)
	{
	#ifdef VFP
	struct pcb *pcb;

	pcb = td->td_pcb;
	if ((pcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
	/*
	* If we have just been running VFP instructions we will
	* need to save the state to memcpy it below.
	*/
	if (td == curthread)
	vfp_save_state(td, pcb);

	KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate,
	("Called fill_fpregs while the kernel is using the VFP"));
	memcpy(regs->fp_q, pcb->pcb_fpustate.vfp_regs,
	sizeof(regs->fp_q));
	regs->fp_cr = pcb->pcb_fpustate.vfp_fpcr;
	regs->fp_sr = pcb->pcb_fpustate.vfp_fpsr;
	} else
	#endif
	memset(regs->fp_q, 0, sizeof(regs->fp_q));
	return (0);
	}

	int
	set_fpregs(struct thread td, struct fpreg regs)
	{
	#ifdef VFP
	struct pcb *pcb;

	pcb = td->td_pcb;
	KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate,
	("Called set_fpregs while the kernel is using the VFP"));
	memcpy(pcb->pcb_fpustate.vfp_regs, regs->fp_q, sizeof(regs->fp_q));
	pcb->pcb_fpustate.vfp_fpcr = regs->fp_cr;
	pcb->pcb_fpustate.vfp_fpsr = regs->fp_sr;
	#endif
	return (0);
	}

	int
	fill_dbregs(struct thread td, struct dbreg regs)
	{

	printf("ARM64TODO: fill_dbregs");
	return (EDOOFUS);
	}

	int
	set_dbregs(struct thread td, struct dbreg regs)
	{

	printf("ARM64TODO: set_dbregs");
	return (EDOOFUS);
	}

	#ifdef COMPAT_FREEBSD32
	int
	fill_regs32(struct thread td, struct reg32 regs)
	{

	printf("ARM64TODO: fill_regs32");
	return (EDOOFUS);
	}

	int
	set_regs32(struct thread td, struct reg32 regs)
	{

	printf("ARM64TODO: set_regs32");
	return (EDOOFUS);
	}

	int
	fill_fpregs32(struct thread td, struct fpreg32 regs)
	{

	printf("ARM64TODO: fill_fpregs32");
	return (EDOOFUS);
	}

	int
	set_fpregs32(struct thread td, struct fpreg32 regs)
	{

	printf("ARM64TODO: set_fpregs32");
	return (EDOOFUS);
	}

	int
	fill_dbregs32(struct thread td, struct dbreg32 regs)
	{

	printf("ARM64TODO: fill_dbregs32");
	return (EDOOFUS);
	}

	int
	set_dbregs32(struct thread td, struct dbreg32 regs)
	{

	printf("ARM64TODO: set_dbregs32");
	return (EDOOFUS);
	}
	#endif

	int
	ptrace_set_pc(struct thread *td, u_long addr)
	{

	printf("ARM64TODO: ptrace_set_pc");
	return (EDOOFUS);
	}

	int
	ptrace_single_step(struct thread *td)
	{

	td->td_frame->tf_spsr \|= PSR_SS;
	td->td_pcb->pcb_flags \|= PCB_SINGLE_STEP;
	return (0);
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{

	td->td_frame->tf_spsr &= ~PSR_SS;
	td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP;
	return (0);
	}

	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *tf = td->td_frame;

	memset(tf, 0, sizeof(struct trapframe));

	tf->tf_x[0] = stack;
	tf->tf_sp = STACKALIGN(stack);
	tf->tf_lr = imgp->entry_addr;
	tf->tf_elr = imgp->entry_addr;
	}

	/* Sanity check these are the same size, they will be memcpy'd to and fro */
	CTASSERT(sizeof(((struct trapframe *)0)->tf_x) ==
	sizeof((struct gpregs *)0)->gp_x);
	CTASSERT(sizeof(((struct trapframe *)0)->tf_x) ==
	sizeof((struct reg *)0)->x);

	int
	get_mcontext(struct thread td, mcontext_t mcp, int clear_ret)
	{
	struct trapframe *tf = td->td_frame;

	if (clear_ret & GET_MC_CLEAR_RET) {
	mcp->mc_gpregs.gp_x[0] = 0;
	mcp->mc_gpregs.gp_spsr = tf->tf_spsr & ~PSR_C;
	} else {
	mcp->mc_gpregs.gp_x[0] = tf->tf_x[0];
	mcp->mc_gpregs.gp_spsr = tf->tf_spsr;
	}

	memcpy(&mcp->mc_gpregs.gp_x[1], &tf->tf_x[1],
	sizeof(mcp->mc_gpregs.gp_x[1]) * (nitems(mcp->mc_gpregs.gp_x) - 1));

	mcp->mc_gpregs.gp_sp = tf->tf_sp;
	mcp->mc_gpregs.gp_lr = tf->tf_lr;
	mcp->mc_gpregs.gp_elr = tf->tf_elr;

	return (0);
	}

	int
	set_mcontext(struct thread td, mcontext_t mcp)
	{
	struct trapframe *tf = td->td_frame;
	uint32_t spsr;

	spsr = mcp->mc_gpregs.gp_spsr;
	if ((spsr & PSR_M_MASK) != PSR_M_EL0t \|\|
	(spsr & (PSR_AARCH32 \| PSR_F \| PSR_I \| PSR_A \| PSR_D)) != 0)
	return (EINVAL);

	memcpy(tf->tf_x, mcp->mc_gpregs.gp_x, sizeof(tf->tf_x));

	tf->tf_sp = mcp->mc_gpregs.gp_sp;
	tf->tf_lr = mcp->mc_gpregs.gp_lr;
	tf->tf_elr = mcp->mc_gpregs.gp_elr;
	tf->tf_spsr = mcp->mc_gpregs.gp_spsr;

	return (0);
	}

	static void
	get_fpcontext(struct thread td, mcontext_t mcp)
	{
	#ifdef VFP
	struct pcb *curpcb;

	critical_enter();

	curpcb = curthread->td_pcb;

	if ((curpcb->pcb_fpflags & PCB_FP_STARTED) != 0) {
	/*
	* If we have just been running VFP instructions we will
	* need to save the state to memcpy it below.
	*/
	vfp_save_state(td, curpcb);

	KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate,
	("Called get_fpcontext while the kernel is using the VFP"));
	KASSERT((curpcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0,
	("Non-userspace FPU flags set in get_fpcontext"));
	memcpy(mcp->mc_fpregs.fp_q, curpcb->pcb_fpustate.vfp_regs,
	sizeof(mcp->mc_fpregs));
	mcp->mc_fpregs.fp_cr = curpcb->pcb_fpustate.vfp_fpcr;
	mcp->mc_fpregs.fp_sr = curpcb->pcb_fpustate.vfp_fpsr;
	mcp->mc_fpregs.fp_flags = curpcb->pcb_fpflags;
	mcp->mc_flags \|= _MC_FP_VALID;
	}

	critical_exit();
	#endif
	}

	static void
	set_fpcontext(struct thread td, mcontext_t mcp)
	{
	#ifdef VFP
	struct pcb *curpcb;

	critical_enter();

	if ((mcp->mc_flags & _MC_FP_VALID) != 0) {
	curpcb = curthread->td_pcb;

	/*
	* Discard any vfp state for the current thread, we
	* are about to override it.
	*/
	vfp_discard(td);

	KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate,
	("Called set_fpcontext while the kernel is using the VFP"));
	memcpy(curpcb->pcb_fpustate.vfp_regs, mcp->mc_fpregs.fp_q,
	sizeof(mcp->mc_fpregs));
	curpcb->pcb_fpustate.vfp_fpcr = mcp->mc_fpregs.fp_cr;
	curpcb->pcb_fpustate.vfp_fpsr = mcp->mc_fpregs.fp_sr;
	curpcb->pcb_fpflags = mcp->mc_fpregs.fp_flags & PCB_FP_USERMASK;
	}

	critical_exit();
	#endif
	}

	void
	cpu_idle(int busy)
	{

	spinlock_enter();
	if (!busy)
	cpu_idleclock();
	if (!sched_runnable())
	__asm __volatile(
	"dsb sy \n"
	"wfi \n");
	if (!busy)
	cpu_activeclock();
	spinlock_exit();
	}

	void
	cpu_halt(void)
	{

	/* We should have shutdown by now, if not enter a low power sleep */
	intr_disable();
	while (1) {
	__asm __volatile("wfi");
	}
	}

	/*
	* Flush the D-cache for non-DMA I/O so that the I-cache can
	* be made coherent later.
	*/
	void
	cpu_flush_dcache(void *ptr, size_t len)
	{

	/* ARM64TODO TBD */
	}

	/* Get current clock frequency for the given CPU ID. */
	int
	cpu_est_clockrate(int cpu_id, uint64_t *rate)
	{
	struct pcpu *pc;

	pc = pcpu_find(cpu_id);
	if (pc == NULL \|\| rate == NULL)
	return (EINVAL);

	if (pc->pc_clock == 0)
	return (EOPNOTSUPP);

	*rate = pc->pc_clock;
	return (0);
	}

	void
	cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
	{

	pcpu->pc_acpi_id = 0xffffffff;
	}

	void
	spinlock_enter(void)
	{
	struct thread *td;
	register_t daif;

	td = curthread;
	if (td->td_md.md_spinlock_count == 0) {
	daif = intr_disable();
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_daif = daif;
	} else
	td->td_md.md_spinlock_count++;
	critical_enter();
	}

	void
	spinlock_exit(void)
	{
	struct thread *td;
	register_t daif;

	td = curthread;
	critical_exit();
	daif = td->td_md.md_saved_daif;
	td->td_md.md_spinlock_count--;
	if (td->td_md.md_spinlock_count == 0)
	intr_restore(daif);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sigreturn_args {
	ucontext_t *ucp;
	};
	#endif

	int
	sys_sigreturn(struct thread td, struct sigreturn_args uap)
	{
	ucontext_t uc;
	int error;

	if (uap == NULL)
	return (EFAULT);
	if (copyin(uap->sigcntxp, &uc, sizeof(uc)))
	return (EFAULT);

	error = set_mcontext(td, &uc.uc_mcontext);
	if (error != 0)
	return (error);
	set_fpcontext(td, &uc.uc_mcontext);

	/* Restore signal mask. */
	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);

	return (EJUSTRETURN);
	}

	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{
	int i;

	for (i = 0; i < PCB_LR; i++)
	pcb->pcb_x[i] = tf->tf_x[i];

	pcb->pcb_x[PCB_LR] = tf->tf_lr;
	pcb->pcb_pc = tf->tf_elr;
	pcb->pcb_sp = tf->tf_sp;
	}

	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct thread *td;
	struct proc *p;
	struct trapframe *tf;
	struct sigframe *fp, frame;
	struct sigacts *psp;
	struct sysentvec *sysent;
	- int code, onstack, sig;
	+ int onstack, sig;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);

	sig = ksi->ksi_signo;
	- code = ksi->ksi_code;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);

	tf = td->td_frame;
	onstack = sigonstack(tf->tf_sp);

	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
	catcher, sig);

	/* Allocate and validate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size);
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else {
	fp = (struct sigframe *)td->td_frame->tf_sp;
	}

	/* Make room, keeping the stack aligned */
	fp--;
	fp = (struct sigframe *)STACKALIGN(fp);

	/* Fill in the frame to copy out */
	get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
	get_fpcontext(td, &frame.sf_uc.uc_mcontext);
	frame.sf_si = ksi->ksi_info;
	frame.sf_uc.uc_sigmask = *mask;
	frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
	((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	frame.sf_uc.uc_stack = td->td_sigstk;
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(td->td_proc);

	/* Copy the sigframe out to the user's stack. */
	if (copyout(&frame, fp, sizeof(*fp)) != 0) {
	/* Process has trashed its stack. Kill it. */
	CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	tf->tf_x[0]= sig;
	tf->tf_x[1] = (register_t)&fp->sf_si;
	tf->tf_x[2] = (register_t)&fp->sf_uc;

	tf->tf_elr = (register_t)catcher;
	tf->tf_sp = (register_t)fp;
	sysent = p->p_sysent;
	if (sysent->sv_sigcode_base != 0)
	tf->tf_lr = (register_t)sysent->sv_sigcode_base;
	else
	tf->tf_lr = (register_t)(sysent->sv_psstrings -
	*(sysent->sv_szsigcode));

	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_elr,
	tf->tf_sp);

	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	static void
	init_proc0(vm_offset_t kstack)
	{
	struct pcpu *pcpup = &__pcpu[0];

	proc_linkup0(&proc0, &thread0);
	thread0.td_kstack = kstack;
	thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1;
	thread0.td_pcb->pcb_fpflags = 0;
	thread0.td_pcb->pcb_fpusaved = &thread0.td_pcb->pcb_fpustate;
	thread0.td_pcb->pcb_vfpcpu = UINT_MAX;
	thread0.td_frame = &proc0_tf;
	pcpup->pc_curpcb = thread0.td_pcb;
	}

	typedef struct {
	uint32_t type;
	uint64_t phys_start;
	uint64_t virt_start;
	uint64_t num_pages;
	uint64_t attr;
	} EFI_MEMORY_DESCRIPTOR;

	static int
	add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
	u_int *physmap_idxp)
	{
	u_int i, insert_idx, _physmap_idx;

	_physmap_idx = *physmap_idxp;

	if (length == 0)
	return (1);

	/*
	* Find insertion point while checking for overlap. Start off by
	* assuming the new entry will be added to the end.
	*/
	insert_idx = _physmap_idx;
	for (i = 0; i <= _physmap_idx; i += 2) {
	if (base < physmap[i + 1]) {
	if (base + length <= physmap[i]) {
	insert_idx = i;
	break;
	}
	if (boothowto & RB_VERBOSE)
	printf(
	"Overlapping memory regions, ignoring second region\n");
	return (1);
	}
	}

	/* See if we can prepend to the next entry. */
	if (insert_idx <= _physmap_idx &&
	base + length == physmap[insert_idx]) {
	physmap[insert_idx] = base;
	return (1);
	}

	/* See if we can append to the previous entry. */
	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
	physmap[insert_idx - 1] += length;
	return (1);
	}

	_physmap_idx += 2;
	*physmap_idxp = _physmap_idx;
	if (_physmap_idx == PHYSMAP_SIZE) {
	printf(
	"Too many segments in the physical address map, giving up\n");
	return (0);
	}

	/*
	* Move the last 'N' entries down to make room for the new
	* entry if needed.
	*/
	for (i = _physmap_idx; i > insert_idx; i -= 2) {
	physmap[i] = physmap[i - 2];
	physmap[i + 1] = physmap[i - 1];
	}

	/* Insert the new entry. */
	physmap[insert_idx] = base;
	physmap[insert_idx + 1] = base + length;
	return (1);
	}

	#ifdef FDT
	static void
	add_fdt_mem_regions(struct mem_region mr, int mrcnt, vm_paddr_t physmap,
	u_int *physmap_idxp)
	{

	for (int i = 0; i < mrcnt; i++) {
	if (!add_physmap_entry(mr[i].mr_start, mr[i].mr_size, physmap,
	physmap_idxp))
	break;
	}
	}
	#endif

	static void
	add_efi_map_entries(struct efi_map_header efihdr, vm_paddr_t physmap,
	u_int *physmap_idxp)
	{
	struct efi_md map, p;
	const char *type;
	size_t efisz;
	int ndesc, i;

	static const char *types[] = {
	"Reserved",
	"LoaderCode",
	"LoaderData",
	"BootServicesCode",
	"BootServicesData",
	"RuntimeServicesCode",
	"RuntimeServicesData",
	"ConventionalMemory",
	"UnusableMemory",
	"ACPIReclaimMemory",
	"ACPIMemoryNVS",
	"MemoryMappedIO",
	"MemoryMappedIOPortSpace",
	"PalCode",
	"PersistentMemory"
	};

	/*
	* Memory map data provided by UEFI via the GetMemoryMap
	* Boot Services API.
	*/
	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
	map = (struct efi_md )((uint8_t )efihdr + efisz);

	if (efihdr->descriptor_size == 0)
	return;
	ndesc = efihdr->memory_size / efihdr->descriptor_size;

	if (boothowto & RB_VERBOSE)
	printf("%23s %12s %12s %8s %4s\n",
	"Type", "Physical", "Virtual", "#Pages", "Attr");

	for (i = 0, p = map; i < ndesc; i++,
	p = efi_next_descriptor(p, efihdr->descriptor_size)) {
	if (boothowto & RB_VERBOSE) {
	if (p->md_type < nitems(types))
	type = types[p->md_type];
	else
	type = "<INVALID>";
	printf("%23s %012lx %12p %08lx ", type, p->md_phys,
	p->md_virt, p->md_pages);
	if (p->md_attr & EFI_MD_ATTR_UC)
	printf("UC ");
	if (p->md_attr & EFI_MD_ATTR_WC)
	printf("WC ");
	if (p->md_attr & EFI_MD_ATTR_WT)
	printf("WT ");
	if (p->md_attr & EFI_MD_ATTR_WB)
	printf("WB ");
	if (p->md_attr & EFI_MD_ATTR_UCE)
	printf("UCE ");
	if (p->md_attr & EFI_MD_ATTR_WP)
	printf("WP ");
	if (p->md_attr & EFI_MD_ATTR_RP)
	printf("RP ");
	if (p->md_attr & EFI_MD_ATTR_XP)
	printf("XP ");
	if (p->md_attr & EFI_MD_ATTR_NV)
	printf("NV ");
	if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
	printf("MORE_RELIABLE ");
	if (p->md_attr & EFI_MD_ATTR_RO)
	printf("RO ");
	if (p->md_attr & EFI_MD_ATTR_RT)
	printf("RUNTIME");
	printf("\n");
	}

	switch (p->md_type) {
	case EFI_MD_TYPE_CODE:
	case EFI_MD_TYPE_DATA:
	case EFI_MD_TYPE_BS_CODE:
	case EFI_MD_TYPE_BS_DATA:
	case EFI_MD_TYPE_FREE:
	/*
	* We're allowed to use any entry with these types.
	*/
	break;
	default:
	continue;
	}

	if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
	physmap, physmap_idxp))
	break;
	}
	}

	#ifdef FDT
	static void
	try_load_dtb(caddr_t kmdp)
	{
	vm_offset_t dtbp;

	dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t);
	if (dtbp == (vm_offset_t)NULL) {
	printf("ERROR loading DTB\n");
	return;
	}

	if (OF_install(OFW_FDT, 0) == FALSE)
	panic("Cannot install FDT");

	if (OF_init((void *)dtbp) != 0)
	panic("OF_init failed with the found device tree");
	}
	#endif

	static bool
	bus_probe(void)
	{
	bool has_acpi, has_fdt;
	char order, env;

	has_acpi = has_fdt = false;

	#ifdef FDT
	has_fdt = (OF_peer(0) != 0);
	#endif
	#ifdef DEV_ACPI
	has_acpi = (acpi_find_table(ACPI_SIG_SPCR) != 0);
	#endif

	env = kern_getenv("kern.cfg.order");
	if (env != NULL) {
	order = env;
	while (order != NULL) {
	if (has_acpi &&
	strncmp(order, "acpi", 4) == 0 &&
	(order[4] == ',' \|\| order[4] == '\0')) {
	arm64_bus_method = ARM64_BUS_ACPI;
	break;
	}
	if (has_fdt &&
	strncmp(order, "fdt", 3) == 0 &&
	(order[3] == ',' \|\| order[3] == '\0')) {
	arm64_bus_method = ARM64_BUS_FDT;
	break;
	}
	order = strchr(order, ',');
	}
	freeenv(env);

	/* If we set the bus method it is valid */
	if (arm64_bus_method != ARM64_BUS_NONE)
	return (true);
	}
	/* If no order or an invalid order was set use the default */
	if (arm64_bus_method == ARM64_BUS_NONE) {
	if (has_fdt)
	arm64_bus_method = ARM64_BUS_FDT;
	else if (has_acpi)
	arm64_bus_method = ARM64_BUS_ACPI;
	}

	/*
	* If no option was set the default is valid, otherwise we are
	* setting one to get cninit() working, then calling panic to tell
	* the user about the invalid bus setup.
	*/
	return (env == NULL);
	}

	static void
	cache_setup(void)
	{
	int dcache_line_shift, icache_line_shift, dczva_line_shift;
	uint32_t ctr_el0;
	uint32_t dczid_el0;

	ctr_el0 = READ_SPECIALREG(ctr_el0);

	/* Read the log2 words in each D cache line */
	dcache_line_shift = CTR_DLINE_SIZE(ctr_el0);
	/* Get the D cache line size */
	dcache_line_size = sizeof(int) << dcache_line_shift;

	/* And the same for the I cache */
	icache_line_shift = CTR_ILINE_SIZE(ctr_el0);
	icache_line_size = sizeof(int) << icache_line_shift;

	idcache_line_size = MIN(dcache_line_size, icache_line_size);

	dczid_el0 = READ_SPECIALREG(dczid_el0);

	/* Check if dc zva is not prohibited */
	if (dczid_el0 & DCZID_DZP)
	dczva_line_size = 0;
	else {
	/* Same as with above calculations */
	dczva_line_shift = DCZID_BS_SIZE(dczid_el0);
	dczva_line_size = sizeof(int) << dczva_line_shift;

	/* Change pagezero function */
	pagezero = pagezero_cache;
	}
	}

	void
	initarm(struct arm64_bootparams *abp)
	{
	struct efi_map_header *efihdr;
	struct pcpu *pcpup;
	char *env;
	#ifdef FDT
	struct mem_region mem_regions[FDT_MEM_REGIONS];
	int mem_regions_sz;
	#endif
	vm_offset_t lastaddr;
	caddr_t kmdp;
	vm_paddr_t mem_len;
	bool valid;
	int i;

	/* Set the module data location */
	preload_metadata = (caddr_t)(uintptr_t)(abp->modulep);

	/* Find the kernel address */
	kmdp = preload_search_by_type("elf kernel");
	if (kmdp == NULL)
	kmdp = preload_search_by_type("elf64 kernel");

	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
	init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0);

	#ifdef FDT
	try_load_dtb(kmdp);
	#endif

	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);

	/* Find the address to start allocating from */
	lastaddr = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);

	/* Load the physical memory ranges */
	physmap_idx = 0;
	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
	MODINFO_METADATA \| MODINFOMD_EFI_MAP);
	if (efihdr != NULL)
	add_efi_map_entries(efihdr, physmap, &physmap_idx);
	#ifdef FDT
	else {
	/* Grab physical memory regions information from device tree. */
	if (fdt_get_mem_regions(mem_regions, &mem_regions_sz,
	NULL) != 0)
	panic("Cannot get physical memory regions");
	add_fdt_mem_regions(mem_regions, mem_regions_sz, physmap,
	&physmap_idx);
	}
	#endif

	/* Print the memory map */
	mem_len = 0;
	for (i = 0; i < physmap_idx; i += 2) {
	dump_avail[i] = physmap[i];
	dump_avail[i + 1] = physmap[i + 1];
	mem_len += physmap[i + 1] - physmap[i];
	}
	dump_avail[i] = 0;
	dump_avail[i + 1] = 0;

	/* Set the pcpu data, this is needed by pmap_bootstrap */
	pcpup = &__pcpu[0];
	pcpu_init(pcpup, 0, sizeof(struct pcpu));

	/*
	* Set the pcpu pointer with a backup in tpidr_el1 to be
	* loaded when entering the kernel from userland.
	*/
	__asm __volatile(
	"mov x18, %0 \n"
	"msr tpidr_el1, %0" :: "r"(pcpup));

	PCPU_SET(curthread, &thread0);

	/* Do basic tuning, hz etc */
	init_param1();

	cache_setup();
	pan_setup();

	/* Bootstrap enough of pmap to enter the kernel proper */
	pmap_bootstrap(abp->kern_l0pt, abp->kern_l1pt,
	KERNBASE - abp->kern_delta, lastaddr - KERNBASE);

	devmap_bootstrap(0, NULL);

	valid = bus_probe();

	cninit();

	if (!valid)
	panic("Invalid bus configuration: %s",
	kern_getenv("kern.cfg.order"));

	init_proc0(abp->kern_stack);
	msgbufinit(msgbufp, msgbufsize);
	mutex_init();
	init_param2(physmem);

	dbg_init();
	kdb_init();
	pan_enable();

	env = kern_getenv("kernelname");
	if (env != NULL)
	strlcpy(kernelname, env, sizeof(kernelname));

	early_boot = 0;
	}

	void
	dbg_init(void)
	{

	/* Clear OS lock */
	WRITE_SPECIALREG(OSLAR_EL1, 0);

	/* This permits DDB to use debug registers for watchpoints. */
	dbg_monitor_init();

	/* TODO: Eventually will need to initialize debug registers here. */
	}

	#ifdef DDB
	#include <ddb/ddb.h>

	DB_SHOW_COMMAND(specialregs, db_show_spregs)
	{
	#define PRINT_REG(reg) \
	db_printf(__STRING(reg) " = %#016lx\n", READ_SPECIALREG(reg))

	PRINT_REG(actlr_el1);
	PRINT_REG(afsr0_el1);
	PRINT_REG(afsr1_el1);
	PRINT_REG(aidr_el1);
	PRINT_REG(amair_el1);
	PRINT_REG(ccsidr_el1);
	PRINT_REG(clidr_el1);
	PRINT_REG(contextidr_el1);
	PRINT_REG(cpacr_el1);
	PRINT_REG(csselr_el1);
	PRINT_REG(ctr_el0);
	PRINT_REG(currentel);
	PRINT_REG(daif);
	PRINT_REG(dczid_el0);
	PRINT_REG(elr_el1);
	PRINT_REG(esr_el1);
	PRINT_REG(far_el1);
	#if 0
	/* ARM64TODO: Enable VFP before reading floating-point registers */
	PRINT_REG(fpcr);
	PRINT_REG(fpsr);
	#endif
	PRINT_REG(id_aa64afr0_el1);
	PRINT_REG(id_aa64afr1_el1);
	PRINT_REG(id_aa64dfr0_el1);
	PRINT_REG(id_aa64dfr1_el1);
	PRINT_REG(id_aa64isar0_el1);
	PRINT_REG(id_aa64isar1_el1);
	PRINT_REG(id_aa64pfr0_el1);
	PRINT_REG(id_aa64pfr1_el1);
	PRINT_REG(id_afr0_el1);
	PRINT_REG(id_dfr0_el1);
	PRINT_REG(id_isar0_el1);
	PRINT_REG(id_isar1_el1);
	PRINT_REG(id_isar2_el1);
	PRINT_REG(id_isar3_el1);
	PRINT_REG(id_isar4_el1);
	PRINT_REG(id_isar5_el1);
	PRINT_REG(id_mmfr0_el1);
	PRINT_REG(id_mmfr1_el1);
	PRINT_REG(id_mmfr2_el1);
	PRINT_REG(id_mmfr3_el1);
	#if 0
	/* Missing from llvm */
	PRINT_REG(id_mmfr4_el1);
	#endif
	PRINT_REG(id_pfr0_el1);
	PRINT_REG(id_pfr1_el1);
	PRINT_REG(isr_el1);
	PRINT_REG(mair_el1);
	PRINT_REG(midr_el1);
	PRINT_REG(mpidr_el1);
	PRINT_REG(mvfr0_el1);
	PRINT_REG(mvfr1_el1);
	PRINT_REG(mvfr2_el1);
	PRINT_REG(revidr_el1);
	PRINT_REG(sctlr_el1);
	PRINT_REG(sp_el0);
	PRINT_REG(spsel);
	PRINT_REG(spsr_el1);
	PRINT_REG(tcr_el1);
	PRINT_REG(tpidr_el0);
	PRINT_REG(tpidr_el1);
	PRINT_REG(tpidrro_el0);
	PRINT_REG(ttbr0_el1);
	PRINT_REG(ttbr1_el1);
	PRINT_REG(vbar_el1);
	#undef PRINT_REG
	}

	DB_SHOW_COMMAND(vtop, db_show_vtop)
	{
	uint64_t phys;

	if (have_addr) {
	phys = arm64_address_translate_s1e1r(addr);
	db_printf("EL1 physical address reg (read): 0x%016lx\n", phys);
	phys = arm64_address_translate_s1e1w(addr);
	db_printf("EL1 physical address reg (write): 0x%016lx\n", phys);
	phys = arm64_address_translate_s1e0r(addr);
	db_printf("EL0 physical address reg (read): 0x%016lx\n", phys);
	phys = arm64_address_translate_s1e0w(addr);
	db_printf("EL0 physical address reg (write): 0x%016lx\n", phys);
	} else
	db_printf("show vtop <virt_addr>\n");
	}
	#endif
	Index: head/sys/arm64/arm64/pmap.c
	===================================================================
	--- head/sys/arm64/arm64/pmap.c (revision 327172)
	+++ head/sys/arm64/arm64/pmap.c (revision 327173)
	@@ -1,4838 +1,4837 @@
	/*-
	* Copyright (c) 1991 Regents of the University of California.
	* All rights reserved.
	* Copyright (c) 1994 John S. Dyson
	* All rights reserved.
	* Copyright (c) 1994 David Greenman
	* All rights reserved.
	* Copyright (c) 2003 Peter Wemm
	* All rights reserved.
	* Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
	* All rights reserved.
	* Copyright (c) 2014 Andrew Turner
	* All rights reserved.
	* Copyright (c) 2014-2016 The FreeBSD Foundation
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department and William Jolitz of UUNET Technologies Inc.
	*
	* This software was developed by Andrew Turner under sponsorship from
	* the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
	*/
	/*-
	* Copyright (c) 2003 Networks Associates Technology, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project by Jake Burkholder,
	* Safeport Network Services, and Network Associates Laboratories, the
	* Security Research Division of Network Associates, Inc. under
	* DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
	* CHATS research program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* Manages physical address maps.
	*
	* Since the information managed by this module is
	* also stored by the logical address mapping module,
	* this module may throw away valid virtual-to-physical
	* mappings at almost any time. However, invalidations
	* of virtual-to-physical mappings must be done as
	* requested.
	*
	* In order to cope with hardware architectures which
	* make virtual-to-physical map invalidates expensive,
	* this module may delay invalidate or reduced protection
	* operations until such time as they are actually
	* necessary. This module is given full information as
	* to which processors are currently using which maps,
	* and to when physical maps must be made correct.
	*/

	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/bitstring.h>
	#include <sys/bus.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/msgbuf.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/rwlock.h>
	#include <sys/sx.h>
	#include <sys/vmem.h>
	#include <sys/vmmeter.h>
	#include <sys/sched.h>
	#include <sys/sysctl.h>
	#include <sys/_unrhdr.h>
	#include <sys/smp.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_phys.h>
	#include <vm/vm_radix.h>
	#include <vm/vm_reserv.h>
	#include <vm/uma.h>

	#include <machine/machdep.h>
	#include <machine/md_var.h>
	#include <machine/pcb.h>

	#define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
	#define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
	#define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
	#define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))

	#define NUL0E L0_ENTRIES
	#define NUL1E (NUL0E * NL1PG)
	#define NUL2E (NUL1E * NL2PG)

	#if !defined(DIAGNOSTIC)
	#ifdef __GNUC_GNU_INLINE__
	#define PMAP_INLINE __attribute__((__gnu_inline__)) inline
	#else
	#define PMAP_INLINE extern inline
	#endif
	#else
	#define PMAP_INLINE
	#endif

	/*
	* These are configured by the mair_el1 register. This is set up in locore.S
	*/
	#define DEVICE_MEMORY 0
	#define UNCACHED_MEMORY 1
	#define CACHED_MEMORY 2


	#ifdef PV_STATS
	#define PV_STAT(x) do { x ; } while (0)
	#else
	#define PV_STAT(x) do { } while (0)
	#endif

	#define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
	#define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)])

	#define NPV_LIST_LOCKS MAXCPU

	#define PHYS_TO_PV_LIST_LOCK(pa) \
	(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])

	#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
	struct rwlock **_lockp = (lockp); \
	struct rwlock *_new_lock; \
	\
	_new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
	if (_new_lock != *_lockp) { \
	if (*_lockp != NULL) \
	rw_wunlock(*_lockp); \
	*_lockp = _new_lock; \
	rw_wlock(*_lockp); \
	} \
	} while (0)

	#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))

	#define RELEASE_PV_LIST_LOCK(lockp) do { \
	struct rwlock **_lockp = (lockp); \
	\
	if (*_lockp != NULL) { \
	rw_wunlock(*_lockp); \
	*_lockp = NULL; \
	} \
	} while (0)

	#define VM_PAGE_TO_PV_LIST_LOCK(m) \
	PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))

	struct pmap kernel_pmap_store;

	vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
	vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
	vm_offset_t kernel_vm_end = 0;

	/*
	* Data for the pv entry allocation mechanism.
	* Updates to pv_invl_gen are protected by the pv_list_locks[]
	* elements, but reads are not.
	*/
	static struct md_page *pv_table;
	static struct md_page pv_dummy;

	vm_paddr_t dmap_phys_base; /* The start of the dmap region */
	vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
	vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */

	/* This code assumes all L1 DMAP entries will be used */
	CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
	CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS);

	#define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
	extern pt_entry_t pagetable_dmap[];

	static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");

	static int superpages_enabled = 1;
	SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
	CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &superpages_enabled, 0,
	"Are large page mappings enabled?");

	/*
	* Data for the pv entry allocation mechanism
	*/
	static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
	static struct mtx pv_chunks_mutex;
	static struct rwlock pv_list_locks[NPV_LIST_LOCKS];

	static void free_pv_chunk(struct pv_chunk *pc);
	static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
	static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
	static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
	static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
	static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
	vm_offset_t va);

	static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode);
	static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
	static pt_entry_t pmap_demote_l1(pmap_t pmap, pt_entry_t l1, vm_offset_t va);
	static pt_entry_t pmap_demote_l2_locked(pmap_t pmap, pt_entry_t l2,
	vm_offset_t va, struct rwlock **lockp);
	static pt_entry_t pmap_demote_l2(pmap_t pmap, pt_entry_t l2, vm_offset_t va);
	static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
	vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
	static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
	pd_entry_t l1e, struct spglist free, struct rwlock *lockp);
	static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
	pd_entry_t l2e, struct spglist free, struct rwlock *lockp);
	static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
	vm_page_t m, struct rwlock **lockp);

	static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
	struct rwlock **lockp);

	static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
	struct spglist *free);
	static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
	static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);

	/*
	* These load the old table data and store the new value.
	* They need to be atomic as the System MMU may write to the table at
	* the same time as the CPU.
	*/
	#define pmap_load_store(table, entry) atomic_swap_64(table, entry)
	#define pmap_set(table, mask) atomic_set_64(table, mask)
	#define pmap_load_clear(table) atomic_swap_64(table, 0)
	#define pmap_load(table) (*table)

	/********************/
	/* Inline functions */
	/********************/

	static __inline void
	pagecopy(void s, void d)
	{

	memcpy(d, s, PAGE_SIZE);
	}

	static __inline pd_entry_t *
	pmap_l0(pmap_t pmap, vm_offset_t va)
	{

	return (&pmap->pm_l0[pmap_l0_index(va)]);
	}

	static __inline pd_entry_t *
	pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
	{
	pd_entry_t *l1;

	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
	return (&l1[pmap_l1_index(va)]);
	}

	static __inline pd_entry_t *
	pmap_l1(pmap_t pmap, vm_offset_t va)
	{
	pd_entry_t *l0;

	l0 = pmap_l0(pmap, va);
	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
	return (NULL);

	return (pmap_l0_to_l1(l0, va));
	}

	static __inline pd_entry_t *
	pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
	{
	pd_entry_t *l2;

	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
	return (&l2[pmap_l2_index(va)]);
	}

	static __inline pd_entry_t *
	pmap_l2(pmap_t pmap, vm_offset_t va)
	{
	pd_entry_t *l1;

	l1 = pmap_l1(pmap, va);
	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
	return (NULL);

	return (pmap_l1_to_l2(l1, va));
	}

	static __inline pt_entry_t *
	pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
	{
	pt_entry_t *l3;

	l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
	return (&l3[pmap_l3_index(va)]);
	}

	/*
	* Returns the lowest valid pde for a given virtual address.
	* The next level may or may not point to a valid page or block.
	*/
	static __inline pd_entry_t *
	pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
	{
	pd_entry_t l0, l1, *l2, desc;

	l0 = pmap_l0(pmap, va);
	desc = pmap_load(l0) & ATTR_DESCR_MASK;
	if (desc != L0_TABLE) {
	*level = -1;
	return (NULL);
	}

	l1 = pmap_l0_to_l1(l0, va);
	desc = pmap_load(l1) & ATTR_DESCR_MASK;
	if (desc != L1_TABLE) {
	*level = 0;
	return (l0);
	}

	l2 = pmap_l1_to_l2(l1, va);
	desc = pmap_load(l2) & ATTR_DESCR_MASK;
	if (desc != L2_TABLE) {
	*level = 1;
	return (l1);
	}

	*level = 2;
	return (l2);
	}

	/*
	* Returns the lowest valid pte block or table entry for a given virtual
	* address. If there are no valid entries return NULL and set the level to
	* the first invalid level.
	*/
	static __inline pt_entry_t *
	pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
	{
	pd_entry_t l1, l2, desc;
	pt_entry_t *l3;

	l1 = pmap_l1(pmap, va);
	if (l1 == NULL) {
	*level = 0;
	return (NULL);
	}
	desc = pmap_load(l1) & ATTR_DESCR_MASK;
	if (desc == L1_BLOCK) {
	*level = 1;
	return (l1);
	}

	if (desc != L1_TABLE) {
	*level = 1;
	return (NULL);
	}

	l2 = pmap_l1_to_l2(l1, va);
	desc = pmap_load(l2) & ATTR_DESCR_MASK;
	if (desc == L2_BLOCK) {
	*level = 2;
	return (l2);
	}

	if (desc != L2_TABLE) {
	*level = 2;
	return (NULL);
	}

	*level = 3;
	l3 = pmap_l2_to_l3(l2, va);
	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
	return (NULL);

	return (l3);
	}

	static inline bool
	pmap_superpages_enabled(void)
	{

	return (superpages_enabled != 0);
	}

	bool
	pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t l0, pd_entry_t l1,
	pd_entry_t l2, pt_entry_t l3)
	{
	pd_entry_t l0p, l1p, *l2p;

	if (pmap->pm_l0 == NULL)
	return (false);

	l0p = pmap_l0(pmap, va);
	*l0 = l0p;

	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
	return (false);

	l1p = pmap_l0_to_l1(l0p, va);
	*l1 = l1p;

	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
	*l2 = NULL;
	*l3 = NULL;
	return (true);
	}

	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
	return (false);

	l2p = pmap_l1_to_l2(l1p, va);
	*l2 = l2p;

	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
	*l3 = NULL;
	return (true);
	}

	*l3 = pmap_l2_to_l3(l2p, va);

	return (true);
	}

	static __inline int
	pmap_l3_valid(pt_entry_t l3)
	{

	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
	}


	CTASSERT(L1_BLOCK == L2_BLOCK);

	/*
	* Checks if the page is dirty. We currently lack proper tracking of this on
	* arm64 so for now assume is a page mapped as rw was accessed it is.
	*/
	static inline int
	pmap_page_dirty(pt_entry_t pte)
	{

	return ((pte & (ATTR_AF \| ATTR_AP_RW_BIT)) ==
	(ATTR_AF \| ATTR_AP(ATTR_AP_RW)));
	}

	static __inline void
	pmap_resident_count_inc(pmap_t pmap, int count)
	{

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	pmap->pm_stats.resident_count += count;
	}

	static __inline void
	pmap_resident_count_dec(pmap_t pmap, int count)
	{

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	KASSERT(pmap->pm_stats.resident_count >= count,
	("pmap %p resident count underflow %ld %d", pmap,
	pmap->pm_stats.resident_count, count));
	pmap->pm_stats.resident_count -= count;
	}

	static pt_entry_t *
	pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
	u_int *l2_slot)
	{
	pt_entry_t *l2;
	pd_entry_t *l1;

	l1 = (pd_entry_t *)l1pt;
	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;

	/* Check locore has used a table L1 map */
	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
	("Invalid bootstrap L1 table"));
	/* Find the address of the L2 table */
	l2 = (pt_entry_t *)init_pt_va;
	*l2_slot = pmap_l2_index(va);

	return (l2);
	}

	static vm_paddr_t
	pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
	{
	u_int l1_slot, l2_slot;
	pt_entry_t *l2;

	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);

	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
	}

	static void
	pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
	{
	vm_offset_t va;
	vm_paddr_t pa;
	u_int l1_slot;

	pa = dmap_phys_base = min_pa & ~L1_OFFSET;
	va = DMAP_MIN_ADDRESS;
	for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
	pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
	l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);

	pmap_load_store(&pagetable_dmap[l1_slot],
	(pa & ~L1_OFFSET) \| ATTR_DEFAULT \| ATTR_XN \|
	ATTR_IDX(CACHED_MEMORY) \| L1_BLOCK);
	}

	/* Set the upper limit of the DMAP region */
	dmap_phys_max = pa;
	dmap_max_addr = va;

	cpu_tlb_flushID();
	}

	static vm_offset_t
	pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
	{
	vm_offset_t l2pt;
	vm_paddr_t pa;
	pd_entry_t *l1;
	u_int l1_slot;

	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));

	l1 = (pd_entry_t *)l1pt;
	l1_slot = pmap_l1_index(va);
	l2pt = l2_start;

	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
	KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));

	pa = pmap_early_vtophys(l1pt, l2pt);
	pmap_load_store(&l1[l1_slot],
	(pa & ~Ln_TABLE_MASK) \| L1_TABLE);
	l2pt += PAGE_SIZE;
	}

	/* Clean the L2 page table */
	memset((void *)l2_start, 0, l2pt - l2_start);

	return l2pt;
	}

	static vm_offset_t
	pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
	{
	- vm_offset_t l2pt, l3pt;
	+ vm_offset_t l3pt;
	vm_paddr_t pa;
	pd_entry_t *l2;
	u_int l2_slot;

	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));

	l2 = pmap_l2(kernel_pmap, va);
	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
	- l2pt = (vm_offset_t)l2;
	l2_slot = pmap_l2_index(va);
	l3pt = l3_start;

	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
	KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));

	pa = pmap_early_vtophys(l1pt, l3pt);
	pmap_load_store(&l2[l2_slot],
	(pa & ~Ln_TABLE_MASK) \| L2_TABLE);
	l3pt += PAGE_SIZE;
	}

	/* Clean the L2 page table */
	memset((void *)l3_start, 0, l3pt - l3_start);

	return l3pt;
	}

	/*
	* Bootstrap the system enough to run with virtual memory.
	*/
	void
	pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
	vm_size_t kernlen)
	{
	u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot;
	uint64_t kern_delta;
	pt_entry_t *l2;
	vm_offset_t va, freemempos;
	vm_offset_t dpcpu, msgbufpv;
	vm_paddr_t pa, max_pa, min_pa;
	int i;

	kern_delta = KERNBASE - kernstart;
	physmem = 0;

	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
	printf("%lx\n", l1pt);
	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);

	/* Set this early so we can use the pagetable walking functions */
	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
	PMAP_LOCK_INIT(kernel_pmap);

	/* Assume the address we were loaded to is a valid physical address */
	min_pa = max_pa = KERNBASE - kern_delta;

	/*
	* Find the minimum physical address. physmap is sorted,
	* but may contain empty ranges.
	*/
	for (i = 0; i < (physmap_idx * 2); i += 2) {
	if (physmap[i] == physmap[i + 1])
	continue;
	if (physmap[i] <= min_pa)
	min_pa = physmap[i];
	if (physmap[i + 1] > max_pa)
	max_pa = physmap[i + 1];
	}

	/* Create a direct map region early so we can use it for pa -> va */
	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);

	va = KERNBASE;
	pa = KERNBASE - kern_delta;

	/*
	* Start to initialise phys_avail by copying from physmap
	* up to the physical address KERNBASE points at.
	*/
	map_slot = avail_slot = 0;
	for (; map_slot < (physmap_idx * 2) &&
	avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) {
	if (physmap[map_slot] == physmap[map_slot + 1])
	continue;

	if (physmap[map_slot] <= pa &&
	physmap[map_slot + 1] > pa)
	break;

	phys_avail[avail_slot] = physmap[map_slot];
	phys_avail[avail_slot + 1] = physmap[map_slot + 1];
	physmem += (phys_avail[avail_slot + 1] -
	phys_avail[avail_slot]) >> PAGE_SHIFT;
	avail_slot += 2;
	}

	/* Add the memory before the kernel */
	if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) {
	phys_avail[avail_slot] = physmap[map_slot];
	phys_avail[avail_slot + 1] = pa;
	physmem += (phys_avail[avail_slot + 1] -
	phys_avail[avail_slot]) >> PAGE_SHIFT;
	avail_slot += 2;
	}
	used_map_slot = map_slot;

	/*
	* Read the page table to find out what is already mapped.
	* This assumes we have mapped a block of memory from KERNBASE
	* using a single L1 entry.
	*/
	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);

	/* Sanity check the index, KERNBASE should be the first VA */
	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));

	/* Find how many pages we have mapped */
	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
	if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
	break;

	/* Check locore used L2 blocks */
	KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
	("Invalid bootstrap L2 table"));
	KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
	("Incorrect PA in L2 table"));

	va += L2_SIZE;
	pa += L2_SIZE;
	}

	va = roundup2(va, L1_SIZE);

	freemempos = KERNBASE + kernlen;
	freemempos = roundup2(freemempos, PAGE_SIZE);
	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
	/* And the l3 tables for the early devmap */
	freemempos = pmap_bootstrap_l3(l1pt,
	VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);

	cpu_tlb_flushID();

	#define alloc_pages(var, np) \
	(var) = freemempos; \
	freemempos += (np * PAGE_SIZE); \
	memset((char )(var), 0, ((np) PAGE_SIZE));

	/* Allocate dynamic per-cpu area. */
	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
	dpcpu_init((void *)dpcpu, 0);

	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
	msgbufp = (void *)msgbufpv;

	virtual_avail = roundup2(freemempos, L1_SIZE);
	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
	kernel_vm_end = virtual_avail;

	pa = pmap_early_vtophys(l1pt, freemempos);

	/* Finish initialising physmap */
	map_slot = used_map_slot;
	for (; avail_slot < (PHYS_AVAIL_SIZE - 2) &&
	map_slot < (physmap_idx * 2); map_slot += 2) {
	if (physmap[map_slot] == physmap[map_slot + 1])
	continue;

	/* Have we used the current range? */
	if (physmap[map_slot + 1] <= pa)
	continue;

	/* Do we need to split the entry? */
	if (physmap[map_slot] < pa) {
	phys_avail[avail_slot] = pa;
	phys_avail[avail_slot + 1] = physmap[map_slot + 1];
	} else {
	phys_avail[avail_slot] = physmap[map_slot];
	phys_avail[avail_slot + 1] = physmap[map_slot + 1];
	}
	physmem += (phys_avail[avail_slot + 1] -
	phys_avail[avail_slot]) >> PAGE_SHIFT;

	avail_slot += 2;
	}
	phys_avail[avail_slot] = 0;
	phys_avail[avail_slot + 1] = 0;

	/*
	* Maxmem isn't the "maximum memory", it's one larger than the
	* highest page of the physical address space. It should be
	* called something like "Maxphyspage".
	*/
	Maxmem = atop(phys_avail[avail_slot - 1]);

	cpu_tlb_flushID();
	}

	/*
	* Initialize a vm_page's machine-dependent fields.
	*/
	void
	pmap_page_init(vm_page_t m)
	{

	TAILQ_INIT(&m->md.pv_list);
	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
	}

	/*
	* Initialize the pmap module.
	* Called by vm_init, to initialize any structures that the pmap
	* system needs to map virtual memory.
	*/
	void
	pmap_init(void)
	{
	vm_size_t s;
	int i, pv_npg;

	/*
	* Are large page mappings enabled?
	*/
	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);

	/*
	* Initialize the pv chunk list mutex.
	*/
	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);

	/*
	* Initialize the pool of pv list locks.
	*/
	for (i = 0; i < NPV_LIST_LOCKS; i++)
	rw_init(&pv_list_locks[i], "pmap pv list");

	/*
	* Calculate the size of the pv head table for superpages.
	*/
	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);

	/*
	* Allocate memory for the pv head table for superpages.
	*/
	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
	s = round_page(s);
	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
	M_WAITOK \| M_ZERO);
	for (i = 0; i < pv_npg; i++)
	TAILQ_INIT(&pv_table[i].pv_list);
	TAILQ_INIT(&pv_dummy.pv_list);
	}

	static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
	"2MB page mapping counters");

	static u_long pmap_l2_demotions;
	SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
	&pmap_l2_demotions, 0, "2MB page demotions");

	static u_long pmap_l2_p_failures;
	SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
	&pmap_l2_p_failures, 0, "2MB page promotion failures");

	static u_long pmap_l2_promotions;
	SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
	&pmap_l2_promotions, 0, "2MB page promotions");

	/*
	* Invalidate a single TLB entry.
	*/
	static __inline void
	pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
	{

	sched_pin();
	__asm __volatile(
	"dsb ishst \n"
	"tlbi vaae1is, %0 \n"
	"dsb ish \n"
	"isb \n"
	: : "r"(va >> PAGE_SHIFT));
	sched_unpin();
	}

	static __inline void
	pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
	{
	vm_offset_t addr;

	sched_pin();
	dsb(ishst);
	for (addr = sva; addr < eva; addr += PAGE_SIZE) {
	__asm __volatile(
	"tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
	}
	__asm __volatile(
	"dsb ish \n"
	"isb \n");
	sched_unpin();
	}

	static __inline void
	pmap_invalidate_all(pmap_t pmap)
	{

	sched_pin();
	__asm __volatile(
	"dsb ishst \n"
	"tlbi vmalle1is \n"
	"dsb ish \n"
	"isb \n");
	sched_unpin();
	}

	/*
	* Routine: pmap_extract
	* Function:
	* Extract the physical page address associated
	* with the given map/virtual_address pair.
	*/
	vm_paddr_t
	pmap_extract(pmap_t pmap, vm_offset_t va)
	{
	pt_entry_t *pte, tpte;
	vm_paddr_t pa;
	int lvl;

	pa = 0;
	PMAP_LOCK(pmap);
	/*
	* Find the block or page map for this virtual address. pmap_pte
	* will return either a valid block/page entry, or NULL.
	*/
	pte = pmap_pte(pmap, va, &lvl);
	if (pte != NULL) {
	tpte = pmap_load(pte);
	pa = tpte & ~ATTR_MASK;
	switch(lvl) {
	case 1:
	KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
	("pmap_extract: Invalid L1 pte found: %lx",
	tpte & ATTR_DESCR_MASK));
	pa \|= (va & L1_OFFSET);
	break;
	case 2:
	KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
	("pmap_extract: Invalid L2 pte found: %lx",
	tpte & ATTR_DESCR_MASK));
	pa \|= (va & L2_OFFSET);
	break;
	case 3:
	KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
	("pmap_extract: Invalid L3 pte found: %lx",
	tpte & ATTR_DESCR_MASK));
	pa \|= (va & L3_OFFSET);
	break;
	}
	}
	PMAP_UNLOCK(pmap);
	return (pa);
	}

	/*
	* Routine: pmap_extract_and_hold
	* Function:
	* Atomically extract and hold the physical page
	* with the given pmap and virtual address pair
	* if that mapping permits the given protection.
	*/
	vm_page_t
	pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
	{
	pt_entry_t *pte, tpte;
	vm_offset_t off;
	vm_paddr_t pa;
	vm_page_t m;
	int lvl;

	pa = 0;
	m = NULL;
	PMAP_LOCK(pmap);
	retry:
	pte = pmap_pte(pmap, va, &lvl);
	if (pte != NULL) {
	tpte = pmap_load(pte);

	KASSERT(lvl > 0 && lvl <= 3,
	("pmap_extract_and_hold: Invalid level %d", lvl));
	CTASSERT(L1_BLOCK == L2_BLOCK);
	KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) \|\|
	(lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
	("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
	tpte & ATTR_DESCR_MASK));
	if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) \|\|
	((prot & VM_PROT_WRITE) == 0)) {
	switch(lvl) {
	case 1:
	off = va & L1_OFFSET;
	break;
	case 2:
	off = va & L2_OFFSET;
	break;
	case 3:
	default:
	off = 0;
	}
	if (vm_page_pa_tryrelock(pmap,
	(tpte & ~ATTR_MASK) \| off, &pa))
	goto retry;
	m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) \| off);
	vm_page_hold(m);
	}
	}
	PA_UNLOCK_COND(pa);
	PMAP_UNLOCK(pmap);
	return (m);
	}

	vm_paddr_t
	pmap_kextract(vm_offset_t va)
	{
	pt_entry_t *pte, tpte;
	vm_paddr_t pa;
	int lvl;

	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
	pa = DMAP_TO_PHYS(va);
	} else {
	pa = 0;
	pte = pmap_pte(kernel_pmap, va, &lvl);
	if (pte != NULL) {
	tpte = pmap_load(pte);
	pa = tpte & ~ATTR_MASK;
	switch(lvl) {
	case 1:
	KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
	("pmap_kextract: Invalid L1 pte found: %lx",
	tpte & ATTR_DESCR_MASK));
	pa \|= (va & L1_OFFSET);
	break;
	case 2:
	KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
	("pmap_kextract: Invalid L2 pte found: %lx",
	tpte & ATTR_DESCR_MASK));
	pa \|= (va & L2_OFFSET);
	break;
	case 3:
	KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
	("pmap_kextract: Invalid L3 pte found: %lx",
	tpte & ATTR_DESCR_MASK));
	pa \|= (va & L3_OFFSET);
	break;
	}
	}
	}
	return (pa);
	}

	/***************************************************
	* Low level mapping routines.....
	***************************************************/

	static void
	pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
	{
	pd_entry_t *pde;
	pt_entry_t *pte, attr;
	vm_offset_t va;
	int lvl;

	KASSERT((pa & L3_OFFSET) == 0,
	("pmap_kenter: Invalid physical address"));
	KASSERT((sva & L3_OFFSET) == 0,
	("pmap_kenter: Invalid virtual address"));
	KASSERT((size & PAGE_MASK) == 0,
	("pmap_kenter: Mapping is not page-sized"));

	attr = ATTR_DEFAULT \| ATTR_IDX(mode) \| L3_PAGE;
	if (mode == DEVICE_MEMORY)
	attr \|= ATTR_XN;

	va = sva;
	while (size != 0) {
	pde = pmap_pde(kernel_pmap, va, &lvl);
	KASSERT(pde != NULL,
	("pmap_kenter: Invalid page entry, va: 0x%lx", va));
	KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));

	pte = pmap_l2_to_l3(pde, va);
	pmap_load_store(pte, (pa & ~L3_OFFSET) \| attr);

	va += PAGE_SIZE;
	pa += PAGE_SIZE;
	size -= PAGE_SIZE;
	}
	pmap_invalidate_range(kernel_pmap, sva, va);
	}

	void
	pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
	{

	pmap_kenter(sva, size, pa, DEVICE_MEMORY);
	}

	/*
	* Remove a page from the kernel pagetables.
	*/
	PMAP_INLINE void
	pmap_kremove(vm_offset_t va)
	{
	pt_entry_t *pte;
	int lvl;

	pte = pmap_pte(kernel_pmap, va, &lvl);
	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));

	pmap_load_clear(pte);
	pmap_invalidate_page(kernel_pmap, va);
	}

	void
	pmap_kremove_device(vm_offset_t sva, vm_size_t size)
	{
	pt_entry_t *pte;
	vm_offset_t va;
	int lvl;

	KASSERT((sva & L3_OFFSET) == 0,
	("pmap_kremove_device: Invalid virtual address"));
	KASSERT((size & PAGE_MASK) == 0,
	("pmap_kremove_device: Mapping is not page-sized"));

	va = sva;
	while (size != 0) {
	pte = pmap_pte(kernel_pmap, va, &lvl);
	KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
	KASSERT(lvl == 3,
	("Invalid device pagetable level: %d != 3", lvl));
	pmap_load_clear(pte);

	va += PAGE_SIZE;
	size -= PAGE_SIZE;
	}
	pmap_invalidate_range(kernel_pmap, sva, va);
	}

	/*
	* Used to map a range of physical addresses into kernel
	* virtual address space.
	*
	* The value passed in '*virt' is a suggested virtual address for
	* the mapping. Architectures which can support a direct-mapped
	* physical to virtual region can return the appropriate address
	* within that region, leaving '*virt' unchanged. Other
	* architectures should map the pages starting at '*virt' and
	* update '*virt' with the first usable address after the mapped
	* region.
	*/
	vm_offset_t
	pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
	{
	return PHYS_TO_DMAP(start);
	}


	/*
	* Add a list of wired pages to the kva
	* this routine is only used for temporary
	* kernel mappings that do not need to have
	* page modification or references recorded.
	* Note that old mappings are simply written
	* over. The page must be wired.
	* Note: SMP coherent. Uses a ranged shootdown IPI.
	*/
	void
	pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
	{
	pd_entry_t *pde;
	pt_entry_t *pte, pa;
	vm_offset_t va;
	vm_page_t m;
	int i, lvl;

	va = sva;
	for (i = 0; i < count; i++) {
	pde = pmap_pde(kernel_pmap, va, &lvl);
	KASSERT(pde != NULL,
	("pmap_qenter: Invalid page entry, va: 0x%lx", va));
	KASSERT(lvl == 2,
	("pmap_qenter: Invalid level %d", lvl));

	m = ma[i];
	pa = VM_PAGE_TO_PHYS(m) \| ATTR_DEFAULT \| ATTR_AP(ATTR_AP_RW) \|
	ATTR_IDX(m->md.pv_memattr) \| L3_PAGE;
	if (m->md.pv_memattr == DEVICE_MEMORY)
	pa \|= ATTR_XN;
	pte = pmap_l2_to_l3(pde, va);
	pmap_load_store(pte, pa);

	va += L3_SIZE;
	}
	pmap_invalidate_range(kernel_pmap, sva, va);
	}

	/*
	* This routine tears out page mappings from the
	* kernel -- it is meant only for temporary mappings.
	*/
	void
	pmap_qremove(vm_offset_t sva, int count)
	{
	pt_entry_t *pte;
	vm_offset_t va;
	int lvl;

	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));

	va = sva;
	while (count-- > 0) {
	pte = pmap_pte(kernel_pmap, va, &lvl);
	KASSERT(lvl == 3,
	("Invalid device pagetable level: %d != 3", lvl));
	if (pte != NULL) {
	pmap_load_clear(pte);
	}

	va += PAGE_SIZE;
	}
	pmap_invalidate_range(kernel_pmap, sva, va);
	}

	/***************************************************
	* Page table page management routines.....
	***************************************************/
	static __inline void
	pmap_free_zero_pages(struct spglist *free)
	{
	vm_page_t m;

	while ((m = SLIST_FIRST(free)) != NULL) {
	SLIST_REMOVE_HEAD(free, plinks.s.ss);
	/* Preserve the page's PG_ZERO setting. */
	vm_page_free_toq(m);
	}
	}

	/*
	* Schedule the specified unused page table page to be freed. Specifically,
	* add the page to the specified list of pages that will be released to the
	* physical memory manager after the TLB has been updated.
	*/
	static __inline void
	pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
	boolean_t set_PG_ZERO)
	{

	if (set_PG_ZERO)
	m->flags \|= PG_ZERO;
	else
	m->flags &= ~PG_ZERO;
	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
	}

	/*
	* Decrements a page table page's wire count, which is used to record the
	* number of valid page table entries within the page. If the wire count
	* drops to zero, then the page table page is unmapped. Returns TRUE if the
	* page table page was unmapped and FALSE otherwise.
	*/
	static inline boolean_t
	pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
	{

	--m->wire_count;
	if (m->wire_count == 0) {
	_pmap_unwire_l3(pmap, va, m, free);
	return (TRUE);
	} else
	return (FALSE);
	}

	static void
	_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
	{

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	/*
	* unmap the page table page
	*/
	if (m->pindex >= (NUL2E + NUL1E)) {
	/* l1 page */
	pd_entry_t *l0;

	l0 = pmap_l0(pmap, va);
	pmap_load_clear(l0);
	} else if (m->pindex >= NUL2E) {
	/* l2 page */
	pd_entry_t *l1;

	l1 = pmap_l1(pmap, va);
	pmap_load_clear(l1);
	} else {
	/* l3 page */
	pd_entry_t *l2;

	l2 = pmap_l2(pmap, va);
	pmap_load_clear(l2);
	}
	pmap_resident_count_dec(pmap, 1);
	if (m->pindex < NUL2E) {
	/* We just released an l3, unhold the matching l2 */
	pd_entry_t *l1, tl1;
	vm_page_t l2pg;

	l1 = pmap_l1(pmap, va);
	tl1 = pmap_load(l1);
	l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
	pmap_unwire_l3(pmap, va, l2pg, free);
	} else if (m->pindex < (NUL2E + NUL1E)) {
	/* We just released an l2, unhold the matching l1 */
	pd_entry_t *l0, tl0;
	vm_page_t l1pg;

	l0 = pmap_l0(pmap, va);
	tl0 = pmap_load(l0);
	l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
	pmap_unwire_l3(pmap, va, l1pg, free);
	}
	pmap_invalidate_page(pmap, va);

	/*
	* This is a release store so that the ordinary store unmapping
	* the page table page is globally performed before TLB shoot-
	* down is begun.
	*/
	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);

	/*
	* Put page on a list so that it is released after
	* ALL TLB shootdown is done
	*/
	pmap_add_delayed_free_list(m, free, TRUE);
	}

	/*
	* After removing a page table entry, this routine is used to
	* conditionally free the page, and manage the hold/wire counts.
	*/
	static int
	pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
	struct spglist *free)
	{
	vm_page_t mpte;

	if (va >= VM_MAXUSER_ADDRESS)
	return (0);
	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
	return (pmap_unwire_l3(pmap, va, mpte, free));
	}

	void
	pmap_pinit0(pmap_t pmap)
	{

	PMAP_LOCK_INIT(pmap);
	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
	pmap->pm_l0 = kernel_pmap->pm_l0;
	pmap->pm_root.rt_root = 0;
	}

	int
	pmap_pinit(pmap_t pmap)
	{
	vm_paddr_t l0phys;
	vm_page_t l0pt;

	/*
	* allocate the l0 page
	*/
	while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL \|
	VM_ALLOC_NOOBJ \| VM_ALLOC_WIRED \| VM_ALLOC_ZERO)) == NULL)
	VM_WAIT;

	l0phys = VM_PAGE_TO_PHYS(l0pt);
	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);

	if ((l0pt->flags & PG_ZERO) == 0)
	pagezero(pmap->pm_l0);

	pmap->pm_root.rt_root = 0;
	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));

	return (1);
	}

	/*
	* This routine is called if the desired page table page does not exist.
	*
	* If page table page allocation fails, this routine may sleep before
	* returning NULL. It sleeps only if a lock pointer was given.
	*
	* Note: If a page allocation fails at page table level two or three,
	* one or two pages may be held during the wait, only to be released
	* afterwards. This conservative approach is easily argued to avoid
	* race conditions.
	*/
	static vm_page_t
	_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
	{
	vm_page_t m, l1pg, l2pg;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);

	/*
	* Allocate a page table page.
	*/
	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ \|
	VM_ALLOC_WIRED \| VM_ALLOC_ZERO)) == NULL) {
	if (lockp != NULL) {
	RELEASE_PV_LIST_LOCK(lockp);
	PMAP_UNLOCK(pmap);
	VM_WAIT;
	PMAP_LOCK(pmap);
	}

	/*
	* Indicate the need to retry. While waiting, the page table
	* page may have been allocated.
	*/
	return (NULL);
	}
	if ((m->flags & PG_ZERO) == 0)
	pmap_zero_page(m);

	/*
	* Map the pagetable page into the process address space, if
	* it isn't already there.
	*/

	if (ptepindex >= (NUL2E + NUL1E)) {
	pd_entry_t *l0;
	vm_pindex_t l0index;

	l0index = ptepindex - (NUL2E + NUL1E);
	l0 = &pmap->pm_l0[l0index];
	pmap_load_store(l0, VM_PAGE_TO_PHYS(m) \| L0_TABLE);
	} else if (ptepindex >= NUL2E) {
	vm_pindex_t l0index, l1index;
	pd_entry_t l0, l1;
	pd_entry_t tl0;

	l1index = ptepindex - NUL2E;
	l0index = l1index >> L0_ENTRIES_SHIFT;

	l0 = &pmap->pm_l0[l0index];
	tl0 = pmap_load(l0);
	if (tl0 == 0) {
	/* recurse for allocating page dir */
	if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
	lockp) == NULL) {
	--m->wire_count;
	/* XXX: release mem barrier? */
	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
	vm_page_free_zero(m);
	return (NULL);
	}
	} else {
	l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
	l1pg->wire_count++;
	}

	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
	l1 = &l1[ptepindex & Ln_ADDR_MASK];
	pmap_load_store(l1, VM_PAGE_TO_PHYS(m) \| L1_TABLE);
	} else {
	vm_pindex_t l0index, l1index;
	pd_entry_t l0, l1, *l2;
	pd_entry_t tl0, tl1;

	l1index = ptepindex >> Ln_ENTRIES_SHIFT;
	l0index = l1index >> L0_ENTRIES_SHIFT;

	l0 = &pmap->pm_l0[l0index];
	tl0 = pmap_load(l0);
	if (tl0 == 0) {
	/* recurse for allocating page dir */
	if (_pmap_alloc_l3(pmap, NUL2E + l1index,
	lockp) == NULL) {
	--m->wire_count;
	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
	vm_page_free_zero(m);
	return (NULL);
	}
	tl0 = pmap_load(l0);
	l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
	l1 = &l1[l1index & Ln_ADDR_MASK];
	} else {
	l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
	l1 = &l1[l1index & Ln_ADDR_MASK];
	tl1 = pmap_load(l1);
	if (tl1 == 0) {
	/* recurse for allocating page dir */
	if (_pmap_alloc_l3(pmap, NUL2E + l1index,
	lockp) == NULL) {
	--m->wire_count;
	/* XXX: release mem barrier? */
	atomic_subtract_int(
	&vm_cnt.v_wire_count, 1);
	vm_page_free_zero(m);
	return (NULL);
	}
	} else {
	l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
	l2pg->wire_count++;
	}
	}

	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
	l2 = &l2[ptepindex & Ln_ADDR_MASK];
	pmap_load_store(l2, VM_PAGE_TO_PHYS(m) \| L2_TABLE);
	}

	pmap_resident_count_inc(pmap, 1);

	return (m);
	}

	static vm_page_t
	pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
	{
	vm_pindex_t ptepindex;
	pd_entry_t *pde, tpde;
	#ifdef INVARIANTS
	pt_entry_t *pte;
	#endif
	vm_page_t m;
	int lvl;

	/*
	* Calculate pagetable page index
	*/
	ptepindex = pmap_l2_pindex(va);
	retry:
	/*
	* Get the page directory entry
	*/
	pde = pmap_pde(pmap, va, &lvl);

	/*
	* If the page table page is mapped, we just increment the hold count,
	* and activate it. If we get a level 2 pde it will point to a level 3
	* table.
	*/
	switch (lvl) {
	case -1:
	break;
	case 0:
	#ifdef INVARIANTS
	pte = pmap_l0_to_l1(pde, va);
	KASSERT(pmap_load(pte) == 0,
	("pmap_alloc_l3: TODO: l0 superpages"));
	#endif
	break;
	case 1:
	#ifdef INVARIANTS
	pte = pmap_l1_to_l2(pde, va);
	KASSERT(pmap_load(pte) == 0,
	("pmap_alloc_l3: TODO: l1 superpages"));
	#endif
	break;
	case 2:
	tpde = pmap_load(pde);
	if (tpde != 0) {
	m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
	m->wire_count++;
	return (m);
	}
	break;
	default:
	panic("pmap_alloc_l3: Invalid level %d", lvl);
	}

	/*
	* Here if the pte page isn't mapped, or if it has been deallocated.
	*/
	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
	if (m == NULL && lockp != NULL)
	goto retry;

	return (m);
	}


	/***************************************************
	* Pmap allocation/deallocation routines.
	***************************************************/

	/*
	* Release any resources held by the given physical map.
	* Called when a pmap initialized by pmap_pinit is being released.
	* Should only be called if the map contains no valid mappings.
	*/
	void
	pmap_release(pmap_t pmap)
	{
	vm_page_t m;

	KASSERT(pmap->pm_stats.resident_count == 0,
	("pmap_release: pmap resident count %ld != 0",
	pmap->pm_stats.resident_count));
	KASSERT(vm_radix_is_empty(&pmap->pm_root),
	("pmap_release: pmap has reserved page table page(s)"));

	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));

	m->wire_count--;
	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
	vm_page_free_zero(m);
	}

	static int
	kvm_size(SYSCTL_HANDLER_ARGS)
	{
	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;

	return sysctl_handle_long(oidp, &ksize, 0, req);
	}
	SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG\|CTLFLAG_RD,
	0, 0, kvm_size, "LU", "Size of KVM");

	static int
	kvm_free(SYSCTL_HANDLER_ARGS)
	{
	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;

	return sysctl_handle_long(oidp, &kfree, 0, req);
	}
	SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG\|CTLFLAG_RD,
	0, 0, kvm_free, "LU", "Amount of KVM free");

	/*
	* grow the number of kernel page table entries, if needed
	*/
	void
	pmap_growkernel(vm_offset_t addr)
	{
	vm_paddr_t paddr;
	vm_page_t nkpg;
	pd_entry_t l0, l1, *l2;

	mtx_assert(&kernel_map->system_mtx, MA_OWNED);

	addr = roundup2(addr, L2_SIZE);
	if (addr - 1 >= kernel_map->max_offset)
	addr = kernel_map->max_offset;
	while (kernel_vm_end < addr) {
	l0 = pmap_l0(kernel_pmap, kernel_vm_end);
	KASSERT(pmap_load(l0) != 0,
	("pmap_growkernel: No level 0 kernel entry"));

	l1 = pmap_l0_to_l1(l0, kernel_vm_end);
	if (pmap_load(l1) == 0) {
	/* We need a new PDP entry */
	nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
	VM_ALLOC_INTERRUPT \| VM_ALLOC_NOOBJ \|
	VM_ALLOC_WIRED \| VM_ALLOC_ZERO);
	if (nkpg == NULL)
	panic("pmap_growkernel: no memory to grow kernel");
	if ((nkpg->flags & PG_ZERO) == 0)
	pmap_zero_page(nkpg);
	paddr = VM_PAGE_TO_PHYS(nkpg);
	pmap_load_store(l1, paddr \| L1_TABLE);
	continue; /* try again */
	}
	l2 = pmap_l1_to_l2(l1, kernel_vm_end);
	if ((pmap_load(l2) & ATTR_AF) != 0) {
	kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
	if (kernel_vm_end - 1 >= kernel_map->max_offset) {
	kernel_vm_end = kernel_map->max_offset;
	break;
	}
	continue;
	}

	nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
	VM_ALLOC_INTERRUPT \| VM_ALLOC_NOOBJ \| VM_ALLOC_WIRED \|
	VM_ALLOC_ZERO);
	if (nkpg == NULL)
	panic("pmap_growkernel: no memory to grow kernel");
	if ((nkpg->flags & PG_ZERO) == 0)
	pmap_zero_page(nkpg);
	paddr = VM_PAGE_TO_PHYS(nkpg);
	pmap_load_store(l2, paddr \| L2_TABLE);
	pmap_invalidate_page(kernel_pmap, kernel_vm_end);

	kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
	if (kernel_vm_end - 1 >= kernel_map->max_offset) {
	kernel_vm_end = kernel_map->max_offset;
	break;
	}
	}
	}


	/***************************************************
	* page management routines.
	***************************************************/

	CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
	CTASSERT(_NPCM == 3);
	CTASSERT(_NPCPV == 168);

	static __inline struct pv_chunk *
	pv_to_chunk(pv_entry_t pv)
	{

	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
	}

	#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)

	#define PC_FREE0 0xfffffffffffffffful
	#define PC_FREE1 0xfffffffffffffffful
	#define PC_FREE2 0x000000fffffffffful

	static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };

	#if 0
	#ifdef PV_STATS
	static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;

	SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
	"Current number of pv entry chunks");
	SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
	"Current number of pv entry chunks allocated");
	SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
	"Current number of pv entry chunks frees");
	SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
	"Number of times tried to get a chunk page but failed.");

	static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
	static int pv_entry_spare;

	SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
	"Current number of pv entry frees");
	SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
	"Current number of pv entry allocs");
	SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
	"Current number of pv entries");
	SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
	"Current number of spare pv entries");
	#endif
	#endif /* 0 */

	/*
	* We are in a serious low memory condition. Resort to
	* drastic measures to free some pages so we can allocate
	* another pv entry chunk.
	*
	* Returns NULL if PV entries were reclaimed from the specified pmap.
	*
	* We do not, however, unmap 2mpages because subsequent accesses will
	* allocate per-page pv entries until repromotion occurs, thereby
	* exacerbating the shortage of free pv entries.
	*/
	static vm_page_t
	reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
	{
	struct pch new_tail;
	struct pv_chunk *pc;
	struct md_page *pvh;
	pd_entry_t *pde;
	pmap_t pmap;
	pt_entry_t *pte, tpte;
	pv_entry_t pv;
	vm_offset_t va;
	vm_page_t m, m_pc;
	struct spglist free;
	uint64_t inuse;
	int bit, field, freed, lvl;

	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
	pmap = NULL;
	m_pc = NULL;
	SLIST_INIT(&free);
	TAILQ_INIT(&new_tail);
	mtx_lock(&pv_chunks_mutex);
	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
	mtx_unlock(&pv_chunks_mutex);
	if (pmap != pc->pc_pmap) {
	if (pmap != NULL && pmap != locked_pmap)
	PMAP_UNLOCK(pmap);
	pmap = pc->pc_pmap;
	/* Avoid deadlock and lock recursion. */
	if (pmap > locked_pmap) {
	RELEASE_PV_LIST_LOCK(lockp);
	PMAP_LOCK(pmap);
	} else if (pmap != locked_pmap &&
	!PMAP_TRYLOCK(pmap)) {
	pmap = NULL;
	TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
	mtx_lock(&pv_chunks_mutex);
	continue;
	}
	}

	/*
	* Destroy every non-wired, 4 KB page mapping in the chunk.
	*/
	freed = 0;
	for (field = 0; field < _NPCM; field++) {
	for (inuse = ~pc->pc_map[field] & pc_freemask[field];
	inuse != 0; inuse &= ~(1UL << bit)) {
	bit = ffsl(inuse) - 1;
	pv = &pc->pc_pventry[field * 64 + bit];
	va = pv->pv_va;
	pde = pmap_pde(pmap, va, &lvl);
	if (lvl != 2)
	continue;
	pte = pmap_l2_to_l3(pde, va);
	tpte = pmap_load(pte);
	if ((tpte & ATTR_SW_WIRED) != 0)
	continue;
	tpte = pmap_load_clear(pte);
	pmap_invalidate_page(pmap, va);
	m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
	if (pmap_page_dirty(tpte))
	vm_page_dirty(m);
	if ((tpte & ATTR_AF) != 0)
	vm_page_aflag_set(m, PGA_REFERENCED);
	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
	TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
	m->md.pv_gen++;
	if (TAILQ_EMPTY(&m->md.pv_list) &&
	(m->flags & PG_FICTITIOUS) == 0) {
	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
	if (TAILQ_EMPTY(&pvh->pv_list)) {
	vm_page_aflag_clear(m,
	PGA_WRITEABLE);
	}
	}
	pc->pc_map[field] \|= 1UL << bit;
	pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
	freed++;
	}
	}
	if (freed == 0) {
	TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
	mtx_lock(&pv_chunks_mutex);
	continue;
	}
	/* Every freed mapping is for a 4 KB page. */
	pmap_resident_count_dec(pmap, freed);
	PV_STAT(atomic_add_long(&pv_entry_frees, freed));
	PV_STAT(atomic_add_int(&pv_entry_spare, freed));
	PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
	if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
	pc->pc_map[2] == PC_FREE2) {
	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
	/* Entire chunk is free; return it. */
	m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
	dump_drop_page(m_pc->phys_addr);
	mtx_lock(&pv_chunks_mutex);
	break;
	}
	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
	TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
	mtx_lock(&pv_chunks_mutex);
	/* One freed pv entry in locked_pmap is sufficient. */
	if (pmap == locked_pmap)
	break;
	}
	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
	mtx_unlock(&pv_chunks_mutex);
	if (pmap != NULL && pmap != locked_pmap)
	PMAP_UNLOCK(pmap);
	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
	m_pc = SLIST_FIRST(&free);
	SLIST_REMOVE_HEAD(&free, plinks.s.ss);
	/* Recycle a freed page table page. */
	m_pc->wire_count = 1;
	atomic_add_int(&vm_cnt.v_wire_count, 1);
	}
	pmap_free_zero_pages(&free);
	return (m_pc);
	}

	/*
	* free the pv_entry back to the free list
	*/
	static void
	free_pv_entry(pmap_t pmap, pv_entry_t pv)
	{
	struct pv_chunk *pc;
	int idx, field, bit;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
	pc = pv_to_chunk(pv);
	idx = pv - &pc->pc_pventry[0];
	field = idx / 64;
	bit = idx % 64;
	pc->pc_map[field] \|= 1ul << bit;
	if (pc->pc_map[0] != PC_FREE0 \|\| pc->pc_map[1] != PC_FREE1 \|\|
	pc->pc_map[2] != PC_FREE2) {
	/* 98% of the time, pc is already at the head of the list. */
	if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
	}
	return;
	}
	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
	free_pv_chunk(pc);
	}

	static void
	free_pv_chunk(struct pv_chunk *pc)
	{
	vm_page_t m;

	mtx_lock(&pv_chunks_mutex);
	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
	mtx_unlock(&pv_chunks_mutex);
	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
	/* entire chunk is free, return it */
	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
	dump_drop_page(m->phys_addr);
	vm_page_unwire(m, PQ_NONE);
	vm_page_free(m);
	}

	/*
	* Returns a new PV entry, allocating a new PV chunk from the system when
	* needed. If this PV chunk allocation fails and a PV list lock pointer was
	* given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
	* returned.
	*
	* The given PV list lock may be released.
	*/
	static pv_entry_t
	get_pv_entry(pmap_t pmap, struct rwlock **lockp)
	{
	int bit, field;
	pv_entry_t pv;
	struct pv_chunk *pc;
	vm_page_t m;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
	retry:
	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
	if (pc != NULL) {
	for (field = 0; field < _NPCM; field++) {
	if (pc->pc_map[field]) {
	bit = ffsl(pc->pc_map[field]) - 1;
	break;
	}
	}
	if (field < _NPCM) {
	pv = &pc->pc_pventry[field * 64 + bit];
	pc->pc_map[field] &= ~(1ul << bit);
	/* If this was the last item, move it to tail */
	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
	pc->pc_map[2] == 0) {
	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
	TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
	pc_list);
	}
	PV_STAT(atomic_add_long(&pv_entry_count, 1));
	PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
	return (pv);
	}
	}
	/* No free items, allocate another chunk */
	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL \| VM_ALLOC_NOOBJ \|
	VM_ALLOC_WIRED);
	if (m == NULL) {
	if (lockp == NULL) {
	PV_STAT(pc_chunk_tryfail++);
	return (NULL);
	}
	m = reclaim_pv_chunk(pmap, lockp);
	if (m == NULL)
	goto retry;
	}
	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
	dump_add_page(m->phys_addr);
	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
	pc->pc_pmap = pmap;
	pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
	pc->pc_map[1] = PC_FREE1;
	pc->pc_map[2] = PC_FREE2;
	mtx_lock(&pv_chunks_mutex);
	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
	mtx_unlock(&pv_chunks_mutex);
	pv = &pc->pc_pventry[0];
	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
	PV_STAT(atomic_add_long(&pv_entry_count, 1));
	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
	return (pv);
	}

	/*
	* Ensure that the number of spare PV entries in the specified pmap meets or
	* exceeds the given count, "needed".
	*
	* The given PV list lock may be released.
	*/
	static void
	reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
	{
	struct pch new_tail;
	struct pv_chunk *pc;
	int avail, free;
	vm_page_t m;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));

	/*
	* Newly allocated PV chunks must be stored in a private list until
	* the required number of PV chunks have been allocated. Otherwise,
	* reclaim_pv_chunk() could recycle one of these chunks. In
	* contrast, these chunks must be added to the pmap upon allocation.
	*/
	TAILQ_INIT(&new_tail);
	retry:
	avail = 0;
	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
	bit_count((bitstr_t *)pc->pc_map, 0,
	sizeof(pc->pc_map) * NBBY, &free);
	if (free == 0)
	break;
	avail += free;
	if (avail >= needed)
	break;
	}
	for (; avail < needed; avail += _NPCPV) {
	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL \| VM_ALLOC_NOOBJ \|
	VM_ALLOC_WIRED);
	if (m == NULL) {
	m = reclaim_pv_chunk(pmap, lockp);
	if (m == NULL)
	goto retry;
	}
	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
	dump_add_page(m->phys_addr);
	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
	pc->pc_pmap = pmap;
	pc->pc_map[0] = PC_FREE0;
	pc->pc_map[1] = PC_FREE1;
	pc->pc_map[2] = PC_FREE2;
	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
	TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
	}
	if (!TAILQ_EMPTY(&new_tail)) {
	mtx_lock(&pv_chunks_mutex);
	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
	mtx_unlock(&pv_chunks_mutex);
	}
	}

	/*
	* First find and then remove the pv entry for the specified pmap and virtual
	* address from the specified pv list. Returns the pv entry if found and NULL
	* otherwise. This operation can be performed on pv lists for either 4KB or
	* 2MB page mappings.
	*/
	static __inline pv_entry_t
	pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
	{
	pv_entry_t pv;

	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
	if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
	TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
	pvh->pv_gen++;
	break;
	}
	}
	return (pv);
	}

	/*
	* After demotion from a 2MB page mapping to 512 4KB page mappings,
	* destroy the pv entry for the 2MB page mapping and reinstantiate the pv
	* entries for each of the 4KB page mappings.
	*/
	static void
	pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
	struct rwlock **lockp)
	{
	struct md_page *pvh;
	struct pv_chunk *pc;
	pv_entry_t pv;
	vm_offset_t va_last;
	vm_page_t m;
	int bit, field;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	KASSERT((pa & L2_OFFSET) == 0,
	("pmap_pv_demote_l2: pa is not 2mpage aligned"));
	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);

	/*
	* Transfer the 2mpage's pv entry for this mapping to the first
	* page's pv list. Once this transfer begins, the pv list lock
	* must not be released until the last pv entry is reinstantiated.
	*/
	pvh = pa_to_pvh(pa);
	va = va & ~L2_OFFSET;
	pv = pmap_pvh_remove(pvh, pmap, va);
	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
	m = PHYS_TO_VM_PAGE(pa);
	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
	m->md.pv_gen++;
	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
	va_last = va + L2_SIZE - PAGE_SIZE;
	for (;;) {
	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
	KASSERT(pc->pc_map[0] != 0 \|\| pc->pc_map[1] != 0 \|\|
	pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
	for (field = 0; field < _NPCM; field++) {
	while (pc->pc_map[field]) {
	bit = ffsl(pc->pc_map[field]) - 1;
	pc->pc_map[field] &= ~(1ul << bit);
	pv = &pc->pc_pventry[field * 64 + bit];
	va += PAGE_SIZE;
	pv->pv_va = va;
	m++;
	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_pv_demote_l2: page %p is not managed", m));
	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
	m->md.pv_gen++;
	if (va == va_last)
	goto out;
	}
	}
	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
	TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
	}
	out:
	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
	TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
	}
	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
	}

	/*
	* First find and then destroy the pv entry for the specified pmap and virtual
	* address. This operation can be performed on pv lists for either 4KB or 2MB
	* page mappings.
	*/
	static void
	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
	{
	pv_entry_t pv;

	pv = pmap_pvh_remove(pvh, pmap, va);
	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
	free_pv_entry(pmap, pv);
	}

	/*
	* Conditionally create the PV entry for a 4KB page mapping if the required
	* memory can be allocated without resorting to reclamation.
	*/
	static boolean_t
	pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
	struct rwlock **lockp)
	{
	pv_entry_t pv;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	/* Pass NULL instead of the lock pointer to disable reclamation. */
	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
	pv->pv_va = va;
	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
	m->md.pv_gen++;
	return (TRUE);
	} else
	return (FALSE);
	}

	/*
	* pmap_remove_l2: do the things to unmap a level 2 superpage in a process
	*/
	static int
	pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
	pd_entry_t l1e, struct spglist free, struct rwlock *lockp)
	{
	struct md_page *pvh;
	pt_entry_t old_l2;
	vm_offset_t eva, va;
	vm_page_t m, ml3;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
	old_l2 = pmap_load_clear(l2);
	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
	if (old_l2 & ATTR_SW_WIRED)
	pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
	if (old_l2 & ATTR_SW_MANAGED) {
	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
	pvh = pa_to_pvh(old_l2 & ~ATTR_MASK);
	pmap_pvh_free(pvh, pmap, sva);
	eva = sva + L2_SIZE;
	for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
	va < eva; va += PAGE_SIZE, m++) {
	if (pmap_page_dirty(old_l2))
	vm_page_dirty(m);
	if (old_l2 & ATTR_AF)
	vm_page_aflag_set(m, PGA_REFERENCED);
	if (TAILQ_EMPTY(&m->md.pv_list) &&
	TAILQ_EMPTY(&pvh->pv_list))
	vm_page_aflag_clear(m, PGA_WRITEABLE);
	}
	}
	KASSERT(pmap != kernel_pmap,
	("Attempting to remove an l2 kernel page"));
	ml3 = pmap_remove_pt_page(pmap, sva);
	if (ml3 != NULL) {
	pmap_resident_count_dec(pmap, 1);
	KASSERT(ml3->wire_count == NL3PG,
	("pmap_remove_pages: l3 page wire count error"));
	ml3->wire_count = 0;
	pmap_add_delayed_free_list(ml3, free, FALSE);
	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
	}
	return (pmap_unuse_pt(pmap, sva, l1e, free));
	}

	/*
	* pmap_remove_l3: do the things to unmap a page in a process
	*/
	static int
	pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
	pd_entry_t l2e, struct spglist free, struct rwlock *lockp)
	{
	struct md_page *pvh;
	pt_entry_t old_l3;
	vm_page_t m;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	old_l3 = pmap_load_clear(l3);
	pmap_invalidate_page(pmap, va);
	if (old_l3 & ATTR_SW_WIRED)
	pmap->pm_stats.wired_count -= 1;
	pmap_resident_count_dec(pmap, 1);
	if (old_l3 & ATTR_SW_MANAGED) {
	m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
	if (pmap_page_dirty(old_l3))
	vm_page_dirty(m);
	if (old_l3 & ATTR_AF)
	vm_page_aflag_set(m, PGA_REFERENCED);
	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
	pmap_pvh_free(&m->md, pmap, va);
	if (TAILQ_EMPTY(&m->md.pv_list) &&
	(m->flags & PG_FICTITIOUS) == 0) {
	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
	if (TAILQ_EMPTY(&pvh->pv_list))
	vm_page_aflag_clear(m, PGA_WRITEABLE);
	}
	}
	return (pmap_unuse_pt(pmap, va, l2e, free));
	}

	/*
	* Remove the given range of addresses from the specified map.
	*
	* It is assumed that the start and end are properly
	* rounded to the page size.
	*/
	void
	pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
	{
	struct rwlock *lock;
	vm_offset_t va, va_next;
	pd_entry_t l0, l1, *l2;
	pt_entry_t l3_paddr, *l3;
	struct spglist free;

	/*
	* Perform an unsynchronized read. This is, however, safe.
	*/
	if (pmap->pm_stats.resident_count == 0)
	return;

	SLIST_INIT(&free);

	PMAP_LOCK(pmap);

	lock = NULL;
	for (; sva < eva; sva = va_next) {

	if (pmap->pm_stats.resident_count == 0)
	break;

	l0 = pmap_l0(pmap, sva);
	if (pmap_load(l0) == 0) {
	va_next = (sva + L0_SIZE) & ~L0_OFFSET;
	if (va_next < sva)
	va_next = eva;
	continue;
	}

	l1 = pmap_l0_to_l1(l0, sva);
	if (pmap_load(l1) == 0) {
	va_next = (sva + L1_SIZE) & ~L1_OFFSET;
	if (va_next < sva)
	va_next = eva;
	continue;
	}

	/*
	* Calculate index for next page table.
	*/
	va_next = (sva + L2_SIZE) & ~L2_OFFSET;
	if (va_next < sva)
	va_next = eva;

	l2 = pmap_l1_to_l2(l1, sva);
	if (l2 == NULL)
	continue;

	l3_paddr = pmap_load(l2);

	if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
	if (sva + L2_SIZE == va_next && eva >= va_next) {
	pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
	&free, &lock);
	continue;
	} else if (pmap_demote_l2_locked(pmap, l2,
	sva &~L2_OFFSET, &lock) == NULL)
	continue;
	l3_paddr = pmap_load(l2);
	}

	/*
	* Weed out invalid mappings.
	*/
	if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
	continue;

	/*
	* Limit our scan to either the end of the va represented
	* by the current page table page, or to the end of the
	* range being removed.
	*/
	if (va_next > eva)
	va_next = eva;

	va = va_next;
	for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
	sva += L3_SIZE) {
	if (l3 == NULL)
	panic("l3 == NULL");
	if (pmap_load(l3) == 0) {
	if (va != va_next) {
	pmap_invalidate_range(pmap, va, sva);
	va = va_next;
	}
	continue;
	}
	if (va == va_next)
	va = sva;
	if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free,
	&lock)) {
	sva += L3_SIZE;
	break;
	}
	}
	if (va != va_next)
	pmap_invalidate_range(pmap, va, sva);
	}
	if (lock != NULL)
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	pmap_free_zero_pages(&free);
	}

	/*
	* Routine: pmap_remove_all
	* Function:
	* Removes this physical page from
	* all physical maps in which it resides.
	* Reflects back modify bits to the pager.
	*
	* Notes:
	* Original versions of this routine were very
	* inefficient because they iteratively called
	* pmap_remove (slow...)
	*/

	void
	pmap_remove_all(vm_page_t m)
	{
	struct md_page *pvh;
	pv_entry_t pv;
	pmap_t pmap;
	struct rwlock *lock;
	pd_entry_t *pde, tpde;
	pt_entry_t *pte, tpte;
	vm_offset_t va;
	struct spglist free;
	int lvl, pvh_gen, md_gen;

	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_remove_all: page %p is not managed", m));
	SLIST_INIT(&free);
	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
	pa_to_pvh(VM_PAGE_TO_PHYS(m));
	retry:
	rw_wlock(lock);
	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	pvh_gen = pvh->pv_gen;
	rw_wunlock(lock);
	PMAP_LOCK(pmap);
	rw_wlock(lock);
	if (pvh_gen != pvh->pv_gen) {
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	goto retry;
	}
	}
	va = pv->pv_va;
	pte = pmap_pte(pmap, va, &lvl);
	KASSERT(pte != NULL,
	("pmap_remove_all: no page table entry found"));
	KASSERT(lvl == 2,
	("pmap_remove_all: invalid pte level %d", lvl));

	pmap_demote_l2_locked(pmap, pte, va, &lock);
	PMAP_UNLOCK(pmap);
	}
	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	pvh_gen = pvh->pv_gen;
	md_gen = m->md.pv_gen;
	rw_wunlock(lock);
	PMAP_LOCK(pmap);
	rw_wlock(lock);
	if (pvh_gen != pvh->pv_gen \|\| md_gen != m->md.pv_gen) {
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	goto retry;
	}
	}
	pmap_resident_count_dec(pmap, 1);

	pde = pmap_pde(pmap, pv->pv_va, &lvl);
	KASSERT(pde != NULL,
	("pmap_remove_all: no page directory entry found"));
	KASSERT(lvl == 2,
	("pmap_remove_all: invalid pde level %d", lvl));
	tpde = pmap_load(pde);

	pte = pmap_l2_to_l3(pde, pv->pv_va);
	tpte = pmap_load(pte);
	pmap_load_clear(pte);
	pmap_invalidate_page(pmap, pv->pv_va);
	if (tpte & ATTR_SW_WIRED)
	pmap->pm_stats.wired_count--;
	if ((tpte & ATTR_AF) != 0)
	vm_page_aflag_set(m, PGA_REFERENCED);

	/*
	* Update the vm_page_t clean and reference bits.
	*/
	if (pmap_page_dirty(tpte))
	vm_page_dirty(m);
	pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
	TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
	m->md.pv_gen++;
	free_pv_entry(pmap, pv);
	PMAP_UNLOCK(pmap);
	}
	vm_page_aflag_clear(m, PGA_WRITEABLE);
	rw_wunlock(lock);
	pmap_free_zero_pages(&free);
	}

	/*
	* Set the physical protection on the
	* specified range of this map as requested.
	*/
	void
	pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
	{
	vm_offset_t va, va_next;
	pd_entry_t l0, l1, *l2;
	pt_entry_t *l3p, l3, nbits;

	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
	if (prot == VM_PROT_NONE) {
	pmap_remove(pmap, sva, eva);
	return;
	}

	if ((prot & (VM_PROT_WRITE \| VM_PROT_EXECUTE)) ==
	(VM_PROT_WRITE \| VM_PROT_EXECUTE))
	return;

	PMAP_LOCK(pmap);
	for (; sva < eva; sva = va_next) {

	l0 = pmap_l0(pmap, sva);
	if (pmap_load(l0) == 0) {
	va_next = (sva + L0_SIZE) & ~L0_OFFSET;
	if (va_next < sva)
	va_next = eva;
	continue;
	}

	l1 = pmap_l0_to_l1(l0, sva);
	if (pmap_load(l1) == 0) {
	va_next = (sva + L1_SIZE) & ~L1_OFFSET;
	if (va_next < sva)
	va_next = eva;
	continue;
	}

	va_next = (sva + L2_SIZE) & ~L2_OFFSET;
	if (va_next < sva)
	va_next = eva;

	l2 = pmap_l1_to_l2(l1, sva);
	if (pmap_load(l2) == 0)
	continue;

	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
	l3p = pmap_demote_l2(pmap, l2, sva);
	if (l3p == NULL)
	continue;
	}
	KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
	("pmap_protect: Invalid L2 entry after demotion"));

	if (va_next > eva)
	va_next = eva;

	va = va_next;
	for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
	sva += L3_SIZE) {
	l3 = pmap_load(l3p);
	if (!pmap_l3_valid(l3))
	continue;

	nbits = 0;
	if ((prot & VM_PROT_WRITE) == 0) {
	if ((l3 & ATTR_SW_MANAGED) &&
	pmap_page_dirty(l3)) {
	vm_page_dirty(PHYS_TO_VM_PAGE(l3 &
	~ATTR_MASK));
	}
	nbits \|= ATTR_AP(ATTR_AP_RO);
	}
	if ((prot & VM_PROT_EXECUTE) == 0)
	nbits \|= ATTR_XN;

	pmap_set(l3p, nbits);
	/* XXX: Use pmap_invalidate_range */
	pmap_invalidate_page(pmap, sva);
	}
	}
	PMAP_UNLOCK(pmap);
	}

	/*
	* Inserts the specified page table page into the specified pmap's collection
	* of idle page table pages. Each of a pmap's page table pages is responsible
	* for mapping a distinct range of virtual addresses. The pmap's collection is
	* ordered by this virtual address range.
	*/
	static __inline int
	pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
	{

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	return (vm_radix_insert(&pmap->pm_root, mpte));
	}

	/*
	* Removes the page table page mapping the specified virtual address from the
	* specified pmap's collection of idle page table pages, and returns it.
	* Otherwise, returns NULL if there is no page table page corresponding to the
	* specified virtual address.
	*/
	static __inline vm_page_t
	pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
	{

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
	}

	/*
	* Performs a break-before-make update of a pmap entry. This is needed when
	* either promoting or demoting pages to ensure the TLB doesn't get into an
	* inconsistent state.
	*/
	static void
	pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
	vm_offset_t va, vm_size_t size)
	{
	register_t intr;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);

	/*
	* Ensure we don't get switched out with the page table in an
	* inconsistent state. We also need to ensure no interrupts fire
	* as they may make use of an address we are about to invalidate.
	*/
	intr = intr_disable();
	critical_enter();

	/* Clear the old mapping */
	pmap_load_clear(pte);
	pmap_invalidate_range(pmap, va, va + size);

	/* Create the new mapping */
	pmap_load_store(pte, newpte);

	critical_exit();
	intr_restore(intr);
	}

	#if VM_NRESERVLEVEL > 0
	/*
	* After promotion from 512 4KB page mappings to a single 2MB page mapping,
	* replace the many pv entries for the 4KB page mappings by a single pv entry
	* for the 2MB page mapping.
	*/
	static void
	pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
	struct rwlock **lockp)
	{
	struct md_page *pvh;
	pv_entry_t pv;
	vm_offset_t va_last;
	vm_page_t m;

	KASSERT((pa & L2_OFFSET) == 0,
	("pmap_pv_promote_l2: pa is not 2mpage aligned"));
	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);

	/*
	* Transfer the first page's pv entry for this mapping to the 2mpage's
	* pv list. Aside from avoiding the cost of a call to get_pv_entry(),
	* a transfer avoids the possibility that get_pv_entry() calls
	* reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
	* mappings that is being promoted.
	*/
	m = PHYS_TO_VM_PAGE(pa);
	va = va & ~L2_OFFSET;
	pv = pmap_pvh_remove(&m->md, pmap, va);
	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
	pvh = pa_to_pvh(pa);
	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
	pvh->pv_gen++;
	/* Free the remaining NPTEPG - 1 pv entries. */
	va_last = va + L2_SIZE - PAGE_SIZE;
	do {
	m++;
	va += PAGE_SIZE;
	pmap_pvh_free(&m->md, pmap, va);
	} while (va < va_last);
	}

	/*
	* Tries to promote the 512, contiguous 4KB page mappings that are within a
	* single level 2 table entry to a single 2MB page mapping. For promotion
	* to occur, two conditions must be met: (1) the 4KB page mappings must map
	* aligned, contiguous physical memory and (2) the 4KB page mappings must have
	* identical characteristics.
	*/
	static void
	pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
	struct rwlock **lockp)
	{
	pt_entry_t firstl3, l3, newl2, oldl3, pa;
	vm_page_t mpte;
	vm_offset_t sva;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);

	sva = va & ~L2_OFFSET;
	firstl3 = pmap_l2_to_l3(l2, sva);
	newl2 = pmap_load(firstl3);

	/* Check the alingment is valid */
	if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) {
	atomic_add_long(&pmap_l2_p_failures, 1);
	CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
	" in pmap %p", va, pmap);
	return;
	}

	pa = newl2 + L2_SIZE - PAGE_SIZE;
	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
	oldl3 = pmap_load(l3);
	if (oldl3 != pa) {
	atomic_add_long(&pmap_l2_p_failures, 1);
	CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
	" in pmap %p", va, pmap);
	return;
	}
	pa -= PAGE_SIZE;
	}

	/*
	* Save the page table page in its current state until the L2
	* mapping the superpage is demoted by pmap_demote_l2() or
	* destroyed by pmap_remove_l3().
	*/
	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
	KASSERT(mpte >= vm_page_array &&
	mpte < &vm_page_array[vm_page_array_size],
	("pmap_promote_l2: page table page is out of range"));
	KASSERT(mpte->pindex == pmap_l2_pindex(va),
	("pmap_promote_l2: page table page's pindex is wrong"));
	if (pmap_insert_pt_page(pmap, mpte)) {
	atomic_add_long(&pmap_l2_p_failures, 1);
	CTR2(KTR_PMAP,
	"pmap_promote_l2: failure for va %#lx in pmap %p", va,
	pmap);
	return;
	}

	if ((newl2 & ATTR_SW_MANAGED) != 0)
	pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);

	newl2 &= ~ATTR_DESCR_MASK;
	newl2 \|= L2_BLOCK;

	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);

	atomic_add_long(&pmap_l2_promotions, 1);
	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
	pmap);
	}
	#endif /* VM_NRESERVLEVEL > 0 */

	/*
	* Insert the given physical page (p) at
	* the specified virtual address (v) in the
	* target physical map with the protection requested.
	*
	* If specified, the page will be wired down, meaning
	* that the related pte can not be reclaimed.
	*
	* NB: This is the only routine which MAY NOT lazy-evaluate
	* or lose information. That is, this routine must actually
	* insert this page into the given map NOW.
	*/
	int
	pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
	u_int flags, int8_t psind __unused)
	{
	struct rwlock *lock;
	pd_entry_t *pde;
	pt_entry_t new_l3, orig_l3;
	pt_entry_t l2, l3;
	pv_entry_t pv;
	vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa;
	vm_page_t mpte, om, l1_m, l2_m, l3_m;
	boolean_t nosleep;
	int lvl;

	va = trunc_page(va);
	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
	VM_OBJECT_ASSERT_LOCKED(m->object);
	pa = VM_PAGE_TO_PHYS(m);
	new_l3 = (pt_entry_t)(pa \| ATTR_DEFAULT \| ATTR_IDX(m->md.pv_memattr) \|
	L3_PAGE);
	if ((prot & VM_PROT_WRITE) == 0)
	new_l3 \|= ATTR_AP(ATTR_AP_RO);
	if ((prot & VM_PROT_EXECUTE) == 0 \|\| m->md.pv_memattr == DEVICE_MEMORY)
	new_l3 \|= ATTR_XN;
	if ((flags & PMAP_ENTER_WIRED) != 0)
	new_l3 \|= ATTR_SW_WIRED;
	if (va < VM_MAXUSER_ADDRESS)
	new_l3 \|= ATTR_AP(ATTR_AP_USER) \| ATTR_PXN;

	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);

	mpte = NULL;

	lock = NULL;
	PMAP_LOCK(pmap);

	pde = pmap_pde(pmap, va, &lvl);
	if (pde != NULL && lvl == 1) {
	l2 = pmap_l1_to_l2(pde, va);
	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
	(l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET,
	&lock)) != NULL) {
	l3 = &l3[pmap_l3_index(va)];
	if (va < VM_MAXUSER_ADDRESS) {
	mpte = PHYS_TO_VM_PAGE(
	pmap_load(l2) & ~ATTR_MASK);
	mpte->wire_count++;
	}
	goto havel3;
	}
	}

	if (va < VM_MAXUSER_ADDRESS) {
	nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
	mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
	if (mpte == NULL && nosleep) {
	CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
	if (lock != NULL)
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	return (KERN_RESOURCE_SHORTAGE);
	}
	pde = pmap_pde(pmap, va, &lvl);
	KASSERT(pde != NULL,
	("pmap_enter: Invalid page entry, va: 0x%lx", va));
	KASSERT(lvl == 2,
	("pmap_enter: Invalid level %d", lvl));

	l3 = pmap_l2_to_l3(pde, va);
	} else {
	/*
	* If we get a level 2 pde it must point to a level 3 entry
	* otherwise we will need to create the intermediate tables
	*/
	if (lvl < 2) {
	switch(lvl) {
	default:
	case -1:
	/* Get the l0 pde to update */
	pde = pmap_l0(pmap, va);
	KASSERT(pde != NULL, ("..."));

	l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL \|
	VM_ALLOC_NOOBJ \| VM_ALLOC_WIRED \|
	VM_ALLOC_ZERO);
	if (l1_m == NULL)
	panic("pmap_enter: l1 pte_m == NULL");
	if ((l1_m->flags & PG_ZERO) == 0)
	pmap_zero_page(l1_m);

	l1_pa = VM_PAGE_TO_PHYS(l1_m);
	pmap_load_store(pde, l1_pa \| L0_TABLE);
	/* FALLTHROUGH */
	case 0:
	/* Get the l1 pde to update */
	pde = pmap_l1_to_l2(pde, va);
	KASSERT(pde != NULL, ("..."));

	l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL \|
	VM_ALLOC_NOOBJ \| VM_ALLOC_WIRED \|
	VM_ALLOC_ZERO);
	if (l2_m == NULL)
	panic("pmap_enter: l2 pte_m == NULL");
	if ((l2_m->flags & PG_ZERO) == 0)
	pmap_zero_page(l2_m);

	l2_pa = VM_PAGE_TO_PHYS(l2_m);
	pmap_load_store(pde, l2_pa \| L1_TABLE);
	/* FALLTHROUGH */
	case 1:
	/* Get the l2 pde to update */
	pde = pmap_l1_to_l2(pde, va);

	l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL \|
	VM_ALLOC_NOOBJ \| VM_ALLOC_WIRED \|
	VM_ALLOC_ZERO);
	if (l3_m == NULL)
	panic("pmap_enter: l3 pte_m == NULL");
	if ((l3_m->flags & PG_ZERO) == 0)
	pmap_zero_page(l3_m);

	l3_pa = VM_PAGE_TO_PHYS(l3_m);
	pmap_load_store(pde, l3_pa \| L2_TABLE);
	break;
	}
	}
	l3 = pmap_l2_to_l3(pde, va);
	pmap_invalidate_page(pmap, va);
	}
	havel3:

	om = NULL;
	orig_l3 = pmap_load(l3);
	opa = orig_l3 & ~ATTR_MASK;

	/*
	* Is the specified virtual address already mapped?
	*/
	if (pmap_l3_valid(orig_l3)) {
	/*
	* Wiring change, just update stats. We don't worry about
	* wiring PT pages as they remain resident as long as there
	* are valid mappings in them. Hence, if a user page is wired,
	* the PT page will be also.
	*/
	if ((flags & PMAP_ENTER_WIRED) != 0 &&
	(orig_l3 & ATTR_SW_WIRED) == 0)
	pmap->pm_stats.wired_count++;
	else if ((flags & PMAP_ENTER_WIRED) == 0 &&
	(orig_l3 & ATTR_SW_WIRED) != 0)
	pmap->pm_stats.wired_count--;

	/*
	* Remove the extra PT page reference.
	*/
	if (mpte != NULL) {
	mpte->wire_count--;
	KASSERT(mpte->wire_count > 0,
	("pmap_enter: missing reference to page table page,"
	" va: 0x%lx", va));
	}

	/*
	* Has the physical page changed?
	*/
	if (opa == pa) {
	/*
	* No, might be a protection or wiring change.
	*/
	if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
	new_l3 \|= ATTR_SW_MANAGED;
	if ((new_l3 & ATTR_AP(ATTR_AP_RW)) ==
	ATTR_AP(ATTR_AP_RW)) {
	vm_page_aflag_set(m, PGA_WRITEABLE);
	}
	}
	goto validate;
	}
	} else {
	/*
	* Increment the counters.
	*/
	if ((new_l3 & ATTR_SW_WIRED) != 0)
	pmap->pm_stats.wired_count++;
	pmap_resident_count_inc(pmap, 1);
	}
	/*
	* Enter on the PV list if part of our managed memory.
	*/
	if ((m->oflags & VPO_UNMANAGED) == 0) {
	new_l3 \|= ATTR_SW_MANAGED;
	pv = get_pv_entry(pmap, &lock);
	pv->pv_va = va;
	CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
	m->md.pv_gen++;
	if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
	vm_page_aflag_set(m, PGA_WRITEABLE);
	}

	/*
	* Update the L3 entry.
	*/
	if (orig_l3 != 0) {
	validate:
	orig_l3 = pmap_load(l3);
	opa = orig_l3 & ~ATTR_MASK;

	if (opa != pa) {
	pmap_update_entry(pmap, l3, new_l3, va, PAGE_SIZE);
	if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
	om = PHYS_TO_VM_PAGE(opa);
	if (pmap_page_dirty(orig_l3))
	vm_page_dirty(om);
	if ((orig_l3 & ATTR_AF) != 0)
	vm_page_aflag_set(om, PGA_REFERENCED);
	CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
	pmap_pvh_free(&om->md, pmap, va);
	if ((om->aflags & PGA_WRITEABLE) != 0 &&
	TAILQ_EMPTY(&om->md.pv_list) &&
	((om->flags & PG_FICTITIOUS) != 0 \|\|
	TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
	vm_page_aflag_clear(om, PGA_WRITEABLE);
	}
	} else {
	pmap_load_store(l3, new_l3);
	pmap_invalidate_page(pmap, va);
	if (pmap_page_dirty(orig_l3) &&
	(orig_l3 & ATTR_SW_MANAGED) != 0)
	vm_page_dirty(m);
	}
	} else {
	pmap_load_store(l3, new_l3);
	}

	pmap_invalidate_page(pmap, va);

	if (pmap != pmap_kernel()) {
	if (pmap == &curproc->p_vmspace->vm_pmap &&
	(prot & VM_PROT_EXECUTE) != 0)
	cpu_icache_sync_range(va, PAGE_SIZE);

	#if VM_NRESERVLEVEL > 0
	if ((mpte == NULL \|\| mpte->wire_count == NL3PG) &&
	pmap_superpages_enabled() &&
	(m->flags & PG_FICTITIOUS) == 0 &&
	vm_reserv_level_iffullpop(m) == 0) {
	pmap_promote_l2(pmap, pde, va, &lock);
	}
	#endif
	}

	if (lock != NULL)
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	return (KERN_SUCCESS);
	}

	/*
	* Maps a sequence of resident pages belonging to the same object.
	* The sequence begins with the given page m_start. This page is
	* mapped at the given virtual address start. Each subsequent page is
	* mapped at a virtual address that is offset from start by the same
	* amount as the page is offset from m_start within the object. The
	* last page in the sequence is the page with the largest offset from
	* m_start that can be mapped at a virtual address less than the given
	* virtual address end. Not every virtual page between start and end
	* is mapped; only those for which a resident page exists with the
	* corresponding offset from m_start are mapped.
	*/
	void
	pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
	vm_page_t m_start, vm_prot_t prot)
	{
	struct rwlock *lock;
	vm_offset_t va;
	vm_page_t m, mpte;
	vm_pindex_t diff, psize;

	VM_OBJECT_ASSERT_LOCKED(m_start->object);

	psize = atop(end - start);
	mpte = NULL;
	m = m_start;
	lock = NULL;
	PMAP_LOCK(pmap);
	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
	va = start + ptoa(diff);
	mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock);
	m = TAILQ_NEXT(m, listq);
	}
	if (lock != NULL)
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	}

	/*
	* this code makes some MAJOR assumptions:
	* 1. Current pmap & pmap exists.
	* 2. Not wired.
	* 3. Read access.
	* 4. No page table pages.
	* but is MUCH faster than pmap_enter...
	*/

	void
	pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
	{
	struct rwlock *lock;

	lock = NULL;
	PMAP_LOCK(pmap);
	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
	if (lock != NULL)
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	}

	static vm_page_t
	pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
	vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
	{
	struct spglist free;
	pd_entry_t *pde;
	pt_entry_t l2, l3;
	vm_paddr_t pa;
	int lvl;

	KASSERT(va < kmi.clean_sva \|\| va >= kmi.clean_eva \|\|
	(m->oflags & VPO_UNMANAGED) != 0,
	("pmap_enter_quick_locked: managed mapping within the clean submap"));
	PMAP_LOCK_ASSERT(pmap, MA_OWNED);

	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
	/*
	* In the case that a page table page is not
	* resident, we are creating it here.
	*/
	if (va < VM_MAXUSER_ADDRESS) {
	vm_pindex_t l2pindex;

	/*
	* Calculate pagetable page index
	*/
	l2pindex = pmap_l2_pindex(va);
	if (mpte && (mpte->pindex == l2pindex)) {
	mpte->wire_count++;
	} else {
	/*
	* Get the l2 entry
	*/
	pde = pmap_pde(pmap, va, &lvl);

	/*
	* If the page table page is mapped, we just increment
	* the hold count, and activate it. Otherwise, we
	* attempt to allocate a page table page. If this
	* attempt fails, we don't retry. Instead, we give up.
	*/
	if (lvl == 1) {
	l2 = pmap_l1_to_l2(pde, va);
	if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
	L2_BLOCK)
	return (NULL);
	}
	if (lvl == 2 && pmap_load(pde) != 0) {
	mpte =
	PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
	mpte->wire_count++;
	} else {
	/*
	* Pass NULL instead of the PV list lock
	* pointer, because we don't intend to sleep.
	*/
	mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
	if (mpte == NULL)
	return (mpte);
	}
	}
	l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
	l3 = &l3[pmap_l3_index(va)];
	} else {
	mpte = NULL;
	pde = pmap_pde(kernel_pmap, va, &lvl);
	KASSERT(pde != NULL,
	("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
	va));
	KASSERT(lvl == 2,
	("pmap_enter_quick_locked: Invalid level %d", lvl));
	l3 = pmap_l2_to_l3(pde, va);
	}

	if (pmap_load(l3) != 0) {
	if (mpte != NULL) {
	mpte->wire_count--;
	mpte = NULL;
	}
	return (mpte);
	}

	/*
	* Enter on the PV list if part of our managed memory.
	*/
	if ((m->oflags & VPO_UNMANAGED) == 0 &&
	!pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
	if (mpte != NULL) {
	SLIST_INIT(&free);
	if (pmap_unwire_l3(pmap, va, mpte, &free)) {
	pmap_invalidate_page(pmap, va);
	pmap_free_zero_pages(&free);
	}
	mpte = NULL;
	}
	return (mpte);
	}

	/*
	* Increment counters
	*/
	pmap_resident_count_inc(pmap, 1);

	pa = VM_PAGE_TO_PHYS(m) \| ATTR_DEFAULT \| ATTR_IDX(m->md.pv_memattr) \|
	ATTR_AP(ATTR_AP_RO) \| L3_PAGE;
	if ((prot & VM_PROT_EXECUTE) == 0 \|\| m->md.pv_memattr == DEVICE_MEMORY)
	pa \|= ATTR_XN;
	else if (va < VM_MAXUSER_ADDRESS)
	pa \|= ATTR_PXN;

	/*
	* Now validate mapping with RO protection
	*/
	if ((m->oflags & VPO_UNMANAGED) == 0)
	pa \|= ATTR_SW_MANAGED;
	pmap_load_store(l3, pa);
	pmap_invalidate_page(pmap, va);
	return (mpte);
	}

	/*
	* This code maps large physical mmap regions into the
	* processor address space. Note that some shortcuts
	* are taken, but the code works.
	*/
	void
	pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
	vm_pindex_t pindex, vm_size_t size)
	{

	VM_OBJECT_ASSERT_WLOCKED(object);
	KASSERT(object->type == OBJT_DEVICE \|\| object->type == OBJT_SG,
	("pmap_object_init_pt: non-device object"));
	}

	/*
	* Clear the wired attribute from the mappings for the specified range of
	* addresses in the given pmap. Every valid mapping within that range
	* must have the wired attribute set. In contrast, invalid mappings
	* cannot have the wired attribute set, so they are ignored.
	*
	* The wired attribute of the page table entry is not a hardware feature,
	* so there is no need to invalidate any TLB entries.
	*/
	void
	pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
	{
	vm_offset_t va_next;
	pd_entry_t l0, l1, *l2;
	pt_entry_t *l3;

	PMAP_LOCK(pmap);
	for (; sva < eva; sva = va_next) {
	l0 = pmap_l0(pmap, sva);
	if (pmap_load(l0) == 0) {
	va_next = (sva + L0_SIZE) & ~L0_OFFSET;
	if (va_next < sva)
	va_next = eva;
	continue;
	}

	l1 = pmap_l0_to_l1(l0, sva);
	if (pmap_load(l1) == 0) {
	va_next = (sva + L1_SIZE) & ~L1_OFFSET;
	if (va_next < sva)
	va_next = eva;
	continue;
	}

	va_next = (sva + L2_SIZE) & ~L2_OFFSET;
	if (va_next < sva)
	va_next = eva;

	l2 = pmap_l1_to_l2(l1, sva);
	if (pmap_load(l2) == 0)
	continue;

	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
	l3 = pmap_demote_l2(pmap, l2, sva);
	if (l3 == NULL)
	continue;
	}
	KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
	("pmap_unwire: Invalid l2 entry after demotion"));

	if (va_next > eva)
	va_next = eva;
	for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
	sva += L3_SIZE) {
	if (pmap_load(l3) == 0)
	continue;
	if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
	panic("pmap_unwire: l3 %#jx is missing "
	"ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));

	/*
	* PG_W must be cleared atomically. Although the pmap
	* lock synchronizes access to PG_W, another processor
	* could be setting PG_M and/or PG_A concurrently.
	*/
	atomic_clear_long(l3, ATTR_SW_WIRED);
	pmap->pm_stats.wired_count--;
	}
	}
	PMAP_UNLOCK(pmap);
	}

	/*
	* Copy the range specified by src_addr/len
	* from the source map to the range dst_addr/len
	* in the destination map.
	*
	* This routine is only advisory and need not do anything.
	*/

	void
	pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
	vm_offset_t src_addr)
	{
	}

	/*
	* pmap_zero_page zeros the specified hardware page by mapping
	* the page into KVM and using bzero to clear its contents.
	*/
	void
	pmap_zero_page(vm_page_t m)
	{
	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));

	pagezero((void *)va);
	}

	/*
	* pmap_zero_page_area zeros the specified hardware page by mapping
	* the page into KVM and using bzero to clear its contents.
	*
	* off and size may not cover an area beyond a single hardware page.
	*/
	void
	pmap_zero_page_area(vm_page_t m, int off, int size)
	{
	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));

	if (off == 0 && size == PAGE_SIZE)
	pagezero((void *)va);
	else
	bzero((char *)va + off, size);
	}

	/*
	* pmap_copy_page copies the specified (machine independent)
	* page by mapping the page into virtual memory and using
	* bcopy to copy the page, one machine dependent page at a
	* time.
	*/
	void
	pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
	{
	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));

	pagecopy((void )src, (void )dst);
	}

	int unmapped_buf_allowed = 1;

	void
	pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
	vm_offset_t b_offset, int xfersize)
	{
	void a_cp, b_cp;
	vm_page_t m_a, m_b;
	vm_paddr_t p_a, p_b;
	vm_offset_t a_pg_offset, b_pg_offset;
	int cnt;

	while (xfersize > 0) {
	a_pg_offset = a_offset & PAGE_MASK;
	m_a = ma[a_offset >> PAGE_SHIFT];
	p_a = m_a->phys_addr;
	b_pg_offset = b_offset & PAGE_MASK;
	m_b = mb[b_offset >> PAGE_SHIFT];
	p_b = m_b->phys_addr;
	cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
	cnt = min(cnt, PAGE_SIZE - b_pg_offset);
	if (__predict_false(!PHYS_IN_DMAP(p_a))) {
	panic("!DMAP a %lx", p_a);
	} else {
	a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
	}
	if (__predict_false(!PHYS_IN_DMAP(p_b))) {
	panic("!DMAP b %lx", p_b);
	} else {
	b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
	}
	bcopy(a_cp, b_cp, cnt);
	a_offset += cnt;
	b_offset += cnt;
	xfersize -= cnt;
	}
	}

	vm_offset_t
	pmap_quick_enter_page(vm_page_t m)
	{

	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
	}

	void
	pmap_quick_remove_page(vm_offset_t addr)
	{
	}

	/*
	* Returns true if the pmap's pv is one of the first
	* 16 pvs linked to from this page. This count may
	* be changed upwards or downwards in the future; it
	* is only necessary that true be returned for a small
	* subset of pmaps for proper page aging.
	*/
	boolean_t
	pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
	{
	struct md_page *pvh;
	struct rwlock *lock;
	pv_entry_t pv;
	int loops = 0;
	boolean_t rv;

	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_page_exists_quick: page %p is not managed", m));
	rv = FALSE;
	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
	rw_rlock(lock);
	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
	if (PV_PMAP(pv) == pmap) {
	rv = TRUE;
	break;
	}
	loops++;
	if (loops >= 16)
	break;
	}
	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
	if (PV_PMAP(pv) == pmap) {
	rv = TRUE;
	break;
	}
	loops++;
	if (loops >= 16)
	break;
	}
	}
	rw_runlock(lock);
	return (rv);
	}

	/*
	* pmap_page_wired_mappings:
	*
	* Return the number of managed mappings to the given physical page
	* that are wired.
	*/
	int
	pmap_page_wired_mappings(vm_page_t m)
	{
	struct rwlock *lock;
	struct md_page *pvh;
	pmap_t pmap;
	pt_entry_t *pte;
	pv_entry_t pv;
	int count, lvl, md_gen, pvh_gen;

	if ((m->oflags & VPO_UNMANAGED) != 0)
	return (0);
	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
	rw_rlock(lock);
	restart:
	count = 0;
	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	md_gen = m->md.pv_gen;
	rw_runlock(lock);
	PMAP_LOCK(pmap);
	rw_rlock(lock);
	if (md_gen != m->md.pv_gen) {
	PMAP_UNLOCK(pmap);
	goto restart;
	}
	}
	pte = pmap_pte(pmap, pv->pv_va, &lvl);
	if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
	count++;
	PMAP_UNLOCK(pmap);
	}
	if ((m->flags & PG_FICTITIOUS) == 0) {
	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	md_gen = m->md.pv_gen;
	pvh_gen = pvh->pv_gen;
	rw_runlock(lock);
	PMAP_LOCK(pmap);
	rw_rlock(lock);
	if (md_gen != m->md.pv_gen \|\|
	pvh_gen != pvh->pv_gen) {
	PMAP_UNLOCK(pmap);
	goto restart;
	}
	}
	pte = pmap_pte(pmap, pv->pv_va, &lvl);
	if (pte != NULL &&
	(pmap_load(pte) & ATTR_SW_WIRED) != 0)
	count++;
	PMAP_UNLOCK(pmap);
	}
	}
	rw_runlock(lock);
	return (count);
	}

	/*
	* Destroy all managed, non-wired mappings in the given user-space
	* pmap. This pmap cannot be active on any processor besides the
	* caller.
	*
	* This function cannot be applied to the kernel pmap. Moreover, it
	* is not intended for general use. It is only to be used during
	* process termination. Consequently, it can be implemented in ways
	* that make it faster than pmap_remove(). First, it can more quickly
	* destroy mappings by iterating over the pmap's collection of PV
	* entries, rather than searching the page table. Second, it doesn't
	* have to test and clear the page table entries atomically, because
	* no processor is currently accessing the user address space. In
	* particular, a page table entry's dirty bit won't change state once
	* this function starts.
	*/
	void
	pmap_remove_pages(pmap_t pmap)
	{
	pd_entry_t *pde;
	pt_entry_t *pte, tpte;
	struct spglist free;
	vm_page_t m, ml3, mt;
	pv_entry_t pv;
	struct md_page *pvh;
	struct pv_chunk pc, npc;
	struct rwlock *lock;
	int64_t bit;
	uint64_t inuse, bitmask;
	int allfree, field, freed, idx, lvl;
	vm_paddr_t pa;

	lock = NULL;

	SLIST_INIT(&free);
	PMAP_LOCK(pmap);
	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
	allfree = 1;
	freed = 0;
	for (field = 0; field < _NPCM; field++) {
	inuse = ~pc->pc_map[field] & pc_freemask[field];
	while (inuse != 0) {
	bit = ffsl(inuse) - 1;
	bitmask = 1UL << bit;
	idx = field * 64 + bit;
	pv = &pc->pc_pventry[idx];
	inuse &= ~bitmask;

	pde = pmap_pde(pmap, pv->pv_va, &lvl);
	KASSERT(pde != NULL,
	("Attempting to remove an unmapped page"));

	switch(lvl) {
	case 1:
	pte = pmap_l1_to_l2(pde, pv->pv_va);
	tpte = pmap_load(pte);
	KASSERT((tpte & ATTR_DESCR_MASK) ==
	L2_BLOCK,
	("Attempting to remove an invalid "
	"block: %lx", tpte));
	tpte = pmap_load(pte);
	break;
	case 2:
	pte = pmap_l2_to_l3(pde, pv->pv_va);
	tpte = pmap_load(pte);
	KASSERT((tpte & ATTR_DESCR_MASK) ==
	L3_PAGE,
	("Attempting to remove an invalid "
	"page: %lx", tpte));
	break;
	default:
	panic(
	"Invalid page directory level: %d",
	lvl);
	}

	/*
	* We cannot remove wired pages from a process' mapping at this time
	*/
	if (tpte & ATTR_SW_WIRED) {
	allfree = 0;
	continue;
	}

	pa = tpte & ~ATTR_MASK;

	m = PHYS_TO_VM_PAGE(pa);
	KASSERT(m->phys_addr == pa,
	("vm_page_t %p phys_addr mismatch %016jx %016jx",
	m, (uintmax_t)m->phys_addr,
	(uintmax_t)tpte));

	KASSERT((m->flags & PG_FICTITIOUS) != 0 \|\|
	m < &vm_page_array[vm_page_array_size],
	("pmap_remove_pages: bad pte %#jx",
	(uintmax_t)tpte));

	pmap_load_clear(pte);

	/*
	* Update the vm_page_t clean/reference bits.
	*/
	if ((tpte & ATTR_AP_RW_BIT) ==
	ATTR_AP(ATTR_AP_RW)) {
	switch (lvl) {
	case 1:
	for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
	vm_page_dirty(m);
	break;
	case 2:
	vm_page_dirty(m);
	break;
	}
	}

	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);

	/* Mark free */
	pc->pc_map[field] \|= bitmask;
	switch (lvl) {
	case 1:
	pmap_resident_count_dec(pmap,
	L2_SIZE / PAGE_SIZE);
	pvh = pa_to_pvh(tpte & ~ATTR_MASK);
	TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
	pvh->pv_gen++;
	if (TAILQ_EMPTY(&pvh->pv_list)) {
	for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
	if ((mt->aflags & PGA_WRITEABLE) != 0 &&
	TAILQ_EMPTY(&mt->md.pv_list))
	vm_page_aflag_clear(mt, PGA_WRITEABLE);
	}
	ml3 = pmap_remove_pt_page(pmap,
	pv->pv_va);
	if (ml3 != NULL) {
	pmap_resident_count_dec(pmap,1);
	KASSERT(ml3->wire_count == NL3PG,
	("pmap_remove_pages: l3 page wire count error"));
	ml3->wire_count = 0;
	pmap_add_delayed_free_list(ml3,
	&free, FALSE);
	atomic_subtract_int(
	&vm_cnt.v_wire_count, 1);
	}
	break;
	case 2:
	pmap_resident_count_dec(pmap, 1);
	TAILQ_REMOVE(&m->md.pv_list, pv,
	pv_next);
	m->md.pv_gen++;
	if ((m->aflags & PGA_WRITEABLE) != 0 &&
	TAILQ_EMPTY(&m->md.pv_list) &&
	(m->flags & PG_FICTITIOUS) == 0) {
	pvh = pa_to_pvh(
	VM_PAGE_TO_PHYS(m));
	if (TAILQ_EMPTY(&pvh->pv_list))
	vm_page_aflag_clear(m,
	PGA_WRITEABLE);
	}
	break;
	}
	pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
	&free);
	freed++;
	}
	}
	PV_STAT(atomic_add_long(&pv_entry_frees, freed));
	PV_STAT(atomic_add_int(&pv_entry_spare, freed));
	PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
	if (allfree) {
	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
	free_pv_chunk(pc);
	}
	}
	pmap_invalidate_all(pmap);
	if (lock != NULL)
	rw_wunlock(lock);
	PMAP_UNLOCK(pmap);
	pmap_free_zero_pages(&free);
	}

	/*
	* This is used to check if a page has been accessed or modified. As we
	* don't have a bit to see if it has been modified we have to assume it
	* has been if the page is read/write.
	*/
	static boolean_t
	pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
	{
	struct rwlock *lock;
	pv_entry_t pv;
	struct md_page *pvh;
	pt_entry_t *pte, mask, value;
	pmap_t pmap;
	int lvl, md_gen, pvh_gen;
	boolean_t rv;

	rv = FALSE;
	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
	rw_rlock(lock);
	restart:
	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	md_gen = m->md.pv_gen;
	rw_runlock(lock);
	PMAP_LOCK(pmap);
	rw_rlock(lock);
	if (md_gen != m->md.pv_gen) {
	PMAP_UNLOCK(pmap);
	goto restart;
	}
	}
	pte = pmap_pte(pmap, pv->pv_va, &lvl);
	KASSERT(lvl == 3,
	("pmap_page_test_mappings: Invalid level %d", lvl));
	mask = 0;
	value = 0;
	if (modified) {
	mask \|= ATTR_AP_RW_BIT;
	value \|= ATTR_AP(ATTR_AP_RW);
	}
	if (accessed) {
	mask \|= ATTR_AF \| ATTR_DESCR_MASK;
	value \|= ATTR_AF \| L3_PAGE;
	}
	rv = (pmap_load(pte) & mask) == value;
	PMAP_UNLOCK(pmap);
	if (rv)
	goto out;
	}
	if ((m->flags & PG_FICTITIOUS) == 0) {
	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	md_gen = m->md.pv_gen;
	pvh_gen = pvh->pv_gen;
	rw_runlock(lock);
	PMAP_LOCK(pmap);
	rw_rlock(lock);
	if (md_gen != m->md.pv_gen \|\|
	pvh_gen != pvh->pv_gen) {
	PMAP_UNLOCK(pmap);
	goto restart;
	}
	}
	pte = pmap_pte(pmap, pv->pv_va, &lvl);
	KASSERT(lvl == 2,
	("pmap_page_test_mappings: Invalid level %d", lvl));
	mask = 0;
	value = 0;
	if (modified) {
	mask \|= ATTR_AP_RW_BIT;
	value \|= ATTR_AP(ATTR_AP_RW);
	}
	if (accessed) {
	mask \|= ATTR_AF \| ATTR_DESCR_MASK;
	value \|= ATTR_AF \| L2_BLOCK;
	}
	rv = (pmap_load(pte) & mask) == value;
	PMAP_UNLOCK(pmap);
	if (rv)
	goto out;
	}
	}
	out:
	rw_runlock(lock);
	return (rv);
	}

	/*
	* pmap_is_modified:
	*
	* Return whether or not the specified physical page was modified
	* in any physical maps.
	*/
	boolean_t
	pmap_is_modified(vm_page_t m)
	{

	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_is_modified: page %p is not managed", m));

	/*
	* If the page is not exclusive busied, then PGA_WRITEABLE cannot be
	* concurrently set while the object is locked. Thus, if PGA_WRITEABLE
	* is clear, no PTEs can have PG_M set.
	*/
	VM_OBJECT_ASSERT_WLOCKED(m->object);
	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
	return (FALSE);
	return (pmap_page_test_mappings(m, FALSE, TRUE));
	}

	/*
	* pmap_is_prefaultable:
	*
	* Return whether or not the specified virtual address is eligible
	* for prefault.
	*/
	boolean_t
	pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
	{
	pt_entry_t *pte;
	boolean_t rv;
	int lvl;

	rv = FALSE;
	PMAP_LOCK(pmap);
	pte = pmap_pte(pmap, addr, &lvl);
	if (pte != NULL && pmap_load(pte) != 0) {
	rv = TRUE;
	}
	PMAP_UNLOCK(pmap);
	return (rv);
	}

	/*
	* pmap_is_referenced:
	*
	* Return whether or not the specified physical page was referenced
	* in any physical maps.
	*/
	boolean_t
	pmap_is_referenced(vm_page_t m)
	{

	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_is_referenced: page %p is not managed", m));
	return (pmap_page_test_mappings(m, TRUE, FALSE));
	}

	/*
	* Clear the write and modified bits in each of the given page's mappings.
	*/
	void
	pmap_remove_write(vm_page_t m)
	{
	struct md_page *pvh;
	pmap_t pmap;
	struct rwlock *lock;
	pv_entry_t next_pv, pv;
	pt_entry_t oldpte, *pte;
	vm_offset_t va;
	int lvl, md_gen, pvh_gen;

	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_remove_write: page %p is not managed", m));

	/*
	* If the page is not exclusive busied, then PGA_WRITEABLE cannot be
	* set by another thread while the object is locked. Thus,
	* if PGA_WRITEABLE is clear, no page table entries need updating.
	*/
	VM_OBJECT_ASSERT_WLOCKED(m->object);
	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
	return;
	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
	pa_to_pvh(VM_PAGE_TO_PHYS(m));
	retry_pv_loop:
	rw_wlock(lock);
	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	pvh_gen = pvh->pv_gen;
	rw_wunlock(lock);
	PMAP_LOCK(pmap);
	rw_wlock(lock);
	if (pvh_gen != pvh->pv_gen) {
	PMAP_UNLOCK(pmap);
	rw_wunlock(lock);
	goto retry_pv_loop;
	}
	}
	va = pv->pv_va;
	pte = pmap_pte(pmap, pv->pv_va, &lvl);
	if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
	pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET,
	&lock);
	KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
	("inconsistent pv lock %p %p for page %p",
	lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
	PMAP_UNLOCK(pmap);
	}
	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	pvh_gen = pvh->pv_gen;
	md_gen = m->md.pv_gen;
	rw_wunlock(lock);
	PMAP_LOCK(pmap);
	rw_wlock(lock);
	if (pvh_gen != pvh->pv_gen \|\|
	md_gen != m->md.pv_gen) {
	PMAP_UNLOCK(pmap);
	rw_wunlock(lock);
	goto retry_pv_loop;
	}
	}
	pte = pmap_pte(pmap, pv->pv_va, &lvl);
	retry:
	oldpte = pmap_load(pte);
	if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) {
	if (!atomic_cmpset_long(pte, oldpte,
	oldpte \| ATTR_AP(ATTR_AP_RO)))
	goto retry;
	if ((oldpte & ATTR_AF) != 0)
	vm_page_dirty(m);
	pmap_invalidate_page(pmap, pv->pv_va);
	}
	PMAP_UNLOCK(pmap);
	}
	rw_wunlock(lock);
	vm_page_aflag_clear(m, PGA_WRITEABLE);
	}

	static __inline boolean_t
	safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
	{

	return (FALSE);
	}

	/*
	* pmap_ts_referenced:
	*
	* Return a count of reference bits for a page, clearing those bits.
	* It is not necessary for every reference bit to be cleared, but it
	* is necessary that 0 only be returned when there are truly no
	* reference bits set.
	*
	* As an optimization, update the page's dirty field if a modified bit is
	* found while counting reference bits. This opportunistic update can be
	* performed at low cost and can eliminate the need for some future calls
	* to pmap_is_modified(). However, since this function stops after
	* finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
	* dirty pages. Those dirty pages will only be detected by a future call
	* to pmap_is_modified().
	*/
	int
	pmap_ts_referenced(vm_page_t m)
	{
	struct md_page *pvh;
	pv_entry_t pv, pvf;
	pmap_t pmap;
	struct rwlock *lock;
	pd_entry_t *pde, tpde;
	pt_entry_t *pte, tpte;
	pt_entry_t *l3;
	vm_offset_t va;
	vm_paddr_t pa;
	int cleared, md_gen, not_cleared, lvl, pvh_gen;
	struct spglist free;
	bool demoted;

	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_ts_referenced: page %p is not managed", m));
	SLIST_INIT(&free);
	cleared = 0;
	pa = VM_PAGE_TO_PHYS(m);
	lock = PHYS_TO_PV_LIST_LOCK(pa);
	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
	rw_wlock(lock);
	retry:
	not_cleared = 0;
	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
	goto small_mappings;
	pv = pvf;
	do {
	if (pvf == NULL)
	pvf = pv;
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	pvh_gen = pvh->pv_gen;
	rw_wunlock(lock);
	PMAP_LOCK(pmap);
	rw_wlock(lock);
	if (pvh_gen != pvh->pv_gen) {
	PMAP_UNLOCK(pmap);
	goto retry;
	}
	}
	va = pv->pv_va;
	pde = pmap_pde(pmap, pv->pv_va, &lvl);
	KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
	KASSERT(lvl == 1,
	("pmap_ts_referenced: invalid pde level %d", lvl));
	tpde = pmap_load(pde);
	KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
	("pmap_ts_referenced: found an invalid l1 table"));
	pte = pmap_l1_to_l2(pde, pv->pv_va);
	tpte = pmap_load(pte);
	if (pmap_page_dirty(tpte)) {
	/*
	* Although "tpte" is mapping a 2MB page, because
	* this function is called at a 4KB page granularity,
	* we only update the 4KB page under test.
	*/
	vm_page_dirty(m);
	}
	if ((tpte & ATTR_AF) != 0) {
	/*
	* Since this reference bit is shared by 512 4KB
	* pages, it should not be cleared every time it is
	* tested. Apply a simple "hash" function on the
	* physical page number, the virtual superpage number,
	* and the pmap address to select one 4KB page out of
	* the 512 on which testing the reference bit will
	* result in clearing that reference bit. This
	* function is designed to avoid the selection of the
	* same 4KB page for every 2MB page mapping.
	*
	* On demotion, a mapping that hasn't been referenced
	* is simply destroyed. To avoid the possibility of a
	* subsequent page fault on a demoted wired mapping,
	* always leave its reference bit set. Moreover,
	* since the superpage is wired, the current state of
	* its reference bit won't affect page replacement.
	*/
	if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
	(uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
	(tpte & ATTR_SW_WIRED) == 0) {
	if (safe_to_clear_referenced(pmap, tpte)) {
	/*
	* TODO: We don't handle the access
	* flag at all. We need to be able
	* to set it in the exception handler.
	*/
	panic("ARM64TODO: "
	"safe_to_clear_referenced\n");
	} else if (pmap_demote_l2_locked(pmap, pte,
	pv->pv_va, &lock) != NULL) {
	demoted = true;
	va += VM_PAGE_TO_PHYS(m) -
	(tpte & ~ATTR_MASK);
	l3 = pmap_l2_to_l3(pte, va);
	pmap_remove_l3(pmap, l3, va,
	pmap_load(pte), NULL, &lock);
	} else
	demoted = true;

	if (demoted) {
	/*
	* The superpage mapping was removed
	* entirely and therefore 'pv' is no
	* longer valid.
	*/
	if (pvf == pv)
	pvf = NULL;
	pv = NULL;
	}
	cleared++;
	KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
	("inconsistent pv lock %p %p for page %p",
	lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
	} else
	not_cleared++;
	}
	PMAP_UNLOCK(pmap);
	/* Rotate the PV list if it has more than one entry. */
	if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
	TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
	pvh->pv_gen++;
	}
	if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
	goto out;
	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
	small_mappings:
	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
	goto out;
	pv = pvf;
	do {
	if (pvf == NULL)
	pvf = pv;
	pmap = PV_PMAP(pv);
	if (!PMAP_TRYLOCK(pmap)) {
	pvh_gen = pvh->pv_gen;
	md_gen = m->md.pv_gen;
	rw_wunlock(lock);
	PMAP_LOCK(pmap);
	rw_wlock(lock);
	if (pvh_gen != pvh->pv_gen \|\| md_gen != m->md.pv_gen) {
	PMAP_UNLOCK(pmap);
	goto retry;
	}
	}
	pde = pmap_pde(pmap, pv->pv_va, &lvl);
	KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
	KASSERT(lvl == 2,
	("pmap_ts_referenced: invalid pde level %d", lvl));
	tpde = pmap_load(pde);
	KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
	("pmap_ts_referenced: found an invalid l2 table"));
	pte = pmap_l2_to_l3(pde, pv->pv_va);
	tpte = pmap_load(pte);
	if (pmap_page_dirty(tpte))
	vm_page_dirty(m);
	if ((tpte & ATTR_AF) != 0) {
	if (safe_to_clear_referenced(pmap, tpte)) {
	/*
	* TODO: We don't handle the access flag
	* at all. We need to be able to set it in
	* the exception handler.
	*/
	panic("ARM64TODO: safe_to_clear_referenced\n");
	} else if ((tpte & ATTR_SW_WIRED) == 0) {
	/*
	* Wired pages cannot be paged out so
	* doing accessed bit emulation for
	* them is wasted effort. We do the
	* hard work for unwired pages only.
	*/
	pmap_remove_l3(pmap, pte, pv->pv_va, tpde,
	&free, &lock);
	pmap_invalidate_page(pmap, pv->pv_va);
	cleared++;
	if (pvf == pv)
	pvf = NULL;
	pv = NULL;
	KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
	("inconsistent pv lock %p %p for page %p",
	lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
	} else
	not_cleared++;
	}
	PMAP_UNLOCK(pmap);
	/* Rotate the PV list if it has more than one entry. */
	if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
	TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
	m->md.pv_gen++;
	}
	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
	not_cleared < PMAP_TS_REFERENCED_MAX);
	out:
	rw_wunlock(lock);
	pmap_free_zero_pages(&free);
	return (cleared + not_cleared);
	}

	/*
	* Apply the given advice to the specified range of addresses within the
	* given pmap. Depending on the advice, clear the referenced and/or
	* modified flags in each mapping and set the mapped page's dirty field.
	*/
	void
	pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
	{
	}

	/*
	* Clear the modify bits on the specified physical page.
	*/
	void
	pmap_clear_modify(vm_page_t m)
	{

	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	("pmap_clear_modify: page %p is not managed", m));
	VM_OBJECT_ASSERT_WLOCKED(m->object);
	KASSERT(!vm_page_xbusied(m),
	("pmap_clear_modify: page %p is exclusive busied", m));

	/*
	* If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
	* If the object containing the page is locked and the page is not
	* exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
	*/
	if ((m->aflags & PGA_WRITEABLE) == 0)
	return;

	/* ARM64TODO: We lack support for tracking if a page is modified */
	}

	void *
	pmap_mapbios(vm_paddr_t pa, vm_size_t size)
	{

	return ((void *)PHYS_TO_DMAP(pa));
	}

	void
	pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
	{
	}

	/*
	* Sets the memory attribute for the specified page.
	*/
	void
	pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
	{

	m->md.pv_memattr = ma;

	/*
	* If "m" is a normal page, update its direct mapping. This update
	* can be relied upon to perform any cache operations that are
	* required for data coherence.
	*/
	if ((m->flags & PG_FICTITIOUS) == 0 &&
	pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
	m->md.pv_memattr) != 0)
	panic("memory attribute change on the direct map failed");
	}

	/*
	* Changes the specified virtual address range's memory type to that given by
	* the parameter "mode". The specified virtual address range must be
	* completely contained within either the direct map or the kernel map. If
	* the virtual address range is contained within the kernel map, then the
	* memory type for each of the corresponding ranges of the direct map is also
	* changed. (The corresponding ranges of the direct map are those ranges that
	* map the same physical pages as the specified virtual address range.) These
	* changes to the direct map are necessary because Intel describes the
	* behavior of their processors as "undefined" if two or more mappings to the
	* same physical page have different memory types.
	*
	* Returns zero if the change completed successfully, and either EINVAL or
	* ENOMEM if the change failed. Specifically, EINVAL is returned if some part
	* of the virtual address range was not mapped, and ENOMEM is returned if
	* there was insufficient memory available to complete the change. In the
	* latter case, the memory type may have been changed on some part of the
	* virtual address range or the direct map.
	*/
	static int
	pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
	{
	int error;

	PMAP_LOCK(kernel_pmap);
	error = pmap_change_attr_locked(va, size, mode);
	PMAP_UNLOCK(kernel_pmap);
	return (error);
	}

	static int
	pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
	{
	vm_offset_t base, offset, tmpva;
	pt_entry_t l3, pte, newpte;
	int lvl;

	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
	base = trunc_page(va);
	offset = va & PAGE_MASK;
	size = round_page(offset + size);

	if (!VIRT_IN_DMAP(base))
	return (EINVAL);

	for (tmpva = base; tmpva < base + size; ) {
	pte = pmap_pte(kernel_pmap, va, &lvl);
	if (pte == NULL)
	return (EINVAL);

	if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
	/*
	* We already have the correct attribute,
	* ignore this entry.
	*/
	switch (lvl) {
	default:
	panic("Invalid DMAP table level: %d\n", lvl);
	case 1:
	tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
	break;
	case 2:
	tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
	break;
	case 3:
	tmpva += PAGE_SIZE;
	break;
	}
	} else {
	/*
	* Split the entry to an level 3 table, then
	* set the new attribute.
	*/
	switch (lvl) {
	default:
	panic("Invalid DMAP table level: %d\n", lvl);
	case 1:
	newpte = pmap_demote_l1(kernel_pmap, pte,
	tmpva & ~L1_OFFSET);
	if (newpte == NULL)
	return (EINVAL);
	pte = pmap_l1_to_l2(pte, tmpva);
	case 2:
	newpte = pmap_demote_l2(kernel_pmap, pte,
	tmpva & ~L2_OFFSET);
	if (newpte == NULL)
	return (EINVAL);
	pte = pmap_l2_to_l3(pte, tmpva);
	case 3:
	/* Update the entry */
	l3 = pmap_load(pte);
	l3 &= ~ATTR_IDX_MASK;
	l3 \|= ATTR_IDX(mode);
	if (mode == DEVICE_MEMORY)
	l3 \|= ATTR_XN;

	pmap_update_entry(kernel_pmap, pte, l3, tmpva,
	PAGE_SIZE);

	/*
	* If moving to a non-cacheable entry flush
	* the cache.
	*/
	if (mode == VM_MEMATTR_UNCACHEABLE)
	cpu_dcache_wbinv_range(tmpva, L3_SIZE);

	break;
	}
	tmpva += PAGE_SIZE;
	}
	}

	return (0);
	}

	/*
	* Create an L2 table to map all addresses within an L1 mapping.
	*/
	static pt_entry_t *
	pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
	{
	pt_entry_t *l2, newl2, oldl1;
	vm_offset_t tmpl1;
	vm_paddr_t l2phys, phys;
	vm_page_t ml2;
	int i;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	oldl1 = pmap_load(l1);
	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
	("pmap_demote_l1: Demoting a non-block entry"));
	KASSERT((va & L1_OFFSET) == 0,
	("pmap_demote_l1: Invalid virtual address %#lx", va));
	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
	("pmap_demote_l1: Level 1 table shouldn't be managed"));

	tmpl1 = 0;
	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
	tmpl1 = kva_alloc(PAGE_SIZE);
	if (tmpl1 == 0)
	return (NULL);
	}

	if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT \|
	VM_ALLOC_NOOBJ \| VM_ALLOC_WIRED)) == NULL) {
	CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
	" in pmap %p", va, pmap);
	return (NULL);
	}

	l2phys = VM_PAGE_TO_PHYS(ml2);
	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);

	/* Address the range points at */
	phys = oldl1 & ~ATTR_MASK;
	/* The attributed from the old l1 table to be copied */
	newl2 = oldl1 & ATTR_MASK;

	/* Create the new entries */
	for (i = 0; i < Ln_ENTRIES; i++) {
	l2[i] = newl2 \| phys;
	phys += L2_SIZE;
	}
	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) \| L2_BLOCK),
	("Invalid l2 page (%lx != %lx)", l2[0],
	(oldl1 & ~ATTR_DESCR_MASK) \| L2_BLOCK));

	if (tmpl1 != 0) {
	pmap_kenter(tmpl1, PAGE_SIZE,
	DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
	l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
	}

	pmap_update_entry(pmap, l1, l2phys \| L1_TABLE, va, PAGE_SIZE);

	if (tmpl1 != 0) {
	pmap_kremove(tmpl1);
	kva_free(tmpl1, PAGE_SIZE);
	}

	return (l2);
	}

	/*
	* Create an L3 table to map all addresses within an L2 mapping.
	*/
	static pt_entry_t *
	pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
	struct rwlock **lockp)
	{
	pt_entry_t *l3, newl3, oldl2;
	vm_offset_t tmpl2;
	vm_paddr_t l3phys, phys;
	vm_page_t ml3;
	int i;

	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
	l3 = NULL;
	oldl2 = pmap_load(l2);
	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
	("pmap_demote_l2: Demoting a non-block entry"));
	KASSERT((va & L2_OFFSET) == 0,
	("pmap_demote_l2: Invalid virtual address %#lx", va));

	tmpl2 = 0;
	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
	tmpl2 = kva_alloc(PAGE_SIZE);
	if (tmpl2 == 0)
	return (NULL);
	}

	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
	ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
	(VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) \|
	VM_ALLOC_NOOBJ \| VM_ALLOC_WIRED);
	if (ml3 == NULL) {
	CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
	" in pmap %p", va, pmap);
	goto fail;
	}
	if (va < VM_MAXUSER_ADDRESS)
	pmap_resident_count_inc(pmap, 1);
	}

	l3phys = VM_PAGE_TO_PHYS(ml3);
	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);

	/* Address the range points at */
	phys = oldl2 & ~ATTR_MASK;
	/* The attributed from the old l2 table to be copied */
	newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) \| L3_PAGE;

	/*
	* If the page table page is new, initialize it.
	*/
	if (ml3->wire_count == 1) {
	for (i = 0; i < Ln_ENTRIES; i++) {
	l3[i] = newl3 \| phys;
	phys += L3_SIZE;
	}
	}
	KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) \| L3_PAGE),
	("Invalid l3 page (%lx != %lx)", l3[0],
	(oldl2 & ~ATTR_DESCR_MASK) \| L3_PAGE));

	/*
	* Map the temporary page so we don't lose access to the l2 table.
	*/
	if (tmpl2 != 0) {
	pmap_kenter(tmpl2, PAGE_SIZE,
	DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
	l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
	}

	/*
	* The spare PV entries must be reserved prior to demoting the
	* mapping, that is, prior to changing the PDE. Otherwise, the state
	* of the L2 and the PV lists will be inconsistent, which can result
	* in reclaim_pv_chunk() attempting to remove a PV entry from the
	* wrong PV list and pmap_pv_demote_l2() failing to find the expected
	* PV entry for the 2MB page mapping that is being demoted.
	*/
	if ((oldl2 & ATTR_SW_MANAGED) != 0)
	reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);

	pmap_update_entry(pmap, l2, l3phys \| L2_TABLE, va, PAGE_SIZE);

	/*
	* Demote the PV entry.
	*/
	if ((oldl2 & ATTR_SW_MANAGED) != 0)
	pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);

	atomic_add_long(&pmap_l2_demotions, 1);
	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
	" in pmap %p %lx", va, pmap, l3[0]);

	fail:
	if (tmpl2 != 0) {
	pmap_kremove(tmpl2);
	kva_free(tmpl2, PAGE_SIZE);
	}

	return (l3);

	}

	static pt_entry_t *
	pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
	{
	struct rwlock *lock;
	pt_entry_t *l3;

	lock = NULL;
	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
	if (lock != NULL)
	rw_wunlock(lock);
	return (l3);
	}

	/*
	* perform the pmap work for mincore
	*/
	int
	pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
	{
	pd_entry_t *l1p, l1;
	pd_entry_t *l2p, l2;
	pt_entry_t *l3p, l3;
	vm_paddr_t pa;
	bool managed;
	int val;

	PMAP_LOCK(pmap);
	retry:
	pa = 0;
	val = 0;
	managed = false;

	l1p = pmap_l1(pmap, addr);
	if (l1p == NULL) /* No l1 */
	goto done;

	l1 = pmap_load(l1p);
	if ((l1 & ATTR_DESCR_MASK) == L1_INVAL)
	goto done;

	if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
	pa = (l1 & ~ATTR_MASK) \| (addr & L1_OFFSET);
	managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
	val = MINCORE_SUPER \| MINCORE_INCORE;
	if (pmap_page_dirty(l1))
	val \|= MINCORE_MODIFIED \| MINCORE_MODIFIED_OTHER;
	if ((l1 & ATTR_AF) == ATTR_AF)
	val \|= MINCORE_REFERENCED \| MINCORE_REFERENCED_OTHER;
	goto done;
	}

	l2p = pmap_l1_to_l2(l1p, addr);
	if (l2p == NULL) /* No l2 */
	goto done;

	l2 = pmap_load(l2p);
	if ((l2 & ATTR_DESCR_MASK) == L2_INVAL)
	goto done;

	if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
	pa = (l2 & ~ATTR_MASK) \| (addr & L2_OFFSET);
	managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
	val = MINCORE_SUPER \| MINCORE_INCORE;
	if (pmap_page_dirty(l2))
	val \|= MINCORE_MODIFIED \| MINCORE_MODIFIED_OTHER;
	if ((l2 & ATTR_AF) == ATTR_AF)
	val \|= MINCORE_REFERENCED \| MINCORE_REFERENCED_OTHER;
	goto done;
	}

	l3p = pmap_l2_to_l3(l2p, addr);
	if (l3p == NULL) /* No l3 */
	goto done;

	l3 = pmap_load(l2p);
	if ((l3 & ATTR_DESCR_MASK) == L3_INVAL)
	goto done;

	if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) {
	pa = (l3 & ~ATTR_MASK) \| (addr & L3_OFFSET);
	managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
	val = MINCORE_INCORE;
	if (pmap_page_dirty(l3))
	val \|= MINCORE_MODIFIED \| MINCORE_MODIFIED_OTHER;
	if ((l3 & ATTR_AF) == ATTR_AF)
	val \|= MINCORE_REFERENCED \| MINCORE_REFERENCED_OTHER;
	}

	done:
	if ((val & (MINCORE_MODIFIED_OTHER \| MINCORE_REFERENCED_OTHER)) !=
	(MINCORE_MODIFIED_OTHER \| MINCORE_REFERENCED_OTHER) && managed) {
	/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
	if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
	goto retry;
	} else
	PA_UNLOCK_COND(*locked_pa);
	PMAP_UNLOCK(pmap);

	return (val);
	}

	void
	pmap_activate(struct thread *td)
	{
	pmap_t pmap;

	critical_enter();
	pmap = vmspace_pmap(td->td_proc->p_vmspace);
	td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0);
	__asm __volatile("msr ttbr0_el1, %0" : :
	"r"(td->td_proc->p_md.md_l0addr));
	pmap_invalidate_all(pmap);
	critical_exit();
	}

	void
	pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
	{

	if (va >= VM_MIN_KERNEL_ADDRESS) {
	cpu_icache_sync_range(va, sz);
	} else {
	u_int len, offset;
	vm_paddr_t pa;

	/* Find the length of data in this page to flush */
	offset = va & PAGE_MASK;
	len = imin(PAGE_SIZE - offset, sz);

	while (sz != 0) {
	/* Extract the physical address & find it in the DMAP */
	pa = pmap_extract(pmap, va);
	if (pa != 0)
	cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);

	/* Move to the next page */
	sz -= len;
	va += len;
	/* Set the length for the next iteration */
	len = imin(PAGE_SIZE, sz);
	}
	}
	}

	int
	pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
	{
	#ifdef SMP
	uint64_t par;
	#endif

	switch (ESR_ELx_EXCEPTION(esr)) {
	case EXCP_DATA_ABORT_L:
	case EXCP_DATA_ABORT:
	break;
	default:
	return (KERN_FAILURE);
	}

	#ifdef SMP
	PMAP_LOCK(pmap);
	switch (esr & ISS_DATA_DFSC_MASK) {
	case ISS_DATA_DFSC_TF_L0:
	case ISS_DATA_DFSC_TF_L1:
	case ISS_DATA_DFSC_TF_L2:
	case ISS_DATA_DFSC_TF_L3:
	/* Ask the MMU to check the address */
	if (pmap == kernel_pmap)
	par = arm64_address_translate_s1e1r(far);
	else
	par = arm64_address_translate_s1e0r(far);

	/*
	* If the translation was successful the address was invalid
	* due to a break-before-make sequence. We can unlock and
	* return success to the trap handler.
	*/
	if (PAR_SUCCESS(par)) {
	PMAP_UNLOCK(pmap);
	return (KERN_SUCCESS);
	}
	break;
	default:
	break;
	}
	PMAP_UNLOCK(pmap);
	#endif

	return (KERN_FAILURE);
	}

	/*
	* Increase the starting virtual address of the given mapping if a
	* different alignment might result in more superpage mappings.
	*/
	void
	pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
	vm_offset_t *addr, vm_size_t size)
	{
	vm_offset_t superpage_offset;

	if (size < L2_SIZE)
	return;
	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
	offset += ptoa(object->pg_color);
	superpage_offset = offset & L2_OFFSET;
	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE \|\|
	(*addr & L2_OFFSET) == superpage_offset)
	return;
	if ((*addr & L2_OFFSET) < superpage_offset)
	addr = (addr & ~L2_OFFSET) + superpage_offset;
	else
	addr = ((addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
	}

	/**
	* Get the kernel virtual address of a set of physical pages. If there are
	* physical addresses not covered by the DMAP perform a transient mapping
	* that will be removed when calling pmap_unmap_io_transient.
	*
	* \param page The pages the caller wishes to obtain the virtual
	* address on the kernel memory map.
	* \param vaddr On return contains the kernel virtual memory address
	* of the pages passed in the page parameter.
	* \param count Number of pages passed in.
	* \param can_fault TRUE if the thread using the mapped pages can take
	* page faults, FALSE otherwise.
	*
	* \returns TRUE if the caller must call pmap_unmap_io_transient when
	* finished or FALSE otherwise.
	*
	*/
	boolean_t
	pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
	boolean_t can_fault)
	{
	vm_paddr_t paddr;
	boolean_t needs_mapping;
	int error, i;

	/*
	* Allocate any KVA space that we need, this is done in a separate
	* loop to prevent calling vmem_alloc while pinned.
	*/
	needs_mapping = FALSE;
	for (i = 0; i < count; i++) {
	paddr = VM_PAGE_TO_PHYS(page[i]);
	if (__predict_false(!PHYS_IN_DMAP(paddr))) {
	error = vmem_alloc(kernel_arena, PAGE_SIZE,
	M_BESTFIT \| M_WAITOK, &vaddr[i]);
	KASSERT(error == 0, ("vmem_alloc failed: %d", error));
	needs_mapping = TRUE;
	} else {
	vaddr[i] = PHYS_TO_DMAP(paddr);
	}
	}

	/* Exit early if everything is covered by the DMAP */
	if (!needs_mapping)
	return (FALSE);

	if (!can_fault)
	sched_pin();
	for (i = 0; i < count; i++) {
	paddr = VM_PAGE_TO_PHYS(page[i]);
	if (!PHYS_IN_DMAP(paddr)) {
	panic(
	"pmap_map_io_transient: TODO: Map out of DMAP data");
	}
	}

	return (needs_mapping);
	}

	void
	pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
	boolean_t can_fault)
	{
	vm_paddr_t paddr;
	int i;

	if (!can_fault)
	sched_unpin();
	for (i = 0; i < count; i++) {
	paddr = VM_PAGE_TO_PHYS(page[i]);
	if (!PHYS_IN_DMAP(paddr)) {
	panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
	}
	}
	}
	Index: head/sys/cam/ata/ata_da.c
	===================================================================
	--- head/sys/cam/ata/ata_da.c (revision 327172)
	+++ head/sys/cam/ata/ata_da.c (revision 327173)
	@@ -1,3587 +1,3584 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ada.h"

	#include <sys/param.h>

	#ifdef _KERNEL
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/endian.h>
	#include <sys/cons.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <sys/sbuf.h>
	#include <geom/geom_disk.h>
	#endif /* _KERNEL */

	#ifndef _KERNEL
	#include <stdio.h>
	#include <string.h>
	#endif /* _KERNEL */

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_da.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_iosched.h>

	#include <cam/ata/ata_all.h>

	#include <machine/md_var.h> /* geometry translation */

	#ifdef _KERNEL

	#define ATA_MAX_28BIT_LBA 268435455UL

	extern int iosched_debug;

	typedef enum {
	ADA_STATE_RAHEAD,
	ADA_STATE_WCACHE,
	ADA_STATE_LOGDIR,
	ADA_STATE_IDDIR,
	ADA_STATE_SUP_CAP,
	ADA_STATE_ZONE,
	ADA_STATE_NORMAL
	} ada_state;

	typedef enum {
	ADA_FLAG_CAN_48BIT = 0x00000002,
	ADA_FLAG_CAN_FLUSHCACHE = 0x00000004,
	ADA_FLAG_CAN_NCQ = 0x00000008,
	ADA_FLAG_CAN_DMA = 0x00000010,
	ADA_FLAG_NEED_OTAG = 0x00000020,
	ADA_FLAG_WAS_OTAG = 0x00000040,
	ADA_FLAG_CAN_TRIM = 0x00000080,
	ADA_FLAG_OPEN = 0x00000100,
	ADA_FLAG_SCTX_INIT = 0x00000200,
	ADA_FLAG_CAN_CFA = 0x00000400,
	ADA_FLAG_CAN_POWERMGT = 0x00000800,
	ADA_FLAG_CAN_DMA48 = 0x00001000,
	ADA_FLAG_CAN_LOG = 0x00002000,
	ADA_FLAG_CAN_IDLOG = 0x00004000,
	ADA_FLAG_CAN_SUPCAP = 0x00008000,
	ADA_FLAG_CAN_ZONE = 0x00010000,
	ADA_FLAG_CAN_WCACHE = 0x00020000,
	ADA_FLAG_CAN_RAHEAD = 0x00040000,
	ADA_FLAG_PROBED = 0x00080000,
	ADA_FLAG_ANNOUNCED = 0x00100000,
	ADA_FLAG_DIRTY = 0x00200000,
	ADA_FLAG_CAN_NCQ_TRIM = 0x00400000, /* CAN_TRIM also set */
	ADA_FLAG_PIM_ATA_EXT = 0x00800000
	} ada_flags;

	typedef enum {
	ADA_Q_NONE = 0x00,
	ADA_Q_4K = 0x01,
	ADA_Q_NCQ_TRIM_BROKEN = 0x02,
	ADA_Q_LOG_BROKEN = 0x04,
	ADA_Q_SMR_DM = 0x08
	} ada_quirks;

	#define ADA_Q_BIT_STRING \
	"\020" \
	"\0014K" \
	"\002NCQ_TRIM_BROKEN" \
	"\003LOG_BROKEN" \
	"\004SMR_DM"

	typedef enum {
	ADA_CCB_RAHEAD = 0x01,
	ADA_CCB_WCACHE = 0x02,
	ADA_CCB_BUFFER_IO = 0x03,
	ADA_CCB_DUMP = 0x05,
	ADA_CCB_TRIM = 0x06,
	ADA_CCB_LOGDIR = 0x07,
	ADA_CCB_IDDIR = 0x08,
	ADA_CCB_SUP_CAP = 0x09,
	ADA_CCB_ZONE = 0x0a,
	ADA_CCB_TYPE_MASK = 0x0F,
	} ada_ccb_state;

	typedef enum {
	ADA_ZONE_NONE = 0x00,
	ADA_ZONE_DRIVE_MANAGED = 0x01,
	ADA_ZONE_HOST_AWARE = 0x02,
	ADA_ZONE_HOST_MANAGED = 0x03
	} ada_zone_mode;

	typedef enum {
	ADA_ZONE_FLAG_RZ_SUP = 0x0001,
	ADA_ZONE_FLAG_OPEN_SUP = 0x0002,
	ADA_ZONE_FLAG_CLOSE_SUP = 0x0004,
	ADA_ZONE_FLAG_FINISH_SUP = 0x0008,
	ADA_ZONE_FLAG_RWP_SUP = 0x0010,
	ADA_ZONE_FLAG_SUP_MASK = (ADA_ZONE_FLAG_RZ_SUP \|
	ADA_ZONE_FLAG_OPEN_SUP \|
	ADA_ZONE_FLAG_CLOSE_SUP \|
	ADA_ZONE_FLAG_FINISH_SUP \|
	ADA_ZONE_FLAG_RWP_SUP),
	ADA_ZONE_FLAG_URSWRZ = 0x0020,
	ADA_ZONE_FLAG_OPT_SEQ_SET = 0x0040,
	ADA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080,
	ADA_ZONE_FLAG_MAX_SEQ_SET = 0x0100,
	ADA_ZONE_FLAG_SET_MASK = (ADA_ZONE_FLAG_OPT_SEQ_SET \|
	ADA_ZONE_FLAG_OPT_NONSEQ_SET \|
	ADA_ZONE_FLAG_MAX_SEQ_SET)
	} ada_zone_flags;

	static struct ada_zone_desc {
	ada_zone_flags value;
	const char *desc;
	} ada_zone_desc_table[] = {
	{ADA_ZONE_FLAG_RZ_SUP, "Report Zones" },
	{ADA_ZONE_FLAG_OPEN_SUP, "Open" },
	{ADA_ZONE_FLAG_CLOSE_SUP, "Close" },
	{ADA_ZONE_FLAG_FINISH_SUP, "Finish" },
	{ADA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" },
	};


	/* Offsets into our private area for storing information */
	#define ccb_state ppriv_field0
	#define ccb_bp ppriv_ptr1

	typedef enum {
	ADA_DELETE_NONE,
	ADA_DELETE_DISABLE,
	ADA_DELETE_CFA_ERASE,
	ADA_DELETE_DSM_TRIM,
	ADA_DELETE_NCQ_DSM_TRIM,
	ADA_DELETE_MIN = ADA_DELETE_CFA_ERASE,
	ADA_DELETE_MAX = ADA_DELETE_NCQ_DSM_TRIM,
	} ada_delete_methods;

	static const char *ada_delete_method_names[] =
	{ "NONE", "DISABLE", "CFA_ERASE", "DSM_TRIM", "NCQ_DSM_TRIM" };
	#if 0
	static const char *ada_delete_method_desc[] =
	{ "NONE", "DISABLED", "CFA Erase", "DSM Trim", "DSM Trim via NCQ" };
	#endif

	struct disk_params {
	u_int8_t heads;
	u_int8_t secs_per_track;
	u_int32_t cylinders;
	u_int32_t secsize; /* Number of bytes/logical sector */
	u_int64_t sectors; /* Total number sectors */
	};

	#define TRIM_MAX_BLOCKS 8
	#define TRIM_MAX_RANGES (TRIM_MAX_BLOCKS * ATA_DSM_BLK_RANGES)
	struct trim_request {
	uint8_t data[TRIM_MAX_RANGES * ATA_DSM_RANGE_SIZE];
	TAILQ_HEAD(, bio) bps;
	};

	struct ada_softc {
	struct cam_iosched_softc *cam_iosched;
	int outstanding_cmds; /* Number of active commands */
	int refcount; /* Active xpt_action() calls */
	ada_state state;
	ada_flags flags;
	ada_zone_mode zone_mode;
	ada_zone_flags zone_flags;
	struct ata_gp_log_dir ata_logdir;
	int valid_logdir_len;
	struct ata_identify_log_pages ata_iddir;
	int valid_iddir_len;
	uint64_t optimal_seq_zones;
	uint64_t optimal_nonseq_zones;
	uint64_t max_seq_zones;
	ada_quirks quirks;
	ada_delete_methods delete_method;
	int trim_max_ranges;
	int read_ahead;
	int write_cache;
	int unmappedio;
	int rotating;
	#ifdef ADA_TEST_FAILURE
	int force_read_error;
	int force_write_error;
	int periodic_read_error;
	int periodic_read_count;
	#endif
	struct disk_params params;
	struct disk *disk;
	struct task sysctl_task;
	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	struct callout sendordered_c;
	struct trim_request trim_req;
	#ifdef CAM_IO_STATS
	struct sysctl_ctx_list sysctl_stats_ctx;
	struct sysctl_oid *sysctl_stats_tree;
	u_int timeouts;
	u_int errors;
	u_int invalidations;
	#endif
	#define ADA_ANNOUNCETMP_SZ 80
	char announce_temp[ADA_ANNOUNCETMP_SZ];
	#define ADA_ANNOUNCE_SZ 400
	char announce_buffer[ADA_ANNOUNCE_SZ];
	};

	struct ada_quirk_entry {
	struct scsi_inquiry_pattern inq_pat;
	ada_quirks quirks;
	};

	static struct ada_quirk_entry ada_quirk_table[] =
	{
	{
	/* Hitachi Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Hitachi H??????????E3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD155UI", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD204UI", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST????DL", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Barracuda Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST???DM", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Barracuda Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST????DM", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9500423AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9500424AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9640423AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9640424AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9750420AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9750422AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST9750423AS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* Seagate Momentus Thin Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST???LT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Red Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????CX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????RS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green/Red Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD????RX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Red Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD??????CX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD??????EX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD??????RS", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD??????RX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD???PKT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD?????PKT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD???PVT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "WDC WD?????PVT", "*" },
	/quirks/ADA_Q_4K
	},
	/* SSDs */
	{
	/*
	* Corsair Force 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair CSSD-F", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Corsair Force 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Force 3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Corsair Neutron GTX SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Neutron GTX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Corsair Force GT & GS SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Force G", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Crucial M4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "M4-CT???M4SSD2", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Crucial M500 SSDs MU07 firmware
	* NCQ Trim works
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTM500*", "MU07" },
	/quirks/0
	},
	{
	/*
	* Crucial M500 SSDs all other firmware
	* NCQ Trim doesn't work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTM500", "" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Crucial M550 SSDs
	* NCQ Trim doesn't work, but only on MU01 firmware
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTM550*", "MU01" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Crucial MX100 SSDs
	* NCQ Trim doesn't work, but only on MU01 firmware
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Crucial CTMX100*", "MU01" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Crucial RealSSD C300 SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "C300-CTFDDAC???MAG",
	"" }, /quirks*/ADA_Q_4K
	},
	{
	/*
	* FCCT M500 SSDs
	* NCQ Trim doesn't work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "FCCTM500", "" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Intel 320 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSA2CW", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel 330 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2CT", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel 510 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2MH", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel 520 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2BW", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel S3610 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSC2BX", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Intel X25-M Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "INTEL SSDSA2M", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Kingston E100 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "KINGSTON SE100S3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Kingston HyperX 3k SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "KINGSTON SH103S3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Marvell SSDs (entry taken from OpenSolaris)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "MARVELL SD88SA02", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Micron M500 SSDs firmware MU07
	* NCQ Trim works?
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron M500", "MU07" },
	/quirks/0
	},
	{
	/*
	* Micron M500 SSDs all other firmware
	* NCQ Trim doesn't work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron M500", "*" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Micron M5[15]0 SSDs
	* NCQ Trim doesn't work, but only MU01 firmware
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron M5[15]0", "MU01" },
	/quirks/ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Micron 5100 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Micron 5100 MTFDDAK", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Agility 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-AGILITY2", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Agility 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-AGILITY3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Deneva R Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "DENRSTE251M45", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Vertex 2 SSDs (inc pro series)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ?VERTEX2", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Vertex 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-VERTEX3", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* OCZ Vertex 4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-VERTEX4", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung 750 SSDs
	* 4k optimised, NCQ TRIM seems to work
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 750", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung 830 Series SSDs
	* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG SSD 830 Series", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung 840 SSDs
	* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 840", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung 845 SSDs
	* 4k optimised, NCQ TRIM Broken (normal TRIM is fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 845", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung 850 SSDs
	* 4k optimised, NCQ TRIM broken (normal TRIM fine)
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Samsung SSD 850", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung SM863 Series SSDs (MZ7KM*)
	* 4k optimised, NCQ believed to be working
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG MZ7KM", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung 843T Series SSDs (MZ7WD*)
	* Samsung PM851 Series SSDs (MZ7TE*)
	* Samsung PM853T Series SSDs (MZ7GE*)
	* 4k optimised, NCQ believed to be broken since these are
	* appear to be built with the same controllers as the 840/850.
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG MZ7", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Same as for SAMSUNG MZ7* but enable the quirks for SSD
	* starting with MZ7* too
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "MZ7", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* Samsung PM851 Series SSDs Dell OEM
	* device model "SAMSUNG SSD PM851 mSATA 256GB"
	* 4k optimised, NCQ broken
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG SSD PM851", "*" },
	/quirks/ADA_Q_4K \| ADA_Q_NCQ_TRIM_BROKEN
	},
	{
	/*
	* SuperTalent TeraDrive CT SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "FTM??CT25H", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* XceedIOPS SATA SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SG9XCS2D", "*" },
	/quirks/ADA_Q_4K
	},
	{
	/*
	* Samsung drive that doesn't support READ LOG EXT or
	* READ LOG DMA EXT, despite reporting that it does in
	* ATA identify data:
	* SAMSUNG HD200HJ KF100-06
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD200", "*" },
	/quirks/ADA_Q_LOG_BROKEN
	},
	{
	/*
	* Samsung drive that doesn't support READ LOG EXT or
	* READ LOG DMA EXT, despite reporting that it does in
	* ATA identify data:
	* SAMSUNG HD501LJ CR100-10
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "SAMSUNG HD501", "*" },
	/quirks/ADA_Q_LOG_BROKEN
	},
	{
	/*
	* Seagate Lamarr 8TB Shingled Magnetic Recording (SMR)
	* Drive Managed SATA hard drive. This drive doesn't report
	* in firmware that it is a drive managed SMR drive.
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "ST8000AS000[23]", "*" },
	/quirks/ADA_Q_SMR_DM
	},
	{
	/* Default */
	{
	T_ANY, SIP_MEDIA_REMOVABLE\|SIP_MEDIA_FIXED,
	/vendor/"", /product/"", /revision/"*"
	},
	/quirks/0
	},
	};

	static disk_strategy_t adastrategy;
	static dumper_t adadump;
	static periph_init_t adainit;
	static void adadiskgonecb(struct disk *dp);
	static periph_oninv_t adaoninvalidate;
	static periph_dtor_t adacleanup;
	static void adaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static int adazonemodesysctl(SYSCTL_HANDLER_ARGS);
	static int adazonesupsysctl(SYSCTL_HANDLER_ARGS);
	static void adasysctlinit(void *context, int pending);
	static int adagetattr(struct bio *bp);
	static void adasetflags(struct ada_softc *softc,
	struct ccb_getdev *cgd);
	static periph_ctor_t adaregister;
	static void ada_dsmtrim(struct ada_softc softc, struct bio bp,
	struct ccb_ataio *ataio);
	static void ada_cfaerase(struct ada_softc softc, struct bio bp,
	struct ccb_ataio *ataio);
	static int ada_zone_bio_to_ata(int disk_zone_cmd);
	static int ada_zone_cmd(struct cam_periph periph, union ccb ccb,
	struct bio bp, int queue_ccb);
	static periph_start_t adastart;
	static void adaprobedone(struct cam_periph periph, union ccb ccb);
	static void adazonedone(struct cam_periph periph, union ccb ccb);
	static void adadone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int adaerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static void adagetparams(struct cam_periph *periph,
	struct ccb_getdev *cgd);
	static timeout_t adasendorderedtag;
	static void adashutdown(void *arg, int howto);
	static void adasuspend(void *arg);
	static void adaresume(void *arg);

	#ifndef ADA_DEFAULT_TIMEOUT
	#define ADA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */
	#endif

	#ifndef ADA_DEFAULT_RETRY
	#define ADA_DEFAULT_RETRY 4
	#endif

	#ifndef ADA_DEFAULT_SEND_ORDERED
	#define ADA_DEFAULT_SEND_ORDERED 1
	#endif

	#ifndef ADA_DEFAULT_SPINDOWN_SHUTDOWN
	#define ADA_DEFAULT_SPINDOWN_SHUTDOWN 1
	#endif

	#ifndef ADA_DEFAULT_SPINDOWN_SUSPEND
	#define ADA_DEFAULT_SPINDOWN_SUSPEND 1
	#endif

	#ifndef ADA_DEFAULT_READ_AHEAD
	#define ADA_DEFAULT_READ_AHEAD 1
	#endif

	#ifndef ADA_DEFAULT_WRITE_CACHE
	#define ADA_DEFAULT_WRITE_CACHE 1
	#endif

	#define ADA_RA (softc->read_ahead >= 0 ? \
	softc->read_ahead : ada_read_ahead)
	#define ADA_WC (softc->write_cache >= 0 ? \
	softc->write_cache : ada_write_cache)

	/*
	* Most platforms map firmware geometry to actual, but some don't. If
	* not overridden, default to nothing.
	*/
	#ifndef ata_disk_firmware_geom_adjust
	#define ata_disk_firmware_geom_adjust(disk)
	#endif

	static int ada_retry_count = ADA_DEFAULT_RETRY;
	static int ada_default_timeout = ADA_DEFAULT_TIMEOUT;
	static int ada_send_ordered = ADA_DEFAULT_SEND_ORDERED;
	static int ada_spindown_shutdown = ADA_DEFAULT_SPINDOWN_SHUTDOWN;
	static int ada_spindown_suspend = ADA_DEFAULT_SPINDOWN_SUSPEND;
	static int ada_read_ahead = ADA_DEFAULT_READ_AHEAD;
	static int ada_write_cache = ADA_DEFAULT_WRITE_CACHE;

	static SYSCTL_NODE(_kern_cam, OID_AUTO, ada, CTLFLAG_RD, 0,
	"CAM Direct Access Disk driver");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, retry_count, CTLFLAG_RWTUN,
	&ada_retry_count, 0, "Normal I/O retry count");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, default_timeout, CTLFLAG_RWTUN,
	&ada_default_timeout, 0, "Normal I/O timeout (in seconds)");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, send_ordered, CTLFLAG_RWTUN,
	&ada_send_ordered, 0, "Send Ordered Tags");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_shutdown, CTLFLAG_RWTUN,
	&ada_spindown_shutdown, 0, "Spin down upon shutdown");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_suspend, CTLFLAG_RWTUN,
	&ada_spindown_suspend, 0, "Spin down upon suspend");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, read_ahead, CTLFLAG_RWTUN,
	&ada_read_ahead, 0, "Enable disk read-ahead");
	SYSCTL_INT(_kern_cam_ada, OID_AUTO, write_cache, CTLFLAG_RWTUN,
	&ada_write_cache, 0, "Enable disk write cache");

	/*
	* ADA_ORDEREDTAG_INTERVAL determines how often, relative
	* to the default timeout, we check to see whether an ordered
	* tagged transaction is appropriate to prevent simple tag
	* starvation. Since we'd like to ensure that there is at least
	* 1/2 of the timeout length left for a starved transaction to
	* complete after we've sent an ordered tag, we must poll at least
	* four times in every timeout period. This takes care of the worst
	* case where a starved transaction starts during an interval that
	* meets the requirement "don't send an ordered tag" test so it takes
	* us two intervals to determine that a tag must be sent.
	*/
	#ifndef ADA_ORDEREDTAG_INTERVAL
	#define ADA_ORDEREDTAG_INTERVAL 4
	#endif

	static struct periph_driver adadriver =
	{
	adainit, "ada",
	TAILQ_HEAD_INITIALIZER(adadriver.units), /* generation */ 0
	};

	static int adadeletemethodsysctl(SYSCTL_HANDLER_ARGS);

	PERIPHDRIVER_DECLARE(ada, adadriver);

	static MALLOC_DEFINE(M_ATADA, "ata_da", "ata_da buffers");

	static int
	adaopen(struct disk *dp)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
	return(ENXIO);
	}

	cam_periph_lock(periph);
	if ((error = cam_periph_hold(periph, PRIBIO\|PCATCH)) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("adaopen\n"));

	softc = (struct ada_softc *)periph->softc;
	softc->flags \|= ADA_FLAG_OPEN;

	cam_periph_unhold(periph);
	cam_periph_unlock(periph);
	return (0);
	}

	static int
	adaclose(struct disk *dp)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	union ccb *ccb;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct ada_softc *)periph->softc;
	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("adaclose\n"));

	/* We only sync the cache if the drive is capable of it. */
	if ((softc->flags & ADA_FLAG_DIRTY) != 0 &&
	(softc->flags & ADA_FLAG_CAN_FLUSHCACHE) != 0 &&
	(periph->flags & CAM_PERIPH_INVALID) == 0 &&
	cam_periph_hold(periph, PRIBIO) == 0) {

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	cam_fill_ataio(&ccb->ataio,
	1,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0);
	error = cam_periph_runccb(ccb, adaerror, /cam_flags/0,
	/sense_flags/0, softc->disk->d_devstat);

	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	softc->flags &= ~ADA_FLAG_DIRTY;
	xpt_release_ccb(ccb);
	cam_periph_unhold(periph);
	}

	softc->flags &= ~ADA_FLAG_OPEN;

	while (softc->refcount != 0)
	cam_periph_sleep(periph, &softc->refcount, PRIBIO, "adaclose", 1);
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	static void
	adaschedule(struct cam_periph *periph)
	{
	struct ada_softc softc = (struct ada_softc )periph->softc;

	if (softc->state != ADA_STATE_NORMAL)
	return;

	cam_iosched_schedule(softc->cam_iosched, periph);
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	adastrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	softc = (struct ada_softc *)periph->softc;

	cam_periph_lock(periph);

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastrategy(%p)\n", bp));

	/*
	* If the device has been made invalid, error out
	*/
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	/*
	* Zone commands must be ordered, because they can depend on the
	* effects of previously issued commands, and they may affect
	* commands after them.
	*/
	if (bp->bio_cmd == BIO_ZONE)
	bp->bio_flags \|= BIO_ORDERED;

	/*
	* Place it in the queue of disk activities for this disk
	*/
	cam_iosched_queue_work(softc->cam_iosched, bp);

	/*
	* Schedule ourselves for performing the work.
	*/
	adaschedule(periph);
	cam_periph_unlock(periph);

	return;
	}

	static int
	adadump(void arg, void virtual, vm_offset_t physical, off_t offset, size_t length)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	u_int secsize;
	struct ccb_ataio ataio;
	struct disk *dp;
	uint64_t lba;
	uint16_t count;
	int error = 0;

	dp = arg;
	periph = dp->d_drv1;
	softc = (struct ada_softc *)periph->softc;
	cam_periph_lock(periph);
	secsize = softc->params.secsize;
	lba = offset / secsize;
	count = length / secsize;

	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_unlock(periph);
	return (ENXIO);
	}

	memset(&ataio, 0, sizeof(ataio));
	if (length > 0) {
	xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	ataio.ccb_h.ccb_state = ADA_CCB_DUMP;
	cam_fill_ataio(&ataio,
	0,
	adadone,
	CAM_DIR_OUT,
	0,
	(u_int8_t *) virtual,
	length,
	ada_default_timeout*1000);
	if ((softc->flags & ADA_FLAG_CAN_48BIT) &&
	(lba + count >= ATA_MAX_28BIT_LBA \|\|
	count >= 256)) {
	ata_48bit_cmd(&ataio, ATA_WRITE_DMA48,
	0, lba, count);
	} else {
	ata_28bit_cmd(&ataio, ATA_WRITE_DMA,
	0, lba, count);
	}
	error = cam_periph_runccb((union ccb *)&ataio, adaerror,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	printf("Aborting dump due to I/O error.\n");

	cam_periph_unlock(periph);
	return (error);
	}

	if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE) {
	xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);

	/*
	* Tell the drive to flush its internal cache. if we
	* can't flush in 5s we have big problems. No need to
	* wait the default 60s to detect problems.
	*/
	ataio.ccb_h.ccb_state = ADA_CCB_DUMP;
	cam_fill_ataio(&ataio,
	0,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	5*1000);

	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(&ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(&ataio, ATA_FLUSHCACHE, 0, 0, 0);
	error = cam_periph_runccb((union ccb *)&ataio, adaerror,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	}
	cam_periph_unlock(periph);
	return (error);
	}

	static void
	adainit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, adaasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("ada: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	} else if (ada_send_ordered) {

	/* Register our event handlers */
	if ((EVENTHANDLER_REGISTER(power_suspend, adasuspend,
	NULL, EVENTHANDLER_PRI_LAST)) == NULL)
	printf("adainit: power event registration failed!\n");
	if ((EVENTHANDLER_REGISTER(power_resume, adaresume,
	NULL, EVENTHANDLER_PRI_LAST)) == NULL)
	printf("adainit: power event registration failed!\n");
	if ((EVENTHANDLER_REGISTER(shutdown_post_sync, adashutdown,
	NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
	printf("adainit: shutdown event registration failed!\n");
	}
	}

	/*
	* Callback from GEOM, called when it has finished cleaning up its
	* resources.
	*/
	static void
	adadiskgonecb(struct disk *dp)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)dp->d_drv1;

	cam_periph_release(periph);
	}

	static void
	adaoninvalidate(struct cam_periph *periph)
	{
	struct ada_softc *softc;

	softc = (struct ada_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, adaasync, periph, periph->path);
	#ifdef CAM_IO_STATS
	softc->invalidations++;
	#endif

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);

	disk_gone(softc->disk);
	}

	static void
	adacleanup(struct cam_periph *periph)
	{
	struct ada_softc *softc;

	softc = (struct ada_softc *)periph->softc;

	cam_periph_unlock(periph);

	cam_iosched_fini(softc->cam_iosched);

	/*
	* If we can't free the sysctl tree, oh well...
	*/
	if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0) {
	#ifdef CAM_IO_STATS
	if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl stats context\n");
	#endif
	if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl context\n");
	}

	disk_destroy(softc->disk);
	callout_drain(&softc->sendordered_c);
	free(softc, M_DEVBUF);
	cam_periph_lock(periph);
	}

	static void
	adasetdeletemethod(struct ada_softc *softc)
	{

	if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
	softc->delete_method = ADA_DELETE_NCQ_DSM_TRIM;
	else if (softc->flags & ADA_FLAG_CAN_TRIM)
	softc->delete_method = ADA_DELETE_DSM_TRIM;
	else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))
	softc->delete_method = ADA_DELETE_CFA_ERASE;
	else
	softc->delete_method = ADA_DELETE_NONE;
	}

	static void
	adaasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct ccb_getdev cgd;
	struct cam_periph *periph;
	struct ada_softc *softc;

	periph = (struct cam_periph *)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_ATA)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(adaregister, adaoninvalidate,
	adacleanup, adastart,
	"ada", CAM_PERIPH_BIO,
	path, adaasync,
	AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("adaasync: Unable to attach to new device "
	"due to status 0x%x\n", status);
	break;
	}
	case AC_GETDEV_CHANGED:
	{
	softc = (struct ada_softc *)periph->softc;
	xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);

	/*
	* Set/clear support flags based on the new Identify data.
	*/
	adasetflags(softc, &cgd);

	cam_periph_async(periph, code, path, arg);
	break;
	}
	case AC_ADVINFO_CHANGED:
	{
	uintptr_t buftype;

	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct ada_softc *softc;

	softc = periph->softc;
	disk_attr_changed(softc->disk, "GEOM::physpath",
	M_NOWAIT);
	}
	break;
	}
	case AC_SENT_BDR:
	case AC_BUS_RESET:
	{
	softc = (struct ada_softc *)periph->softc;
	cam_periph_async(periph, code, path, arg);
	if (softc->state != ADA_STATE_NORMAL)
	break;
	xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);
	if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD)
	softc->state = ADA_STATE_RAHEAD;
	else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE)
	softc->state = ADA_STATE_WCACHE;
	else if ((softc->flags & ADA_FLAG_CAN_LOG)
	&& (softc->zone_mode != ADA_ZONE_NONE))
	softc->state = ADA_STATE_LOGDIR;
	else
	break;
	if (cam_periph_acquire(periph) != CAM_REQ_CMP)
	softc->state = ADA_STATE_NORMAL;
	else
	xpt_schedule(periph, CAM_PRIORITY_DEV);
	}
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static int
	adazonemodesysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[40];
	struct ada_softc *softc;
	int error;

	softc = (struct ada_softc *)arg1;

	switch (softc->zone_mode) {
	case ADA_ZONE_DRIVE_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed");
	break;
	case ADA_ZONE_HOST_AWARE:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware");
	break;
	case ADA_ZONE_HOST_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed");
	break;
	case ADA_ZONE_NONE:
	default:
	snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned");
	break;
	}

	error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req);

	return (error);
	}

	static int
	adazonesupsysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[180];
	struct ada_softc *softc;
	struct sbuf sb;
	int error, first;
	unsigned int i;

	softc = (struct ada_softc *)arg1;

	error = 0;
	first = 1;
	sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0);

	for (i = 0; i < sizeof(ada_zone_desc_table) /
	sizeof(ada_zone_desc_table[0]); i++) {
	if (softc->zone_flags & ada_zone_desc_table[i].value) {
	if (first == 0)
	sbuf_printf(&sb, ", ");
	else
	first = 0;
	sbuf_cat(&sb, ada_zone_desc_table[i].desc);
	}
	}

	if (first == 1)
	sbuf_printf(&sb, "None");

	sbuf_finish(&sb);

	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);

	return (error);
	}


	static void
	adasysctlinit(void *context, int pending)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	char tmpstr[32], tmpstr2[16];

	periph = (struct cam_periph *)context;

	/* periph was held for us when this task was enqueued */
	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
	cam_periph_release(periph);
	return;
	}

	softc = (struct ada_softc *)periph->softc;
	snprintf(tmpstr, sizeof(tmpstr), "CAM ADA unit %d",periph->unit_number);
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);

	sysctl_ctx_init(&softc->sysctl_ctx);
	softc->flags \|= ADA_FLAG_SCTX_INIT;
	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_kern_cam_ada), OID_AUTO, tmpstr2,
	CTLFLAG_RD, 0, tmpstr, "device_index");
	if (softc->sysctl_tree == NULL) {
	printf("adasysctlinit: unable to allocate sysctl tree\n");
	cam_periph_release(periph);
	return;
	}

	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "delete_method", CTLTYPE_STRING \| CTLFLAG_RW,
	softc, 0, adadeletemethodsysctl, "A",
	"BIO_DELETE execution method");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "read_ahead", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->read_ahead, 0, "Enable disk read ahead.");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "write_cache", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->write_cache, 0, "Enable disk write cache.");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "unmapped_io", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->unmappedio, 0, "Unmapped I/O leaf");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "rotating", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->rotating, 0, "Rotating media");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_mode", CTLTYPE_STRING \| CTLFLAG_RD,
	softc, 0, adazonemodesysctl, "A",
	"Zone Mode");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_support", CTLTYPE_STRING \| CTLFLAG_RD,
	softc, 0, adazonesupsysctl, "A",
	"Zone Support");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones,
	"Optimal Number of Open Sequential Write Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_nonseq_zones", CTLFLAG_RD,
	&softc->optimal_nonseq_zones,
	"Optimal Number of Non-Sequentially Written Sequential Write "
	"Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones,
	"Maximum Number of Open Sequential Write Required Zones");

	#ifdef ADA_TEST_FAILURE
	/*
	* Add a 'door bell' sysctl which allows one to set it from userland
	* and cause something bad to happen. For the moment, we only allow
	* whacking the next read or write.
	*/
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "force_read_error", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->force_read_error, 0,
	"Force a read error for the next N reads.");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "force_write_error", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->force_write_error, 0,
	"Force a write error for the next N writes.");
	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "periodic_read_error", CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&softc->periodic_read_error, 0,
	"Force a read error every N reads (don't set too low).");
	#endif

	#ifdef CAM_IO_STATS
	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
	CTLFLAG_RD, 0, "Statistics");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "timeouts", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->timeouts, 0,
	"Device timeouts reported by the SIM");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "errors", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->errors, 0,
	"Transport errors reported by the SIM.");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO, "pack_invalidations", CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&softc->invalidations, 0,
	"Device pack invalidations.");
	#endif

	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
	softc->sysctl_tree);

	cam_periph_release(periph);
	}

	static int
	adagetattr(struct bio *bp)
	{
	int ret;
	struct cam_periph *periph;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	cam_periph_lock(periph);
	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
	periph->path);
	cam_periph_unlock(periph);
	if (ret == 0)
	bp->bio_completed = bp->bio_length;
	return ret;
	}

	static int
	adadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	const char *p;
	struct ada_softc *softc;
	int i, error, value, methods;

	softc = (struct ada_softc *)arg1;

	value = softc->delete_method;
	if (value < 0 \|\| value > ADA_DELETE_MAX)
	p = "UNKNOWN";
	else
	p = ada_delete_method_names[value];
	strncpy(buf, p, sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	methods = 1 << ADA_DELETE_DISABLE;
	if ((softc->flags & ADA_FLAG_CAN_CFA) &&
	!(softc->flags & ADA_FLAG_CAN_48BIT))
	methods \|= 1 << ADA_DELETE_CFA_ERASE;
	if (softc->flags & ADA_FLAG_CAN_TRIM)
	methods \|= 1 << ADA_DELETE_DSM_TRIM;
	if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
	methods \|= 1 << ADA_DELETE_NCQ_DSM_TRIM;
	for (i = 0; i <= ADA_DELETE_MAX; i++) {
	if (!(methods & (1 << i)) \|\|
	strcmp(buf, ada_delete_method_names[i]) != 0)
	continue;
	softc->delete_method = i;
	return (0);
	}
	return (EINVAL);
	}

	static void
	adasetflags(struct ada_softc softc, struct ccb_getdev cgd)
	{
	if ((cgd->ident_data.capabilities1 & ATA_SUPPORT_DMA) &&
	(cgd->inq_flags & SID_DMA))
	softc->flags \|= ADA_FLAG_CAN_DMA;
	else
	softc->flags &= ~ADA_FLAG_CAN_DMA;

	if (cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) {
	softc->flags \|= ADA_FLAG_CAN_48BIT;
	if (cgd->inq_flags & SID_DMA48)
	softc->flags \|= ADA_FLAG_CAN_DMA48;
	else
	softc->flags &= ~ADA_FLAG_CAN_DMA48;
	} else
	softc->flags &= ~(ADA_FLAG_CAN_48BIT \| ADA_FLAG_CAN_DMA48);

	if (cgd->ident_data.support.command2 & ATA_SUPPORT_FLUSHCACHE)
	softc->flags \|= ADA_FLAG_CAN_FLUSHCACHE;
	else
	softc->flags &= ~ADA_FLAG_CAN_FLUSHCACHE;

	if (cgd->ident_data.support.command1 & ATA_SUPPORT_POWERMGT)
	softc->flags \|= ADA_FLAG_CAN_POWERMGT;
	else
	softc->flags &= ~ADA_FLAG_CAN_POWERMGT;

	if ((cgd->ident_data.satacapabilities & ATA_SUPPORT_NCQ) &&
	(cgd->inq_flags & SID_DMA) && (cgd->inq_flags & SID_CmdQue))
	softc->flags \|= ADA_FLAG_CAN_NCQ;
	else
	softc->flags &= ~ADA_FLAG_CAN_NCQ;

	if ((cgd->ident_data.support_dsm & ATA_SUPPORT_DSM_TRIM) &&
	(cgd->inq_flags & SID_DMA)) {
	softc->flags \|= ADA_FLAG_CAN_TRIM;
	softc->trim_max_ranges = TRIM_MAX_RANGES;
	if (cgd->ident_data.max_dsm_blocks != 0) {
	softc->trim_max_ranges =
	min(cgd->ident_data.max_dsm_blocks *
	ATA_DSM_BLK_RANGES, softc->trim_max_ranges);
	}
	/*
	* If we can do RCVSND_FPDMA_QUEUED commands, we may be able
	* to do NCQ trims, if we support trims at all. We also need
	* support from the SIM to do things properly. Perhaps we
	* should look at log 13 dword 0 bit 0 and dword 1 bit 0 are
	* set too...
	*/
	if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 &&
	(softc->flags & ADA_FLAG_PIM_ATA_EXT) != 0 &&
	(cgd->ident_data.satacapabilities2 &
	ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
	(softc->flags & ADA_FLAG_CAN_TRIM) != 0)
	softc->flags \|= ADA_FLAG_CAN_NCQ_TRIM;
	else
	softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM;
	} else
	softc->flags &= ~(ADA_FLAG_CAN_TRIM \| ADA_FLAG_CAN_NCQ_TRIM);

	if (cgd->ident_data.support.command2 & ATA_SUPPORT_CFA)
	softc->flags \|= ADA_FLAG_CAN_CFA;
	else
	softc->flags &= ~ADA_FLAG_CAN_CFA;

	/*
	* Now that we've set the appropriate flags, setup the delete
	* method.
	*/
	adasetdeletemethod(softc);

	if ((cgd->ident_data.support.extension & ATA_SUPPORT_GENLOG)
	&& ((softc->quirks & ADA_Q_LOG_BROKEN) == 0))
	softc->flags \|= ADA_FLAG_CAN_LOG;
	else
	softc->flags &= ~ADA_FLAG_CAN_LOG;

	if ((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE)
	softc->zone_mode = ADA_ZONE_HOST_AWARE;
	else if (((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED)
	\|\| (softc->quirks & ADA_Q_SMR_DM))
	softc->zone_mode = ADA_ZONE_DRIVE_MANAGED;
	else
	softc->zone_mode = ADA_ZONE_NONE;

	if (cgd->ident_data.support.command1 & ATA_SUPPORT_LOOKAHEAD)
	softc->flags \|= ADA_FLAG_CAN_RAHEAD;
	else
	softc->flags &= ~ADA_FLAG_CAN_RAHEAD;

	if (cgd->ident_data.support.command1 & ATA_SUPPORT_WRITECACHE)
	softc->flags \|= ADA_FLAG_CAN_WCACHE;
	else
	softc->flags &= ~ADA_FLAG_CAN_WCACHE;
	}

	static cam_status
	adaregister(struct cam_periph periph, void arg)
	{
	struct ada_softc *softc;
	struct ccb_pathinq cpi;
	struct ccb_getdev *cgd;
	struct disk_params *dp;
	struct sbuf sb;
	char *announce_buf;
	caddr_t match;
	u_int maxio;
	int quirks;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("adaregister: no getdev CCB, can't register device\n");
	return(CAM_REQ_CMP_ERR);
	}

	softc = (struct ada_softc )malloc(sizeof(softc), M_DEVBUF,
	M_NOWAIT\|M_ZERO);

	if (softc == NULL) {
	printf("adaregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return(CAM_REQ_CMP_ERR);
	}

	announce_buf = softc->announce_temp;
	bzero(announce_buf, ADA_ANNOUNCETMP_SZ);

	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
	printf("adaregister: Unable to probe new device. "
	"Unable to allocate iosched memory\n");
	free(softc, M_DEVBUF);
	return(CAM_REQ_CMP_ERR);
	}

	periph->softc = softc;

	/*
	* See if this device has any quirks.
	*/
	match = cam_quirkmatch((caddr_t)&cgd->ident_data,
	(caddr_t)ada_quirk_table,
	nitems(ada_quirk_table),
	sizeof(*ada_quirk_table), ata_identify_match);
	if (match != NULL)
	softc->quirks = ((struct ada_quirk_entry *)match)->quirks;
	else
	softc->quirks = ADA_Q_NONE;

	xpt_path_inq(&cpi, periph->path);

	TASK_INIT(&softc->sysctl_task, 0, adasysctlinit, periph);

	/*
	* Register this media as a disk
	*/
	(void)cam_periph_hold(periph, PRIBIO);
	cam_periph_unlock(periph);
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"kern.cam.ada.%d.quirks", periph->unit_number);
	quirks = softc->quirks;
	TUNABLE_INT_FETCH(announce_buf, &quirks);
	softc->quirks = quirks;
	softc->read_ahead = -1;
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"kern.cam.ada.%d.read_ahead", periph->unit_number);
	TUNABLE_INT_FETCH(announce_buf, &softc->read_ahead);
	softc->write_cache = -1;
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"kern.cam.ada.%d.write_cache", periph->unit_number);
	TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);

	/*
	* Set support flags based on the Identify data and quirks.
	*/
	adasetflags(softc, cgd);

	/* Disable queue sorting for non-rotational media by default. */
	if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) {
	softc->rotating = 0;
	} else {
	softc->rotating = 1;
	}
	cam_iosched_set_sort_queue(softc->cam_iosched, softc->rotating ? -1 : 0);
	adagetparams(periph, cgd);
	softc->disk = disk_alloc();
	softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
	periph->unit_number, softc->params.secsize,
	DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT \|
	XPORT_DEVSTAT_TYPE(cpi.transport),
	DEVSTAT_PRIORITY_DISK);
	softc->disk->d_open = adaopen;
	softc->disk->d_close = adaclose;
	softc->disk->d_strategy = adastrategy;
	softc->disk->d_getattr = adagetattr;
	softc->disk->d_dump = adadump;
	softc->disk->d_gone = adadiskgonecb;
	softc->disk->d_name = "ada";
	softc->disk->d_drv1 = periph;
	maxio = cpi.maxio; /* Honor max I/O size of SIM */
	if (maxio == 0)
	maxio = DFLTPHYS; /* traditional default */
	else if (maxio > MAXPHYS)
	maxio = MAXPHYS; /* for safety */
	if (softc->flags & ADA_FLAG_CAN_48BIT)
	maxio = min(maxio, 65536 * softc->params.secsize);
	else /* 28bit ATA command limit */
	maxio = min(maxio, 256 * softc->params.secsize);
	softc->disk->d_maxsize = maxio;
	softc->disk->d_unit = periph->unit_number;
	softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION \| DISKFLAG_CANZONE;
	if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE)
	softc->disk->d_flags \|= DISKFLAG_CANFLUSHCACHE;
	if (softc->flags & ADA_FLAG_CAN_TRIM) {
	softc->disk->d_flags \|= DISKFLAG_CANDELETE;
	softc->disk->d_delmaxsize = softc->params.secsize *
	ATA_DSM_RANGE_MAX *
	softc->trim_max_ranges;
	} else if ((softc->flags & ADA_FLAG_CAN_CFA) &&
	!(softc->flags & ADA_FLAG_CAN_48BIT)) {
	softc->disk->d_flags \|= DISKFLAG_CANDELETE;
	softc->disk->d_delmaxsize = 256 * softc->params.secsize;
	} else
	softc->disk->d_delmaxsize = maxio;
	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
	softc->disk->d_flags \|= DISKFLAG_UNMAPPED_BIO;
	softc->unmappedio = 1;
	}
	if (cpi.hba_misc & PIM_ATA_EXT)
	softc->flags \|= ADA_FLAG_PIM_ATA_EXT;
	strlcpy(softc->disk->d_descr, cgd->ident_data.model,
	MIN(sizeof(softc->disk->d_descr), sizeof(cgd->ident_data.model)));
	strlcpy(softc->disk->d_ident, cgd->ident_data.serial,
	MIN(sizeof(softc->disk->d_ident), sizeof(cgd->ident_data.serial)));
	softc->disk->d_hba_vendor = cpi.hba_vendor;
	softc->disk->d_hba_device = cpi.hba_device;
	softc->disk->d_hba_subvendor = cpi.hba_subvendor;
	softc->disk->d_hba_subdevice = cpi.hba_subdevice;

	softc->disk->d_sectorsize = softc->params.secsize;
	softc->disk->d_mediasize = (off_t)softc->params.sectors *
	softc->params.secsize;
	if (ata_physical_sector_size(&cgd->ident_data) !=
	softc->params.secsize) {
	softc->disk->d_stripesize =
	ata_physical_sector_size(&cgd->ident_data);
	softc->disk->d_stripeoffset = (softc->disk->d_stripesize -
	ata_logical_sector_offset(&cgd->ident_data)) %
	softc->disk->d_stripesize;
	} else if (softc->quirks & ADA_Q_4K) {
	softc->disk->d_stripesize = 4096;
	softc->disk->d_stripeoffset = 0;
	}
	softc->disk->d_fwsectors = softc->params.secs_per_track;
	softc->disk->d_fwheads = softc->params.heads;
	ata_disk_firmware_geom_adjust(softc->disk);

	/*
	* Acquire a reference to the periph before we register with GEOM.
	* We'll release this reference once GEOM calls us back (via
	* adadiskgonecb()) telling us that our provider has been freed.
	*/
	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}
	disk_create(softc->disk, DISK_VERSION);
	cam_periph_lock(periph);

	dp = &softc->params;
	snprintf(announce_buf, ADA_ANNOUNCETMP_SZ,
	"%juMB (%ju %u byte sectors)",
	((uintmax_t)dp->secsize * dp->sectors) / (1024 * 1024),
	(uintmax_t)dp->sectors, dp->secsize);

	sbuf_new(&sb, softc->announce_buffer, ADA_ANNOUNCE_SZ, SBUF_FIXEDLEN);
	xpt_announce_periph_sbuf(periph, &sb, announce_buf);
	xpt_announce_quirks_sbuf(periph, &sb, softc->quirks, ADA_Q_BIT_STRING);
	sbuf_finish(&sb);
	sbuf_putbuf(&sb);

	/*
	* Create our sysctl variables, now that we know
	* we have successfully attached.
	*/
	if (cam_periph_acquire(periph) == CAM_REQ_CMP)
	taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task);

	/*
	* Add async callbacks for bus reset and
	* bus device reset calls. I don't bother
	* checking if this fails as, in most cases,
	* the system will function just fine without
	* them and the only alternative would be to
	* not attach the device on failure.
	*/
	xpt_register_async(AC_SENT_BDR \| AC_BUS_RESET \| AC_LOST_DEVICE \|
	AC_GETDEV_CHANGED \| AC_ADVINFO_CHANGED,
	adaasync, periph, periph->path);

	/*
	* Schedule a periodic event to occasionally send an
	* ordered tag to a device.
	*/
	callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0);
	callout_reset(&softc->sendordered_c,
	(ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL,
	adasendorderedtag, softc);

	if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD) {
	softc->state = ADA_STATE_RAHEAD;
	} else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE) {
	softc->state = ADA_STATE_WCACHE;
	} else if ((softc->flags & ADA_FLAG_CAN_LOG)
	&& (softc->zone_mode != ADA_ZONE_NONE)) {
	softc->state = ADA_STATE_LOGDIR;
	} else {
	/*
	* Nothing to probe, so we can just transition to the
	* normal state.
	*/
	adaprobedone(periph, NULL);
	return(CAM_REQ_CMP);
	}

	xpt_schedule(periph, CAM_PRIORITY_DEV);

	return(CAM_REQ_CMP);
	}

	static int
	ada_dsmtrim_req_create(struct ada_softc softc, struct bio bp, struct trim_request *req)
	{
	uint64_t lastlba = (uint64_t)-1;
	int c, lastcount = 0, off, ranges = 0;

	bzero(req, sizeof(*req));
	TAILQ_INIT(&req->bps);
	do {
	uint64_t lba = bp->bio_pblkno;
	int count = bp->bio_bcount / softc->params.secsize;

	/* Try to extend the previous range. */
	if (lba == lastlba) {
	c = min(count, ATA_DSM_RANGE_MAX - lastcount);
	lastcount += c;
	off = (ranges - 1) * ATA_DSM_RANGE_SIZE;
	req->data[off + 6] = lastcount & 0xff;
	req->data[off + 7] =
	(lastcount >> 8) & 0xff;
	count -= c;
	lba += c;
	}

	while (count > 0) {
	c = min(count, ATA_DSM_RANGE_MAX);
	off = ranges * ATA_DSM_RANGE_SIZE;
	req->data[off + 0] = lba & 0xff;
	req->data[off + 1] = (lba >> 8) & 0xff;
	req->data[off + 2] = (lba >> 16) & 0xff;
	req->data[off + 3] = (lba >> 24) & 0xff;
	req->data[off + 4] = (lba >> 32) & 0xff;
	req->data[off + 5] = (lba >> 40) & 0xff;
	req->data[off + 6] = c & 0xff;
	req->data[off + 7] = (c >> 8) & 0xff;
	lba += c;
	count -= c;
	lastcount = c;
	ranges++;
	/*
	* Its the caller's responsibility to ensure the
	* request will fit so we don't need to check for
	* overrun here
	*/
	}
	lastlba = lba;
	TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue);

	bp = cam_iosched_next_trim(softc->cam_iosched);
	if (bp == NULL)
	break;
	if (bp->bio_bcount / softc->params.secsize >
	(softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp);
	break;
	}
	} while (1);

	return (ranges);
	}

	static void
	ada_dsmtrim(struct ada_softc softc, struct bio bp, struct ccb_ataio *ataio)
	{
	struct trim_request *req = &softc->trim_req;
	int ranges;

	ranges = ada_dsmtrim_req_create(softc, bp, req);
	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	CAM_DIR_OUT,
	0,
	req->data,
	howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE,
	ada_default_timeout * 1000);
	ata_48bit_cmd(ataio, ATA_DATA_SET_MANAGEMENT,
	ATA_DSM_TRIM, 0, howmany(ranges, ATA_DSM_BLK_RANGES));
	}

	static void
	ada_ncq_dsmtrim(struct ada_softc softc, struct bio bp, struct ccb_ataio *ataio)
	{
	struct trim_request *req = &softc->trim_req;
	int ranges;

	ranges = ada_dsmtrim_req_create(softc, bp, req);
	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	CAM_DIR_OUT,
	0,
	req->data,
	howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE,
	ada_default_timeout * 1000);
	ata_ncq_cmd(ataio,
	ATA_SEND_FPDMA_QUEUED,
	0,
	howmany(ranges, ATA_DSM_BLK_RANGES));
	ataio->cmd.sector_count_exp = ATA_SFPDMA_DSM;
	ataio->ata_flags \|= ATA_FLAG_AUX;
	ataio->aux = 1;
	}

	static void
	ada_cfaerase(struct ada_softc softc, struct bio bp, struct ccb_ataio *ataio)
	{
	struct trim_request *req = &softc->trim_req;
	uint64_t lba = bp->bio_pblkno;
	uint16_t count = bp->bio_bcount / softc->params.secsize;

	bzero(req, sizeof(*req));
	TAILQ_INIT(&req->bps);
	TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue);

	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (count >= 256)
	count = 0;
	ata_28bit_cmd(ataio, ATA_CFA_ERASE, 0, lba, count);
	}

	static int
	ada_zone_bio_to_ata(int disk_zone_cmd)
	{
	switch (disk_zone_cmd) {
	case DISK_ZONE_OPEN:
	return ATA_ZM_OPEN_ZONE;
	case DISK_ZONE_CLOSE:
	return ATA_ZM_CLOSE_ZONE;
	case DISK_ZONE_FINISH:
	return ATA_ZM_FINISH_ZONE;
	case DISK_ZONE_RWP:
	return ATA_ZM_RWP;
	}

	return -1;
	}

	static int
	ada_zone_cmd(struct cam_periph periph, union ccb ccb, struct bio *bp,
	int *queue_ccb)
	{
	struct ada_softc *softc;
	int error;

	error = 0;

	if (bp->bio_cmd != BIO_ZONE) {
	error = EINVAL;
	goto bailout;
	}

	softc = periph->softc;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP: {
	int zone_flags;
	int zone_sa;
	uint64_t lba;

	zone_sa = ada_zone_bio_to_ata(bp->bio_zone.zone_cmd);
	if (zone_sa == -1) {
	xpt_print(periph->path, "Cannot translate zone "
	"cmd %#x to ATA\n", bp->bio_zone.zone_cmd);
	error = EINVAL;
	goto bailout;
	}

	zone_flags = 0;
	lba = bp->bio_zone.zone_params.rwp.id;

	if (bp->bio_zone.zone_params.rwp.flags &
	DISK_ZONE_RWP_FLAG_ALL)
	zone_flags \|= ZBC_OUT_ALL;

	ata_zac_mgmt_out(&ccb->ataio,
	/retries/ ada_retry_count,
	/cbfcnp/ adadone,
	/use_ncq/ (softc->flags &
	ADA_FLAG_PIM_ATA_EXT) ? 1 : 0,
	/zm_action/ zone_sa,
	/zone_id/ lba,
	/zone_flags/ zone_flags,
	/sector_count/ 0,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	/timeout/ ada_default_timeout * 1000);
	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_REPORT_ZONES: {
	uint8_t *rz_ptr;
	uint32_t num_entries, alloc_size;
	struct disk_zone_report *rep;

	rep = &bp->bio_zone.zone_params.report;

	num_entries = rep->entries_allocated;
	if (num_entries == 0) {
	xpt_print(periph->path, "No entries allocated for "
	"Report Zones request\n");
	error = EINVAL;
	goto bailout;
	}
	alloc_size = sizeof(struct scsi_report_zones_hdr) +
	(sizeof(struct scsi_report_zones_desc) * num_entries);
	alloc_size = min(alloc_size, softc->disk->d_maxsize);
	rz_ptr = malloc(alloc_size, M_ATADA, M_NOWAIT \| M_ZERO);
	if (rz_ptr == NULL) {
	xpt_print(periph->path, "Unable to allocate memory "
	"for Report Zones request\n");
	error = ENOMEM;
	goto bailout;
	}

	ata_zac_mgmt_in(&ccb->ataio,
	/retries/ ada_retry_count,
	/cbcfnp/ adadone,
	/use_ncq/ (softc->flags &
	ADA_FLAG_PIM_ATA_EXT) ? 1 : 0,
	/zm_action/ ATA_ZM_REPORT_ZONES,
	/zone_id/ rep->starting_id,
	/zone_flags/ rep->rep_options,
	/data_ptr/ rz_ptr,
	/dxfer_len/ alloc_size,
	/timeout/ ada_default_timeout * 1000);

	/*
	* For BIO_ZONE, this isn't normally needed. However, it
	* is used by devstat_end_transaction_bio() to determine
	* how much data was transferred.
	*/
	/*
	* XXX KDM we have a problem. But I'm not sure how to fix
	* it. devstat uses bio_bcount - bio_resid to calculate
	* the amount of data transferred. The GEOM disk code
	* uses bio_length - bio_resid to calculate the amount of
	* data in bio_completed. We have different structure
	* sizes above and below the ada(4) driver. So, if we
	* use the sizes above, the amount transferred won't be
	* quite accurate for devstat. If we use different sizes
	* for bio_bcount and bio_length (above and below
	* respectively), then the residual needs to match one or
	* the other. Everything is calculated after the bio
	* leaves the driver, so changing the values around isn't
	* really an option. For now, just set the count to the
	* passed in length. This means that the calculations
	* above (e.g. bio_completed) will be correct, but the
	* amount of data reported to devstat will be slightly
	* under or overstated.
	*/
	bp->bio_bcount = bp->bio_length;

	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_GET_PARAMS: {
	struct disk_zone_disk_params *params;

	params = &bp->bio_zone.zone_params.disk_params;
	bzero(params, sizeof(*params));

	switch (softc->zone_mode) {
	case ADA_ZONE_DRIVE_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED;
	break;
	case ADA_ZONE_HOST_AWARE:
	params->zone_mode = DISK_ZONE_MODE_HOST_AWARE;
	break;
	case ADA_ZONE_HOST_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED;
	break;
	default:
	case ADA_ZONE_NONE:
	params->zone_mode = DISK_ZONE_MODE_NONE;
	break;
	}

	if (softc->zone_flags & ADA_ZONE_FLAG_URSWRZ)
	params->flags \|= DISK_ZONE_DISK_URSWRZ;

	if (softc->zone_flags & ADA_ZONE_FLAG_OPT_SEQ_SET) {
	params->optimal_seq_zones = softc->optimal_seq_zones;
	params->flags \|= DISK_ZONE_OPT_SEQ_SET;
	}

	if (softc->zone_flags & ADA_ZONE_FLAG_OPT_NONSEQ_SET) {
	params->optimal_nonseq_zones =
	softc->optimal_nonseq_zones;
	params->flags \|= DISK_ZONE_OPT_NONSEQ_SET;
	}

	if (softc->zone_flags & ADA_ZONE_FLAG_MAX_SEQ_SET) {
	params->max_seq_zones = softc->max_seq_zones;
	params->flags \|= DISK_ZONE_MAX_SEQ_SET;
	}
	if (softc->zone_flags & ADA_ZONE_FLAG_RZ_SUP)
	params->flags \|= DISK_ZONE_RZ_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_OPEN_SUP)
	params->flags \|= DISK_ZONE_OPEN_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_CLOSE_SUP)
	params->flags \|= DISK_ZONE_CLOSE_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_FINISH_SUP)
	params->flags \|= DISK_ZONE_FINISH_SUP;

	if (softc->zone_flags & ADA_ZONE_FLAG_RWP_SUP)
	params->flags \|= DISK_ZONE_RWP_SUP;
	break;
	}
	default:
	break;
	}
	bailout:
	return (error);
	}

	static void
	adastart(struct cam_periph periph, union ccb start_ccb)
	{
	struct ada_softc softc = (struct ada_softc )periph->softc;
	struct ccb_ataio *ataio = &start_ccb->ataio;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastart\n"));

	switch (softc->state) {
	case ADA_STATE_NORMAL:
	{
	struct bio *bp;
	u_int8_t tag_code;

	bp = cam_iosched_next_bio(softc->cam_iosched);
	if (bp == NULL) {
	xpt_release_ccb(start_ccb);
	break;
	}

	if ((bp->bio_flags & BIO_ORDERED) != 0 \|\|
	(bp->bio_cmd != BIO_DELETE && (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) {
	softc->flags &= ~ADA_FLAG_NEED_OTAG;
	softc->flags \|= ADA_FLAG_WAS_OTAG;
	tag_code = 0;
	} else {
	tag_code = 1;
	}
	switch (bp->bio_cmd) {
	case BIO_WRITE:
	case BIO_READ:
	{
	uint64_t lba = bp->bio_pblkno;
	uint16_t count = bp->bio_bcount / softc->params.secsize;
	void *data_ptr;
	int rw_op;

	if (bp->bio_cmd == BIO_WRITE) {
	softc->flags \|= ADA_FLAG_DIRTY;
	rw_op = CAM_DIR_OUT;
	} else {
	rw_op = CAM_DIR_IN;
	}

	data_ptr = bp->bio_data;
	if ((bp->bio_flags & (BIO_UNMAPPED\|BIO_VLIST)) != 0) {
	rw_op \|= CAM_DATA_BIO;
	data_ptr = bp;
	}

	#ifdef ADA_TEST_FAILURE
	int fail = 0;

	/*
	* Support the failure ioctls. If the command is a
	* read, and there are pending forced read errors, or
	* if a write and pending write errors, then fail this
	* operation with EIO. This is useful for testing
	* purposes. Also, support having every Nth read fail.
	*
	* This is a rather blunt tool.
	*/
	if (bp->bio_cmd == BIO_READ) {
	if (softc->force_read_error) {
	softc->force_read_error--;
	fail = 1;
	}
	if (softc->periodic_read_error > 0) {
	if (++softc->periodic_read_count >=
	softc->periodic_read_error) {
	softc->periodic_read_count = 0;
	fail = 1;
	}
	}
	} else {
	if (softc->force_write_error) {
	softc->force_write_error--;
	fail = 1;
	}
	}
	if (fail) {
	biofinish(bp, NULL, EIO);
	xpt_release_ccb(start_ccb);
	adaschedule(periph);
	return;
	}
	#endif
	KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 \|\|
	round_page(bp->bio_bcount + bp->bio_ma_offset) /
	PAGE_SIZE == bp->bio_ma_n,
	("Short bio %p", bp));
	cam_fill_ataio(ataio,
	ada_retry_count,
	adadone,
	rw_op,
	0,
	data_ptr,
	bp->bio_bcount,
	ada_default_timeout*1000);

	if ((softc->flags & ADA_FLAG_CAN_NCQ) && tag_code) {
	if (bp->bio_cmd == BIO_READ) {
	ata_ncq_cmd(ataio, ATA_READ_FPDMA_QUEUED,
	lba, count);
	} else {
	ata_ncq_cmd(ataio, ATA_WRITE_FPDMA_QUEUED,
	lba, count);
	}
	} else if ((softc->flags & ADA_FLAG_CAN_48BIT) &&
	(lba + count >= ATA_MAX_28BIT_LBA \|\|
	count > 256)) {
	if (softc->flags & ADA_FLAG_CAN_DMA48) {
	if (bp->bio_cmd == BIO_READ) {
	ata_48bit_cmd(ataio, ATA_READ_DMA48,
	0, lba, count);
	} else {
	ata_48bit_cmd(ataio, ATA_WRITE_DMA48,
	0, lba, count);
	}
	} else {
	if (bp->bio_cmd == BIO_READ) {
	ata_48bit_cmd(ataio, ATA_READ_MUL48,
	0, lba, count);
	} else {
	ata_48bit_cmd(ataio, ATA_WRITE_MUL48,
	0, lba, count);
	}
	}
	} else {
	if (count == 256)
	count = 0;
	if (softc->flags & ADA_FLAG_CAN_DMA) {
	if (bp->bio_cmd == BIO_READ) {
	ata_28bit_cmd(ataio, ATA_READ_DMA,
	0, lba, count);
	} else {
	ata_28bit_cmd(ataio, ATA_WRITE_DMA,
	0, lba, count);
	}
	} else {
	if (bp->bio_cmd == BIO_READ) {
	ata_28bit_cmd(ataio, ATA_READ_MUL,
	0, lba, count);
	} else {
	ata_28bit_cmd(ataio, ATA_WRITE_MUL,
	0, lba, count);
	}
	}
	}
	break;
	}
	case BIO_DELETE:
	switch (softc->delete_method) {
	case ADA_DELETE_NCQ_DSM_TRIM:
	ada_ncq_dsmtrim(softc, bp, ataio);
	break;
	case ADA_DELETE_DSM_TRIM:
	ada_dsmtrim(softc, bp, ataio);
	break;
	case ADA_DELETE_CFA_ERASE:
	ada_cfaerase(softc, bp, ataio);
	break;
	default:
	biofinish(bp, NULL, EOPNOTSUPP);
	xpt_release_ccb(start_ccb);
	adaschedule(periph);
	return;
	}
	start_ccb->ccb_h.ccb_state = ADA_CCB_TRIM;
	start_ccb->ccb_h.flags \|= CAM_UNLOCKED;
	cam_iosched_submit_trim(softc->cam_iosched);
	goto out;
	case BIO_FLUSH:
	cam_fill_ataio(ataio,
	1,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(ataio, ATA_FLUSHCACHE, 0, 0, 0);
	break;
	case BIO_ZONE: {
	int error, queue_ccb;

	queue_ccb = 0;

	error = ada_zone_cmd(periph, start_ccb, bp, &queue_ccb);
	if ((error != 0)
	\|\| (queue_ccb == 0)) {
	biofinish(bp, NULL, error);
	xpt_release_ccb(start_ccb);
	return;
	}
	break;
	}
	}
	start_ccb->ccb_h.ccb_state = ADA_CCB_BUFFER_IO;
	start_ccb->ccb_h.flags \|= CAM_UNLOCKED;
	out:
	start_ccb->ccb_h.ccb_bp = bp;
	softc->outstanding_cmds++;
	softc->refcount++;
	cam_periph_unlock(periph);
	xpt_action(start_ccb);
	cam_periph_lock(periph);
	softc->refcount--;

	/* May have more work to do, so ensure we stay scheduled */
	adaschedule(periph);
	break;
	}
	case ADA_STATE_RAHEAD:
	case ADA_STATE_WCACHE:
	{
	cam_fill_ataio(ataio,
	1,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);

	if (softc->state == ADA_STATE_RAHEAD) {
	ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_RA ?
	ATA_SF_ENAB_RCACHE : ATA_SF_DIS_RCACHE, 0, 0);
	start_ccb->ccb_h.ccb_state = ADA_CCB_RAHEAD;
	} else {
	ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_WC ?
	ATA_SF_ENAB_WCACHE : ATA_SF_DIS_WCACHE, 0, 0);
	start_ccb->ccb_h.ccb_state = ADA_CCB_WCACHE;
	}
	start_ccb->ccb_h.flags \|= CAM_DEV_QFREEZE;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_LOGDIR:
	{
	struct ata_gp_log_dir *log_dir;

	if ((softc->flags & ADA_FLAG_CAN_LOG) == 0) {
	adaprobedone(periph, start_ccb);
	break;
	}

	log_dir = malloc(sizeof(*log_dir), M_ATADA, M_NOWAIT\|M_ZERO);
	if (log_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc log_dir "
	"data\n");
	softc->state = ADA_STATE_NORMAL;
	xpt_release_ccb(start_ccb);
	break;
	}


	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_LOG_DIRECTORY,
	/page_number/ 0,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)log_dir,
	/dxfer_len/sizeof(*log_dir),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_LOGDIR;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_IDDIR:
	{
	struct ata_identify_log_pages *id_dir;

	id_dir = malloc(sizeof(*id_dir), M_ATADA, M_NOWAIT \| M_ZERO);
	if (id_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc id_dir "
	"data\n");
	adaprobedone(periph, start_ccb);
	break;
	}

	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_PAGE_LIST,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)id_dir,
	/dxfer_len/ sizeof(*id_dir),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_IDDIR;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_SUP_CAP:
	{
	struct ata_identify_log_sup_cap *sup_cap;

	sup_cap = malloc(sizeof(*sup_cap), M_ATADA, M_NOWAIT\|M_ZERO);
	if (sup_cap == NULL) {
	xpt_print(periph->path, "Couldn't malloc sup_cap "
	"data\n");
	adaprobedone(periph, start_ccb);
	break;
	}

	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_SUP_CAP,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)sup_cap,
	/dxfer_len/ sizeof(*sup_cap),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_SUP_CAP;
	xpt_action(start_ccb);
	break;
	}
	case ADA_STATE_ZONE:
	{
	struct ata_zoned_info_log *ata_zone;

	ata_zone = malloc(sizeof(*ata_zone), M_ATADA, M_NOWAIT\|M_ZERO);
	if (ata_zone == NULL) {
	xpt_print(periph->path, "Couldn't malloc ata_zone "
	"data\n");
	adaprobedone(periph, start_ccb);
	break;
	}

	ata_read_log(ataio,
	/retries/1,
	/cbfcnp/adadone,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_ZDI,
	/block_count/ 1,
	/protocol/ softc->flags & ADA_FLAG_CAN_DMA ?
	CAM_ATAIO_DMA : 0,
	/data_ptr/ (uint8_t *)ata_zone,
	/dxfer_len/ sizeof(*ata_zone),
	/timeout/ada_default_timeout*1000);

	start_ccb->ccb_h.ccb_state = ADA_CCB_ZONE;
	xpt_action(start_ccb);
	break;
	}
	}
	}

	static void
	adaprobedone(struct cam_periph periph, union ccb ccb)
	{
	struct ada_softc *softc;

	softc = (struct ada_softc *)periph->softc;

	if (ccb != NULL)
	xpt_release_ccb(ccb);

	softc->state = ADA_STATE_NORMAL;
	softc->flags \|= ADA_FLAG_PROBED;
	adaschedule(periph);
	if ((softc->flags & ADA_FLAG_ANNOUNCED) == 0) {
	softc->flags \|= ADA_FLAG_ANNOUNCED;
	cam_periph_unhold(periph);
	} else {
	cam_periph_release_locked(periph);
	}
	}

	static void
	adazonedone(struct cam_periph periph, union ccb ccb)
	{
	- struct ada_softc *softc;
	struct bio *bp;

	- softc = periph->softc;
	bp = (struct bio *)ccb->ccb_h.ccb_bp;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP:
	break;
	case DISK_ZONE_REPORT_ZONES: {
	uint32_t avail_len;
	struct disk_zone_report *rep;
	struct scsi_report_zones_hdr *hdr;
	struct scsi_report_zones_desc *desc;
	struct disk_zone_rep_entry *entry;
	- uint32_t num_alloced, hdr_len, num_avail;
	+ uint32_t hdr_len, num_avail;
	uint32_t num_to_fill, i;

	rep = &bp->bio_zone.zone_params.report;
	avail_len = ccb->ataio.dxfer_len - ccb->ataio.resid;
	/*
	* Note that bio_resid isn't normally used for zone
	* commands, but it is used by devstat_end_transaction_bio()
	* to determine how much data was transferred. Because
	* the size of the SCSI/ATA data structures is different
	* than the size of the BIO interface structures, the
	* amount of data actually transferred from the drive will
	* be different than the amount of data transferred to
	* the user.
	*/
	- num_alloced = rep->entries_allocated;
	hdr = (struct scsi_report_zones_hdr *)ccb->ataio.data_ptr;
	if (avail_len < sizeof(*hdr)) {
	/*
	* Is there a better error than EIO here? We asked
	* for at least the header, and we got less than
	* that.
	*/
	bp->bio_error = EIO;
	bp->bio_flags \|= BIO_ERROR;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	hdr_len = le32dec(hdr->length);
	if (hdr_len > 0)
	rep->entries_available = hdr_len / sizeof(*desc);
	else
	rep->entries_available = 0;
	/*
	* NOTE: using the same values for the BIO version of the
	* same field as the SCSI/ATA values. This means we could
	* get some additional values that aren't defined in bio.h
	* if more values of the same field are defined later.
	*/
	rep->header.same = hdr->byte4 & SRZ_SAME_MASK;
	rep->header.maximum_lba = le64dec(hdr->maximum_lba);
	/*
	* If the drive reports no entries that match the query,
	* we're done.
	*/
	if (hdr_len == 0) {
	rep->entries_filled = 0;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	num_avail = min((avail_len - sizeof(hdr)) / sizeof(desc),
	hdr_len / sizeof(*desc));
	/*
	* If the drive didn't return any data, then we're done.
	*/
	if (num_avail == 0) {
	rep->entries_filled = 0;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	num_to_fill = min(num_avail, rep->entries_allocated);
	/*
	* If the user didn't allocate any entries for us to fill,
	* we're done.
	*/
	if (num_to_fill == 0) {
	rep->entries_filled = 0;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0];
	i < num_to_fill; i++, desc++, entry++) {
	/*
	* NOTE: we're mapping the values here directly
	* from the SCSI/ATA bit definitions to the bio.h
	* definitions. There is also a warning in
	* disk_zone.h, but the impact is that if
	* additional values are added in the SCSI/ATA
	* specs these will be visible to consumers of
	* this interface.
	*/
	entry->zone_type = desc->zone_type & SRZ_TYPE_MASK;
	entry->zone_condition =
	(desc->zone_flags & SRZ_ZONE_COND_MASK) >>
	SRZ_ZONE_COND_SHIFT;
	entry->zone_flags \|= desc->zone_flags &
	(SRZ_ZONE_NON_SEQ\|SRZ_ZONE_RESET);
	entry->zone_length = le64dec(desc->zone_length);
	entry->zone_start_lba = le64dec(desc->zone_start_lba);
	entry->write_pointer_lba =
	le64dec(desc->write_pointer_lba);
	}
	rep->entries_filled = num_to_fill;
	/*
	* Note that this residual is accurate from the user's
	* standpoint, but the amount transferred isn't accurate
	* from the standpoint of what actually came back from the
	* drive.
	*/
	bp->bio_resid = bp->bio_bcount - (num_to_fill * sizeof(*entry));
	break;
	}
	case DISK_ZONE_GET_PARAMS:
	default:
	/*
	* In theory we should not get a GET_PARAMS bio, since it
	* should be handled without queueing the command to the
	* drive.
	*/
	panic("%s: Invalid zone command %d", __func__,
	bp->bio_zone.zone_cmd);
	break;
	}

	if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)
	free(ccb->ataio.data_ptr, M_ATADA);
	}


	static void
	adadone(struct cam_periph periph, union ccb done_ccb)
	{
	struct ada_softc *softc;
	struct ccb_ataio *ataio;
	struct cam_path *path;
	uint32_t priority;
	int state;

	softc = (struct ada_softc *)periph->softc;
	ataio = &done_ccb->ataio;
	path = done_ccb->ccb_h.path;
	priority = done_ccb->ccb_h.pinfo.priority;

	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("adadone\n"));

	state = ataio->ccb_h.ccb_state & ADA_CCB_TYPE_MASK;
	switch (state) {
	case ADA_CCB_BUFFER_IO:
	case ADA_CCB_TRIM:
	{
	struct bio *bp;
	int error;

	cam_periph_lock(periph);
	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	error = adaerror(done_ccb, 0, 0);
	if (error == ERESTART) {
	/* A retry was scheduled, so just return. */
	cam_periph_unlock(periph);
	return;
	}
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	/*
	* If we get an error on an NCQ DSM TRIM, fall back
	* to a non-NCQ DSM TRIM forever. Please note that if
	* CAN_NCQ_TRIM is set, CAN_TRIM is necessarily set too.
	* However, for this one trim, we treat it as advisory
	* and return success up the stack.
	*/
	if (state == ADA_CCB_TRIM &&
	error != 0 &&
	(softc->flags & ADA_FLAG_CAN_NCQ_TRIM) != 0) {
	softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM;
	error = 0;
	adasetdeletemethod(softc);
	}
	} else {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	panic("REQ_CMP with QFRZN");

	error = 0;
	}
	bp->bio_error = error;
	if (error != 0) {
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	} else {
	if (bp->bio_cmd == BIO_ZONE)
	adazonedone(periph, done_ccb);
	else if (state == ADA_CCB_TRIM)
	bp->bio_resid = 0;
	else
	bp->bio_resid = ataio->resid;

	if ((bp->bio_resid > 0)
	&& (bp->bio_cmd != BIO_ZONE))
	bp->bio_flags \|= BIO_ERROR;
	}
	softc->outstanding_cmds--;
	if (softc->outstanding_cmds == 0)
	softc->flags \|= ADA_FLAG_WAS_OTAG;

	/*
	* We need to call cam_iosched before we call biodone so that we
	* don't measure any activity that happens in the completion
	* routine, which in the case of sendfile can be quite
	* extensive.
	*/
	cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
	xpt_release_ccb(done_ccb);
	if (state == ADA_CCB_TRIM) {
	TAILQ_HEAD(, bio) queue;
	struct bio *bp1;

	TAILQ_INIT(&queue);
	TAILQ_CONCAT(&queue, &softc->trim_req.bps, bio_queue);
	/*
	* Normally, the xpt_release_ccb() above would make sure
	* that when we have more work to do, that work would
	* get kicked off. However, we specifically keep
	* trim_running set to 0 before the call above to allow
	* other I/O to progress when many BIO_DELETE requests
	* are pushed down. We set trim_running to 0 and call
	* daschedule again so that we don't stall if there are
	* no other I/Os pending apart from BIO_DELETEs.
	*/
	cam_iosched_trim_done(softc->cam_iosched);
	adaschedule(periph);
	cam_periph_unlock(periph);
	while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, bp1, bio_queue);
	bp1->bio_error = error;
	if (error != 0) {
	bp1->bio_flags \|= BIO_ERROR;
	bp1->bio_resid = bp1->bio_bcount;
	} else
	bp1->bio_resid = 0;
	biodone(bp1);
	}
	} else {
	adaschedule(periph);
	cam_periph_unlock(periph);
	biodone(bp);
	}
	return;
	}
	case ADA_CCB_RAHEAD:
	{
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if (adaerror(done_ccb, 0, 0) == ERESTART) {
	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);
	return;
	} else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}

	/*
	* Since our peripheral may be invalidated by an error
	* above or an external event, we must release our CCB
	* before releasing the reference on the peripheral.
	* The peripheral will only go away once the last reference
	* is removed, and we need it around for the CCB release
	* operation.
	*/

	xpt_release_ccb(done_ccb);
	softc->state = ADA_STATE_WCACHE;
	xpt_schedule(periph, priority);
	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);
	return;
	}
	case ADA_CCB_WCACHE:
	{
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	if (adaerror(done_ccb, 0, 0) == ERESTART) {
	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);
	return;
	} else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	cam_release_devq(path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}

	/* Drop freeze taken due to CAM_DEV_QFREEZE */
	cam_release_devq(path, 0, 0, 0, FALSE);

	if ((softc->flags & ADA_FLAG_CAN_LOG)
	&& (softc->zone_mode != ADA_ZONE_NONE)) {
	xpt_release_ccb(done_ccb);
	softc->state = ADA_STATE_LOGDIR;
	xpt_schedule(periph, priority);
	} else {
	adaprobedone(periph, done_ccb);
	}
	return;
	}
	case ADA_CCB_LOGDIR:
	{
	int error;

	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	error = 0;
	softc->valid_logdir_len = 0;
	bzero(&softc->ata_logdir, sizeof(softc->ata_logdir));
	softc->valid_logdir_len =
	ataio->dxfer_len - ataio->resid;
	if (softc->valid_logdir_len > 0)
	bcopy(ataio->data_ptr, &softc->ata_logdir,
	min(softc->valid_logdir_len,
	sizeof(softc->ata_logdir)));
	/*
	* Figure out whether the Identify Device log is
	* supported. The General Purpose log directory
	* has a header, and lists the number of pages
	* available for each GP log identified by the
	* offset into the list.
	*/
	if ((softc->valid_logdir_len >=
	((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t)))
	&& (le16dec(softc->ata_logdir.header) ==
	ATA_GP_LOG_DIR_VERSION)
	&& (le16dec(&softc->ata_logdir.num_pages[
	(ATA_IDENTIFY_DATA_LOG *
	sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){
	softc->flags \|= ADA_FLAG_CAN_IDLOG;
	} else {
	softc->flags &= ~ADA_FLAG_CAN_IDLOG;
	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA log directory,
	* then ATA logs are effectively not
	* supported even if the bit is set in the
	* identify data.
	*/
	softc->flags &= ~(ADA_FLAG_CAN_LOG \|
	ADA_FLAG_CAN_IDLOG);
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}


	}

	free(ataio->data_ptr, M_ATADA);

	if ((error == 0)
	&& (softc->flags & ADA_FLAG_CAN_IDLOG)) {
	softc->state = ADA_STATE_IDDIR;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	} else
	adaprobedone(periph, done_ccb);

	return;
	}
	case ADA_CCB_IDDIR: {
	int error;

	if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	off_t entries_offset, max_entries;
	error = 0;

	softc->valid_iddir_len = 0;
	bzero(&softc->ata_iddir, sizeof(softc->ata_iddir));
	softc->flags &= ~(ADA_FLAG_CAN_SUPCAP \|
	ADA_FLAG_CAN_ZONE);
	softc->valid_iddir_len =
	ataio->dxfer_len - ataio->resid;
	if (softc->valid_iddir_len > 0)
	bcopy(ataio->data_ptr, &softc->ata_iddir,
	min(softc->valid_iddir_len,
	sizeof(softc->ata_iddir)));

	entries_offset =
	__offsetof(struct ata_identify_log_pages,entries);
	max_entries = softc->valid_iddir_len - entries_offset;
	if ((softc->valid_iddir_len > (entries_offset + 1))
	&& (le64dec(softc->ata_iddir.header) ==
	ATA_IDLOG_REVISION)
	&& (softc->ata_iddir.entry_count > 0)) {
	int num_entries, i;

	num_entries = softc->ata_iddir.entry_count;
	num_entries = min(num_entries,
	softc->valid_iddir_len - entries_offset);
	for (i = 0; i < num_entries &&
	i < max_entries; i++) {
	if (softc->ata_iddir.entries[i] ==
	ATA_IDL_SUP_CAP)
	softc->flags \|=
	ADA_FLAG_CAN_SUPCAP;
	else if (softc->ata_iddir.entries[i]==
	ATA_IDL_ZDI)
	softc->flags \|=
	ADA_FLAG_CAN_ZONE;

	if ((softc->flags &
	ADA_FLAG_CAN_SUPCAP)
	&& (softc->flags &
	ADA_FLAG_CAN_ZONE))
	break;
	}
	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data log
	* directory, then it effectively isn't
	* supported even if the ATA Log directory
	* a non-zero number of pages present for
	* this log.
	*/
	softc->flags &= ~ADA_FLAG_CAN_IDLOG;
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(ataio->data_ptr, M_ATADA);

	if ((error == 0)
	&& (softc->flags & ADA_FLAG_CAN_SUPCAP)) {
	softc->state = ADA_STATE_SUP_CAP;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	} else
	adaprobedone(periph, done_ccb);
	return;
	}
	case ADA_CCB_SUP_CAP: {
	int error;

	if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;
	size_t needed_size;
	struct ata_identify_log_sup_cap *sup_cap;
	error = 0;

	sup_cap = (struct ata_identify_log_sup_cap *)
	ataio->data_ptr;
	valid_len = ataio->dxfer_len - ataio->resid;
	needed_size =
	__offsetof(struct ata_identify_log_sup_cap,
	sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap);
	if (valid_len >= needed_size) {
	uint64_t zoned, zac_cap;

	zoned = le64dec(sup_cap->zoned_cap);
	if (zoned & ATA_ZONED_VALID) {
	/*
	* This should have already been
	* set, because this is also in the
	* ATA identify data.
	*/
	if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE)
	softc->zone_mode =
	ADA_ZONE_HOST_AWARE;
	else if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED)
	softc->zone_mode =
	ADA_ZONE_DRIVE_MANAGED;
	}

	zac_cap = le64dec(sup_cap->sup_zac_cap);
	if (zac_cap & ATA_SUP_ZAC_CAP_VALID) {
	if (zac_cap & ATA_REPORT_ZONES_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_RZ_SUP;
	if (zac_cap & ATA_ND_OPEN_ZONE_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_OPEN_SUP;
	if (zac_cap & ATA_ND_CLOSE_ZONE_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_CLOSE_SUP;
	if (zac_cap & ATA_ND_FINISH_ZONE_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_FINISH_SUP;
	if (zac_cap & ATA_ND_RWP_SUP)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_RWP_SUP;
	} else {
	/*
	* This field was introduced in
	* ACS-4, r08 on April 28th, 2015.
	* If the drive firmware was written
	* to an earlier spec, it won't have
	* the field. So, assume all
	* commands are supported.
	*/
	softc->zone_flags \|=
	ADA_ZONE_FLAG_SUP_MASK;
	}

	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data
	* Supported Capabilities page, clear the
	* flag...
	*/
	softc->flags &= ~ADA_FLAG_CAN_SUPCAP;
	/*
	* And clear zone capabilities.
	*/
	softc->zone_flags &= ~ADA_ZONE_FLAG_SUP_MASK;
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(ataio->data_ptr, M_ATADA);

	if ((error == 0)
	&& (softc->flags & ADA_FLAG_CAN_ZONE)) {
	softc->state = ADA_STATE_ZONE;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	} else
	adaprobedone(periph, done_ccb);
	return;
	}
	case ADA_CCB_ZONE: {
	int error;

	if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	struct ata_zoned_info_log *zi_log;
	uint32_t valid_len;
	size_t needed_size;

	zi_log = (struct ata_zoned_info_log *)ataio->data_ptr;

	valid_len = ataio->dxfer_len - ataio->resid;
	needed_size = __offsetof(struct ata_zoned_info_log,
	version_info) + 1 + sizeof(zi_log->version_info);
	if (valid_len >= needed_size) {
	uint64_t tmpvar;

	tmpvar = le64dec(zi_log->zoned_cap);
	if (tmpvar & ATA_ZDI_CAP_VALID) {
	if (tmpvar & ATA_ZDI_CAP_URSWRZ)
	softc->zone_flags \|=
	ADA_ZONE_FLAG_URSWRZ;
	else
	softc->zone_flags &=
	~ADA_ZONE_FLAG_URSWRZ;
	}
	tmpvar = le64dec(zi_log->optimal_seq_zones);
	if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) {
	softc->zone_flags \|=
	ADA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = (tmpvar &
	ATA_ZDI_OPT_SEQ_MASK);
	} else {
	softc->zone_flags &=
	~ADA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = 0;
	}

	tmpvar =le64dec(zi_log->optimal_nonseq_zones);
	if (tmpvar & ATA_ZDI_OPT_NS_VALID) {
	softc->zone_flags \|=
	ADA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones =
	(tmpvar & ATA_ZDI_OPT_NS_MASK);
	} else {
	softc->zone_flags &=
	~ADA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones = 0;
	}

	tmpvar = le64dec(zi_log->max_seq_req_zones);
	if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) {
	softc->zone_flags \|=
	ADA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones =
	(tmpvar & ATA_ZDI_MAX_SEQ_MASK);
	} else {
	softc->zone_flags &=
	~ADA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones = 0;
	}
	}
	} else {
	error = adaerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	softc->flags &= ~ADA_FLAG_CAN_ZONE;
	softc->flags &= ~ADA_ZONE_FLAG_SET_MASK;

	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}

	}
	free(ataio->data_ptr, M_ATADA);

	adaprobedone(periph, done_ccb);
	return;
	}
	case ADA_CCB_DUMP:
	/* No-op. We're polling */
	return;
	default:
	break;
	}
	xpt_release_ccb(done_ccb);
	}

	static int
	adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	#ifdef CAM_IO_STATS
	struct ada_softc *softc;
	struct cam_periph *periph;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct ada_softc *)periph->softc;

	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
	case CAM_CMD_TIMEOUT:
	softc->timeouts++;
	break;
	case CAM_REQ_ABORTED:
	case CAM_REQ_CMP_ERR:
	case CAM_REQ_TERMIO:
	case CAM_UNREC_HBA_ERROR:
	case CAM_DATA_RUN_ERR:
	case CAM_ATA_STATUS_ERROR:
	softc->errors++;
	break;
	default:
	break;
	}
	#endif

	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}

	static void
	adagetparams(struct cam_periph periph, struct ccb_getdev cgd)
	{
	struct ada_softc softc = (struct ada_softc )periph->softc;
	struct disk_params *dp = &softc->params;
	u_int64_t lbasize48;
	u_int32_t lbasize;

	dp->secsize = ata_logical_sector_size(&cgd->ident_data);
	if ((cgd->ident_data.atavalid & ATA_FLAG_54_58) &&
	cgd->ident_data.current_heads && cgd->ident_data.current_sectors) {
	dp->heads = cgd->ident_data.current_heads;
	dp->secs_per_track = cgd->ident_data.current_sectors;
	dp->cylinders = cgd->ident_data.cylinders;
	dp->sectors = (u_int32_t)cgd->ident_data.current_size_1 \|
	((u_int32_t)cgd->ident_data.current_size_2 << 16);
	} else {
	dp->heads = cgd->ident_data.heads;
	dp->secs_per_track = cgd->ident_data.sectors;
	dp->cylinders = cgd->ident_data.cylinders;
	dp->sectors = cgd->ident_data.cylinders *
	(u_int32_t)(dp->heads * dp->secs_per_track);
	}
	lbasize = (u_int32_t)cgd->ident_data.lba_size_1 \|
	((u_int32_t)cgd->ident_data.lba_size_2 << 16);

	/* use the 28bit LBA size if valid or bigger than the CHS mapping */
	if (cgd->ident_data.cylinders == 16383 \|\| dp->sectors < lbasize)
	dp->sectors = lbasize;

	/* use the 48bit LBA size if valid */
	lbasize48 = ((u_int64_t)cgd->ident_data.lba_size48_1) \|
	((u_int64_t)cgd->ident_data.lba_size48_2 << 16) \|
	((u_int64_t)cgd->ident_data.lba_size48_3 << 32) \|
	((u_int64_t)cgd->ident_data.lba_size48_4 << 48);
	if ((cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) &&
	lbasize48 > ATA_MAX_28BIT_LBA)
	dp->sectors = lbasize48;
	}

	static void
	adasendorderedtag(void *arg)
	{
	struct ada_softc *softc = arg;

	if (ada_send_ordered) {
	if (softc->outstanding_cmds > 0) {
	if ((softc->flags & ADA_FLAG_WAS_OTAG) == 0)
	softc->flags \|= ADA_FLAG_NEED_OTAG;
	softc->flags &= ~ADA_FLAG_WAS_OTAG;
	}
	}
	/* Queue us up again */
	callout_reset(&softc->sendordered_c,
	(ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL,
	adasendorderedtag, softc);
	}

	/*
	* Step through all ADA peripheral drivers, and if the device is still open,
	* sync the disk cache to physical media.
	*/
	static void
	adaflush(void)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	union ccb *ccb;
	int error;

	CAM_PERIPH_FOREACH(periph, &adadriver) {
	softc = (struct ada_softc *)periph->softc;
	if (SCHEDULER_STOPPED()) {
	/* If we paniced with the lock held, do not recurse. */
	if (!cam_periph_owned(periph) &&
	(softc->flags & ADA_FLAG_OPEN)) {
	adadump(softc->disk, NULL, 0, 0, 0);
	}
	continue;
	}
	cam_periph_lock(periph);
	/*
	* We only sync the cache if the drive is still open, and
	* if the drive is capable of it..
	*/
	if (((softc->flags & ADA_FLAG_OPEN) == 0) \|\|
	(softc->flags & ADA_FLAG_CAN_FLUSHCACHE) == 0) {
	cam_periph_unlock(periph);
	continue;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	cam_fill_ataio(&ccb->ataio,
	0,
	adadone,
	CAM_DIR_NONE,
	0,
	NULL,
	0,
	ada_default_timeout*1000);
	if (softc->flags & ADA_FLAG_CAN_48BIT)
	ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0);
	else
	ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0);

	error = cam_periph_runccb(ccb, adaerror, /cam_flags/0,
	/sense_flags/ SF_NO_RECOVERY \| SF_NO_RETRY,
	softc->disk->d_devstat);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);
	}
	}

	static void
	adaspindown(uint8_t cmd, int flags)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;
	struct ccb_ataio local_ccb;
	int error;

	CAM_PERIPH_FOREACH(periph, &adadriver) {
	/* If we paniced with lock held - not recurse here. */
	if (cam_periph_owned(periph))
	continue;
	cam_periph_lock(periph);
	softc = (struct ada_softc *)periph->softc;
	/*
	* We only spin-down the drive if it is capable of it..
	*/
	if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) {
	cam_periph_unlock(periph);
	continue;
	}

	if (bootverbose)
	xpt_print(periph->path, "spin-down\n");

	memset(&local_ccb, 0, sizeof(local_ccb));
	xpt_setup_ccb(&local_ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	local_ccb.ccb_h.ccb_state = ADA_CCB_DUMP;

	cam_fill_ataio(&local_ccb,
	0,
	adadone,
	CAM_DIR_NONE \| flags,
	0,
	NULL,
	0,
	ada_default_timeout*1000);
	ata_28bit_cmd(&local_ccb, cmd, 0, 0, 0);
	error = cam_periph_runccb((union ccb *)&local_ccb, adaerror,
	/cam_flags/0, /sense_flags/ SF_NO_RECOVERY \| SF_NO_RETRY,
	softc->disk->d_devstat);
	if (error != 0)
	xpt_print(periph->path, "Spin-down disk failed\n");
	cam_periph_unlock(periph);
	}
	}

	static void
	adashutdown(void *arg, int howto)
	{
	int how;

	adaflush();

	/*
	* STANDBY IMMEDIATE saves any volatile data to the drive. It also spins
	* down hard drives. IDLE IMMEDIATE also saves the volatile data without
	* a spindown. We send the former when we expect to lose power soon. For
	* a warm boot, we send the latter to avoid a thundering herd of spinups
	* just after the kernel loads while probing. We have to do something to
	* flush the data because the BIOS in many systems resets the HBA
	* causing a COMINIT/COMRESET negotiation, which some drives interpret
	* as license to toss the volatile data, and others count as unclean
	* shutdown when in the Active PM state in SMART attributes.
	*
	* adaspindown will ensure that we don't send this to a drive that
	* doesn't support it.
	*/
	if (ada_spindown_shutdown != 0) {
	how = (howto & (RB_HALT \| RB_POWEROFF \| RB_POWERCYCLE)) ?
	ATA_STANDBY_IMMEDIATE : ATA_IDLE_IMMEDIATE;
	adaspindown(how, 0);
	}
	}

	static void
	adasuspend(void *arg)
	{

	adaflush();
	/*
	* SLEEP also fushes any volatile data, like STANDBY IMEDIATE,
	* so we don't need to send it as well.
	*/
	if (ada_spindown_suspend != 0)
	adaspindown(ATA_SLEEP, CAM_DEV_QFREEZE);
	}

	static void
	adaresume(void *arg)
	{
	struct cam_periph *periph;
	struct ada_softc *softc;

	if (ada_spindown_suspend == 0)
	return;

	CAM_PERIPH_FOREACH(periph, &adadriver) {
	cam_periph_lock(periph);
	softc = (struct ada_softc *)periph->softc;
	/*
	* We only spin-down the drive if it is capable of it..
	*/
	if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) {
	cam_periph_unlock(periph);
	continue;
	}

	if (bootverbose)
	xpt_print(periph->path, "resume\n");

	/*
	* Drop freeze taken due to CAM_DEV_QFREEZE flag set on
	* sleep request.
	*/
	cam_release_devq(periph->path,
	/relsim_flags/0,
	/openings/0,
	/timeout/0,
	/getcount_only/0);

	cam_periph_unlock(periph);
	}
	}

	#endif /* _KERNEL */
	Index: head/sys/cam/scsi/scsi_da.c
	===================================================================
	--- head/sys/cam/scsi/scsi_da.c (revision 327172)
	+++ head/sys/cam/scsi/scsi_da.c (revision 327173)
	@@ -1,6056 +1,6053 @@
	/*-
	* Implementation of SCSI Direct Access Peripheral driver for CAM.
	*
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997 Justin T. Gibbs.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>

	#ifdef _KERNEL
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/conf.h>
	#include <sys/devicestat.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/cons.h>
	#include <sys/endian.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <geom/geom.h>
	#include <geom/geom_disk.h>
	#endif /* _KERNEL */

	#ifndef _KERNEL
	#include <stdio.h>
	#include <string.h>
	#endif /* _KERNEL */

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_iosched.h>

	#include <cam/scsi/scsi_message.h>
	#include <cam/scsi/scsi_da.h>

	#ifdef _KERNEL
	/*
	* Note that there are probe ordering dependencies here. The order isn't
	* controlled by this enumeration, but by explicit state transitions in
	* dastart() and dadone(). Here are some of the dependencies:
	*
	* 1. RC should come first, before RC16, unless there is evidence that RC16
	* is supported.
	* 2. BDC needs to come before any of the ATA probes, or the ZONE probe.
	* 3. The ATA probes should go in this order:
	* ATA -> LOGDIR -> IDDIR -> SUP -> ATA_ZONE
	*/
	typedef enum {
	DA_STATE_PROBE_RC,
	DA_STATE_PROBE_RC16,
	DA_STATE_PROBE_LBP,
	DA_STATE_PROBE_BLK_LIMITS,
	DA_STATE_PROBE_BDC,
	DA_STATE_PROBE_ATA,
	DA_STATE_PROBE_ATA_LOGDIR,
	DA_STATE_PROBE_ATA_IDDIR,
	DA_STATE_PROBE_ATA_SUP,
	DA_STATE_PROBE_ATA_ZONE,
	DA_STATE_PROBE_ZONE,
	DA_STATE_NORMAL
	} da_state;

	typedef enum {
	DA_FLAG_PACK_INVALID = 0x000001,
	DA_FLAG_NEW_PACK = 0x000002,
	DA_FLAG_PACK_LOCKED = 0x000004,
	DA_FLAG_PACK_REMOVABLE = 0x000008,
	DA_FLAG_NEED_OTAG = 0x000020,
	DA_FLAG_WAS_OTAG = 0x000040,
	DA_FLAG_RETRY_UA = 0x000080,
	DA_FLAG_OPEN = 0x000100,
	DA_FLAG_SCTX_INIT = 0x000200,
	DA_FLAG_CAN_RC16 = 0x000400,
	DA_FLAG_PROBED = 0x000800,
	DA_FLAG_DIRTY = 0x001000,
	DA_FLAG_ANNOUNCED = 0x002000,
	DA_FLAG_CAN_ATA_DMA = 0x004000,
	DA_FLAG_CAN_ATA_LOG = 0x008000,
	DA_FLAG_CAN_ATA_IDLOG = 0x010000,
	DA_FLAG_CAN_ATA_SUPCAP = 0x020000,
	DA_FLAG_CAN_ATA_ZONE = 0x040000
	} da_flags;

	typedef enum {
	DA_Q_NONE = 0x00,
	DA_Q_NO_SYNC_CACHE = 0x01,
	DA_Q_NO_6_BYTE = 0x02,
	DA_Q_NO_PREVENT = 0x04,
	DA_Q_4K = 0x08,
	DA_Q_NO_RC16 = 0x10,
	DA_Q_NO_UNMAP = 0x20,
	DA_Q_RETRY_BUSY = 0x40,
	DA_Q_SMR_DM = 0x80,
	DA_Q_STRICT_UNMAP = 0x100
	} da_quirks;

	#define DA_Q_BIT_STRING \
	"\020" \
	"\001NO_SYNC_CACHE" \
	"\002NO_6_BYTE" \
	"\003NO_PREVENT" \
	"\0044K" \
	"\005NO_RC16" \
	"\006NO_UNMAP" \
	"\007RETRY_BUSY" \
	"\010SMR_DM" \
	"\011STRICT_UNMAP"

	typedef enum {
	DA_CCB_PROBE_RC = 0x01,
	DA_CCB_PROBE_RC16 = 0x02,
	DA_CCB_PROBE_LBP = 0x03,
	DA_CCB_PROBE_BLK_LIMITS = 0x04,
	DA_CCB_PROBE_BDC = 0x05,
	DA_CCB_PROBE_ATA = 0x06,
	DA_CCB_BUFFER_IO = 0x07,
	DA_CCB_DUMP = 0x0A,
	DA_CCB_DELETE = 0x0B,
	DA_CCB_TUR = 0x0C,
	DA_CCB_PROBE_ZONE = 0x0D,
	DA_CCB_PROBE_ATA_LOGDIR = 0x0E,
	DA_CCB_PROBE_ATA_IDDIR = 0x0F,
	DA_CCB_PROBE_ATA_SUP = 0x10,
	DA_CCB_PROBE_ATA_ZONE = 0x11,
	DA_CCB_TYPE_MASK = 0x1F,
	DA_CCB_RETRY_UA = 0x20
	} da_ccb_state;

	/*
	* Order here is important for method choice
	*
	* We prefer ATA_TRIM as tests run against a Sandforce 2281 SSD attached to
	* LSI 2008 (mps) controller (FW: v12, Drv: v14) resulted 20% quicker deletes
	* using ATA_TRIM than the corresponding UNMAP results for a real world mysql
	* import taking 5mins.
	*
	*/
	typedef enum {
	DA_DELETE_NONE,
	DA_DELETE_DISABLE,
	DA_DELETE_ATA_TRIM,
	DA_DELETE_UNMAP,
	DA_DELETE_WS16,
	DA_DELETE_WS10,
	DA_DELETE_ZERO,
	DA_DELETE_MIN = DA_DELETE_ATA_TRIM,
	DA_DELETE_MAX = DA_DELETE_ZERO
	} da_delete_methods;

	/*
	* For SCSI, host managed drives show up as a separate device type. For
	* ATA, host managed drives also have a different device signature.
	* XXX KDM figure out the ATA host managed signature.
	*/
	typedef enum {
	DA_ZONE_NONE = 0x00,
	DA_ZONE_DRIVE_MANAGED = 0x01,
	DA_ZONE_HOST_AWARE = 0x02,
	DA_ZONE_HOST_MANAGED = 0x03
	} da_zone_mode;

	/*
	* We distinguish between these interface cases in addition to the drive type:
	* o ATA drive behind a SCSI translation layer that knows about ZBC/ZAC
	* o ATA drive behind a SCSI translation layer that does not know about
	* ZBC/ZAC, and so needs to be managed via ATA passthrough. In this
	* case, we would need to share the ATA code with the ada(4) driver.
	* o SCSI drive.
	*/
	typedef enum {
	DA_ZONE_IF_SCSI,
	DA_ZONE_IF_ATA_PASS,
	DA_ZONE_IF_ATA_SAT,
	} da_zone_interface;

	typedef enum {
	DA_ZONE_FLAG_RZ_SUP = 0x0001,
	DA_ZONE_FLAG_OPEN_SUP = 0x0002,
	DA_ZONE_FLAG_CLOSE_SUP = 0x0004,
	DA_ZONE_FLAG_FINISH_SUP = 0x0008,
	DA_ZONE_FLAG_RWP_SUP = 0x0010,
	DA_ZONE_FLAG_SUP_MASK = (DA_ZONE_FLAG_RZ_SUP \|
	DA_ZONE_FLAG_OPEN_SUP \|
	DA_ZONE_FLAG_CLOSE_SUP \|
	DA_ZONE_FLAG_FINISH_SUP \|
	DA_ZONE_FLAG_RWP_SUP),
	DA_ZONE_FLAG_URSWRZ = 0x0020,
	DA_ZONE_FLAG_OPT_SEQ_SET = 0x0040,
	DA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080,
	DA_ZONE_FLAG_MAX_SEQ_SET = 0x0100,
	DA_ZONE_FLAG_SET_MASK = (DA_ZONE_FLAG_OPT_SEQ_SET \|
	DA_ZONE_FLAG_OPT_NONSEQ_SET \|
	DA_ZONE_FLAG_MAX_SEQ_SET)
	} da_zone_flags;

	static struct da_zone_desc {
	da_zone_flags value;
	const char *desc;
	} da_zone_desc_table[] = {
	{DA_ZONE_FLAG_RZ_SUP, "Report Zones" },
	{DA_ZONE_FLAG_OPEN_SUP, "Open" },
	{DA_ZONE_FLAG_CLOSE_SUP, "Close" },
	{DA_ZONE_FLAG_FINISH_SUP, "Finish" },
	{DA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" },
	};

	typedef void da_delete_func_t (struct cam_periph periph, union ccb ccb,
	struct bio *bp);
	static da_delete_func_t da_delete_trim;
	static da_delete_func_t da_delete_unmap;
	static da_delete_func_t da_delete_ws;

	static const void * da_delete_functions[] = {
	NULL,
	NULL,
	da_delete_trim,
	da_delete_unmap,
	da_delete_ws,
	da_delete_ws,
	da_delete_ws
	};

	static const char *da_delete_method_names[] =
	{ "NONE", "DISABLE", "ATA_TRIM", "UNMAP", "WS16", "WS10", "ZERO" };
	static const char *da_delete_method_desc[] =
	{ "NONE", "DISABLED", "ATA TRIM", "UNMAP", "WRITE SAME(16) with UNMAP",
	"WRITE SAME(10) with UNMAP", "ZERO" };

	/* Offsets into our private area for storing information */
	#define ccb_state ppriv_field0
	#define ccb_bp ppriv_ptr1

	struct disk_params {
	u_int8_t heads;
	u_int32_t cylinders;
	u_int8_t secs_per_track;
	u_int32_t secsize; /* Number of bytes/sector */
	u_int64_t sectors; /* total number sectors */
	u_int stripesize;
	u_int stripeoffset;
	};

	#define UNMAP_RANGE_MAX 0xffffffff
	#define UNMAP_HEAD_SIZE 8
	#define UNMAP_RANGE_SIZE 16
	#define UNMAP_MAX_RANGES 2048 /* Protocol Max is 4095 */
	#define UNMAP_BUF_SIZE ((UNMAP_MAX_RANGES * UNMAP_RANGE_SIZE) + \
	UNMAP_HEAD_SIZE)

	#define WS10_MAX_BLKS 0xffff
	#define WS16_MAX_BLKS 0xffffffff
	#define ATA_TRIM_MAX_RANGES ((UNMAP_BUF_SIZE / \
	(ATA_DSM_RANGE_SIZE * ATA_DSM_BLK_SIZE)) * ATA_DSM_BLK_SIZE)

	#define DA_WORK_TUR (1 << 16)

	struct da_softc {
	struct cam_iosched_softc *cam_iosched;
	struct bio_queue_head delete_run_queue;
	LIST_HEAD(, ccb_hdr) pending_ccbs;
	int refcount; /* Active xpt_action() calls */
	da_state state;
	da_flags flags;
	da_quirks quirks;
	int minimum_cmd_size;
	int error_inject;
	int trim_max_ranges;
	int delete_available; /* Delete methods possibly available */
	da_zone_mode zone_mode;
	da_zone_interface zone_interface;
	da_zone_flags zone_flags;
	struct ata_gp_log_dir ata_logdir;
	int valid_logdir_len;
	struct ata_identify_log_pages ata_iddir;
	int valid_iddir_len;
	uint64_t optimal_seq_zones;
	uint64_t optimal_nonseq_zones;
	uint64_t max_seq_zones;
	u_int maxio;
	uint32_t unmap_max_ranges;
	uint32_t unmap_max_lba; /* Max LBAs in UNMAP req */
	uint32_t unmap_gran;
	uint32_t unmap_gran_align;
	uint64_t ws_max_blks;
	da_delete_methods delete_method_pref;
	da_delete_methods delete_method;
	da_delete_func_t *delete_func;
	int unmappedio;
	int rotating;
	struct disk_params params;
	struct disk *disk;
	union ccb saved_ccb;
	struct task sysctl_task;
	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	struct callout sendordered_c;
	uint64_t wwpn;
	uint8_t unmap_buf[UNMAP_BUF_SIZE];
	struct scsi_read_capacity_data_long rcaplong;
	struct callout mediapoll_c;
	#ifdef CAM_IO_STATS
	struct sysctl_ctx_list sysctl_stats_ctx;
	struct sysctl_oid *sysctl_stats_tree;
	u_int errors;
	u_int timeouts;
	u_int invalidations;
	#endif
	#define DA_ANNOUNCETMP_SZ 80
	char announce_temp[DA_ANNOUNCETMP_SZ];
	#define DA_ANNOUNCE_SZ 400
	char announcebuf[DA_ANNOUNCE_SZ];
	};

	#define dadeleteflag(softc, delete_method, enable) \
	if (enable) { \
	softc->delete_available \|= (1 << delete_method); \
	} else { \
	softc->delete_available &= ~(1 << delete_method); \
	}

	struct da_quirk_entry {
	struct scsi_inquiry_pattern inq_pat;
	da_quirks quirks;
	};

	static const char quantum[] = "QUANTUM";
	static const char microp[] = "MICROP";

	static struct da_quirk_entry da_quirk_table[] =
	{
	/* SPI, FC devices */
	{
	/*
	* Fujitsu M2513A MO drives.
	* Tested devices: M2513A2 firmware versions 1200 & 1300.
	* (dip switch selects whether T_DIRECT or T_OPTICAL device)
	* Reported by: W.Scholten <whs@xs4all.nl>
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/* See above. */
	{T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* This particular Fujitsu drive doesn't like the
	* synchronize cache command.
	* Reported by: Tom Jackson <toj@gorilla.net>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* This drive doesn't like the synchronize cache command
	* either. Reported by: Matthew Jacob <mjacob@feral.com>
	* in NetBSD PR kern/6027, August 24, 1998.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, microp, "2217", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* This drive doesn't like the synchronize cache command
	* either. Reported by: Hellmuth Michaelis (hm@kts.org)
	* (PR 8882).
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, microp, "2112", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: Blaz Zupan <blaz@gold.amis.net>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: Blaz Zupan <blaz@gold.amis.net>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: walter@pelissero.de
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Doesn't work correctly with 6 byte reads/writes.
	* Returns illegal request, and points to byte 9 of the
	* 6-byte CDB.
	* Reported by: Adam McDougall <bsdx@spawnet.com>
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4", ""},
	/quirks/ DA_Q_NO_6_BYTE
	},
	{
	/* See above. */
	{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2", ""},
	/quirks/ DA_Q_NO_6_BYTE
	},
	{
	/*
	* Doesn't like the synchronize cache command.
	* Reported by: walter@pelissero.de
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* The CISS RAID controllers do not support SYNC_CACHE
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* The STEC SSDs sometimes hang on UNMAP.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "STEC", "", ""},
	/quirks/ DA_Q_NO_UNMAP
	},
	{
	/*
	* VMware returns BUSY status when storage has transient
	* connectivity problems, so better wait.
	* Also VMware returns odd errors on misaligned UNMAPs.
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "VMware", "", "*"},
	/quirks/ DA_Q_RETRY_BUSY \| DA_Q_STRICT_UNMAP
	},
	/* USB mass storage devices supported by umass(4) */
	{
	/*
	* EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player
	* PR: kern/51675
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Power Quotient Int. (PQI) USB flash key
	* PR: kern/53067
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic", "USB Flash Disk",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Creative Nomad MUVO mp3 player (USB)
	* PR: kern/53094
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* Jungsoft NEXDISK USB flash key
	* PR: kern/54737
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* FreeDik USB Mini Data Drive
	* PR: kern/54786
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Sigmatel USB Flash MP3 Player
	* PR: kern/57046
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* Neuros USB Digital Audio Computer
	* PR: kern/63645
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* SEAGRAND NP-900 MP3 Player
	* PR: kern/64563
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* iRiver iFP MP3 player (with UMS Firmware)
	* PR: kern/54881, i386/63941, kern/66124
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01
	* PR: kern/70158
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* ZICPlay USB MP3 Player with FM
	* PR: kern/75057
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS" , "USB DISK", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* TEAC USB floppy mechanisms
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Kingston DataTraveler II+ USB Pen-Drive.
	* Reported by: Pawel Jakub Dawidek <pjd@FreeBSD.org>
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* USB DISK Pro PMAP
	* Reported by: jhs
	* PR: usb/96381
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Motorola E398 Mobile Phone (TransFlash memory card).
	* Reported by: Wojciech A. Koszek <dunstan@FreeBSD.czest.pl>
	* PR: usb/89889
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Qware BeatZkey! Pro
	* PR: usb/79164
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Time DPA20B 1GB MP3 Player
	* PR: usb/81846
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0", "(FS) FLASH DISK",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Samsung USB key 128Mb
	* PR: usb/90081
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Kingston DataTraveler 2.0 USB Flash memory.
	* PR: usb/89196
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Creative MUVO Slim mp3 player (USB)
	* PR: usb/86131
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE\|DA_Q_NO_PREVENT
	},
	{
	/*
	* United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3)
	* PR: usb/80487
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* SanDisk Micro Cruzer 128MB
	* PR: usb/75970
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* TOSHIBA TransMemory USB sticks
	* PR: kern/94660
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* PNY USB 3.0 Flash Drives
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "PNY", "USB 3.0 FD*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE \| DA_Q_NO_RC16
	},
	{
	/*
	* PNY USB Flash keys
	* PR: usb/75578, usb/72344, usb/65436
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "" , "USB DISK",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Genesys GL3224
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic", "STORAGE DEVICE",
	"120?"}, /quirks/ DA_Q_NO_SYNC_CACHE \| DA_Q_4K \| DA_Q_NO_RC16
	},
	{
	/*
	* Genesys 6-in-1 Card Reader
	* PR: usb/94647
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic", "STORAGE DEVICE",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Rekam Digital CAMERA
	* PR: usb/98713
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA", "4MP-9J6",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* iRiver H10 MP3 player
	* PR: usb/102547
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* iRiver U10 MP3 player
	* PR: usb/92306
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* X-Micro Flash Disk
	* PR: usb/96901
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* EasyMP3 EM732X USB 2.0 Flash MP3 Player
	* PR: usb/96546
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*",
	"1.00"}, /quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Denver MP3 player
	* PR: usb/107101
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Philips USB Key Audio KEY013
	* PR: usb/68412
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE \| DA_Q_NO_PREVENT
	},
	{
	/*
	* JNC MP3 Player
	* PR: usb/94439
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC" , "MP3 Player",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* SAMSUNG MP0402H
	* PR: usb/108427
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* I/O Magic USB flash - Giga Bank
	* PR: usb/108810
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor", ""},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* JoyFly 128mb USB Flash Drive
	* PR: 96133
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* ChipsBnk usb stick
	* PR: 103702
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A
	* PR: 129858
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Samsung YP-U3 mp3-player
	* PR: 125398
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*",
	"2000"}, /quirks/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Sony Cyber-Shot DSC cameras
	* PR: usb/137035
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE \| DA_Q_NO_PREVENT
	},
	{
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3",
	"1.00"}, /quirks/ DA_Q_NO_PREVENT
	},
	{
	/* At least several Transcent USB sticks lie on RC16. */
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "JetFlash", "Transcend*",
	""}, /quirks*/ DA_Q_NO_RC16
	},
	{
	/*
	* I-O Data USB Flash Disk
	* PR: usb/211716
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "I-O DATA", "USB Flash Disk*",
	""}, /quirks*/ DA_Q_NO_RC16
	},
	/* ATA/SATA devices over SAS/USB/... */
	{
	/* Hitachi Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Micron Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Micron 5100 MTFDDAK", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Samsung Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST???DM", "", "*" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Barracuda Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Thin Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* Seagate Momentus Thin Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ST???LT", "", "*" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Caviar Green Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Black Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/* WDC Scorpio Blue Advanced Format (4k) drives */
	{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Olympus FE-210 camera
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* LG UP3S MP3 player
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* Laser MP3-2GA13 MP3 player
	*/
	{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk",
	""}, /quirks*/ DA_Q_NO_SYNC_CACHE
	},
	{
	/*
	* LaCie external 250GB Hard drive des by Porsche
	* Submitted by: Ben Stuyts <ben@altesco.nl>
	* PR: 121474
	*/
	{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"},
	/quirks/ DA_Q_NO_SYNC_CACHE
	},
	/* SATA SSDs */
	{
	/*
	* Corsair Force 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair CSSD-F", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Corsair Force 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force 3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Corsair Neutron GTX SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "Corsair Neutron GTX", "*" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Corsair Force GT & GS SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force G", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Crucial M4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "M4-CT???M4SSD2", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Crucial RealSSD C300 SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "C300-CTFDDAC???MAG*",
	"" }, /quirks*/DA_Q_4K
	},
	{
	/*
	* Intel 320 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2CW", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel 330 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2CT", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel 510 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2MH", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel 520 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BW", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel S3610 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BX", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Intel X25-M Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2M", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Kingston E100 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SE100S3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Kingston HyperX 3k SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SH103S3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Marvell SSDs (entry taken from OpenSolaris)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MARVELL SD88SA02", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Agility 2 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "", "OCZ-AGILITY2", "*" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Agility 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-AGILITY3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Deneva R Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "DENRSTE251M45", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Vertex 2 SSDs (inc pro series)
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ?VERTEX2", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Vertex 3 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX3", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* OCZ Vertex 4 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX4", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 750 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 750", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 830 Series SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG SSD 830 Series", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 840 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 840", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 845 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 845", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 850 SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 850", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Samsung 843T Series SSDs (MZ7WD*)
	* Samsung PM851 Series SSDs (MZ7TE*)
	* Samsung PM853T Series SSDs (MZ7GE*)
	* Samsung SM863 Series SSDs (MZ7KM*)
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG MZ7", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Same as for SAMSUNG MZ7* but enable the quirks for SSD
	* starting with MZ7* too
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MZ7", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* SuperTalent TeraDrive CT SSDs
	* 4k optimised & trim only works in 4k requests + 4k aligned
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "FTM??CT25H", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* XceedIOPS SATA SSDs
	* 4k optimised
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SG9XCS2D", "" },
	/quirks/DA_Q_4K
	},
	{
	/*
	* Hama Innostor USB-Stick
	*/
	{ T_DIRECT, SIP_MEDIA_REMOVABLE, "Innostor", "Innostor", "" },
	/quirks/DA_Q_NO_RC16
	},
	{
	/*
	* Seagate Lamarr 8TB Shingled Magnetic Recording (SMR)
	* Drive Managed SATA hard drive. This drive doesn't report
	* in firmware that it is a drive managed SMR drive.
	*/
	{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST8000AS000[23]", "" },
	/quirks/DA_Q_SMR_DM
	},
	{
	/*
	* MX-ES USB Drive by Mach Xtreme
	*/
	{ T_DIRECT, SIP_MEDIA_REMOVABLE, "MX", "MXUB3", ""},
	/quirks/DA_Q_NO_RC16
	},
	};

	static disk_strategy_t dastrategy;
	static dumper_t dadump;
	static periph_init_t dainit;
	static void daasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static void dasysctlinit(void *context, int pending);
	static int dasysctlsofttimeout(SYSCTL_HANDLER_ARGS);
	static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS);
	static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS);
	static int dazonemodesysctl(SYSCTL_HANDLER_ARGS);
	static int dazonesupsysctl(SYSCTL_HANDLER_ARGS);
	static int dadeletemaxsysctl(SYSCTL_HANDLER_ARGS);
	static void dadeletemethodset(struct da_softc *softc,
	da_delete_methods delete_method);
	static off_t dadeletemaxsize(struct da_softc *softc,
	da_delete_methods delete_method);
	static void dadeletemethodchoose(struct da_softc *softc,
	da_delete_methods default_method);
	static void daprobedone(struct cam_periph periph, union ccb ccb);

	static periph_ctor_t daregister;
	static periph_dtor_t dacleanup;
	static periph_start_t dastart;
	static periph_oninv_t daoninvalidate;
	static void dazonedone(struct cam_periph periph, union ccb ccb);
	static void dadone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int daerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static void daprevent(struct cam_periph *periph, int action);
	static void dareprobe(struct cam_periph *periph);
	static void dasetgeom(struct cam_periph *periph, uint32_t block_len,
	uint64_t maxsector,
	struct scsi_read_capacity_data_long *rcaplong,
	size_t rcap_size);
	static timeout_t dasendorderedtag;
	static void dashutdown(void *arg, int howto);
	static timeout_t damediapoll;

	#ifndef DA_DEFAULT_POLL_PERIOD
	#define DA_DEFAULT_POLL_PERIOD 3
	#endif

	#ifndef DA_DEFAULT_TIMEOUT
	#define DA_DEFAULT_TIMEOUT 60 /* Timeout in seconds */
	#endif

	#ifndef DA_DEFAULT_SOFTTIMEOUT
	#define DA_DEFAULT_SOFTTIMEOUT 0
	#endif

	#ifndef DA_DEFAULT_RETRY
	#define DA_DEFAULT_RETRY 4
	#endif

	#ifndef DA_DEFAULT_SEND_ORDERED
	#define DA_DEFAULT_SEND_ORDERED 1
	#endif

	static int da_poll_period = DA_DEFAULT_POLL_PERIOD;
	static int da_retry_count = DA_DEFAULT_RETRY;
	static int da_default_timeout = DA_DEFAULT_TIMEOUT;
	static sbintime_t da_default_softtimeout = DA_DEFAULT_SOFTTIMEOUT;
	static int da_send_ordered = DA_DEFAULT_SEND_ORDERED;

	static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD, 0,
	"CAM Direct Access Disk driver");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RWTUN,
	&da_poll_period, 0, "Media polling period in seconds");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RWTUN,
	&da_retry_count, 0, "Normal I/O retry count");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RWTUN,
	&da_default_timeout, 0, "Normal I/O timeout (in seconds)");
	SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RWTUN,
	&da_send_ordered, 0, "Send Ordered Tags");

	SYSCTL_PROC(_kern_cam_da, OID_AUTO, default_softtimeout,
	CTLTYPE_UINT \| CTLFLAG_RW, NULL, 0, dasysctlsofttimeout, "I",
	"Soft I/O timeout (ms)");
	TUNABLE_INT64("kern.cam.da.default_softtimeout", &da_default_softtimeout);

	/*
	* DA_ORDEREDTAG_INTERVAL determines how often, relative
	* to the default timeout, we check to see whether an ordered
	* tagged transaction is appropriate to prevent simple tag
	* starvation. Since we'd like to ensure that there is at least
	* 1/2 of the timeout length left for a starved transaction to
	* complete after we've sent an ordered tag, we must poll at least
	* four times in every timeout period. This takes care of the worst
	* case where a starved transaction starts during an interval that
	* meets the requirement "don't send an ordered tag" test so it takes
	* us two intervals to determine that a tag must be sent.
	*/
	#ifndef DA_ORDEREDTAG_INTERVAL
	#define DA_ORDEREDTAG_INTERVAL 4
	#endif

	static struct periph_driver dadriver =
	{
	dainit, "da",
	TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(da, dadriver);

	static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers");

	static int
	daopen(struct disk *dp)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	int error;

	periph = (struct cam_periph *)dp->d_drv1;
	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
	return (ENXIO);
	}

	cam_periph_lock(periph);
	if ((error = cam_periph_hold(periph, PRIBIO\|PCATCH)) != 0) {
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (error);
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("daopen\n"));

	softc = (struct da_softc *)periph->softc;
	dareprobe(periph);

	/* Wait for the disk size update. */
	error = cam_periph_sleep(periph, &softc->disk->d_mediasize, PRIBIO,
	"dareprobe", 0);
	if (error != 0)
	xpt_print(periph->path, "unable to retrieve capacity data\n");

	if (periph->flags & CAM_PERIPH_INVALID)
	error = ENXIO;

	if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
	(softc->quirks & DA_Q_NO_PREVENT) == 0)
	daprevent(periph, PR_PREVENT);

	if (error == 0) {
	softc->flags &= ~DA_FLAG_PACK_INVALID;
	softc->flags \|= DA_FLAG_OPEN;
	}

	cam_periph_unhold(periph);
	cam_periph_unlock(periph);

	if (error != 0)
	cam_periph_release(periph);

	return (error);
	}

	static int
	daclose(struct disk *dp)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	union ccb *ccb;
	- int error;

	periph = (struct cam_periph *)dp->d_drv1;
	softc = (struct da_softc *)periph->softc;
	cam_periph_lock(periph);
	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE \| CAM_DEBUG_PERIPH,
	("daclose\n"));

	if (cam_periph_hold(periph, PRIBIO) == 0) {

	/* Flush disk cache. */
	if ((softc->flags & DA_FLAG_DIRTY) != 0 &&
	(softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 &&
	(softc->flags & DA_FLAG_PACK_INVALID) == 0) {
	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	scsi_synchronize_cache(&ccb->csio, /retries/1,
	/cbfcnp/dadone, MSG_SIMPLE_Q_TAG,
	/begin_lba/0, /lb_count/0, SSD_FULL_SIZE,
	5 * 60 * 1000);
	- error = cam_periph_runccb(ccb, daerror, /cam_flags/0,
	+ cam_periph_runccb(ccb, daerror, /cam_flags/0,
	/sense_flags/SF_RETRY_UA \| SF_QUIET_IR,
	softc->disk->d_devstat);
	softc->flags &= ~DA_FLAG_DIRTY;
	xpt_release_ccb(ccb);
	}

	/* Allow medium removal. */
	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
	(softc->quirks & DA_Q_NO_PREVENT) == 0)
	daprevent(periph, PR_ALLOW);

	cam_periph_unhold(periph);
	}

	/*
	* If we've got removeable media, mark the blocksize as
	* unavailable, since it could change when new media is
	* inserted.
	*/
	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0)
	softc->disk->d_devstat->flags \|= DEVSTAT_BS_UNAVAILABLE;

	softc->flags &= ~DA_FLAG_OPEN;
	while (softc->refcount != 0)
	cam_periph_sleep(periph, &softc->refcount, PRIBIO, "daclose", 1);
	cam_periph_unlock(periph);
	cam_periph_release(periph);
	return (0);
	}

	static void
	daschedule(struct cam_periph *periph)
	{
	struct da_softc softc = (struct da_softc )periph->softc;

	if (softc->state != DA_STATE_NORMAL)
	return;

	cam_iosched_schedule(softc->cam_iosched, periph);
	}

	/*
	* Actually translate the requested transfer into one the physical driver
	* can understand. The transfer is described by a buf and will include
	* only one physical transfer.
	*/
	static void
	dastrategy(struct bio *bp)
	{
	struct cam_periph *periph;
	struct da_softc *softc;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	softc = (struct da_softc *)periph->softc;

	cam_periph_lock(periph);

	/*
	* If the device has been made invalid, error out
	*/
	if ((softc->flags & DA_FLAG_PACK_INVALID)) {
	cam_periph_unlock(periph);
	biofinish(bp, NULL, ENXIO);
	return;
	}

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp));

	/*
	* Zone commands must be ordered, because they can depend on the
	* effects of previously issued commands, and they may affect
	* commands after them.
	*/
	if (bp->bio_cmd == BIO_ZONE)
	bp->bio_flags \|= BIO_ORDERED;

	/*
	* Place it in the queue of disk activities for this disk
	*/
	cam_iosched_queue_work(softc->cam_iosched, bp);

	/*
	* Schedule ourselves for performing the work.
	*/
	daschedule(periph);
	cam_periph_unlock(periph);

	return;
	}

	static int
	dadump(void arg, void virtual, vm_offset_t physical, off_t offset, size_t length)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	u_int secsize;
	struct ccb_scsiio csio;
	struct disk *dp;
	int error = 0;

	dp = arg;
	periph = dp->d_drv1;
	softc = (struct da_softc *)periph->softc;
	cam_periph_lock(periph);
	secsize = softc->params.secsize;

	if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) {
	cam_periph_unlock(periph);
	return (ENXIO);
	}

	memset(&csio, 0, sizeof(csio));
	if (length > 0) {
	xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	csio.ccb_h.ccb_state = DA_CCB_DUMP;
	scsi_read_write(&csio,
	/retries/0,
	dadone,
	MSG_ORDERED_Q_TAG,
	/read/SCSI_RW_WRITE,
	/byte2/0,
	/minimum_cmd_size/ softc->minimum_cmd_size,
	offset / secsize,
	length / secsize,
	/data_ptr/(u_int8_t *) virtual,
	/dxfer_len/length,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	error = cam_periph_runccb((union ccb *)&csio, cam_periph_error,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	printf("Aborting dump due to I/O error.\n");
	cam_periph_unlock(periph);
	return (error);
	}

	/*
	* Sync the disk cache contents to the physical media.
	*/
	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {

	xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	csio.ccb_h.ccb_state = DA_CCB_DUMP;
	scsi_synchronize_cache(&csio,
	/retries/0,
	/cbfcnp/dadone,
	MSG_SIMPLE_Q_TAG,
	/begin_lba/0,/* Cover the whole disk */
	/lb_count/0,
	SSD_FULL_SIZE,
	5 * 1000);
	error = cam_periph_runccb((union ccb *)&csio, cam_periph_error,
	0, SF_NO_RECOVERY \| SF_NO_RETRY, NULL);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	}
	cam_periph_unlock(periph);
	return (error);
	}

	static int
	dagetattr(struct bio *bp)
	{
	int ret;
	struct cam_periph *periph;

	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
	cam_periph_lock(periph);
	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
	periph->path);
	cam_periph_unlock(periph);
	if (ret == 0)
	bp->bio_completed = bp->bio_length;
	return ret;
	}

	static void
	dainit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("da: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	} else if (da_send_ordered) {

	/* Register our shutdown event handler */
	if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown,
	NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
	printf("dainit: shutdown event registration failed!\n");
	}
	}

	/*
	* Callback from GEOM, called when it has finished cleaning up its
	* resources.
	*/
	static void
	dadiskgonecb(struct disk *dp)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)dp->d_drv1;
	cam_periph_release(periph);
	}

	static void
	daoninvalidate(struct cam_periph *periph)
	{
	struct da_softc *softc;

	softc = (struct da_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, daasync, periph, periph->path);

	softc->flags \|= DA_FLAG_PACK_INVALID;
	#ifdef CAM_IO_STATS
	softc->invalidations++;
	#endif

	/*
	* Return all queued I/O with ENXIO.
	* XXX Handle any transactions queued to the card
	* with XPT_ABORT_CCB.
	*/
	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);

	/*
	* Tell GEOM that we've gone away, we'll get a callback when it is
	* done cleaning up its resources.
	*/
	disk_gone(softc->disk);
	}

	static void
	dacleanup(struct cam_periph *periph)
	{
	struct da_softc *softc;

	softc = (struct da_softc *)periph->softc;

	cam_periph_unlock(periph);

	cam_iosched_fini(softc->cam_iosched);

	/*
	* If we can't free the sysctl tree, oh well...
	*/
	if ((softc->flags & DA_FLAG_SCTX_INIT) != 0) {
	#ifdef CAM_IO_STATS
	if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl stats context\n");
	#endif
	if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
	xpt_print(periph->path,
	"can't remove sysctl context\n");
	}

	callout_drain(&softc->mediapoll_c);
	disk_destroy(softc->disk);
	callout_drain(&softc->sendordered_c);
	free(softc, M_DEVBUF);
	cam_periph_lock(periph);
	}

	static void
	daasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct cam_periph *periph;
	struct da_softc *softc;

	periph = (struct cam_periph *)callback_arg;
	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	if (cgd->protocol != PROTO_SCSI)
	break;
	if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED)
	break;
	if (SID_TYPE(&cgd->inq_data) != T_DIRECT
	&& SID_TYPE(&cgd->inq_data) != T_RBC
	&& SID_TYPE(&cgd->inq_data) != T_OPTICAL
	&& SID_TYPE(&cgd->inq_data) != T_ZBC_HM)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(daregister, daoninvalidate,
	dacleanup, dastart,
	"da", CAM_PERIPH_BIO,
	path, daasync,
	AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG)
	printf("daasync: Unable to attach to new device "
	"due to status 0x%x\n", status);
	return;
	}
	case AC_ADVINFO_CHANGED:
	{
	uintptr_t buftype;

	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct da_softc *softc;

	softc = periph->softc;
	disk_attr_changed(softc->disk, "GEOM::physpath",
	M_NOWAIT);
	}
	break;
	}
	case AC_UNIT_ATTENTION:
	{
	union ccb *ccb;
	int error_code, sense_key, asc, ascq;

	softc = (struct da_softc *)periph->softc;
	ccb = (union ccb *)arg;

	/*
	* Handle all UNIT ATTENTIONs except our own,
	* as they will be handled by daerror().
	*/
	if (xpt_path_periph(ccb->ccb_h.path) != periph &&
	scsi_extract_sense_ccb(ccb,
	&error_code, &sense_key, &asc, &ascq)) {
	if (asc == 0x2A && ascq == 0x09) {
	xpt_print(ccb->ccb_h.path,
	"Capacity data has changed\n");
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	} else if (asc == 0x28 && ascq == 0x00) {
	softc->flags &= ~DA_FLAG_PROBED;
	disk_media_changed(softc->disk, M_NOWAIT);
	} else if (asc == 0x3F && ascq == 0x03) {
	xpt_print(ccb->ccb_h.path,
	"INQUIRY data has changed\n");
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	}
	}
	break;
	}
	case AC_SCSI_AEN:
	softc = (struct da_softc *)periph->softc;
	if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
	if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
	cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
	daschedule(periph);
	}
	}
	/* FALLTHROUGH */
	case AC_SENT_BDR:
	case AC_BUS_RESET:
	{
	struct ccb_hdr *ccbh;

	softc = (struct da_softc *)periph->softc;
	/*
	* Don't fail on the expected unit attention
	* that will occur.
	*/
	softc->flags \|= DA_FLAG_RETRY_UA;
	LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le)
	ccbh->ccb_state \|= DA_CCB_RETRY_UA;
	break;
	}
	case AC_INQ_CHANGED:
	softc = (struct da_softc *)periph->softc;
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	break;
	default:
	break;
	}
	cam_periph_async(periph, code, path, arg);
	}

	static void
	dasysctlinit(void *context, int pending)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	char tmpstr[32], tmpstr2[16];
	struct ccb_trans_settings cts;

	periph = (struct cam_periph *)context;
	/*
	* periph was held for us when this task was enqueued
	*/
	if (periph->flags & CAM_PERIPH_INVALID) {
	cam_periph_release(periph);
	return;
	}

	softc = (struct da_softc *)periph->softc;
	snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number);
	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);

	sysctl_ctx_init(&softc->sysctl_ctx);
	softc->flags \|= DA_FLAG_SCTX_INIT;
	softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2,
	CTLFLAG_RD, 0, tmpstr, "device_index");
	if (softc->sysctl_tree == NULL) {
	printf("dasysctlinit: unable to allocate sysctl tree\n");
	cam_periph_release(periph);
	return;
	}

	/*
	* Now register the sysctl handler, so the user can change the value on
	* the fly.
	*/
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "delete_method", CTLTYPE_STRING \| CTLFLAG_RWTUN,
	softc, 0, dadeletemethodsysctl, "A",
	"BIO_DELETE execution method");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "delete_max", CTLTYPE_U64 \| CTLFLAG_RW,
	softc, 0, dadeletemaxsysctl, "Q",
	"Maximum BIO_DELETE size");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "minimum_cmd_size", CTLTYPE_INT \| CTLFLAG_RW,
	&softc->minimum_cmd_size, 0, dacmdsizesysctl, "I",
	"Minimum CDB size");

	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_mode", CTLTYPE_STRING \| CTLFLAG_RD,
	softc, 0, dazonemodesysctl, "A",
	"Zone Mode");
	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "zone_support", CTLTYPE_STRING \| CTLFLAG_RD,
	softc, 0, dazonesupsysctl, "A",
	"Zone Support");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones,
	"Optimal Number of Open Sequential Write Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"optimal_nonseq_zones", CTLFLAG_RD,
	&softc->optimal_nonseq_zones,
	"Optimal Number of Non-Sequentially Written Sequential Write "
	"Preferred Zones");
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
	"max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones,
	"Maximum Number of Open Sequential Write Required Zones");

	SYSCTL_ADD_INT(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO,
	"error_inject",
	CTLFLAG_RW,
	&softc->error_inject,
	0,
	"error_inject leaf");

	SYSCTL_ADD_INT(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO,
	"unmapped_io",
	CTLFLAG_RD,
	&softc->unmappedio,
	0,
	"Unmapped I/O leaf");

	SYSCTL_ADD_INT(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO,
	"rotating",
	CTLFLAG_RD,
	&softc->rotating,
	0,
	"Rotating media");

	/*
	* Add some addressing info.
	*/
	memset(&cts, 0, sizeof (cts));
	xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE);
	cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
	cts.type = CTS_TYPE_CURRENT_SETTINGS;
	cam_periph_lock(periph);
	xpt_action((union ccb *)&cts);
	cam_periph_unlock(periph);
	if (cts.ccb_h.status != CAM_REQ_CMP) {
	cam_periph_release(periph);
	return;
	}
	if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) {
	struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc;
	if (fc->valid & CTS_FC_VALID_WWPN) {
	softc->wwpn = fc->wwpn;
	SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "wwpn", CTLFLAG_RD,
	&softc->wwpn, "World Wide Port Name");
	}
	}

	#ifdef CAM_IO_STATS
	/*
	* Now add some useful stats.
	* XXX These should live in cam_periph and be common to all periphs
	*/
	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
	CTLFLAG_RD, 0, "Statistics");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO,
	"errors",
	CTLFLAG_RD,
	&softc->errors,
	0,
	"Transport errors reported by the SIM");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO,
	"timeouts",
	CTLFLAG_RD,
	&softc->timeouts,
	0,
	"Device timeouts reported by the SIM");
	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
	SYSCTL_CHILDREN(softc->sysctl_stats_tree),
	OID_AUTO,
	"pack_invalidations",
	CTLFLAG_RD,
	&softc->invalidations,
	0,
	"Device pack invalidations");
	#endif

	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
	softc->sysctl_tree);

	cam_periph_release(periph);
	}

	static int
	dadeletemaxsysctl(SYSCTL_HANDLER_ARGS)
	{
	int error;
	uint64_t value;
	struct da_softc *softc;

	softc = (struct da_softc *)arg1;

	value = softc->disk->d_delmaxsize;
	error = sysctl_handle_64(oidp, &value, 0, req);
	if ((error != 0) \|\| (req->newptr == NULL))
	return (error);

	/* only accept values smaller than the calculated value */
	if (value > dadeletemaxsize(softc, softc->delete_method)) {
	return (EINVAL);
	}
	softc->disk->d_delmaxsize = value;

	return (0);
	}

	static int
	dacmdsizesysctl(SYSCTL_HANDLER_ARGS)
	{
	int error, value;

	value = (int )arg1;

	error = sysctl_handle_int(oidp, &value, 0, req);

	if ((error != 0)
	\|\| (req->newptr == NULL))
	return (error);

	/*
	* Acceptable values here are 6, 10, 12 or 16.
	*/
	if (value < 6)
	value = 6;
	else if ((value > 6)
	&& (value <= 10))
	value = 10;
	else if ((value > 10)
	&& (value <= 12))
	value = 12;
	else if (value > 12)
	value = 16;

	(int )arg1 = value;

	return (0);
	}

	static int
	dasysctlsofttimeout(SYSCTL_HANDLER_ARGS)
	{
	sbintime_t value;
	int error;

	value = da_default_softtimeout / SBT_1MS;

	error = sysctl_handle_int(oidp, (int *)&value, 0, req);
	if ((error != 0) \|\| (req->newptr == NULL))
	return (error);

	/* XXX Should clip this to a reasonable level */
	if (value > da_default_timeout * 1000)
	return (EINVAL);

	da_default_softtimeout = value * SBT_1MS;
	return (0);
	}

	static void
	dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method)
	{

	softc->delete_method = delete_method;
	softc->disk->d_delmaxsize = dadeletemaxsize(softc, delete_method);
	softc->delete_func = da_delete_functions[delete_method];

	if (softc->delete_method > DA_DELETE_DISABLE)
	softc->disk->d_flags \|= DISKFLAG_CANDELETE;
	else
	softc->disk->d_flags &= ~DISKFLAG_CANDELETE;
	}

	static off_t
	dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method)
	{
	off_t sectors;

	switch(delete_method) {
	case DA_DELETE_UNMAP:
	sectors = (off_t)softc->unmap_max_lba;
	break;
	case DA_DELETE_ATA_TRIM:
	sectors = (off_t)ATA_DSM_RANGE_MAX * softc->trim_max_ranges;
	break;
	case DA_DELETE_WS16:
	sectors = omin(softc->ws_max_blks, WS16_MAX_BLKS);
	break;
	case DA_DELETE_ZERO:
	case DA_DELETE_WS10:
	sectors = omin(softc->ws_max_blks, WS10_MAX_BLKS);
	break;
	default:
	return 0;
	}

	return (off_t)softc->params.secsize *
	omin(sectors, softc->params.sectors);
	}

	static void
	daprobedone(struct cam_periph periph, union ccb ccb)
	{
	struct da_softc *softc;

	softc = (struct da_softc *)periph->softc;

	dadeletemethodchoose(softc, DA_DELETE_NONE);

	if (bootverbose && (softc->flags & DA_FLAG_ANNOUNCED) == 0) {
	char buf[80];
	int i, sep;

	snprintf(buf, sizeof(buf), "Delete methods: <");
	sep = 0;
	for (i = 0; i <= DA_DELETE_MAX; i++) {
	if ((softc->delete_available & (1 << i)) == 0 &&
	i != softc->delete_method)
	continue;
	if (sep)
	strlcat(buf, ",", sizeof(buf));
	strlcat(buf, da_delete_method_names[i],
	sizeof(buf));
	if (i == softc->delete_method)
	strlcat(buf, "(*)", sizeof(buf));
	sep = 1;
	}
	strlcat(buf, ">", sizeof(buf));
	printf("%s%d: %s\n", periph->periph_name,
	periph->unit_number, buf);
	}

	/*
	* Since our peripheral may be invalidated by an error
	* above or an external event, we must release our CCB
	* before releasing the probe lock on the peripheral.
	* The peripheral will only go away once the last lock
	* is removed, and we need it around for the CCB release
	* operation.
	*/
	xpt_release_ccb(ccb);
	softc->state = DA_STATE_NORMAL;
	softc->flags \|= DA_FLAG_PROBED;
	daschedule(periph);
	wakeup(&softc->disk->d_mediasize);
	if ((softc->flags & DA_FLAG_ANNOUNCED) == 0) {
	softc->flags \|= DA_FLAG_ANNOUNCED;
	cam_periph_unhold(periph);
	} else
	cam_periph_release_locked(periph);
	}

	static void
	dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method)
	{
	int i, methods;

	/* If available, prefer the method requested by user. */
	i = softc->delete_method_pref;
	methods = softc->delete_available \| (1 << DA_DELETE_DISABLE);
	if (methods & (1 << i)) {
	dadeletemethodset(softc, i);
	return;
	}

	/* Use the pre-defined order to choose the best performing delete. */
	for (i = DA_DELETE_MIN; i <= DA_DELETE_MAX; i++) {
	if (i == DA_DELETE_ZERO)
	continue;
	if (softc->delete_available & (1 << i)) {
	dadeletemethodset(softc, i);
	return;
	}
	}

	/* Fallback to default. */
	dadeletemethodset(softc, default_method);
	}

	static int
	dadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	const char *p;
	struct da_softc *softc;
	- int i, error, methods, value;
	+ int i, error, value;

	softc = (struct da_softc *)arg1;

	value = softc->delete_method;
	if (value < 0 \|\| value > DA_DELETE_MAX)
	p = "UNKNOWN";
	else
	p = da_delete_method_names[value];
	strncpy(buf, p, sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	- methods = softc->delete_available \| (1 << DA_DELETE_DISABLE);
	for (i = 0; i <= DA_DELETE_MAX; i++) {
	if (strcmp(buf, da_delete_method_names[i]) == 0)
	break;
	}
	if (i > DA_DELETE_MAX)
	return (EINVAL);
	softc->delete_method_pref = i;
	dadeletemethodchoose(softc, DA_DELETE_NONE);
	return (0);
	}

	static int
	dazonemodesysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[40];
	struct da_softc *softc;
	int error;

	softc = (struct da_softc *)arg1;

	switch (softc->zone_mode) {
	case DA_ZONE_DRIVE_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed");
	break;
	case DA_ZONE_HOST_AWARE:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware");
	break;
	case DA_ZONE_HOST_MANAGED:
	snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed");
	break;
	case DA_ZONE_NONE:
	default:
	snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned");
	break;
	}

	error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req);

	return (error);
	}

	static int
	dazonesupsysctl(SYSCTL_HANDLER_ARGS)
	{
	char tmpbuf[180];
	struct da_softc *softc;
	struct sbuf sb;
	int error, first;
	unsigned int i;

	softc = (struct da_softc *)arg1;

	error = 0;
	first = 1;
	sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0);

	for (i = 0; i < sizeof(da_zone_desc_table) /
	sizeof(da_zone_desc_table[0]); i++) {
	if (softc->zone_flags & da_zone_desc_table[i].value) {
	if (first == 0)
	sbuf_printf(&sb, ", ");
	else
	first = 0;
	sbuf_cat(&sb, da_zone_desc_table[i].desc);
	}
	}

	if (first == 1)
	sbuf_printf(&sb, "None");

	sbuf_finish(&sb);

	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);

	return (error);
	}

	static cam_status
	daregister(struct cam_periph periph, void arg)
	{
	struct da_softc *softc;
	struct ccb_pathinq cpi;
	struct ccb_getdev *cgd;
	char tmpstr[80];
	caddr_t match;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("daregister: no getdev CCB, can't register device\n");
	return(CAM_REQ_CMP_ERR);
	}

	softc = (struct da_softc )malloc(sizeof(softc), M_DEVBUF,
	M_NOWAIT\|M_ZERO);

	if (softc == NULL) {
	printf("daregister: Unable to probe new device. "
	"Unable to allocate softc\n");
	return(CAM_REQ_CMP_ERR);
	}

	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
	printf("daregister: Unable to probe new device. "
	"Unable to allocate iosched memory\n");
	free(softc, M_DEVBUF);
	return(CAM_REQ_CMP_ERR);
	}

	LIST_INIT(&softc->pending_ccbs);
	softc->state = DA_STATE_PROBE_RC;
	bioq_init(&softc->delete_run_queue);
	if (SID_IS_REMOVABLE(&cgd->inq_data))
	softc->flags \|= DA_FLAG_PACK_REMOVABLE;
	softc->unmap_max_ranges = UNMAP_MAX_RANGES;
	softc->unmap_max_lba = UNMAP_RANGE_MAX;
	softc->unmap_gran = 0;
	softc->unmap_gran_align = 0;
	softc->ws_max_blks = WS16_MAX_BLKS;
	softc->trim_max_ranges = ATA_TRIM_MAX_RANGES;
	softc->rotating = 1;

	periph->softc = softc;

	/*
	* See if this device has any quirks.
	*/
	match = cam_quirkmatch((caddr_t)&cgd->inq_data,
	(caddr_t)da_quirk_table,
	nitems(da_quirk_table),
	sizeof(*da_quirk_table), scsi_inquiry_match);

	if (match != NULL)
	softc->quirks = ((struct da_quirk_entry *)match)->quirks;
	else
	softc->quirks = DA_Q_NONE;

	/* Check if the SIM does not want 6 byte commands */
	xpt_path_inq(&cpi, periph->path);
	if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE))
	softc->quirks \|= DA_Q_NO_6_BYTE;

	if (SID_TYPE(&cgd->inq_data) == T_ZBC_HM)
	softc->zone_mode = DA_ZONE_HOST_MANAGED;
	else if (softc->quirks & DA_Q_SMR_DM)
	softc->zone_mode = DA_ZONE_DRIVE_MANAGED;
	else
	softc->zone_mode = DA_ZONE_NONE;

	if (softc->zone_mode != DA_ZONE_NONE) {
	if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
	if (scsi_vpd_supported_page(periph, SVPD_ZONED_BDC))
	softc->zone_interface = DA_ZONE_IF_ATA_SAT;
	else
	softc->zone_interface = DA_ZONE_IF_ATA_PASS;
	} else
	softc->zone_interface = DA_ZONE_IF_SCSI;
	}

	TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph);

	/*
	* Take an exclusive refcount on the periph while dastart is called
	* to finish the probe. The reference will be dropped in dadone at
	* the end of probe.
	*/
	(void)cam_periph_hold(periph, PRIBIO);

	/*
	* Schedule a periodic event to occasionally send an
	* ordered tag to a device.
	*/
	callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0);
	callout_reset(&softc->sendordered_c,
	(da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
	dasendorderedtag, softc);

	cam_periph_unlock(periph);
	/*
	* RBC devices don't have to support READ(6), only READ(10).
	*/
	if (softc->quirks & DA_Q_NO_6_BYTE \|\| SID_TYPE(&cgd->inq_data) == T_RBC)
	softc->minimum_cmd_size = 10;
	else
	softc->minimum_cmd_size = 6;

	/*
	* Load the user's default, if any.
	*/
	snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size",
	periph->unit_number);
	TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size);

	/*
	* 6, 10, 12 and 16 are the currently permissible values.
	*/
	if (softc->minimum_cmd_size > 12)
	softc->minimum_cmd_size = 16;
	else if (softc->minimum_cmd_size > 10)
	softc->minimum_cmd_size = 12;
	else if (softc->minimum_cmd_size > 6)
	softc->minimum_cmd_size = 10;
	else
	softc->minimum_cmd_size = 6;

	/* Predict whether device may support READ CAPACITY(16). */
	if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3 &&
	(softc->quirks & DA_Q_NO_RC16) == 0) {
	softc->flags \|= DA_FLAG_CAN_RC16;
	softc->state = DA_STATE_PROBE_RC16;
	}

	/*
	* Register this media as a disk.
	*/
	softc->disk = disk_alloc();
	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
	periph->unit_number, 0,
	DEVSTAT_BS_UNAVAILABLE,
	SID_TYPE(&cgd->inq_data) \|
	XPORT_DEVSTAT_TYPE(cpi.transport),
	DEVSTAT_PRIORITY_DISK);
	softc->disk->d_open = daopen;
	softc->disk->d_close = daclose;
	softc->disk->d_strategy = dastrategy;
	softc->disk->d_dump = dadump;
	softc->disk->d_getattr = dagetattr;
	softc->disk->d_gone = dadiskgonecb;
	softc->disk->d_name = "da";
	softc->disk->d_drv1 = periph;
	if (cpi.maxio == 0)
	softc->maxio = DFLTPHYS; /* traditional default */
	else if (cpi.maxio > MAXPHYS)
	softc->maxio = MAXPHYS; /* for safety */
	else
	softc->maxio = cpi.maxio;
	softc->disk->d_maxsize = softc->maxio;
	softc->disk->d_unit = periph->unit_number;
	softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION \| DISKFLAG_CANZONE;
	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0)
	softc->disk->d_flags \|= DISKFLAG_CANFLUSHCACHE;
	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
	softc->unmappedio = 1;
	softc->disk->d_flags \|= DISKFLAG_UNMAPPED_BIO;
	}
	cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor,
	sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr));
	strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr));
	cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)],
	cgd->inq_data.product, sizeof(cgd->inq_data.product),
	sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr));
	softc->disk->d_hba_vendor = cpi.hba_vendor;
	softc->disk->d_hba_device = cpi.hba_device;
	softc->disk->d_hba_subvendor = cpi.hba_subvendor;
	softc->disk->d_hba_subdevice = cpi.hba_subdevice;

	/*
	* Acquire a reference to the periph before we register with GEOM.
	* We'll release this reference once GEOM calls us back (via
	* dadiskgonecb()) telling us that our provider has been freed.
	*/
	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	disk_create(softc->disk, DISK_VERSION);
	cam_periph_lock(periph);

	/*
	* Add async callbacks for events of interest.
	* I don't bother checking if this fails as,
	* in most cases, the system will function just
	* fine without them and the only alternative
	* would be to not attach the device on failure.
	*/
	xpt_register_async(AC_SENT_BDR \| AC_BUS_RESET \| AC_LOST_DEVICE \|
	AC_ADVINFO_CHANGED \| AC_SCSI_AEN \| AC_UNIT_ATTENTION \|
	AC_INQ_CHANGED, daasync, periph, periph->path);

	/*
	* Emit an attribute changed notification just in case
	* physical path information arrived before our async
	* event handler was registered, but after anyone attaching
	* to our disk device polled it.
	*/
	disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT);

	/*
	* Schedule a periodic media polling events.
	*/
	callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0);
	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) &&
	(cgd->inq_flags & SID_AEN) == 0 &&
	da_poll_period != 0)
	callout_reset(&softc->mediapoll_c, da_poll_period * hz,
	damediapoll, periph);

	xpt_schedule(periph, CAM_PRIORITY_DEV);

	return(CAM_REQ_CMP);
	}

	static int
	da_zone_bio_to_scsi(int disk_zone_cmd)
	{
	switch (disk_zone_cmd) {
	case DISK_ZONE_OPEN:
	return ZBC_OUT_SA_OPEN;
	case DISK_ZONE_CLOSE:
	return ZBC_OUT_SA_CLOSE;
	case DISK_ZONE_FINISH:
	return ZBC_OUT_SA_FINISH;
	case DISK_ZONE_RWP:
	return ZBC_OUT_SA_RWP;
	}

	return -1;
	}

	static int
	da_zone_cmd(struct cam_periph periph, union ccb ccb, struct bio *bp,
	int *queue_ccb)
	{
	struct da_softc *softc;
	int error;

	error = 0;

	if (bp->bio_cmd != BIO_ZONE) {
	error = EINVAL;
	goto bailout;
	}

	softc = periph->softc;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP: {
	int zone_flags;
	int zone_sa;
	uint64_t lba;

	zone_sa = da_zone_bio_to_scsi(bp->bio_zone.zone_cmd);
	if (zone_sa == -1) {
	xpt_print(periph->path, "Cannot translate zone "
	"cmd %#x to SCSI\n", bp->bio_zone.zone_cmd);
	error = EINVAL;
	goto bailout;
	}

	zone_flags = 0;
	lba = bp->bio_zone.zone_params.rwp.id;

	if (bp->bio_zone.zone_params.rwp.flags &
	DISK_ZONE_RWP_FLAG_ALL)
	zone_flags \|= ZBC_OUT_ALL;

	if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
	scsi_zbc_out(&ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/service_action/ zone_sa,
	/zone_id/ lba,
	/zone_flags/ zone_flags,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	} else {
	/*
	* Note that in this case, even though we can
	* technically use NCQ, we don't bother for several
	* reasons:
	* 1. It hasn't been tested on a SAT layer that
	* supports it. This is new as of SAT-4.
	* 2. Even when there is a SAT layer that supports
	* it, that SAT layer will also probably support
	* ZBC -> ZAC translation, since they are both
	* in the SAT-4 spec.
	* 3. Translation will likely be preferable to ATA
	* passthrough. LSI / Avago at least single
	* steps ATA passthrough commands in the HBA,
	* regardless of protocol, so unless that
	* changes, there is a performance penalty for
	* doing ATA passthrough no matter whether
	* you're using NCQ/FPDMA, DMA or PIO.
	* 4. It requires a 32-byte CDB, which at least at
	* this point in CAM requires a CDB pointer, which
	* would require us to allocate an additional bit
	* of storage separate from the CCB.
	*/
	error = scsi_ata_zac_mgmt_out(&ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/use_ncq/ 0,
	/zm_action/ zone_sa,
	/zone_id/ lba,
	/zone_flags/ zone_flags,
	/data_ptr/ NULL,
	/dxfer_len/ 0,
	/cdb_storage/ NULL,
	/cdb_storage_len/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	if (error != 0) {
	error = EINVAL;
	xpt_print(periph->path,
	"scsi_ata_zac_mgmt_out() returned an "
	"error!");
	goto bailout;
	}
	}
	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_REPORT_ZONES: {
	uint8_t *rz_ptr;
	uint32_t num_entries, alloc_size;
	struct disk_zone_report *rep;

	rep = &bp->bio_zone.zone_params.report;

	num_entries = rep->entries_allocated;
	if (num_entries == 0) {
	xpt_print(periph->path, "No entries allocated for "
	"Report Zones request\n");
	error = EINVAL;
	goto bailout;
	}
	alloc_size = sizeof(struct scsi_report_zones_hdr) +
	(sizeof(struct scsi_report_zones_desc) * num_entries);
	alloc_size = min(alloc_size, softc->disk->d_maxsize);
	rz_ptr = malloc(alloc_size, M_SCSIDA, M_NOWAIT \| M_ZERO);
	if (rz_ptr == NULL) {
	xpt_print(periph->path, "Unable to allocate memory "
	"for Report Zones request\n");
	error = ENOMEM;
	goto bailout;
	}

	if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) {
	scsi_zbc_in(&ccb->csio,
	/retries/ da_retry_count,
	/cbcfnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/service_action/ ZBC_IN_SA_REPORT_ZONES,
	/zone_start_lba/ rep->starting_id,
	/zone_options/ rep->rep_options,
	/data_ptr/ rz_ptr,
	/dxfer_len/ alloc_size,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	} else {
	/*
	* Note that in this case, even though we can
	* technically use NCQ, we don't bother for several
	* reasons:
	* 1. It hasn't been tested on a SAT layer that
	* supports it. This is new as of SAT-4.
	* 2. Even when there is a SAT layer that supports
	* it, that SAT layer will also probably support
	* ZBC -> ZAC translation, since they are both
	* in the SAT-4 spec.
	* 3. Translation will likely be preferable to ATA
	* passthrough. LSI / Avago at least single
	* steps ATA passthrough commands in the HBA,
	* regardless of protocol, so unless that
	* changes, there is a performance penalty for
	* doing ATA passthrough no matter whether
	* you're using NCQ/FPDMA, DMA or PIO.
	* 4. It requires a 32-byte CDB, which at least at
	* this point in CAM requires a CDB pointer, which
	* would require us to allocate an additional bit
	* of storage separate from the CCB.
	*/
	error = scsi_ata_zac_mgmt_in(&ccb->csio,
	/retries/ da_retry_count,
	/cbcfnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/use_ncq/ 0,
	/zm_action/ ATA_ZM_REPORT_ZONES,
	/zone_id/ rep->starting_id,
	/zone_flags/ rep->rep_options,
	/data_ptr/ rz_ptr,
	/dxfer_len/ alloc_size,
	/cdb_storage/ NULL,
	/cdb_storage_len/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	if (error != 0) {
	error = EINVAL;
	xpt_print(periph->path,
	"scsi_ata_zac_mgmt_in() returned an "
	"error!");
	goto bailout;
	}
	}

	/*
	* For BIO_ZONE, this isn't normally needed. However, it
	* is used by devstat_end_transaction_bio() to determine
	* how much data was transferred.
	*/
	/*
	* XXX KDM we have a problem. But I'm not sure how to fix
	* it. devstat uses bio_bcount - bio_resid to calculate
	* the amount of data transferred. The GEOM disk code
	* uses bio_length - bio_resid to calculate the amount of
	* data in bio_completed. We have different structure
	* sizes above and below the ada(4) driver. So, if we
	* use the sizes above, the amount transferred won't be
	* quite accurate for devstat. If we use different sizes
	* for bio_bcount and bio_length (above and below
	* respectively), then the residual needs to match one or
	* the other. Everything is calculated after the bio
	* leaves the driver, so changing the values around isn't
	* really an option. For now, just set the count to the
	* passed in length. This means that the calculations
	* above (e.g. bio_completed) will be correct, but the
	* amount of data reported to devstat will be slightly
	* under or overstated.
	*/
	bp->bio_bcount = bp->bio_length;

	*queue_ccb = 1;

	break;
	}
	case DISK_ZONE_GET_PARAMS: {
	struct disk_zone_disk_params *params;

	params = &bp->bio_zone.zone_params.disk_params;
	bzero(params, sizeof(*params));

	switch (softc->zone_mode) {
	case DA_ZONE_DRIVE_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED;
	break;
	case DA_ZONE_HOST_AWARE:
	params->zone_mode = DISK_ZONE_MODE_HOST_AWARE;
	break;
	case DA_ZONE_HOST_MANAGED:
	params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED;
	break;
	default:
	case DA_ZONE_NONE:
	params->zone_mode = DISK_ZONE_MODE_NONE;
	break;
	}

	if (softc->zone_flags & DA_ZONE_FLAG_URSWRZ)
	params->flags \|= DISK_ZONE_DISK_URSWRZ;

	if (softc->zone_flags & DA_ZONE_FLAG_OPT_SEQ_SET) {
	params->optimal_seq_zones = softc->optimal_seq_zones;
	params->flags \|= DISK_ZONE_OPT_SEQ_SET;
	}

	if (softc->zone_flags & DA_ZONE_FLAG_OPT_NONSEQ_SET) {
	params->optimal_nonseq_zones =
	softc->optimal_nonseq_zones;
	params->flags \|= DISK_ZONE_OPT_NONSEQ_SET;
	}

	if (softc->zone_flags & DA_ZONE_FLAG_MAX_SEQ_SET) {
	params->max_seq_zones = softc->max_seq_zones;
	params->flags \|= DISK_ZONE_MAX_SEQ_SET;
	}
	if (softc->zone_flags & DA_ZONE_FLAG_RZ_SUP)
	params->flags \|= DISK_ZONE_RZ_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_OPEN_SUP)
	params->flags \|= DISK_ZONE_OPEN_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_CLOSE_SUP)
	params->flags \|= DISK_ZONE_CLOSE_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_FINISH_SUP)
	params->flags \|= DISK_ZONE_FINISH_SUP;

	if (softc->zone_flags & DA_ZONE_FLAG_RWP_SUP)
	params->flags \|= DISK_ZONE_RWP_SUP;
	break;
	}
	default:
	break;
	}
	bailout:
	return (error);
	}

	static void
	dastart(struct cam_periph periph, union ccb start_ccb)
	{
	struct da_softc *softc;

	softc = (struct da_softc *)periph->softc;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n"));

	skipstate:
	switch (softc->state) {
	case DA_STATE_NORMAL:
	{
	struct bio *bp;
	uint8_t tag_code;

	more:
	bp = cam_iosched_next_bio(softc->cam_iosched);
	if (bp == NULL) {
	if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
	cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR);
	scsi_test_unit_ready(&start_ccb->csio,
	/retries/ da_retry_count,
	dadone,
	MSG_SIMPLE_Q_TAG,
	SSD_FULL_SIZE,
	da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_TUR;
	xpt_action(start_ccb);
	} else
	xpt_release_ccb(start_ccb);
	break;
	}

	if (bp->bio_cmd == BIO_DELETE) {
	if (softc->delete_func != NULL) {
	softc->delete_func(periph, start_ccb, bp);
	goto out;
	} else {
	/* Not sure this is possible, but failsafe by lying and saying "sure, done." */
	biofinish(bp, NULL, 0);
	goto more;
	}
	}

	if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) {
	cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR);
	cam_periph_release_locked(periph); /* XXX is this still valid? I think so but unverified */
	}

	if ((bp->bio_flags & BIO_ORDERED) != 0 \|\|
	(softc->flags & DA_FLAG_NEED_OTAG) != 0) {
	softc->flags &= ~DA_FLAG_NEED_OTAG;
	softc->flags \|= DA_FLAG_WAS_OTAG;
	tag_code = MSG_ORDERED_Q_TAG;
	} else {
	tag_code = MSG_SIMPLE_Q_TAG;
	}

	switch (bp->bio_cmd) {
	case BIO_WRITE:
	case BIO_READ:
	{
	void *data_ptr;
	int rw_op;

	biotrack(bp, __func__);

	if (bp->bio_cmd == BIO_WRITE) {
	softc->flags \|= DA_FLAG_DIRTY;
	rw_op = SCSI_RW_WRITE;
	} else {
	rw_op = SCSI_RW_READ;
	}

	data_ptr = bp->bio_data;
	if ((bp->bio_flags & (BIO_UNMAPPED\|BIO_VLIST)) != 0) {
	rw_op \|= SCSI_RW_BIO;
	data_ptr = bp;
	}

	scsi_read_write(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/tag_code,
	rw_op,
	/byte2/0,
	softc->minimum_cmd_size,
	/lba/bp->bio_pblkno,
	/block_count/bp->bio_bcount /
	softc->params.secsize,
	data_ptr,
	/dxfer_len/ bp->bio_bcount,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	start_ccb->csio.bio = bp;
	#endif
	break;
	}
	case BIO_FLUSH:
	/*
	* If we don't support sync cache, or the disk
	* isn't dirty, FLUSH is a no-op. Use the
	* allocated * CCB for the next bio if one is
	* available.
	*/
	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) != 0 \|\|
	(softc->flags & DA_FLAG_DIRTY) == 0) {
	biodone(bp);
	goto skipstate;
	}

	/*
	* BIO_FLUSH doesn't currently communicate
	* range data, so we synchronize the cache
	* over the whole disk. We also force
	* ordered tag semantics the flush applies
	* to all previously queued I/O.
	*/
	scsi_synchronize_cache(&start_ccb->csio,
	/retries/1,
	/cbfcnp/dadone,
	MSG_ORDERED_Q_TAG,
	/begin_lba/0,
	/lb_count/0,
	SSD_FULL_SIZE,
	da_default_timeout*1000);
	/*
	* Clear the dirty flag before sending the command.
	* Either this sync cache will be successful, or it
	* will fail after a retry. If it fails, it is
	* unlikely to be successful if retried later, so
	* we'll save ourselves time by just marking the
	* device clean.
	*/
	softc->flags &= ~DA_FLAG_DIRTY;
	break;
	case BIO_ZONE: {
	int error, queue_ccb;

	queue_ccb = 0;

	error = da_zone_cmd(periph, start_ccb, bp,&queue_ccb);
	if ((error != 0)
	\|\| (queue_ccb == 0)) {
	biofinish(bp, NULL, error);
	xpt_release_ccb(start_ccb);
	return;
	}
	break;
	}
	}
	start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO;
	start_ccb->ccb_h.flags \|= CAM_UNLOCKED;
	start_ccb->ccb_h.softtimeout = sbttotv(da_default_softtimeout);

	out:
	LIST_INSERT_HEAD(&softc->pending_ccbs,
	&start_ccb->ccb_h, periph_links.le);

	/* We expect a unit attention from this device */
	if ((softc->flags & DA_FLAG_RETRY_UA) != 0) {
	start_ccb->ccb_h.ccb_state \|= DA_CCB_RETRY_UA;
	softc->flags &= ~DA_FLAG_RETRY_UA;
	}

	start_ccb->ccb_h.ccb_bp = bp;
	softc->refcount++;
	cam_periph_unlock(periph);
	xpt_action(start_ccb);
	cam_periph_lock(periph);
	softc->refcount--;

	/* May have more work to do, so ensure we stay scheduled */
	daschedule(periph);
	break;
	}
	case DA_STATE_PROBE_RC:
	{
	struct scsi_read_capacity_data *rcap;

	rcap = (struct scsi_read_capacity_data *)
	malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (rcap == NULL) {
	printf("dastart: Couldn't malloc read_capacity data\n");
	/* da_free_periph??? */
	break;
	}
	scsi_read_capacity(&start_ccb->csio,
	/retries/da_retry_count,
	dadone,
	MSG_SIMPLE_Q_TAG,
	rcap,
	SSD_FULL_SIZE,
	/timeout/5000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_RC16:
	{
	struct scsi_read_capacity_data_long *rcaplong;

	rcaplong = (struct scsi_read_capacity_data_long *)
	malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (rcaplong == NULL) {
	printf("dastart: Couldn't malloc read_capacity data\n");
	/* da_free_periph??? */
	break;
	}
	scsi_read_capacity_16(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/lba/ 0,
	/reladr/ 0,
	/pmi/ 0,
	/rcap_buf/ (uint8_t *)rcaplong,
	/rcap_buf_len/ sizeof(*rcaplong),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC16;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_LBP:
	{
	struct scsi_vpd_logical_block_prov *lbp;

	if (!scsi_vpd_supported_page(periph, SVPD_LBP)) {
	/*
	* If we get here we don't support any SBC-3 delete
	* methods with UNMAP as the Logical Block Provisioning
	* VPD page support is required for devices which
	* support it according to T10/1799-D Revision 31
	* however older revisions of the spec don't mandate
	* this so we currently don't remove these methods
	* from the available set.
	*/
	softc->state = DA_STATE_PROBE_BLK_LIMITS;
	goto skipstate;
	}

	lbp = (struct scsi_vpd_logical_block_prov *)
	malloc(sizeof(*lbp), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (lbp == NULL) {
	printf("dastart: Couldn't malloc lbp data\n");
	/* da_free_periph??? */
	break;
	}

	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)lbp,
	/inq_len/sizeof(*lbp),
	/evpd/TRUE,
	/page_code/SVPD_LBP,
	/sense_len/SSD_MIN_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_LBP;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_BLK_LIMITS:
	{
	struct scsi_vpd_block_limits *block_limits;

	if (!scsi_vpd_supported_page(periph, SVPD_BLOCK_LIMITS)) {
	/* Not supported skip to next probe */
	softc->state = DA_STATE_PROBE_BDC;
	goto skipstate;
	}

	block_limits = (struct scsi_vpd_block_limits *)
	malloc(sizeof(*block_limits), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (block_limits == NULL) {
	printf("dastart: Couldn't malloc block_limits data\n");
	/* da_free_periph??? */
	break;
	}

	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)block_limits,
	/inq_len/sizeof(*block_limits),
	/evpd/TRUE,
	/page_code/SVPD_BLOCK_LIMITS,
	/sense_len/SSD_MIN_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BLK_LIMITS;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_BDC:
	{
	struct scsi_vpd_block_characteristics *bdc;

	if (!scsi_vpd_supported_page(periph, SVPD_BDC)) {
	softc->state = DA_STATE_PROBE_ATA;
	goto skipstate;
	}

	bdc = (struct scsi_vpd_block_characteristics *)
	malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (bdc == NULL) {
	printf("dastart: Couldn't malloc bdc data\n");
	/* da_free_periph??? */
	break;
	}

	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)bdc,
	/inq_len/sizeof(*bdc),
	/evpd/TRUE,
	/page_code/SVPD_BDC,
	/sense_len/SSD_MIN_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BDC;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA:
	{
	struct ata_params *ata_params;

	if (!scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) {
	if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
	\|\| (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
	/*
	* Note that if the ATA VPD page isn't
	* supported, we aren't talking to an ATA
	* device anyway. Support for that VPD
	* page is mandatory for SCSI to ATA (SAT)
	* translation layers.
	*/
	softc->state = DA_STATE_PROBE_ZONE;
	goto skipstate;
	}
	daprobedone(periph, start_ccb);
	break;
	}

	ata_params = (struct ata_params*)
	malloc(sizeof(*ata_params), M_SCSIDA,M_NOWAIT\|M_ZERO);

	if (ata_params == NULL) {
	xpt_print(periph->path, "Couldn't malloc ata_params "
	"data\n");
	/* da_free_periph??? */
	break;
	}

	scsi_ata_identify(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/data_ptr/(u_int8_t *)ata_params,
	/dxfer_len/sizeof(*ata_params),
	/sense_len/SSD_FULL_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_LOGDIR:
	{
	struct ata_gp_log_dir *log_dir;
	int retval;

	retval = 0;

	if ((softc->flags & DA_FLAG_CAN_ATA_LOG) == 0) {
	/*
	* If we don't have log support, not much point in
	* trying to probe zone support.
	*/
	daprobedone(periph, start_ccb);
	break;
	}

	/*
	* If we have an ATA device (the SCSI ATA Information VPD
	* page should be present and the ATA identify should have
	* succeeded) and it supports logs, ask for the log directory.
	*/

	log_dir = malloc(sizeof(*log_dir), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (log_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc log_dir "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_LOG_DIRECTORY,
	/page_number/ 0,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)log_dir,
	/dxfer_len/ sizeof(*log_dir),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(log_dir, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;
	}
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_LOGDIR;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_IDDIR:
	{
	struct ata_identify_log_pages *id_dir;
	int retval;

	retval = 0;

	/*
	* Check here to see whether the Identify Device log is
	* supported in the directory of logs. If so, continue
	* with requesting the log of identify device pages.
	*/
	if ((softc->flags & DA_FLAG_CAN_ATA_IDLOG) == 0) {
	daprobedone(periph, start_ccb);
	break;
	}

	id_dir = malloc(sizeof(*id_dir), M_SCSIDA, M_NOWAIT \| M_ZERO);
	if (id_dir == NULL) {
	xpt_print(periph->path, "Couldn't malloc id_dir "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_PAGE_LIST,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)id_dir,
	/dxfer_len/ sizeof(*id_dir),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(id_dir, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;
	}
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_IDDIR;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_SUP:
	{
	struct ata_identify_log_sup_cap *sup_cap;
	int retval;

	retval = 0;

	/*
	* Check here to see whether the Supported Capabilities log
	* is in the list of Identify Device logs.
	*/
	if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) == 0) {
	daprobedone(periph, start_ccb);
	break;
	}

	sup_cap = malloc(sizeof(*sup_cap), M_SCSIDA, M_NOWAIT\|M_ZERO);
	if (sup_cap == NULL) {
	xpt_print(periph->path, "Couldn't malloc sup_cap "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_SUP_CAP,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)sup_cap,
	/dxfer_len/ sizeof(*sup_cap),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(sup_cap, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;

	}

	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_SUP;
	xpt_action(start_ccb);
	break;
	}
	case DA_STATE_PROBE_ATA_ZONE:
	{
	struct ata_zoned_info_log *ata_zone;
	int retval;

	retval = 0;

	/*
	* Check here to see whether the zoned device information
	* page is supported. If so, continue on to request it.
	* If not, skip to DA_STATE_PROBE_LOG or done.
	*/
	if ((softc->flags & DA_FLAG_CAN_ATA_ZONE) == 0) {
	daprobedone(periph, start_ccb);
	break;
	}
	ata_zone = malloc(sizeof(*ata_zone), M_SCSIDA,
	M_NOWAIT\|M_ZERO);
	if (ata_zone == NULL) {
	xpt_print(periph->path, "Couldn't malloc ata_zone "
	"data\n");
	daprobedone(periph, start_ccb);
	break;
	}

	retval = scsi_ata_read_log(&start_ccb->csio,
	/retries/ da_retry_count,
	/cbfcnp/ dadone,
	/tag_action/ MSG_SIMPLE_Q_TAG,
	/log_address/ ATA_IDENTIFY_DATA_LOG,
	/page_number/ ATA_IDL_ZDI,
	/block_count/ 1,
	/protocol/ softc->flags & DA_FLAG_CAN_ATA_DMA ?
	AP_PROTO_DMA : AP_PROTO_PIO_IN,
	/data_ptr/ (uint8_t *)ata_zone,
	/dxfer_len/ sizeof(*ata_zone),
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ da_default_timeout * 1000);

	if (retval != 0) {
	xpt_print(periph->path, "scsi_ata_read_log() failed!");
	free(ata_zone, M_SCSIDA);
	daprobedone(periph, start_ccb);
	break;
	}
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_ZONE;
	xpt_action(start_ccb);

	break;
	}
	case DA_STATE_PROBE_ZONE:
	{
	struct scsi_vpd_zoned_bdc *bdc;

	/*
	* Note that this page will be supported for SCSI protocol
	* devices that support ZBC (SMR devices), as well as ATA
	* protocol devices that are behind a SAT (SCSI to ATA
	* Translation) layer that supports converting ZBC commands
	* to their ZAC equivalents.
	*/
	if (!scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) {
	daprobedone(periph, start_ccb);
	break;
	}
	bdc = (struct scsi_vpd_zoned_bdc *)
	malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT\|M_ZERO);

	if (bdc == NULL) {
	xpt_release_ccb(start_ccb);
	xpt_print(periph->path, "Couldn't malloc zone VPD "
	"data\n");
	break;
	}
	scsi_inquiry(&start_ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/inq_buf/(u_int8_t *)bdc,
	/inq_len/sizeof(*bdc),
	/evpd/TRUE,
	/page_code/SVPD_ZONED_BDC,
	/sense_len/SSD_FULL_SIZE,
	/timeout/da_default_timeout * 1000);
	start_ccb->ccb_h.ccb_bp = NULL;
	start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ZONE;
	xpt_action(start_ccb);
	break;
	}
	}
	}

	/*
	* In each of the methods below, while its the caller's
	* responsibility to ensure the request will fit into a
	* single device request, we might have changed the delete
	* method due to the device incorrectly advertising either
	* its supported methods or limits.
	*
	* To prevent this causing further issues we validate the
	* against the methods limits, and warn which would
	* otherwise be unnecessary.
	*/
	static void
	da_delete_unmap(struct cam_periph periph, union ccb ccb, struct bio *bp)
	{
	struct da_softc softc = (struct da_softc )periph->softc;;
	struct bio *bp1;
	uint8_t *buf = softc->unmap_buf;
	struct scsi_unmap_desc d = (void )&buf[UNMAP_HEAD_SIZE];
	uint64_t lba, lastlba = (uint64_t)-1;
	uint64_t totalcount = 0;
	uint64_t count;
	uint32_t c, lastcount = 0, ranges = 0;

	/*
	* Currently this doesn't take the UNMAP
	* Granularity and Granularity Alignment
	* fields into account.
	*
	* This could result in both unoptimal unmap
	* requests as as well as UNMAP calls unmapping
	* fewer LBA's than requested.
	*/

	bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
	bp1 = bp;
	do {
	/*
	* Note: ada and da are different in how they store the
	* pending bp's in a trim. ada stores all of them in the
	* trim_req.bps. da stores all but the first one in the
	* delete_run_queue. ada then completes all the bps in
	* its adadone() loop. da completes all the bps in the
	* delete_run_queue in dadone, and relies on the biodone
	* after to complete. This should be reconciled since there's
	* no real reason to do it differently. XXX
	*/
	if (bp1 != bp)
	bioq_insert_tail(&softc->delete_run_queue, bp1);
	lba = bp1->bio_pblkno;
	count = bp1->bio_bcount / softc->params.secsize;

	/* Try to extend the previous range. */
	if (lba == lastlba) {
	c = omin(count, UNMAP_RANGE_MAX - lastcount);
	lastlba += c;
	lastcount += c;
	scsi_ulto4b(lastcount, d[ranges - 1].length);
	count -= c;
	lba += c;
	totalcount += c;
	} else if ((softc->quirks & DA_Q_STRICT_UNMAP) &&
	softc->unmap_gran != 0) {
	/* Align length of the previous range. */
	if ((c = lastcount % softc->unmap_gran) != 0) {
	if (lastcount <= c) {
	totalcount -= lastcount;
	lastlba = (uint64_t)-1;
	lastcount = 0;
	ranges--;
	} else {
	totalcount -= c;
	lastlba -= c;
	lastcount -= c;
	scsi_ulto4b(lastcount, d[ranges - 1].length);
	}
	}
	/* Align beginning of the new range. */
	c = (lba - softc->unmap_gran_align) % softc->unmap_gran;
	if (c != 0) {
	c = softc->unmap_gran - c;
	if (count <= c) {
	count = 0;
	} else {
	lba += c;
	count -= c;
	}
	}
	}

	while (count > 0) {
	c = omin(count, UNMAP_RANGE_MAX);
	if (totalcount + c > softc->unmap_max_lba \|\|
	ranges >= softc->unmap_max_ranges) {
	xpt_print(periph->path,
	"%s issuing short delete %ld > %ld"
	"\|\| %d >= %d",
	da_delete_method_desc[softc->delete_method],
	totalcount + c, softc->unmap_max_lba,
	ranges, softc->unmap_max_ranges);
	break;
	}
	scsi_u64to8b(lba, d[ranges].lba);
	scsi_ulto4b(c, d[ranges].length);
	lba += c;
	totalcount += c;
	ranges++;
	count -= c;
	lastlba = lba;
	lastcount = c;
	}
	bp1 = cam_iosched_next_trim(softc->cam_iosched);
	if (bp1 == NULL)
	break;
	if (ranges >= softc->unmap_max_ranges \|\|
	totalcount + bp1->bio_bcount /
	softc->params.secsize > softc->unmap_max_lba) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp1);
	break;
	}
	} while (1);

	/* Align length of the last range. */
	if ((softc->quirks & DA_Q_STRICT_UNMAP) && softc->unmap_gran != 0 &&
	(c = lastcount % softc->unmap_gran) != 0) {
	if (lastcount <= c)
	ranges--;
	else
	scsi_ulto4b(lastcount - c, d[ranges - 1].length);
	}

	scsi_ulto2b(ranges * 16 + 6, &buf[0]);
	scsi_ulto2b(ranges * 16, &buf[2]);

	scsi_unmap(&ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/byte2/0,
	/data_ptr/ buf,
	/dxfer_len/ ranges * 16 + 8,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
	ccb->ccb_h.flags \|= CAM_UNLOCKED;
	cam_iosched_submit_trim(softc->cam_iosched);
	}

	static void
	da_delete_trim(struct cam_periph periph, union ccb ccb, struct bio *bp)
	{
	struct da_softc softc = (struct da_softc )periph->softc;
	struct bio *bp1;
	uint8_t *buf = softc->unmap_buf;
	uint64_t lastlba = (uint64_t)-1;
	uint64_t count;
	uint64_t lba;
	uint32_t lastcount = 0, c, requestcount;
	int ranges = 0, off, block_count;

	bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
	bp1 = bp;
	do {
	if (bp1 != bp)//XXX imp XXX
	bioq_insert_tail(&softc->delete_run_queue, bp1);
	lba = bp1->bio_pblkno;
	count = bp1->bio_bcount / softc->params.secsize;
	requestcount = count;

	/* Try to extend the previous range. */
	if (lba == lastlba) {
	c = omin(count, ATA_DSM_RANGE_MAX - lastcount);
	lastcount += c;
	off = (ranges - 1) * 8;
	buf[off + 6] = lastcount & 0xff;
	buf[off + 7] = (lastcount >> 8) & 0xff;
	count -= c;
	lba += c;
	}

	while (count > 0) {
	c = omin(count, ATA_DSM_RANGE_MAX);
	off = ranges * 8;

	buf[off + 0] = lba & 0xff;
	buf[off + 1] = (lba >> 8) & 0xff;
	buf[off + 2] = (lba >> 16) & 0xff;
	buf[off + 3] = (lba >> 24) & 0xff;
	buf[off + 4] = (lba >> 32) & 0xff;
	buf[off + 5] = (lba >> 40) & 0xff;
	buf[off + 6] = c & 0xff;
	buf[off + 7] = (c >> 8) & 0xff;
	lba += c;
	ranges++;
	count -= c;
	lastcount = c;
	if (count != 0 && ranges == softc->trim_max_ranges) {
	xpt_print(periph->path,
	"%s issuing short delete %ld > %ld\n",
	da_delete_method_desc[softc->delete_method],
	requestcount,
	(softc->trim_max_ranges - ranges) *
	ATA_DSM_RANGE_MAX);
	break;
	}
	}
	lastlba = lba;
	bp1 = cam_iosched_next_trim(softc->cam_iosched);
	if (bp1 == NULL)
	break;
	if (bp1->bio_bcount / softc->params.secsize >
	(softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp1);
	break;
	}
	} while (1);

	block_count = howmany(ranges, ATA_DSM_BLK_RANGES);
	scsi_ata_trim(&ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	block_count,
	/data_ptr/buf,
	/dxfer_len/block_count * ATA_DSM_BLK_SIZE,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
	ccb->ccb_h.flags \|= CAM_UNLOCKED;
	cam_iosched_submit_trim(softc->cam_iosched);
	}

	/*
	* We calculate ws_max_blks here based off d_delmaxsize instead
	* of using softc->ws_max_blks as it is absolute max for the
	* device not the protocol max which may well be lower.
	*/
	static void
	da_delete_ws(struct cam_periph periph, union ccb ccb, struct bio *bp)
	{
	struct da_softc *softc;
	struct bio *bp1;
	uint64_t ws_max_blks;
	uint64_t lba;
	uint64_t count; /* forward compat with WS32 */

	softc = (struct da_softc *)periph->softc;
	ws_max_blks = softc->disk->d_delmaxsize / softc->params.secsize;
	lba = bp->bio_pblkno;
	count = 0;
	bp1 = bp;
	do {
	if (bp1 != bp)//XXX imp XXX
	bioq_insert_tail(&softc->delete_run_queue, bp1);
	count += bp1->bio_bcount / softc->params.secsize;
	if (count > ws_max_blks) {
	xpt_print(periph->path,
	"%s issuing short delete %ld > %ld\n",
	da_delete_method_desc[softc->delete_method],
	count, ws_max_blks);
	count = omin(count, ws_max_blks);
	break;
	}
	bp1 = cam_iosched_next_trim(softc->cam_iosched);
	if (bp1 == NULL)
	break;
	if (lba + count != bp1->bio_pblkno \|\|
	count + bp1->bio_bcount /
	softc->params.secsize > ws_max_blks) {
	cam_iosched_put_back_trim(softc->cam_iosched, bp1);
	break;
	}
	} while (1);

	scsi_write_same(&ccb->csio,
	/retries/da_retry_count,
	/cbfcnp/dadone,
	/tag_action/MSG_SIMPLE_Q_TAG,
	/byte2/softc->delete_method ==
	DA_DELETE_ZERO ? 0 : SWS_UNMAP,
	softc->delete_method == DA_DELETE_WS16 ? 16 : 10,
	/lba/lba,
	/block_count/count,
	/data_ptr/ __DECONST(void *, zero_region),
	/dxfer_len/ softc->params.secsize,
	/sense_len/SSD_FULL_SIZE,
	da_default_timeout * 1000);
	ccb->ccb_h.ccb_state = DA_CCB_DELETE;
	ccb->ccb_h.flags \|= CAM_UNLOCKED;
	cam_iosched_submit_trim(softc->cam_iosched);
	}

	static int
	cmd6workaround(union ccb *ccb)
	{
	struct scsi_rw_6 cmd6;
	struct scsi_rw_10 *cmd10;
	struct da_softc *softc;
	u_int8_t *cdb;
	struct bio *bp;
	int frozen;

	cdb = ccb->csio.cdb_io.cdb_bytes;
	softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc;

	if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) {
	da_delete_methods old_method = softc->delete_method;

	/*
	* Typically there are two reasons for failure here
	* 1. Delete method was detected as supported but isn't
	* 2. Delete failed due to invalid params e.g. too big
	*
	* While we will attempt to choose an alternative delete method
	* this may result in short deletes if the existing delete
	* requests from geom are big for the new method chosen.
	*
	* This method assumes that the error which triggered this
	* will not retry the io otherwise a panic will occur
	*/
	dadeleteflag(softc, old_method, 0);
	dadeletemethodchoose(softc, DA_DELETE_DISABLE);
	if (softc->delete_method == DA_DELETE_DISABLE)
	xpt_print(ccb->ccb_h.path,
	"%s failed, disabling BIO_DELETE\n",
	da_delete_method_desc[old_method]);
	else
	xpt_print(ccb->ccb_h.path,
	"%s failed, switching to %s BIO_DELETE\n",
	da_delete_method_desc[old_method],
	da_delete_method_desc[softc->delete_method]);

	while ((bp = bioq_takefirst(&softc->delete_run_queue)) != NULL)
	cam_iosched_queue_work(softc->cam_iosched, bp);
	cam_iosched_queue_work(softc->cam_iosched,
	(struct bio *)ccb->ccb_h.ccb_bp);
	ccb->ccb_h.ccb_bp = NULL;
	return (0);
	}

	/* Detect unsupported PREVENT ALLOW MEDIUM REMOVAL. */
	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
	(*cdb == PREVENT_ALLOW) &&
	(softc->quirks & DA_Q_NO_PREVENT) == 0) {
	if (bootverbose)
	xpt_print(ccb->ccb_h.path,
	"PREVENT ALLOW MEDIUM REMOVAL not supported.\n");
	softc->quirks \|= DA_Q_NO_PREVENT;
	return (0);
	}

	/* Detect unsupported SYNCHRONIZE CACHE(10). */
	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 &&
	(*cdb == SYNCHRONIZE_CACHE) &&
	(softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
	if (bootverbose)
	xpt_print(ccb->ccb_h.path,
	"SYNCHRONIZE CACHE(10) not supported.\n");
	softc->quirks \|= DA_Q_NO_SYNC_CACHE;
	softc->disk->d_flags &= ~DISKFLAG_CANFLUSHCACHE;
	return (0);
	}

	/* Translation only possible if CDB is an array and cmd is R/W6 */
	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 \|\|
	(cdb != READ_6 && cdb != WRITE_6))
	return 0;

	xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, "
	"increasing minimum_cmd_size to 10.\n");
	softc->minimum_cmd_size = 10;

	bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6));
	cmd10 = (struct scsi_rw_10 *)cdb;
	cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10;
	cmd10->byte2 = 0;
	scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr);
	cmd10->reserved = 0;
	scsi_ulto2b(cmd6.length, cmd10->length);
	cmd10->control = cmd6.control;
	ccb->csio.cdb_len = sizeof(*cmd10);

	/* Requeue request, unfreezing queue if necessary */
	frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0;
	ccb->ccb_h.status = CAM_REQUEUE_REQ;
	xpt_action(ccb);
	if (frozen) {
	cam_release_devq(ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	return (ERESTART);
	}

	static void
	dazonedone(struct cam_periph periph, union ccb ccb)
	{
	struct da_softc *softc;
	struct bio *bp;

	softc = periph->softc;
	bp = (struct bio *)ccb->ccb_h.ccb_bp;

	switch (bp->bio_zone.zone_cmd) {
	case DISK_ZONE_OPEN:
	case DISK_ZONE_CLOSE:
	case DISK_ZONE_FINISH:
	case DISK_ZONE_RWP:
	break;
	case DISK_ZONE_REPORT_ZONES: {
	uint32_t avail_len;
	struct disk_zone_report *rep;
	struct scsi_report_zones_hdr *hdr;
	struct scsi_report_zones_desc *desc;
	struct disk_zone_rep_entry *entry;
	- uint32_t num_alloced, hdr_len, num_avail;
	+ uint32_t hdr_len, num_avail;
	uint32_t num_to_fill, i;
	int ata;

	rep = &bp->bio_zone.zone_params.report;
	avail_len = ccb->csio.dxfer_len - ccb->csio.resid;
	/*
	* Note that bio_resid isn't normally used for zone
	* commands, but it is used by devstat_end_transaction_bio()
	* to determine how much data was transferred. Because
	* the size of the SCSI/ATA data structures is different
	* than the size of the BIO interface structures, the
	* amount of data actually transferred from the drive will
	* be different than the amount of data transferred to
	* the user.
	*/
	bp->bio_resid = ccb->csio.resid;
	- num_alloced = rep->entries_allocated;
	hdr = (struct scsi_report_zones_hdr *)ccb->csio.data_ptr;
	if (avail_len < sizeof(*hdr)) {
	/*
	* Is there a better error than EIO here? We asked
	* for at least the header, and we got less than
	* that.
	*/
	bp->bio_error = EIO;
	bp->bio_flags \|= BIO_ERROR;
	bp->bio_resid = bp->bio_bcount;
	break;
	}

	if (softc->zone_interface == DA_ZONE_IF_ATA_PASS)
	ata = 1;
	else
	ata = 0;

	hdr_len = ata ? le32dec(hdr->length) :
	scsi_4btoul(hdr->length);
	if (hdr_len > 0)
	rep->entries_available = hdr_len / sizeof(*desc);
	else
	rep->entries_available = 0;
	/*
	* NOTE: using the same values for the BIO version of the
	* same field as the SCSI/ATA values. This means we could
	* get some additional values that aren't defined in bio.h
	* if more values of the same field are defined later.
	*/
	rep->header.same = hdr->byte4 & SRZ_SAME_MASK;
	rep->header.maximum_lba = ata ? le64dec(hdr->maximum_lba) :
	scsi_8btou64(hdr->maximum_lba);
	/*
	* If the drive reports no entries that match the query,
	* we're done.
	*/
	if (hdr_len == 0) {
	rep->entries_filled = 0;
	break;
	}

	num_avail = min((avail_len - sizeof(hdr)) / sizeof(desc),
	hdr_len / sizeof(*desc));
	/*
	* If the drive didn't return any data, then we're done.
	*/
	if (num_avail == 0) {
	rep->entries_filled = 0;
	break;
	}

	num_to_fill = min(num_avail, rep->entries_allocated);
	/*
	* If the user didn't allocate any entries for us to fill,
	* we're done.
	*/
	if (num_to_fill == 0) {
	rep->entries_filled = 0;
	break;
	}

	for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0];
	i < num_to_fill; i++, desc++, entry++) {
	/*
	* NOTE: we're mapping the values here directly
	* from the SCSI/ATA bit definitions to the bio.h
	* definitons. There is also a warning in
	* disk_zone.h, but the impact is that if
	* additional values are added in the SCSI/ATA
	* specs these will be visible to consumers of
	* this interface.
	*/
	entry->zone_type = desc->zone_type & SRZ_TYPE_MASK;
	entry->zone_condition =
	(desc->zone_flags & SRZ_ZONE_COND_MASK) >>
	SRZ_ZONE_COND_SHIFT;
	entry->zone_flags \|= desc->zone_flags &
	(SRZ_ZONE_NON_SEQ\|SRZ_ZONE_RESET);
	entry->zone_length =
	ata ? le64dec(desc->zone_length) :
	scsi_8btou64(desc->zone_length);
	entry->zone_start_lba =
	ata ? le64dec(desc->zone_start_lba) :
	scsi_8btou64(desc->zone_start_lba);
	entry->write_pointer_lba =
	ata ? le64dec(desc->write_pointer_lba) :
	scsi_8btou64(desc->write_pointer_lba);
	}
	rep->entries_filled = num_to_fill;
	break;
	}
	case DISK_ZONE_GET_PARAMS:
	default:
	/*
	* In theory we should not get a GET_PARAMS bio, since it
	* should be handled without queueing the command to the
	* drive.
	*/
	panic("%s: Invalid zone command %d", __func__,
	bp->bio_zone.zone_cmd);
	break;
	}

	if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)
	free(ccb->csio.data_ptr, M_SCSIDA);
	}

	static void
	dadone(struct cam_periph periph, union ccb done_ccb)
	{
	struct da_softc *softc;
	struct ccb_scsiio *csio;
	u_int32_t priority;
	da_ccb_state state;

	softc = (struct da_softc *)periph->softc;
	priority = done_ccb->ccb_h.pinfo.priority;

	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n"));

	csio = &done_ccb->csio;
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (csio->bio != NULL)
	biotrack(csio->bio, __func__);
	#endif
	state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK;
	switch (state) {
	case DA_CCB_BUFFER_IO:
	case DA_CCB_DELETE:
	{
	struct bio bp, bp1;

	cam_periph_lock(periph);
	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	int error;
	int sf;

	if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0)
	sf = SF_RETRY_UA;
	else
	sf = 0;

	error = daerror(done_ccb, CAM_RETRY_SELTO, sf);
	if (error == ERESTART) {
	/*
	* A retry was scheduled, so
	* just return.
	*/
	cam_periph_unlock(periph);
	return;
	}
	bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
	if (error != 0) {
	int queued_error;

	/*
	* return all queued I/O with EIO, so that
	* the client can retry these I/Os in the
	* proper order should it attempt to recover.
	*/
	queued_error = EIO;

	if (error == ENXIO
	&& (softc->flags & DA_FLAG_PACK_INVALID)== 0) {
	/*
	* Catastrophic error. Mark our pack as
	* invalid.
	*/
	/*
	* XXX See if this is really a media
	* XXX change first?
	*/
	xpt_print(periph->path,
	"Invalidating pack\n");
	softc->flags \|= DA_FLAG_PACK_INVALID;
	#ifdef CAM_IO_STATS
	softc->invalidations++;
	#endif
	queued_error = ENXIO;
	}
	cam_iosched_flush(softc->cam_iosched, NULL,
	queued_error);
	if (bp != NULL) {
	bp->bio_error = error;
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	}
	} else if (bp != NULL) {
	if (state == DA_CCB_DELETE)
	bp->bio_resid = 0;
	else
	bp->bio_resid = csio->resid;
	bp->bio_error = 0;
	if (bp->bio_resid != 0)
	bp->bio_flags \|= BIO_ERROR;
	}
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	} else if (bp != NULL) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	panic("REQ_CMP with QFRZN");
	if (bp->bio_cmd == BIO_ZONE)
	dazonedone(periph, done_ccb);
	else if (state == DA_CCB_DELETE)
	bp->bio_resid = 0;
	else
	bp->bio_resid = csio->resid;
	if ((csio->resid > 0)
	&& (bp->bio_cmd != BIO_ZONE))
	bp->bio_flags \|= BIO_ERROR;
	if (softc->error_inject != 0) {
	bp->bio_error = softc->error_inject;
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	softc->error_inject = 0;
	}
	}

	if (bp != NULL)
	biotrack(bp, __func__);
	LIST_REMOVE(&done_ccb->ccb_h, periph_links.le);
	if (LIST_EMPTY(&softc->pending_ccbs))
	softc->flags \|= DA_FLAG_WAS_OTAG;

	/*
	* We need to call cam_iosched before we call biodone so that we
	* don't measure any activity that happens in the completion
	* routine, which in the case of sendfile can be quite
	* extensive.
	*/
	cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
	xpt_release_ccb(done_ccb);
	if (state == DA_CCB_DELETE) {
	TAILQ_HEAD(, bio) queue;

	TAILQ_INIT(&queue);
	TAILQ_CONCAT(&queue, &softc->delete_run_queue.queue, bio_queue);
	softc->delete_run_queue.insert_point = NULL;
	/*
	* Normally, the xpt_release_ccb() above would make sure
	* that when we have more work to do, that work would
	* get kicked off. However, we specifically keep
	* delete_running set to 0 before the call above to
	* allow other I/O to progress when many BIO_DELETE
	* requests are pushed down. We set delete_running to 0
	* and call daschedule again so that we don't stall if
	* there are no other I/Os pending apart from BIO_DELETEs.
	*/
	cam_iosched_trim_done(softc->cam_iosched);
	daschedule(periph);
	cam_periph_unlock(periph);
	while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
	TAILQ_REMOVE(&queue, bp1, bio_queue);
	bp1->bio_error = bp->bio_error;
	if (bp->bio_flags & BIO_ERROR) {
	bp1->bio_flags \|= BIO_ERROR;
	bp1->bio_resid = bp1->bio_bcount;
	} else
	bp1->bio_resid = 0;
	biodone(bp1);
	}
	} else {
	daschedule(periph);
	cam_periph_unlock(periph);
	}
	if (bp != NULL)
	biodone(bp);
	return;
	}
	case DA_CCB_PROBE_RC:
	case DA_CCB_PROBE_RC16:
	{
	struct scsi_read_capacity_data *rdcap;
	struct scsi_read_capacity_data_long *rcaplong;
	char *announce_buf;
	int lbp;

	lbp = 0;
	rdcap = NULL;
	rcaplong = NULL;
	/* XXX TODO: can this be a malloc? */
	announce_buf = softc->announce_temp;
	bzero(announce_buf, DA_ANNOUNCETMP_SZ);

	if (state == DA_CCB_PROBE_RC)
	rdcap =(struct scsi_read_capacity_data *)csio->data_ptr;
	else
	rcaplong = (struct scsi_read_capacity_data_long *)
	csio->data_ptr;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	struct disk_params *dp;
	uint32_t block_size;
	uint64_t maxsector;
	u_int lalba; /* Lowest aligned LBA. */

	if (state == DA_CCB_PROBE_RC) {
	block_size = scsi_4btoul(rdcap->length);
	maxsector = scsi_4btoul(rdcap->addr);
	lalba = 0;

	/*
	* According to SBC-2, if the standard 10
	* byte READ CAPACITY command returns 2^32,
	* we should issue the 16 byte version of
	* the command, since the device in question
	* has more sectors than can be represented
	* with the short version of the command.
	*/
	if (maxsector == 0xffffffff) {
	free(rdcap, M_SCSIDA);
	xpt_release_ccb(done_ccb);
	softc->state = DA_STATE_PROBE_RC16;
	xpt_schedule(periph, priority);
	return;
	}
	} else {
	block_size = scsi_4btoul(rcaplong->length);
	maxsector = scsi_8btou64(rcaplong->addr);
	lalba = scsi_2btoul(rcaplong->lalba_lbp);
	}

	/*
	* Because GEOM code just will panic us if we
	* give them an 'illegal' value we'll avoid that
	* here.
	*/
	if (block_size == 0) {
	block_size = 512;
	if (maxsector == 0)
	maxsector = -1;
	}
	if (block_size >= MAXPHYS) {
	xpt_print(periph->path,
	"unsupportable block size %ju\n",
	(uintmax_t) block_size);
	announce_buf = NULL;
	cam_periph_invalidate(periph);
	} else {
	/*
	* We pass rcaplong into dasetgeom(),
	* because it will only use it if it is
	* non-NULL.
	*/
	dasetgeom(periph, block_size, maxsector,
	rcaplong, sizeof(*rcaplong));
	lbp = (lalba & SRC16_LBPME_A);
	dp = &softc->params;
	snprintf(announce_buf, DA_ANNOUNCETMP_SZ,
	"%juMB (%ju %u byte sectors)",
	((uintmax_t)dp->secsize * dp->sectors) /
	(1024 * 1024),
	(uintmax_t)dp->sectors, dp->secsize);
	}
	} else {
	int error;

	/*
	* Retry any UNIT ATTENTION type errors. They
	* are expected at boot.
	*/
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART) {
	/*
	* A retry was scheuled, so
	* just return.
	*/
	return;
	} else if (error != 0) {
	int asc, ascq;
	int sense_key, error_code;
	int have_sense;
	cam_status status;
	struct ccb_getdev cgd;

	/* Don't wedge this device's queue */
	status = done_ccb->ccb_h.status;
	if ((status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);


	xpt_setup_ccb(&cgd.ccb_h,
	done_ccb->ccb_h.path,
	CAM_PRIORITY_NORMAL);
	cgd.ccb_h.func_code = XPT_GDEV_TYPE;
	xpt_action((union ccb *)&cgd);

	if (scsi_extract_sense_ccb(done_ccb,
	&error_code, &sense_key, &asc, &ascq))
	have_sense = TRUE;
	else
	have_sense = FALSE;

	/*
	* If we tried READ CAPACITY(16) and failed,
	* fallback to READ CAPACITY(10).
	*/
	if ((state == DA_CCB_PROBE_RC16) &&
	(softc->flags & DA_FLAG_CAN_RC16) &&
	(((csio->ccb_h.status & CAM_STATUS_MASK) ==
	CAM_REQ_INVALID) \|\|
	((have_sense) &&
	(error_code == SSD_CURRENT_ERROR) &&
	(sense_key == SSD_KEY_ILLEGAL_REQUEST)))) {
	softc->flags &= ~DA_FLAG_CAN_RC16;
	free(rdcap, M_SCSIDA);
	xpt_release_ccb(done_ccb);
	softc->state = DA_STATE_PROBE_RC;
	xpt_schedule(periph, priority);
	return;
	}

	/*
	* Attach to anything that claims to be a
	* direct access or optical disk device,
	* as long as it doesn't return a "Logical
	* unit not supported" (0x25) error.
	* "Internal Target Failure" (0x44) is also
	* special and typically means that the
	* device is a SATA drive behind a SATL
	* translation that's fallen into a
	* terminally fatal state.
	*/
	if ((have_sense)
	&& (asc != 0x25) && (asc != 0x44)
	&& (error_code == SSD_CURRENT_ERROR)) {
	const char *sense_key_desc;
	const char *asc_desc;

	dasetgeom(periph, 512, -1, NULL, 0);
	scsi_sense_desc(sense_key, asc, ascq,
	&cgd.inq_data,
	&sense_key_desc,
	&asc_desc);
	snprintf(announce_buf,
	DA_ANNOUNCETMP_SZ,
	"Attempt to query device "
	"size failed: %s, %s",
	sense_key_desc, asc_desc);
	} else {
	if (have_sense)
	scsi_sense_print(
	&done_ccb->csio);
	else {
	xpt_print(periph->path,
	"got CAM status %#x\n",
	done_ccb->ccb_h.status);
	}

	xpt_print(periph->path, "fatal error, "
	"failed to attach to device\n");

	announce_buf = NULL;

	/*
	* Free up resources.
	*/
	cam_periph_invalidate(periph);
	}
	}
	}
	free(csio->data_ptr, M_SCSIDA);
	if (announce_buf != NULL &&
	((softc->flags & DA_FLAG_ANNOUNCED) == 0)) {
	struct sbuf sb;

	sbuf_new(&sb, softc->announcebuf, DA_ANNOUNCE_SZ,
	SBUF_FIXEDLEN);
	xpt_announce_periph_sbuf(periph, &sb, announce_buf);
	xpt_announce_quirks_sbuf(periph, &sb, softc->quirks,
	DA_Q_BIT_STRING);
	sbuf_finish(&sb);
	sbuf_putbuf(&sb);

	/*
	* Create our sysctl variables, now that we know
	* we have successfully attached.
	*/
	/* increase the refcount */
	if (cam_periph_acquire(periph) == CAM_REQ_CMP) {

	taskqueue_enqueue(taskqueue_thread,
	&softc->sysctl_task);
	} else {
	/* XXX This message is useless! */
	xpt_print(periph->path, "fatal error, "
	"could not acquire reference count\n");
	}
	}

	/* We already probed the device. */
	if (softc->flags & DA_FLAG_PROBED) {
	daprobedone(periph, done_ccb);
	return;
	}

	/* Ensure re-probe doesn't see old delete. */
	softc->delete_available = 0;
	dadeleteflag(softc, DA_DELETE_ZERO, 1);
	if (lbp && (softc->quirks & DA_Q_NO_UNMAP) == 0) {
	/*
	* Based on older SBC-3 spec revisions
	* any of the UNMAP methods "may" be
	* available via LBP given this flag so
	* we flag all of them as available and
	* then remove those which further
	* probes confirm aren't available
	* later.
	*
	* We could also check readcap(16) p_type
	* flag to exclude one or more invalid
	* write same (X) types here
	*/
	dadeleteflag(softc, DA_DELETE_WS16, 1);
	dadeleteflag(softc, DA_DELETE_WS10, 1);
	dadeleteflag(softc, DA_DELETE_UNMAP, 1);

	xpt_release_ccb(done_ccb);
	softc->state = DA_STATE_PROBE_LBP;
	xpt_schedule(periph, priority);
	return;
	}

	xpt_release_ccb(done_ccb);
	softc->state = DA_STATE_PROBE_BDC;
	xpt_schedule(periph, priority);
	return;
	}
	case DA_CCB_PROBE_LBP:
	{
	struct scsi_vpd_logical_block_prov *lbp;

	lbp = (struct scsi_vpd_logical_block_prov *)csio->data_ptr;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	/*
	* T10/1799-D Revision 31 states at least one of these
	* must be supported but we don't currently enforce this.
	*/
	dadeleteflag(softc, DA_DELETE_WS16,
	(lbp->flags & SVPD_LBP_WS16));
	dadeleteflag(softc, DA_DELETE_WS10,
	(lbp->flags & SVPD_LBP_WS10));
	dadeleteflag(softc, DA_DELETE_UNMAP,
	(lbp->flags & SVPD_LBP_UNMAP));
	} else {
	int error;
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}

	/*
	* Failure indicates we don't support any SBC-3
	* delete methods with UNMAP
	*/
	}
	}

	free(lbp, M_SCSIDA);
	xpt_release_ccb(done_ccb);
	softc->state = DA_STATE_PROBE_BLK_LIMITS;
	xpt_schedule(periph, priority);
	return;
	}
	case DA_CCB_PROBE_BLK_LIMITS:
	{
	struct scsi_vpd_block_limits *block_limits;

	block_limits = (struct scsi_vpd_block_limits *)csio->data_ptr;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t max_txfer_len = scsi_4btoul(
	block_limits->max_txfer_len);
	uint32_t max_unmap_lba_cnt = scsi_4btoul(
	block_limits->max_unmap_lba_cnt);
	uint32_t max_unmap_blk_cnt = scsi_4btoul(
	block_limits->max_unmap_blk_cnt);
	uint32_t unmap_gran = scsi_4btoul(
	block_limits->opt_unmap_grain);
	uint32_t unmap_gran_align = scsi_4btoul(
	block_limits->unmap_grain_align);
	uint64_t ws_max_blks = scsi_8btou64(
	block_limits->max_write_same_length);

	if (max_txfer_len != 0) {
	softc->disk->d_maxsize = MIN(softc->maxio,
	(off_t)max_txfer_len * softc->params.secsize);
	}

	/*
	* We should already support UNMAP but we check lba
	* and block count to be sure
	*/
	if (max_unmap_lba_cnt != 0x00L &&
	max_unmap_blk_cnt != 0x00L) {
	softc->unmap_max_lba = max_unmap_lba_cnt;
	softc->unmap_max_ranges = min(max_unmap_blk_cnt,
	UNMAP_MAX_RANGES);
	if (unmap_gran > 1) {
	softc->unmap_gran = unmap_gran;
	if (unmap_gran_align & 0x80000000) {
	softc->unmap_gran_align =
	unmap_gran_align &
	0x7fffffff;
	}
	}
	} else {
	/*
	* Unexpected UNMAP limits which means the
	* device doesn't actually support UNMAP
	*/
	dadeleteflag(softc, DA_DELETE_UNMAP, 0);
	}

	if (ws_max_blks != 0x00L)
	softc->ws_max_blks = ws_max_blks;
	} else {
	int error;
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}

	/*
	* Failure here doesn't mean UNMAP is not
	* supported as this is an optional page.
	*/
	softc->unmap_max_lba = 1;
	softc->unmap_max_ranges = 1;
	}
	}

	free(block_limits, M_SCSIDA);
	xpt_release_ccb(done_ccb);
	softc->state = DA_STATE_PROBE_BDC;
	xpt_schedule(periph, priority);
	return;
	}
	case DA_CCB_PROBE_BDC:
	{
	struct scsi_vpd_block_device_characteristics *bdc;

	bdc = (struct scsi_vpd_block_device_characteristics *)
	csio->data_ptr;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;

	/*
	* Disable queue sorting for non-rotational media
	* by default.
	*/
	u_int16_t old_rate = softc->disk->d_rotation_rate;

	valid_len = csio->dxfer_len - csio->resid;
	if (SBDC_IS_PRESENT(bdc, valid_len,
	medium_rotation_rate)) {
	softc->disk->d_rotation_rate =
	scsi_2btoul(bdc->medium_rotation_rate);
	if (softc->disk->d_rotation_rate ==
	SVPD_BDC_RATE_NON_ROTATING) {
	cam_iosched_set_sort_queue(
	softc->cam_iosched, 0);
	softc->rotating = 0;
	}
	if (softc->disk->d_rotation_rate != old_rate) {
	disk_attr_changed(softc->disk,
	"GEOM::rotation_rate", M_NOWAIT);
	}
	}
	if ((SBDC_IS_PRESENT(bdc, valid_len, flags))
	&& (softc->zone_mode == DA_ZONE_NONE)) {
	int ata_proto;

	if (scsi_vpd_supported_page(periph,
	SVPD_ATA_INFORMATION))
	ata_proto = 1;
	else
	ata_proto = 0;

	/*
	* The Zoned field will only be set for
	* Drive Managed and Host Aware drives. If
	* they are Host Managed, the device type
	* in the standard INQUIRY data should be
	* set to T_ZBC_HM (0x14).
	*/
	if ((bdc->flags & SVPD_ZBC_MASK) ==
	SVPD_HAW_ZBC) {
	softc->zone_mode = DA_ZONE_HOST_AWARE;
	softc->zone_interface = (ata_proto) ?
	DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
	} else if ((bdc->flags & SVPD_ZBC_MASK) ==
	SVPD_DM_ZBC) {
	softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
	softc->zone_interface = (ata_proto) ?
	DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI;
	} else if ((bdc->flags & SVPD_ZBC_MASK) !=
	SVPD_ZBC_NR) {
	xpt_print(periph->path, "Unknown zoned "
	"type %#x",
	bdc->flags & SVPD_ZBC_MASK);
	}
	}
	} else {
	int error;
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(bdc, M_SCSIDA);
	xpt_release_ccb(done_ccb);
	softc->state = DA_STATE_PROBE_ATA;
	xpt_schedule(periph, priority);
	return;
	}
	case DA_CCB_PROBE_ATA:
	{
	int i;
	struct ata_params *ata_params;
	int continue_probe;
	int error;
	int16_t *ptr;

	ata_params = (struct ata_params *)csio->data_ptr;
	ptr = (uint16_t *)ata_params;
	continue_probe = 0;
	error = 0;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint16_t old_rate;

	for (i = 0; i < sizeof(*ata_params) / 2; i++)
	ptr[i] = le16toh(ptr[i]);
	if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM &&
	(softc->quirks & DA_Q_NO_UNMAP) == 0) {
	dadeleteflag(softc, DA_DELETE_ATA_TRIM, 1);
	if (ata_params->max_dsm_blocks != 0)
	softc->trim_max_ranges = min(
	softc->trim_max_ranges,
	ata_params->max_dsm_blocks *
	ATA_DSM_BLK_RANGES);
	}
	/*
	* Disable queue sorting for non-rotational media
	* by default.
	*/
	old_rate = softc->disk->d_rotation_rate;
	softc->disk->d_rotation_rate =
	ata_params->media_rotation_rate;
	if (softc->disk->d_rotation_rate ==
	ATA_RATE_NON_ROTATING) {
	cam_iosched_set_sort_queue(softc->cam_iosched, 0);
	softc->rotating = 0;
	}
	if (softc->disk->d_rotation_rate != old_rate) {
	disk_attr_changed(softc->disk,
	"GEOM::rotation_rate", M_NOWAIT);
	}

	if (ata_params->capabilities1 & ATA_SUPPORT_DMA)
	softc->flags \|= DA_FLAG_CAN_ATA_DMA;

	if (ata_params->support.extension &
	ATA_SUPPORT_GENLOG)
	softc->flags \|= DA_FLAG_CAN_ATA_LOG;

	/*
	* At this point, if we have a SATA host aware drive,
	* we communicate via ATA passthrough unless the
	* SAT layer supports ZBC -> ZAC translation. In
	* that case,
	*/
	/*
	* XXX KDM figure out how to detect a host managed
	* SATA drive.
	*/
	if (softc->zone_mode == DA_ZONE_NONE) {
	/*
	* Note that we don't override the zone
	* mode or interface if it has already been
	* set. This is because it has either been
	* set as a quirk, or when we probed the
	* SCSI Block Device Characteristics page,
	* the zoned field was set. The latter
	* means that the SAT layer supports ZBC to
	* ZAC translation, and we would prefer to
	* use that if it is available.
	*/
	if ((ata_params->support3 &
	ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE) {
	softc->zone_mode = DA_ZONE_HOST_AWARE;
	softc->zone_interface =
	DA_ZONE_IF_ATA_PASS;
	} else if ((ata_params->support3 &
	ATA_SUPPORT_ZONE_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED) {
	softc->zone_mode =DA_ZONE_DRIVE_MANAGED;
	softc->zone_interface =
	DA_ZONE_IF_ATA_PASS;
	}
	}

	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(ata_params, M_SCSIDA);
	if ((softc->zone_mode == DA_ZONE_HOST_AWARE)
	\|\| (softc->zone_mode == DA_ZONE_HOST_MANAGED)) {
	/*
	* If the ATA IDENTIFY failed, we could be talking
	* to a SCSI drive, although that seems unlikely,
	* since the drive did report that it supported the
	* ATA Information VPD page. If the ATA IDENTIFY
	* succeeded, and the SAT layer doesn't support
	* ZBC -> ZAC translation, continue on to get the
	* directory of ATA logs, and complete the rest of
	* the ZAC probe. If the SAT layer does support
	* ZBC -> ZAC translation, we want to use that,
	* and we'll probe the SCSI Zoned Block Device
	* Characteristics VPD page next.
	*/
	if ((error == 0)
	&& (softc->flags & DA_FLAG_CAN_ATA_LOG)
	&& (softc->zone_interface == DA_ZONE_IF_ATA_PASS))
	softc->state = DA_STATE_PROBE_ATA_LOGDIR;
	else
	softc->state = DA_STATE_PROBE_ZONE;
	continue_probe = 1;
	}
	if (continue_probe != 0) {
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	} else
	daprobedone(periph, done_ccb);
	return;
	}
	case DA_CCB_PROBE_ATA_LOGDIR:
	{
	int error;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	error = 0;
	softc->valid_logdir_len = 0;
	bzero(&softc->ata_logdir, sizeof(softc->ata_logdir));
	softc->valid_logdir_len =
	csio->dxfer_len - csio->resid;
	if (softc->valid_logdir_len > 0)
	bcopy(csio->data_ptr, &softc->ata_logdir,
	min(softc->valid_logdir_len,
	sizeof(softc->ata_logdir)));
	/*
	* Figure out whether the Identify Device log is
	* supported. The General Purpose log directory
	* has a header, and lists the number of pages
	* available for each GP log identified by the
	* offset into the list.
	*/
	if ((softc->valid_logdir_len >=
	((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t)))
	&& (le16dec(softc->ata_logdir.header) ==
	ATA_GP_LOG_DIR_VERSION)
	&& (le16dec(&softc->ata_logdir.num_pages[
	(ATA_IDENTIFY_DATA_LOG *
	sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){
	softc->flags \|= DA_FLAG_CAN_ATA_IDLOG;
	} else {
	softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA log directory,
	* then ATA logs are effectively not
	* supported even if the bit is set in the
	* identify data.
	*/
	softc->flags &= ~(DA_FLAG_CAN_ATA_LOG \|
	DA_FLAG_CAN_ATA_IDLOG);
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	if ((error == 0)
	&& (softc->flags & DA_FLAG_CAN_ATA_IDLOG)) {
	softc->state = DA_STATE_PROBE_ATA_IDDIR;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}
	daprobedone(periph, done_ccb);
	return;
	}
	case DA_CCB_PROBE_ATA_IDDIR:
	{
	int error;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	off_t entries_offset, max_entries;
	error = 0;

	softc->valid_iddir_len = 0;
	bzero(&softc->ata_iddir, sizeof(softc->ata_iddir));
	softc->flags &= ~(DA_FLAG_CAN_ATA_SUPCAP \|
	DA_FLAG_CAN_ATA_ZONE);
	softc->valid_iddir_len =
	csio->dxfer_len - csio->resid;
	if (softc->valid_iddir_len > 0)
	bcopy(csio->data_ptr, &softc->ata_iddir,
	min(softc->valid_iddir_len,
	sizeof(softc->ata_iddir)));

	entries_offset =
	__offsetof(struct ata_identify_log_pages,entries);
	max_entries = softc->valid_iddir_len - entries_offset;
	if ((softc->valid_iddir_len > (entries_offset + 1))
	&& (le64dec(softc->ata_iddir.header) ==
	ATA_IDLOG_REVISION)
	&& (softc->ata_iddir.entry_count > 0)) {
	int num_entries, i;

	num_entries = softc->ata_iddir.entry_count;
	num_entries = min(num_entries,
	softc->valid_iddir_len - entries_offset);
	for (i = 0; i < num_entries &&
	i < max_entries; i++) {
	if (softc->ata_iddir.entries[i] ==
	ATA_IDL_SUP_CAP)
	softc->flags \|=
	DA_FLAG_CAN_ATA_SUPCAP;
	else if (softc->ata_iddir.entries[i]==
	ATA_IDL_ZDI)
	softc->flags \|=
	DA_FLAG_CAN_ATA_ZONE;

	if ((softc->flags &
	DA_FLAG_CAN_ATA_SUPCAP)
	&& (softc->flags &
	DA_FLAG_CAN_ATA_ZONE))
	break;
	}
	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data log
	* directory, then it effectively isn't
	* supported even if the ATA Log directory
	* a non-zero number of pages present for
	* this log.
	*/
	softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG;
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	if ((error == 0)
	&& (softc->flags & DA_FLAG_CAN_ATA_SUPCAP)) {
	softc->state = DA_STATE_PROBE_ATA_SUP;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}
	daprobedone(periph, done_ccb);
	return;
	}
	case DA_CCB_PROBE_ATA_SUP:
	{
	int error;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;
	size_t needed_size;
	struct ata_identify_log_sup_cap *sup_cap;
	error = 0;

	sup_cap = (struct ata_identify_log_sup_cap *)
	csio->data_ptr;
	valid_len = csio->dxfer_len - csio->resid;
	needed_size =
	__offsetof(struct ata_identify_log_sup_cap,
	sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap);
	if (valid_len >= needed_size) {
	uint64_t zoned, zac_cap;

	zoned = le64dec(sup_cap->zoned_cap);
	if (zoned & ATA_ZONED_VALID) {
	/*
	* This should have already been
	* set, because this is also in the
	* ATA identify data.
	*/
	if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_HOST_AWARE)
	softc->zone_mode =
	DA_ZONE_HOST_AWARE;
	else if ((zoned & ATA_ZONED_MASK) ==
	ATA_SUPPORT_ZONE_DEV_MANAGED)
	softc->zone_mode =
	DA_ZONE_DRIVE_MANAGED;
	}

	zac_cap = le64dec(sup_cap->sup_zac_cap);
	if (zac_cap & ATA_SUP_ZAC_CAP_VALID) {
	if (zac_cap & ATA_REPORT_ZONES_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_RZ_SUP;
	if (zac_cap & ATA_ND_OPEN_ZONE_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_OPEN_SUP;
	if (zac_cap & ATA_ND_CLOSE_ZONE_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_CLOSE_SUP;
	if (zac_cap & ATA_ND_FINISH_ZONE_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_FINISH_SUP;
	if (zac_cap & ATA_ND_RWP_SUP)
	softc->zone_flags \|=
	DA_ZONE_FLAG_RWP_SUP;
	} else {
	/*
	* This field was introduced in
	* ACS-4, r08 on April 28th, 2015.
	* If the drive firmware was written
	* to an earlier spec, it won't have
	* the field. So, assume all
	* commands are supported.
	*/
	softc->zone_flags \|=
	DA_ZONE_FLAG_SUP_MASK;
	}

	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	/*
	* If we can't get the ATA Identify Data
	* Supported Capabilities page, clear the
	* flag...
	*/
	softc->flags &= ~DA_FLAG_CAN_ATA_SUPCAP;
	/*
	* And clear zone capabilities.
	*/
	softc->zone_flags &= ~DA_ZONE_FLAG_SUP_MASK;
	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}

	free(csio->data_ptr, M_SCSIDA);

	if ((error == 0)
	&& (softc->flags & DA_FLAG_CAN_ATA_ZONE)) {
	softc->state = DA_STATE_PROBE_ATA_ZONE;
	xpt_release_ccb(done_ccb);
	xpt_schedule(periph, priority);
	return;
	}
	daprobedone(periph, done_ccb);
	return;
	}
	case DA_CCB_PROBE_ATA_ZONE:
	{
	int error;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	struct ata_zoned_info_log *zi_log;
	uint32_t valid_len;
	size_t needed_size;

	zi_log = (struct ata_zoned_info_log *)csio->data_ptr;

	valid_len = csio->dxfer_len - csio->resid;
	needed_size = __offsetof(struct ata_zoned_info_log,
	version_info) + 1 + sizeof(zi_log->version_info);
	if (valid_len >= needed_size) {
	uint64_t tmpvar;

	tmpvar = le64dec(zi_log->zoned_cap);
	if (tmpvar & ATA_ZDI_CAP_VALID) {
	if (tmpvar & ATA_ZDI_CAP_URSWRZ)
	softc->zone_flags \|=
	DA_ZONE_FLAG_URSWRZ;
	else
	softc->zone_flags &=
	~DA_ZONE_FLAG_URSWRZ;
	}
	tmpvar = le64dec(zi_log->optimal_seq_zones);
	if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) {
	softc->zone_flags \|=
	DA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = (tmpvar &
	ATA_ZDI_OPT_SEQ_MASK);
	} else {
	softc->zone_flags &=
	~DA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_seq_zones = 0;
	}

	tmpvar =le64dec(zi_log->optimal_nonseq_zones);
	if (tmpvar & ATA_ZDI_OPT_NS_VALID) {
	softc->zone_flags \|=
	DA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones =
	(tmpvar & ATA_ZDI_OPT_NS_MASK);
	} else {
	softc->zone_flags &=
	~DA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->optimal_nonseq_zones = 0;
	}

	tmpvar = le64dec(zi_log->max_seq_req_zones);
	if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) {
	softc->zone_flags \|=
	DA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones =
	(tmpvar & ATA_ZDI_MAX_SEQ_MASK);
	} else {
	softc->zone_flags &=
	~DA_ZONE_FLAG_MAX_SEQ_SET;
	softc->max_seq_zones = 0;
	}
	}
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	softc->flags &= ~DA_FLAG_CAN_ATA_ZONE;
	softc->flags &= ~DA_ZONE_FLAG_SET_MASK;

	if ((done_ccb->ccb_h.status &
	CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}

	}
	free(csio->data_ptr, M_SCSIDA);

	daprobedone(periph, done_ccb);
	return;
	}
	case DA_CCB_PROBE_ZONE:
	{
	int error;

	if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
	uint32_t valid_len;
	size_t needed_len;
	struct scsi_vpd_zoned_bdc *zoned_bdc;

	error = 0;
	zoned_bdc = (struct scsi_vpd_zoned_bdc *)
	csio->data_ptr;
	valid_len = csio->dxfer_len - csio->resid;
	needed_len = __offsetof(struct scsi_vpd_zoned_bdc,
	max_seq_req_zones) + 1 +
	sizeof(zoned_bdc->max_seq_req_zones);
	if ((valid_len >= needed_len)
	&& (scsi_2btoul(zoned_bdc->page_length) >=
	SVPD_ZBDC_PL)) {
	if (zoned_bdc->flags & SVPD_ZBDC_URSWRZ)
	softc->zone_flags \|=
	DA_ZONE_FLAG_URSWRZ;
	else
	softc->zone_flags &=
	~DA_ZONE_FLAG_URSWRZ;
	softc->optimal_seq_zones =
	scsi_4btoul(zoned_bdc->optimal_seq_zones);
	softc->zone_flags \|= DA_ZONE_FLAG_OPT_SEQ_SET;
	softc->optimal_nonseq_zones = scsi_4btoul(
	zoned_bdc->optimal_nonseq_zones);
	softc->zone_flags \|=
	DA_ZONE_FLAG_OPT_NONSEQ_SET;
	softc->max_seq_zones =
	scsi_4btoul(zoned_bdc->max_seq_req_zones);
	softc->zone_flags \|= DA_ZONE_FLAG_MAX_SEQ_SET;
	}
	/*
	* All of the zone commands are mandatory for SCSI
	* devices.
	*
	* XXX KDM this is valid as of September 2015.
	* Re-check this assumption once the SAT spec is
	* updated to support SCSI ZBC to ATA ZAC mapping.
	* Since ATA allows zone commands to be reported
	* as supported or not, this may not necessarily
	* be true for an ATA device behind a SAT (SCSI to
	* ATA Translation) layer.
	*/
	softc->zone_flags \|= DA_ZONE_FLAG_SUP_MASK;
	} else {
	error = daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA\|SF_NO_PRINT);
	if (error == ERESTART)
	return;
	else if (error != 0) {
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
	/* Don't wedge this device's queue */
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	}
	}
	daprobedone(periph, done_ccb);
	return;
	}
	case DA_CCB_DUMP:
	/* No-op. We're polling */
	return;
	case DA_CCB_TUR:
	{
	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {

	if (daerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_RECOVERY \| SF_NO_PRINT) ==
	ERESTART)
	return;
	if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(done_ccb->ccb_h.path,
	/relsim_flags/0,
	/reduction/0,
	/timeout/0,
	/getcount_only/0);
	}
	xpt_release_ccb(done_ccb);
	cam_periph_release_locked(periph);
	return;
	}
	default:
	break;
	}
	xpt_release_ccb(done_ccb);
	}

	static void
	dareprobe(struct cam_periph *periph)
	{
	struct da_softc *softc;
	cam_status status;

	softc = (struct da_softc *)periph->softc;

	/* Probe in progress; don't interfere. */
	if (softc->state != DA_STATE_NORMAL)
	return;

	status = cam_periph_acquire(periph);
	KASSERT(status == CAM_REQ_CMP,
	("dareprobe: cam_periph_acquire failed"));

	if (softc->flags & DA_FLAG_CAN_RC16)
	softc->state = DA_STATE_PROBE_RC16;
	else
	softc->state = DA_STATE_PROBE_RC;

	xpt_schedule(periph, CAM_PRIORITY_DEV);
	}

	static int
	daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	struct da_softc *softc;
	struct cam_periph *periph;
	int error, error_code, sense_key, asc, ascq;

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (ccb->csio.bio != NULL)
	biotrack(ccb->csio.bio, __func__);
	#endif

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct da_softc *)periph->softc;

	/*
	* Automatically detect devices that do not support
	* READ(6)/WRITE(6) and upgrade to using 10 byte cdbs.
	*/
	error = 0;
	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) {
	error = cmd6workaround(ccb);
	} else if (scsi_extract_sense_ccb(ccb,
	&error_code, &sense_key, &asc, &ascq)) {
	if (sense_key == SSD_KEY_ILLEGAL_REQUEST)
	error = cmd6workaround(ccb);
	/*
	* If the target replied with CAPACITY DATA HAS CHANGED UA,
	* query the capacity and notify upper layers.
	*/
	else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
	asc == 0x2A && ascq == 0x09) {
	xpt_print(periph->path, "Capacity data has changed\n");
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	sense_flags \|= SF_NO_PRINT;
	} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
	asc == 0x28 && ascq == 0x00) {
	softc->flags &= ~DA_FLAG_PROBED;
	disk_media_changed(softc->disk, M_NOWAIT);
	} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
	asc == 0x3F && ascq == 0x03) {
	xpt_print(periph->path, "INQUIRY data has changed\n");
	softc->flags &= ~DA_FLAG_PROBED;
	dareprobe(periph);
	sense_flags \|= SF_NO_PRINT;
	} else if (sense_key == SSD_KEY_NOT_READY &&
	asc == 0x3a && (softc->flags & DA_FLAG_PACK_INVALID) == 0) {
	softc->flags \|= DA_FLAG_PACK_INVALID;
	disk_media_gone(softc->disk, M_NOWAIT);
	}
	}
	if (error == ERESTART)
	return (ERESTART);

	#ifdef CAM_IO_STATS
	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
	case CAM_CMD_TIMEOUT:
	softc->timeouts++;
	break;
	case CAM_REQ_ABORTED:
	case CAM_REQ_CMP_ERR:
	case CAM_REQ_TERMIO:
	case CAM_UNREC_HBA_ERROR:
	case CAM_DATA_RUN_ERR:
	softc->errors++;
	break;
	default:
	break;
	}
	#endif

	/*
	* XXX
	* Until we have a better way of doing pack validation,
	* don't treat UAs as errors.
	*/
	sense_flags \|= SF_RETRY_UA;

	if (softc->quirks & DA_Q_RETRY_BUSY)
	sense_flags \|= SF_RETRY_BUSY;
	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}

	static void
	damediapoll(void *arg)
	{
	struct cam_periph *periph = arg;
	struct da_softc *softc = periph->softc;

	if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR) &&
	LIST_EMPTY(&softc->pending_ccbs)) {
	if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
	cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR);
	daschedule(periph);
	}
	}
	/* Queue us up again */
	if (da_poll_period != 0)
	callout_schedule(&softc->mediapoll_c, da_poll_period * hz);
	}

	static void
	daprevent(struct cam_periph *periph, int action)
	{
	struct da_softc *softc;
	union ccb *ccb;
	int error;

	softc = (struct da_softc *)periph->softc;

	if (((action == PR_ALLOW)
	&& (softc->flags & DA_FLAG_PACK_LOCKED) == 0)
	\|\| ((action == PR_PREVENT)
	&& (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) {
	return;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);

	scsi_prevent(&ccb->csio,
	/retries/1,
	/cbcfp/dadone,
	MSG_SIMPLE_Q_TAG,
	action,
	SSD_FULL_SIZE,
	5000);

	error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT, softc->disk->d_devstat);

	if (error == 0) {
	if (action == PR_ALLOW)
	softc->flags &= ~DA_FLAG_PACK_LOCKED;
	else
	softc->flags \|= DA_FLAG_PACK_LOCKED;
	}

	xpt_release_ccb(ccb);
	}

	static void
	dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector,
	struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len)
	{
	struct ccb_calc_geometry ccg;
	struct da_softc *softc;
	struct disk_params *dp;
	u_int lbppbe, lalba;
	int error;

	softc = (struct da_softc *)periph->softc;

	dp = &softc->params;
	dp->secsize = block_len;
	dp->sectors = maxsector + 1;
	if (rcaplong != NULL) {
	lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE;
	lalba = scsi_2btoul(rcaplong->lalba_lbp);
	lalba &= SRC16_LALBA_A;
	} else {
	lbppbe = 0;
	lalba = 0;
	}

	if (lbppbe > 0) {
	dp->stripesize = block_len << lbppbe;
	dp->stripeoffset = (dp->stripesize - block_len * lalba) %
	dp->stripesize;
	} else if (softc->quirks & DA_Q_4K) {
	dp->stripesize = 4096;
	dp->stripeoffset = 0;
	} else if (softc->unmap_gran != 0) {
	dp->stripesize = block_len * softc->unmap_gran;
	dp->stripeoffset = (dp->stripesize - block_len *
	softc->unmap_gran_align) % dp->stripesize;
	} else {
	dp->stripesize = 0;
	dp->stripeoffset = 0;
	}
	/*
	* Have the controller provide us with a geometry
	* for this disk. The only time the geometry
	* matters is when we boot and the controller
	* is the only one knowledgeable enough to come
	* up with something that will make this a bootable
	* device.
	*/
	xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	ccg.ccb_h.func_code = XPT_CALC_GEOMETRY;
	ccg.block_size = dp->secsize;
	ccg.volume_size = dp->sectors;
	ccg.heads = 0;
	ccg.secs_per_track = 0;
	ccg.cylinders = 0;
	xpt_action((union ccb*)&ccg);
	if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
	/*
	* We don't know what went wrong here- but just pick
	* a geometry so we don't have nasty things like divide
	* by zero.
	*/
	dp->heads = 255;
	dp->secs_per_track = 255;
	dp->cylinders = dp->sectors / (255 * 255);
	if (dp->cylinders == 0) {
	dp->cylinders = 1;
	}
	} else {
	dp->heads = ccg.heads;
	dp->secs_per_track = ccg.secs_per_track;
	dp->cylinders = ccg.cylinders;
	}

	/*
	* If the user supplied a read capacity buffer, and if it is
	* different than the previous buffer, update the data in the EDT.
	* If it's the same, we don't bother. This avoids sending an
	* update every time someone opens this device.
	*/
	if ((rcaplong != NULL)
	&& (bcmp(rcaplong, &softc->rcaplong,
	min(sizeof(softc->rcaplong), rcap_len)) != 0)) {
	struct ccb_dev_advinfo cdai;

	xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
	cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
	cdai.buftype = CDAI_TYPE_RCAPLONG;
	cdai.flags = CDAI_FLAG_STORE;
	cdai.bufsiz = rcap_len;
	cdai.buf = (uint8_t *)rcaplong;
	xpt_action((union ccb *)&cdai);
	if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
	cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
	if (cdai.ccb_h.status != CAM_REQ_CMP) {
	xpt_print(periph->path, "%s: failed to set read "
	"capacity advinfo\n", __func__);
	/* Use cam_error_print() to decode the status */
	cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS,
	CAM_EPF_ALL);
	} else {
	bcopy(rcaplong, &softc->rcaplong,
	min(sizeof(softc->rcaplong), rcap_len));
	}
	}

	softc->disk->d_sectorsize = softc->params.secsize;
	softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors;
	softc->disk->d_stripesize = softc->params.stripesize;
	softc->disk->d_stripeoffset = softc->params.stripeoffset;
	/* XXX: these are not actually "firmware" values, so they may be wrong */
	softc->disk->d_fwsectors = softc->params.secs_per_track;
	softc->disk->d_fwheads = softc->params.heads;
	softc->disk->d_devstat->block_size = softc->params.secsize;
	softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE;

	error = disk_resize(softc->disk, M_NOWAIT);
	if (error != 0)
	xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error);
	}

	static void
	dasendorderedtag(void *arg)
	{
	struct da_softc *softc = arg;

	if (da_send_ordered) {
	if (!LIST_EMPTY(&softc->pending_ccbs)) {
	if ((softc->flags & DA_FLAG_WAS_OTAG) == 0)
	softc->flags \|= DA_FLAG_NEED_OTAG;
	softc->flags &= ~DA_FLAG_WAS_OTAG;
	}
	}
	/* Queue us up again */
	callout_reset(&softc->sendordered_c,
	(da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
	dasendorderedtag, softc);
	}

	/*
	* Step through all DA peripheral drivers, and if the device is still open,
	* sync the disk cache to physical media.
	*/
	static void
	dashutdown(void * arg, int howto)
	{
	struct cam_periph *periph;
	struct da_softc *softc;
	union ccb *ccb;
	int error;

	CAM_PERIPH_FOREACH(periph, &dadriver) {
	softc = (struct da_softc *)periph->softc;
	if (SCHEDULER_STOPPED()) {
	/* If we paniced with the lock held, do not recurse. */
	if (!cam_periph_owned(periph) &&
	(softc->flags & DA_FLAG_OPEN)) {
	dadump(softc->disk, NULL, 0, 0, 0);
	}
	continue;
	}
	cam_periph_lock(periph);

	/*
	* We only sync the cache if the drive is still open, and
	* if the drive is capable of it..
	*/
	if (((softc->flags & DA_FLAG_OPEN) == 0)
	\|\| (softc->quirks & DA_Q_NO_SYNC_CACHE)) {
	cam_periph_unlock(periph);
	continue;
	}

	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
	scsi_synchronize_cache(&ccb->csio,
	/retries/0,
	/cbfcnp/dadone,
	MSG_SIMPLE_Q_TAG,
	/begin_lba/0, /* whole disk */
	/lb_count/0,
	SSD_FULL_SIZE,
	60 * 60 * 1000);

	error = cam_periph_runccb(ccb, daerror, /cam_flags/0,
	/sense_flags/ SF_NO_RECOVERY \| SF_NO_RETRY \| SF_QUIET_IR,
	softc->disk->d_devstat);
	if (error != 0)
	xpt_print(periph->path, "Synchronize cache failed\n");
	xpt_release_ccb(ccb);
	cam_periph_unlock(periph);
	}
	}

	#else /* !_KERNEL */

	/*
	* XXX These are only left out of the kernel build to silence warnings. If,
	* for some reason these functions are used in the kernel, the ifdefs should
	* be moved so they are included both in the kernel and userland.
	*/
	void
	scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave,
	u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_format_unit *scsi_cmd;

	scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = FORMAT_UNIT;
	scsi_cmd->byte2 = byte2;
	scsi_ulto2b(ileave, scsi_cmd->interleave);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_read_defects(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, uint8_t list_format,
	uint32_t addr_desc_index, uint8_t *data_ptr,
	uint32_t dxfer_len, int minimum_cmd_size,
	uint8_t sense_len, uint32_t timeout)
	{
	uint8_t cdb_len;

	/*
	* These conditions allow using the 10 byte command. Otherwise we
	* need to use the 12 byte command.
	*/
	if ((minimum_cmd_size <= 10)
	&& (addr_desc_index == 0)
	&& (dxfer_len <= SRDD10_MAX_LENGTH)) {
	struct scsi_read_defect_data_10 *cdb10;

	cdb10 = (struct scsi_read_defect_data_10 *)
	&csio->cdb_io.cdb_bytes;

	cdb_len = sizeof(*cdb10);
	bzero(cdb10, cdb_len);
	cdb10->opcode = READ_DEFECT_DATA_10;
	cdb10->format = list_format;
	scsi_ulto2b(dxfer_len, cdb10->alloc_length);
	} else {
	struct scsi_read_defect_data_12 *cdb12;

	cdb12 = (struct scsi_read_defect_data_12 *)
	&csio->cdb_io.cdb_bytes;

	cdb_len = sizeof(*cdb12);
	bzero(cdb12, cdb_len);
	cdb12->opcode = READ_DEFECT_DATA_12;
	cdb12->format = list_format;
	scsi_ulto4b(dxfer_len, cdb12->alloc_length);
	scsi_ulto4b(addr_desc_index, cdb12->address_descriptor_index);
	}

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ CAM_DIR_IN,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	cdb_len,
	timeout);
	}

	void
	scsi_sanitize(struct ccb_scsiio *csio, u_int32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	u_int8_t tag_action, u_int8_t byte2, u_int16_t control,
	u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
	u_int32_t timeout)
	{
	struct scsi_sanitize *scsi_cmd;

	scsi_cmd = (struct scsi_sanitize *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = SANITIZE;
	scsi_cmd->byte2 = byte2;
	scsi_cmd->control = control;
	scsi_ulto2b(dxfer_len, scsi_cmd->length);

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	#endif /* _KERNEL */

	void
	scsi_zbc_out(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, uint8_t service_action, uint64_t zone_id,
	uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t sense_len, uint32_t timeout)
	{
	struct scsi_zbc_out *scsi_cmd;

	scsi_cmd = (struct scsi_zbc_out *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = ZBC_OUT;
	scsi_cmd->service_action = service_action;
	scsi_u64to8b(zone_id, scsi_cmd->zone_id);
	scsi_cmd->zone_flags = zone_flags;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);
	}

	void
	scsi_zbc_in(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, uint8_t service_action, uint64_t zone_start_lba,
	uint8_t zone_options, uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t sense_len, uint32_t timeout)
	{
	struct scsi_zbc_in *scsi_cmd;

	scsi_cmd = (struct scsi_zbc_in *)&csio->cdb_io.cdb_bytes;
	scsi_cmd->opcode = ZBC_IN;
	scsi_cmd->service_action = service_action;
	scsi_ulto4b(dxfer_len, scsi_cmd->length);
	scsi_u64to8b(zone_start_lba, scsi_cmd->zone_start_lba);
	scsi_cmd->zone_options = zone_options;

	cam_fill_csio(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_IN : CAM_DIR_NONE,
	tag_action,
	data_ptr,
	dxfer_len,
	sense_len,
	sizeof(*scsi_cmd),
	timeout);

	}

	int
	scsi_ata_zac_mgmt_out(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, int use_ncq,
	uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
	uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t *cdb_storage, size_t cdb_storage_len,
	uint8_t sense_len, uint32_t timeout)
	{
	uint8_t command_out, protocol, ata_flags;
	uint16_t features_out;
	uint32_t sectors_out, auxiliary;
	int retval;

	retval = 0;

	if (use_ncq == 0) {
	command_out = ATA_ZAC_MANAGEMENT_OUT;
	features_out = (zm_action & 0xf) \| (zone_flags << 8);
	ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
	if (dxfer_len == 0) {
	protocol = AP_PROTO_NON_DATA;
	ata_flags \|= AP_FLAG_TLEN_NO_DATA;
	sectors_out = 0;
	} else {
	protocol = AP_PROTO_DMA;
	ata_flags \|= AP_FLAG_TLEN_SECT_CNT \|
	AP_FLAG_TDIR_TO_DEV;
	sectors_out = ((dxfer_len >> 9) & 0xffff);
	}
	auxiliary = 0;
	} else {
	ata_flags = AP_FLAG_BYT_BLOK_BLOCKS;
	if (dxfer_len == 0) {
	command_out = ATA_NCQ_NON_DATA;
	features_out = ATA_NCQ_ZAC_MGMT_OUT;
	/*
	* We're assuming the SCSI to ATA translation layer
	* will set the NCQ tag number in the tag field.
	* That isn't clear from the SAT-4 spec (as of rev 05).
	*/
	sectors_out = 0;
	ata_flags \|= AP_FLAG_TLEN_NO_DATA;
	} else {
	command_out = ATA_SEND_FPDMA_QUEUED;
	/*
	* Note that we're defaulting to normal priority,
	* and assuming that the SCSI to ATA translation
	* layer will insert the NCQ tag number in the tag
	* field. That isn't clear in the SAT-4 spec (as
	* of rev 05).
	*/
	sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8;

	ata_flags \|= AP_FLAG_TLEN_FEAT \|
	AP_FLAG_TDIR_TO_DEV;

	/*
	* For SEND FPDMA QUEUED, the transfer length is
	* encoded in the FEATURE register, and 0 means
	* that 65536 512 byte blocks are to be tranferred.
	* In practice, it seems unlikely that we'll see
	* a transfer that large, and it may confuse the
	* the SAT layer, because generally that means that
	* 0 bytes should be transferred.
	*/
	if (dxfer_len == (65536 * 512)) {
	features_out = 0;
	} else if (dxfer_len <= (65535 * 512)) {
	features_out = ((dxfer_len >> 9) & 0xffff);
	} else {
	/* The transfer is too big. */
	retval = 1;
	goto bailout;
	}

	}

	auxiliary = (zm_action & 0xf) \| (zone_flags << 8);
	protocol = AP_PROTO_FPDMA;
	}

	protocol \|= AP_EXTEND;

	retval = scsi_ata_pass(csio,
	retries,
	cbfcnp,
	/flags/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
	tag_action,
	/protocol/ protocol,
	/ata_flags/ ata_flags,
	/features/ features_out,
	/sector_count/ sectors_out,
	/lba/ zone_id,
	/command/ command_out,
	/device/ 0,
	/icc/ 0,
	/auxiliary/ auxiliary,
	/control/ 0,
	/data_ptr/ data_ptr,
	/dxfer_len/ dxfer_len,
	/cdb_storage/ cdb_storage,
	/cdb_storage_len/ cdb_storage_len,
	/minimum_cmd_size/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ timeout);

	bailout:

	return (retval);
	}

	int
	scsi_ata_zac_mgmt_in(struct ccb_scsiio *csio, uint32_t retries,
	void (cbfcnp)(struct cam_periph , union ccb *),
	uint8_t tag_action, int use_ncq,
	uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags,
	uint8_t *data_ptr, uint32_t dxfer_len,
	uint8_t *cdb_storage, size_t cdb_storage_len,
	uint8_t sense_len, uint32_t timeout)
	{
	uint8_t command_out, protocol;
	uint16_t features_out, sectors_out;
	uint32_t auxiliary;
	int ata_flags;
	int retval;

	retval = 0;
	ata_flags = AP_FLAG_TDIR_FROM_DEV \| AP_FLAG_BYT_BLOK_BLOCKS;

	if (use_ncq == 0) {
	command_out = ATA_ZAC_MANAGEMENT_IN;
	/* XXX KDM put a macro here */
	features_out = (zm_action & 0xf) \| (zone_flags << 8);
	sectors_out = dxfer_len >> 9; /* XXX KDM macro */
	protocol = AP_PROTO_DMA;
	ata_flags \|= AP_FLAG_TLEN_SECT_CNT;
	auxiliary = 0;
	} else {
	ata_flags \|= AP_FLAG_TLEN_FEAT;

	command_out = ATA_RECV_FPDMA_QUEUED;
	sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8;

	/*
	* For RECEIVE FPDMA QUEUED, the transfer length is
	* encoded in the FEATURE register, and 0 means
	* that 65536 512 byte blocks are to be tranferred.
	* In practice, it seems unlikely that we'll see
	* a transfer that large, and it may confuse the
	* the SAT layer, because generally that means that
	* 0 bytes should be transferred.
	*/
	if (dxfer_len == (65536 * 512)) {
	features_out = 0;
	} else if (dxfer_len <= (65535 * 512)) {
	features_out = ((dxfer_len >> 9) & 0xffff);
	} else {
	/* The transfer is too big. */
	retval = 1;
	goto bailout;
	}
	auxiliary = (zm_action & 0xf) \| (zone_flags << 8),
	protocol = AP_PROTO_FPDMA;
	}

	protocol \|= AP_EXTEND;

	retval = scsi_ata_pass(csio,
	retries,
	cbfcnp,
	/flags/ CAM_DIR_IN,
	tag_action,
	/protocol/ protocol,
	/ata_flags/ ata_flags,
	/features/ features_out,
	/sector_count/ sectors_out,
	/lba/ zone_id,
	/command/ command_out,
	/device/ 0,
	/icc/ 0,
	/auxiliary/ auxiliary,
	/control/ 0,
	/data_ptr/ data_ptr,
	/dxfer_len/ (dxfer_len >> 9) * 512, /* XXX KDM */
	/cdb_storage/ cdb_storage,
	/cdb_storage_len/ cdb_storage_len,
	/minimum_cmd_size/ 0,
	/sense_len/ SSD_FULL_SIZE,
	/timeout/ timeout);

	bailout:
	return (retval);
	}
	Index: head/sys/cam/scsi/scsi_pass.c
	===================================================================
	--- head/sys/cam/scsi/scsi_pass.c (revision 327172)
	+++ head/sys/cam/scsi/scsi_pass.c (revision 327173)
	@@ -1,2278 +1,2276 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997, 1998, 2000 Justin T. Gibbs.
	* Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	#include <sys/types.h>
	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/devicestat.h>
	#include <sys/errno.h>
	#include <sys/fcntl.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/poll.h>
	#include <sys/selinfo.h>
	#include <sys/sdt.h>
	#include <sys/taskqueue.h>
	#include <vm/uma.h>
	#include <vm/vm.h>
	#include <vm/vm_extern.h>

	#include <machine/bus.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_periph.h>
	#include <cam/cam_queue.h>
	#include <cam/cam_xpt.h>
	#include <cam/cam_xpt_periph.h>
	#include <cam/cam_debug.h>
	#include <cam/cam_compat.h>
	#include <cam/cam_xpt_periph.h>

	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_pass.h>

	typedef enum {
	PASS_FLAG_OPEN = 0x01,
	PASS_FLAG_LOCKED = 0x02,
	PASS_FLAG_INVALID = 0x04,
	PASS_FLAG_INITIAL_PHYSPATH = 0x08,
	PASS_FLAG_ZONE_INPROG = 0x10,
	PASS_FLAG_ZONE_VALID = 0x20,
	PASS_FLAG_UNMAPPED_CAPABLE = 0x40,
	PASS_FLAG_ABANDONED_REF_SET = 0x80
	} pass_flags;

	typedef enum {
	PASS_STATE_NORMAL
	} pass_state;

	typedef enum {
	PASS_CCB_BUFFER_IO,
	PASS_CCB_QUEUED_IO
	} pass_ccb_types;

	#define ccb_type ppriv_field0
	#define ccb_ioreq ppriv_ptr1

	/*
	* The maximum number of memory segments we preallocate.
	*/
	#define PASS_MAX_SEGS 16

	typedef enum {
	PASS_IO_NONE = 0x00,
	PASS_IO_USER_SEG_MALLOC = 0x01,
	PASS_IO_KERN_SEG_MALLOC = 0x02,
	PASS_IO_ABANDONED = 0x04
	} pass_io_flags;

	struct pass_io_req {
	union ccb ccb;
	union ccb *alloced_ccb;
	union ccb *user_ccb_ptr;
	camq_entry user_periph_links;
	ccb_ppriv_area user_periph_priv;
	struct cam_periph_map_info mapinfo;
	pass_io_flags flags;
	ccb_flags data_flags;
	int num_user_segs;
	bus_dma_segment_t user_segs[PASS_MAX_SEGS];
	int num_kern_segs;
	bus_dma_segment_t kern_segs[PASS_MAX_SEGS];
	bus_dma_segment_t *user_segptr;
	bus_dma_segment_t *kern_segptr;
	int num_bufs;
	uint32_t dirs[CAM_PERIPH_MAXMAPS];
	uint32_t lengths[CAM_PERIPH_MAXMAPS];
	uint8_t *user_bufs[CAM_PERIPH_MAXMAPS];
	uint8_t *kern_bufs[CAM_PERIPH_MAXMAPS];
	struct bintime start_time;
	TAILQ_ENTRY(pass_io_req) links;
	};

	struct pass_softc {
	pass_state state;
	pass_flags flags;
	u_int8_t pd_type;
	union ccb saved_ccb;
	int open_count;
	u_int maxio;
	struct devstat *device_stats;
	struct cdev *dev;
	struct cdev *alias_dev;
	struct task add_physpath_task;
	struct task shutdown_kqueue_task;
	struct selinfo read_select;
	TAILQ_HEAD(, pass_io_req) incoming_queue;
	TAILQ_HEAD(, pass_io_req) active_queue;
	TAILQ_HEAD(, pass_io_req) abandoned_queue;
	TAILQ_HEAD(, pass_io_req) done_queue;
	struct cam_periph *periph;
	char zone_name[12];
	char io_zone_name[12];
	uma_zone_t pass_zone;
	uma_zone_t pass_io_zone;
	size_t io_zone_size;
	};

	static d_open_t passopen;
	static d_close_t passclose;
	static d_ioctl_t passioctl;
	static d_ioctl_t passdoioctl;
	static d_poll_t passpoll;
	static d_kqfilter_t passkqfilter;
	static void passreadfiltdetach(struct knote *kn);
	static int passreadfilt(struct knote *kn, long hint);

	static periph_init_t passinit;
	static periph_ctor_t passregister;
	static periph_oninv_t passoninvalidate;
	static periph_dtor_t passcleanup;
	static periph_start_t passstart;
	static void pass_shutdown_kqueue(void *context, int pending);
	static void pass_add_physpath(void *context, int pending);
	static void passasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg);
	static void passdone(struct cam_periph *periph,
	union ccb *done_ccb);
	static int passcreatezone(struct cam_periph *periph);
	static void passiocleanup(struct pass_softc *softc,
	struct pass_io_req *io_req);
	static int passcopysglist(struct cam_periph *periph,
	struct pass_io_req *io_req,
	ccb_flags direction);
	static int passmemsetup(struct cam_periph *periph,
	struct pass_io_req *io_req);
	static int passmemdone(struct cam_periph *periph,
	struct pass_io_req *io_req);
	static int passerror(union ccb *ccb, u_int32_t cam_flags,
	u_int32_t sense_flags);
	static int passsendccb(struct cam_periph periph, union ccb ccb,
	union ccb *inccb);

	static struct periph_driver passdriver =
	{
	passinit, "pass",
	TAILQ_HEAD_INITIALIZER(passdriver.units), /* generation */ 0
	};

	PERIPHDRIVER_DECLARE(pass, passdriver);

	static struct cdevsw pass_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_TRACKCLOSE,
	.d_open = passopen,
	.d_close = passclose,
	.d_ioctl = passioctl,
	.d_poll = passpoll,
	.d_kqfilter = passkqfilter,
	.d_name = "pass",
	};

	static struct filterops passread_filtops = {
	.f_isfd = 1,
	.f_detach = passreadfiltdetach,
	.f_event = passreadfilt
	};

	static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers");

	static void
	passinit(void)
	{
	cam_status status;

	/*
	* Install a global async callback. This callback will
	* receive async callbacks like "new device found".
	*/
	status = xpt_register_async(AC_FOUND_DEVICE, passasync, NULL, NULL);

	if (status != CAM_REQ_CMP) {
	printf("pass: Failed to attach master async callback "
	"due to status 0x%x!\n", status);
	}

	}

	static void
	passrejectios(struct cam_periph *periph)
	{
	struct pass_io_req io_req, io_req2;
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	/*
	* The user can no longer get status for I/O on the done queue, so
	* clean up all outstanding I/O on the done queue.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->done_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	/*
	* The underlying device is gone, so we can't issue these I/Os.
	* The devfs node has been shut down, so we can't return status to
	* the user. Free any I/O left on the incoming queue.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	/*
	* Normally we would put I/Os on the abandoned queue and acquire a
	* reference when we saw the final close. But, the device went
	* away and devfs may have moved everything off to deadfs by the
	* time the I/O done callback is called; as a result, we won't see
	* any more closes. So, if we have any active I/Os, we need to put
	* them on the abandoned queue. When the abandoned queue is empty,
	* we'll release the remaining reference (see below) to the peripheral.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	io_req->flags \|= PASS_IO_ABANDONED;
	TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links);
	}

	/*
	* If we put any I/O on the abandoned queue, acquire a reference.
	*/
	if ((!TAILQ_EMPTY(&softc->abandoned_queue))
	&& ((softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0)) {
	cam_periph_doacquire(periph);
	softc->flags \|= PASS_FLAG_ABANDONED_REF_SET;
	}
	}

	static void
	passdevgonecb(void *arg)
	{
	struct cam_periph *periph;
	struct mtx *mtx;
	struct pass_softc *softc;
	int i;

	periph = (struct cam_periph *)arg;
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	softc = (struct pass_softc *)periph->softc;
	KASSERT(softc->open_count >= 0, ("Negative open count %d",
	softc->open_count));

	/*
	* When we get this callback, we will get no more close calls from
	* devfs. So if we have any dangling opens, we need to release the
	* reference held for that particular context.
	*/
	for (i = 0; i < softc->open_count; i++)
	cam_periph_release_locked(periph);

	softc->open_count = 0;

	/*
	* Release the reference held for the device node, it is gone now.
	* Accordingly, inform all queued I/Os of their fate.
	*/
	cam_periph_release_locked(periph);
	passrejectios(periph);

	/*
	* We reference the SIM lock directly here, instead of using
	* cam_periph_unlock(). The reason is that the final call to
	* cam_periph_release_locked() above could result in the periph
	* getting freed. If that is the case, dereferencing the periph
	* with a cam_periph_unlock() call would cause a page fault.
	*/
	mtx_unlock(mtx);

	/*
	* We have to remove our kqueue context from a thread because it
	* may sleep. It would be nice if we could get a callback from
	* kqueue when it is done cleaning up resources.
	*/
	taskqueue_enqueue(taskqueue_thread, &softc->shutdown_kqueue_task);
	}

	static void
	passoninvalidate(struct cam_periph *periph)
	{
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	/*
	* De-register any async callbacks.
	*/
	xpt_register_async(0, passasync, periph, periph->path);

	softc->flags \|= PASS_FLAG_INVALID;

	/*
	* Tell devfs this device has gone away, and ask for a callback
	* when it has cleaned up its state.
	*/
	destroy_dev_sched_cb(softc->dev, passdevgonecb, periph);
	}

	static void
	passcleanup(struct cam_periph *periph)
	{
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);
	KASSERT(TAILQ_EMPTY(&softc->active_queue),
	("%s called when there are commands on the active queue!\n",
	__func__));
	KASSERT(TAILQ_EMPTY(&softc->abandoned_queue),
	("%s called when there are commands on the abandoned queue!\n",
	__func__));
	KASSERT(TAILQ_EMPTY(&softc->incoming_queue),
	("%s called when there are commands on the incoming queue!\n",
	__func__));
	KASSERT(TAILQ_EMPTY(&softc->done_queue),
	("%s called when there are commands on the done queue!\n",
	__func__));

	devstat_remove_entry(softc->device_stats);

	cam_periph_unlock(periph);

	/*
	* We call taskqueue_drain() for the physpath task to make sure it
	* is complete. We drop the lock because this can potentially
	* sleep. XXX KDM that is bad. Need a way to get a callback when
	* a taskqueue is drained.
	*
	* Note that we don't drain the kqueue shutdown task queue. This
	* is because we hold a reference on the periph for kqueue, and
	* release that reference from the kqueue shutdown task queue. So
	* we cannot come into this routine unless we've released that
	* reference. Also, because that could be the last reference, we
	* could be called from the cam_periph_release() call in
	* pass_shutdown_kqueue(). In that case, the taskqueue_drain()
	* would deadlock. It would be preferable if we had a way to
	* get a callback when a taskqueue is done.
	*/
	taskqueue_drain(taskqueue_thread, &softc->add_physpath_task);

	cam_periph_lock(periph);

	free(softc, M_DEVBUF);
	}

	static void
	pass_shutdown_kqueue(void *context, int pending)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = context;
	softc = periph->softc;

	knlist_clear(&softc->read_select.si_note, /is_locked/ 0);
	knlist_destroy(&softc->read_select.si_note);

	/*
	* Release the reference we held for kqueue.
	*/
	cam_periph_release(periph);
	}

	static void
	pass_add_physpath(void *context, int pending)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	struct mtx *mtx;
	char *physpath;

	/*
	* If we have one, create a devfs alias for our
	* physical path.
	*/
	periph = context;
	softc = periph->softc;
	physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	if (periph->flags & CAM_PERIPH_INVALID)
	goto out;

	if (xpt_getattr(physpath, MAXPATHLEN,
	"GEOM::physpath", periph->path) == 0
	&& strlen(physpath) != 0) {

	mtx_unlock(mtx);
	make_dev_physpath_alias(MAKEDEV_WAITOK, &softc->alias_dev,
	softc->dev, softc->alias_dev, physpath);
	mtx_lock(mtx);
	}

	out:
	/*
	* Now that we've made our alias, we no longer have to have a
	* reference to the device.
	*/
	if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0)
	softc->flags \|= PASS_FLAG_INITIAL_PHYSPATH;

	/*
	* We always acquire a reference to the periph before queueing this
	* task queue function, so it won't go away before we run.
	*/
	while (pending-- > 0)
	cam_periph_release_locked(periph);
	mtx_unlock(mtx);

	free(physpath, M_DEVBUF);
	}

	static void
	passasync(void *callback_arg, u_int32_t code,
	struct cam_path path, void arg)
	{
	struct cam_periph *periph;

	periph = (struct cam_periph *)callback_arg;

	switch (code) {
	case AC_FOUND_DEVICE:
	{
	struct ccb_getdev *cgd;
	cam_status status;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL)
	break;

	/*
	* Allocate a peripheral instance for
	* this device and start the probe
	* process.
	*/
	status = cam_periph_alloc(passregister, passoninvalidate,
	passcleanup, passstart, "pass",
	CAM_PERIPH_BIO, path,
	passasync, AC_FOUND_DEVICE, cgd);

	if (status != CAM_REQ_CMP
	&& status != CAM_REQ_INPROG) {
	const struct cam_status_entry *entry;

	entry = cam_fetch_status_entry(status);

	printf("passasync: Unable to attach new device "
	"due to status %#x: %s\n", status, entry ?
	entry->status_text : "Unknown");
	}

	break;
	}
	case AC_ADVINFO_CHANGED:
	{
	uintptr_t buftype;

	buftype = (uintptr_t)arg;
	if (buftype == CDAI_TYPE_PHYS_PATH) {
	struct pass_softc *softc;
	cam_status status;

	softc = (struct pass_softc *)periph->softc;
	/*
	* Acquire a reference to the periph before we
	* start the taskqueue, so that we don't run into
	* a situation where the periph goes away before
	* the task queue has a chance to run.
	*/
	status = cam_periph_acquire(periph);
	if (status != CAM_REQ_CMP)
	break;

	taskqueue_enqueue(taskqueue_thread,
	&softc->add_physpath_task);
	}
	break;
	}
	default:
	cam_periph_async(periph, code, path, arg);
	break;
	}
	}

	static cam_status
	passregister(struct cam_periph periph, void arg)
	{
	struct pass_softc *softc;
	struct ccb_getdev *cgd;
	struct ccb_pathinq cpi;
	struct make_dev_args args;
	int error, no_tags;

	cgd = (struct ccb_getdev *)arg;
	if (cgd == NULL) {
	printf("%s: no getdev CCB, can't register device\n", __func__);
	return(CAM_REQ_CMP_ERR);
	}

	softc = (struct pass_softc )malloc(sizeof(softc),
	M_DEVBUF, M_NOWAIT);

	if (softc == NULL) {
	printf("%s: Unable to probe new device. "
	"Unable to allocate softc\n", __func__);
	return(CAM_REQ_CMP_ERR);
	}

	bzero(softc, sizeof(*softc));
	softc->state = PASS_STATE_NORMAL;
	if (cgd->protocol == PROTO_SCSI \|\| cgd->protocol == PROTO_ATAPI)
	softc->pd_type = SID_TYPE(&cgd->inq_data);
	else if (cgd->protocol == PROTO_SATAPM)
	softc->pd_type = T_ENCLOSURE;
	else
	softc->pd_type = T_DIRECT;

	periph->softc = softc;
	softc->periph = periph;
	TAILQ_INIT(&softc->incoming_queue);
	TAILQ_INIT(&softc->active_queue);
	TAILQ_INIT(&softc->abandoned_queue);
	TAILQ_INIT(&softc->done_queue);
	snprintf(softc->zone_name, sizeof(softc->zone_name), "%s%d",
	periph->periph_name, periph->unit_number);
	snprintf(softc->io_zone_name, sizeof(softc->io_zone_name), "%s%dIO",
	periph->periph_name, periph->unit_number);
	softc->io_zone_size = MAXPHYS;
	knlist_init_mtx(&softc->read_select.si_note, cam_periph_mtx(periph));

	xpt_path_inq(&cpi, periph->path);

	if (cpi.maxio == 0)
	softc->maxio = DFLTPHYS; /* traditional default */
	else if (cpi.maxio > MAXPHYS)
	softc->maxio = MAXPHYS; /* for safety */
	else
	softc->maxio = cpi.maxio; /* real value */

	if (cpi.hba_misc & PIM_UNMAPPED)
	softc->flags \|= PASS_FLAG_UNMAPPED_CAPABLE;

	/*
	* We pass in 0 for a blocksize, since we don't
	* know what the blocksize of this device is, if
	* it even has a blocksize.
	*/
	cam_periph_unlock(periph);
	no_tags = (cgd->inq_data.flags & SID_CmdQue) == 0;
	softc->device_stats = devstat_new_entry("pass",
	periph->unit_number, 0,
	DEVSTAT_NO_BLOCKSIZE
	\| (no_tags ? DEVSTAT_NO_ORDERED_TAGS : 0),
	softc->pd_type \|
	XPORT_DEVSTAT_TYPE(cpi.transport) \|
	DEVSTAT_TYPE_PASS,
	DEVSTAT_PRIORITY_PASS);

	/*
	* Initialize the taskqueue handler for shutting down kqueue.
	*/
	TASK_INIT(&softc->shutdown_kqueue_task, /priority/ 0,
	pass_shutdown_kqueue, periph);

	/*
	* Acquire a reference to the periph that we can release once we've
	* cleaned up the kqueue.
	*/
	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	/*
	* Acquire a reference to the periph before we create the devfs
	* instance for it. We'll release this reference once the devfs
	* instance has been freed.
	*/
	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	/* Register the device */
	make_dev_args_init(&args);
	args.mda_devsw = &pass_cdevsw;
	args.mda_unit = periph->unit_number;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0600;
	args.mda_si_drv1 = periph;
	error = make_dev_s(&args, &softc->dev, "%s%d", periph->periph_name,
	periph->unit_number);
	if (error != 0) {
	cam_periph_lock(periph);
	cam_periph_release_locked(periph);
	return (CAM_REQ_CMP_ERR);
	}

	/*
	* Hold a reference to the periph before we create the physical
	* path alias so it can't go away.
	*/
	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
	xpt_print(periph->path, "%s: lost periph during "
	"registration!\n", __func__);
	cam_periph_lock(periph);
	return (CAM_REQ_CMP_ERR);
	}

	cam_periph_lock(periph);

	TASK_INIT(&softc->add_physpath_task, /priority/0,
	pass_add_physpath, periph);

	/*
	* See if physical path information is already available.
	*/
	taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task);

	/*
	* Add an async callback so that we get notified if
	* this device goes away or its physical path
	* (stored in the advanced info data of the EDT) has
	* changed.
	*/
	xpt_register_async(AC_LOST_DEVICE \| AC_ADVINFO_CHANGED,
	passasync, periph, periph->path);

	if (bootverbose)
	xpt_announce_periph(periph, NULL);

	return(CAM_REQ_CMP);
	}

	static int
	passopen(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int error;

	periph = (struct cam_periph *)dev->si_drv1;
	if (cam_periph_acquire(periph) != CAM_REQ_CMP)
	return (ENXIO);

	cam_periph_lock(periph);

	softc = (struct pass_softc *)periph->softc;

	if (softc->flags & PASS_FLAG_INVALID) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(ENXIO);
	}

	/*
	* Don't allow access when we're running at a high securelevel.
	*/
	error = securelevel_gt(td->td_ucred, 1);
	if (error) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(error);
	}

	/*
	* Only allow read-write access.
	*/
	if (((flags & FWRITE) == 0) \|\| ((flags & FREAD) == 0)) {
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(EPERM);
	}

	/*
	* We don't allow nonblocking access.
	*/
	if ((flags & O_NONBLOCK) != 0) {
	xpt_print(periph->path, "can't do nonblocking access\n");
	cam_periph_release_locked(periph);
	cam_periph_unlock(periph);
	return(EINVAL);
	}

	softc->open_count++;

	cam_periph_unlock(periph);

	return (error);
	}

	static int
	passclose(struct cdev dev, int flag, int fmt, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	struct mtx *mtx;

	periph = (struct cam_periph *)dev->si_drv1;
	mtx = cam_periph_mtx(periph);
	mtx_lock(mtx);

	softc = periph->softc;
	softc->open_count--;

	if (softc->open_count == 0) {
	struct pass_io_req io_req, io_req2;

	TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) {
	TAILQ_REMOVE(&softc->done_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links,
	io_req2) {
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);
	}

	/*
	* If there are any active I/Os, we need to forcibly acquire a
	* reference to the peripheral so that we don't go away
	* before they complete. We'll release the reference when
	* the abandoned queue is empty.
	*/
	io_req = TAILQ_FIRST(&softc->active_queue);
	if ((io_req != NULL)
	&& (softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0) {
	cam_periph_doacquire(periph);
	softc->flags \|= PASS_FLAG_ABANDONED_REF_SET;
	}

	/*
	* Since the I/O in the active queue is not under our
	* control, just set a flag so that we can clean it up when
	* it completes and put it on the abandoned queue. This
	* will prevent our sending spurious completions in the
	* event that the device is opened again before these I/Os
	* complete.
	*/
	TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links,
	io_req2) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	io_req->flags \|= PASS_IO_ABANDONED;
	TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req,
	links);
	}
	}

	cam_periph_release_locked(periph);

	/*
	* We reference the lock directly here, instead of using
	* cam_periph_unlock(). The reason is that the call to
	* cam_periph_release_locked() above could result in the periph
	* getting freed. If that is the case, dereferencing the periph
	* with a cam_periph_unlock() call would cause a page fault.
	*
	* cam_periph_release() avoids this problem using the same method,
	* but we're manually acquiring and dropping the lock here to
	* protect the open count and avoid another lock acquisition and
	* release.
	*/
	mtx_unlock(mtx);

	return (0);
	}


	static void
	passstart(struct cam_periph periph, union ccb start_ccb)
	{
	struct pass_softc *softc;

	softc = (struct pass_softc *)periph->softc;

	switch (softc->state) {
	case PASS_STATE_NORMAL: {
	struct pass_io_req *io_req;

	/*
	* Check for any queued I/O requests that require an
	* allocated slot.
	*/
	io_req = TAILQ_FIRST(&softc->incoming_queue);
	if (io_req == NULL) {
	xpt_release_ccb(start_ccb);
	break;
	}
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);
	/*
	* Merge the user's CCB into the allocated CCB.
	*/
	xpt_merge_ccb(start_ccb, &io_req->ccb);
	start_ccb->ccb_h.ccb_type = PASS_CCB_QUEUED_IO;
	start_ccb->ccb_h.ccb_ioreq = io_req;
	start_ccb->ccb_h.cbfcnp = passdone;
	io_req->alloced_ccb = start_ccb;
	binuptime(&io_req->start_time);
	devstat_start_transaction(softc->device_stats,
	&io_req->start_time);

	xpt_action(start_ccb);

	/*
	* If we have any more I/O waiting, schedule ourselves again.
	*/
	if (!TAILQ_EMPTY(&softc->incoming_queue))
	xpt_schedule(periph, CAM_PRIORITY_NORMAL);
	break;
	}
	default:
	break;
	}
	}

	static void
	passdone(struct cam_periph periph, union ccb done_ccb)
	{
	struct pass_softc *softc;
	struct ccb_scsiio *csio;

	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);

	csio = &done_ccb->csio;
	switch (csio->ccb_h.ccb_type) {
	case PASS_CCB_QUEUED_IO: {
	struct pass_io_req *io_req;

	io_req = done_ccb->ccb_h.ccb_ioreq;
	#if 0
	xpt_print(periph->path, "%s: called for user CCB %p\n",
	__func__, io_req->user_ccb_ptr);
	#endif
	if (((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
	&& (done_ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER)
	&& ((io_req->flags & PASS_IO_ABANDONED) == 0)) {
	int error;

	error = passerror(done_ccb, CAM_RETRY_SELTO,
	SF_RETRY_UA \| SF_NO_PRINT);

	if (error == ERESTART) {
	/*
	* A retry was scheduled, so
	* just return.
	*/
	return;
	}
	}

	/*
	* Copy the allocated CCB contents back to the malloced CCB
	* so we can give status back to the user when he requests it.
	*/
	bcopy(done_ccb, &io_req->ccb, sizeof(*done_ccb));

	/*
	* Log data/transaction completion with devstat(9).
	*/
	switch (done_ccb->ccb_h.func_code) {
	case XPT_SCSI_IO:
	devstat_end_transaction(softc->device_stats,
	done_ccb->csio.dxfer_len - done_ccb->csio.resid,
	done_ccb->csio.tag_action & 0x3,
	((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
	CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
	(done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
	DEVSTAT_WRITE : DEVSTAT_READ, NULL,
	&io_req->start_time);
	break;
	case XPT_ATA_IO:
	devstat_end_transaction(softc->device_stats,
	done_ccb->ataio.dxfer_len - done_ccb->ataio.resid,
	0, /* Not used in ATA */
	((done_ccb->ccb_h.flags & CAM_DIR_MASK) ==
	CAM_DIR_NONE) ? DEVSTAT_NO_DATA :
	(done_ccb->ccb_h.flags & CAM_DIR_OUT) ?
	DEVSTAT_WRITE : DEVSTAT_READ, NULL,
	&io_req->start_time);
	break;
	case XPT_SMP_IO:
	/*
	* XXX KDM this isn't quite right, but there isn't
	* currently an easy way to represent a bidirectional
	* transfer in devstat. The only way to do it
	* and have the byte counts come out right would
	* mean that we would have to record two
	* transactions, one for the request and one for the
	* response. For now, so that we report something,
	* just treat the entire thing as a read.
	*/
	devstat_end_transaction(softc->device_stats,
	done_ccb->smpio.smp_request_len +
	done_ccb->smpio.smp_response_len,
	DEVSTAT_TAG_SIMPLE, DEVSTAT_READ, NULL,
	&io_req->start_time);
	break;
	default:
	devstat_end_transaction(softc->device_stats, 0,
	DEVSTAT_TAG_NONE, DEVSTAT_NO_DATA, NULL,
	&io_req->start_time);
	break;
	}

	/*
	* In the normal case, take the completed I/O off of the
	* active queue and put it on the done queue. Notitfy the
	* user that we have a completed I/O.
	*/
	if ((io_req->flags & PASS_IO_ABANDONED) == 0) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
	selwakeuppri(&softc->read_select, PRIBIO);
	KNOTE_LOCKED(&softc->read_select.si_note, 0);
	} else {
	/*
	* In the case of an abandoned I/O (final close
	* without fetching the I/O), take it off of the
	* abandoned queue and free it.
	*/
	TAILQ_REMOVE(&softc->abandoned_queue, io_req, links);
	passiocleanup(softc, io_req);
	uma_zfree(softc->pass_zone, io_req);

	/*
	* Release the done_ccb here, since we may wind up
	* freeing the peripheral when we decrement the
	* reference count below.
	*/
	xpt_release_ccb(done_ccb);

	/*
	* If the abandoned queue is empty, we can release
	* our reference to the periph since we won't have
	* any more completions coming.
	*/
	if ((TAILQ_EMPTY(&softc->abandoned_queue))
	&& (softc->flags & PASS_FLAG_ABANDONED_REF_SET)) {
	softc->flags &= ~PASS_FLAG_ABANDONED_REF_SET;
	cam_periph_release_locked(periph);
	}

	/*
	* We have already released the CCB, so we can
	* return.
	*/
	return;
	}
	break;
	}
	}
	xpt_release_ccb(done_ccb);
	}

	static int
	passcreatezone(struct cam_periph *periph)
	{
	struct pass_softc *softc;
	int error;

	error = 0;
	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);
	KASSERT(((softc->flags & PASS_FLAG_ZONE_VALID) == 0),
	("%s called when the pass(4) zone is valid!\n", __func__));
	KASSERT((softc->pass_zone == NULL),
	("%s called when the pass(4) zone is allocated!\n", __func__));

	if ((softc->flags & PASS_FLAG_ZONE_INPROG) == 0) {

	/*
	* We're the first context through, so we need to create
	* the pass(4) UMA zone for I/O requests.
	*/
	softc->flags \|= PASS_FLAG_ZONE_INPROG;

	/*
	* uma_zcreate() does a blocking (M_WAITOK) allocation,
	* so we cannot hold a mutex while we call it.
	*/
	cam_periph_unlock(periph);

	softc->pass_zone = uma_zcreate(softc->zone_name,
	sizeof(struct pass_io_req), NULL, NULL, NULL, NULL,
	/align/ 0, /flags/ 0);

	softc->pass_io_zone = uma_zcreate(softc->io_zone_name,
	softc->io_zone_size, NULL, NULL, NULL, NULL,
	/align/ 0, /flags/ 0);

	cam_periph_lock(periph);

	if ((softc->pass_zone == NULL)
	\|\| (softc->pass_io_zone == NULL)) {
	if (softc->pass_zone == NULL)
	xpt_print(periph->path, "unable to allocate "
	"IO Req UMA zone\n");
	else
	xpt_print(periph->path, "unable to allocate "
	"IO UMA zone\n");
	softc->flags &= ~PASS_FLAG_ZONE_INPROG;
	goto bailout;
	}

	/*
	* Set the flags appropriately and notify any other waiters.
	*/
	softc->flags &= PASS_FLAG_ZONE_INPROG;
	softc->flags \|= PASS_FLAG_ZONE_VALID;
	wakeup(&softc->pass_zone);
	} else {
	/*
	* In this case, the UMA zone has not yet been created, but
	* another context is in the process of creating it. We
	* need to sleep until the creation is either done or has
	* failed.
	*/
	while ((softc->flags & PASS_FLAG_ZONE_INPROG)
	&& ((softc->flags & PASS_FLAG_ZONE_VALID) == 0)) {
	error = msleep(&softc->pass_zone,
	cam_periph_mtx(periph), PRIBIO,
	"paszon", 0);
	if (error != 0)
	goto bailout;
	}
	/*
	* If the zone creation failed, no luck for the user.
	*/
	if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0){
	error = ENOMEM;
	goto bailout;
	}
	}
	bailout:
	return (error);
	}

	static void
	passiocleanup(struct pass_softc softc, struct pass_io_req io_req)
	{
	union ccb *ccb;
	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
	int i, numbufs;

	ccb = &io_req->ccb;

	switch (ccb->ccb_h.func_code) {
	case XPT_DEV_MATCH:
	numbufs = min(io_req->num_bufs, 2);

	if (numbufs == 1) {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
	} else {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
	data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
	}
	break;
	case XPT_SCSI_IO:
	case XPT_CONT_TARGET_IO:
	data_ptrs[0] = &ccb->csio.data_ptr;
	numbufs = min(io_req->num_bufs, 1);
	break;
	case XPT_ATA_IO:
	data_ptrs[0] = &ccb->ataio.data_ptr;
	numbufs = min(io_req->num_bufs, 1);
	break;
	case XPT_SMP_IO:
	numbufs = min(io_req->num_bufs, 2);
	data_ptrs[0] = &ccb->smpio.smp_request;
	data_ptrs[1] = &ccb->smpio.smp_response;
	break;
	case XPT_DEV_ADVINFO:
	numbufs = min(io_req->num_bufs, 1);
	data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
	break;
	case XPT_NVME_IO:
	case XPT_NVME_ADMIN:
	data_ptrs[0] = &ccb->nvmeio.data_ptr;
	numbufs = min(io_req->num_bufs, 1);
	break;
	default:
	/* allow ourselves to be swapped once again */
	return;
	break; /* NOTREACHED */
	}

	if (io_req->flags & PASS_IO_USER_SEG_MALLOC) {
	free(io_req->user_segptr, M_SCSIPASS);
	io_req->user_segptr = NULL;
	}

	/*
	* We only want to free memory we malloced.
	*/
	if (io_req->data_flags == CAM_DATA_VADDR) {
	for (i = 0; i < io_req->num_bufs; i++) {
	if (io_req->kern_bufs[i] == NULL)
	continue;

	free(io_req->kern_bufs[i], M_SCSIPASS);
	io_req->kern_bufs[i] = NULL;
	}
	} else if (io_req->data_flags == CAM_DATA_SG) {
	for (i = 0; i < io_req->num_kern_segs; i++) {
	if ((uint8_t *)(uintptr_t)
	io_req->kern_segptr[i].ds_addr == NULL)
	continue;

	uma_zfree(softc->pass_io_zone, (uint8_t *)(uintptr_t)
	io_req->kern_segptr[i].ds_addr);
	io_req->kern_segptr[i].ds_addr = 0;
	}
	}

	if (io_req->flags & PASS_IO_KERN_SEG_MALLOC) {
	free(io_req->kern_segptr, M_SCSIPASS);
	io_req->kern_segptr = NULL;
	}

	if (io_req->data_flags != CAM_DATA_PADDR) {
	for (i = 0; i < numbufs; i++) {
	/*
	* Restore the user's buffer pointers to their
	* previous values.
	*/
	if (io_req->user_bufs[i] != NULL)
	*data_ptrs[i] = io_req->user_bufs[i];
	}
	}

	}

	static int
	passcopysglist(struct cam_periph periph, struct pass_io_req io_req,
	ccb_flags direction)
	{
	bus_size_t kern_watermark, user_watermark, len_copied, len_to_copy;
	bus_dma_segment_t user_sglist, kern_sglist;
	int i, j, error;

	error = 0;
	kern_watermark = 0;
	user_watermark = 0;
	len_to_copy = 0;
	len_copied = 0;
	user_sglist = io_req->user_segptr;
	kern_sglist = io_req->kern_segptr;

	for (i = 0, j = 0; i < io_req->num_user_segs &&
	j < io_req->num_kern_segs;) {
	uint8_t user_ptr, kern_ptr;

	len_to_copy = min(user_sglist[i].ds_len -user_watermark,
	kern_sglist[j].ds_len - kern_watermark);

	user_ptr = (uint8_t *)(uintptr_t)user_sglist[i].ds_addr;
	user_ptr = user_ptr + user_watermark;
	kern_ptr = (uint8_t *)(uintptr_t)kern_sglist[j].ds_addr;
	kern_ptr = kern_ptr + kern_watermark;

	user_watermark += len_to_copy;
	kern_watermark += len_to_copy;

	if (!useracc(user_ptr, len_to_copy,
	(direction == CAM_DIR_IN) ? VM_PROT_WRITE : VM_PROT_READ)) {
	xpt_print(periph->path, "%s: unable to access user "
	"S/G list element %p len %zu\n", __func__,
	user_ptr, len_to_copy);
	error = EFAULT;
	goto bailout;
	}

	if (direction == CAM_DIR_IN) {
	error = copyout(kern_ptr, user_ptr, len_to_copy);
	if (error != 0) {
	xpt_print(periph->path, "%s: copyout of %u "
	"bytes from %p to %p failed with "
	"error %d\n", __func__, len_to_copy,
	kern_ptr, user_ptr, error);
	goto bailout;
	}
	} else {
	error = copyin(user_ptr, kern_ptr, len_to_copy);
	if (error != 0) {
	xpt_print(periph->path, "%s: copyin of %u "
	"bytes from %p to %p failed with "
	"error %d\n", __func__, len_to_copy,
	user_ptr, kern_ptr, error);
	goto bailout;
	}
	}

	len_copied += len_to_copy;

	if (user_sglist[i].ds_len == user_watermark) {
	i++;
	user_watermark = 0;
	}

	if (kern_sglist[j].ds_len == kern_watermark) {
	j++;
	kern_watermark = 0;
	}
	}

	bailout:

	return (error);
	}

	static int
	passmemsetup(struct cam_periph periph, struct pass_io_req io_req)
	{
	union ccb *ccb;
	struct pass_softc *softc;
	int numbufs, i;
	uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
	uint32_t lengths[CAM_PERIPH_MAXMAPS];
	uint32_t dirs[CAM_PERIPH_MAXMAPS];
	uint32_t num_segs;
	uint16_t *seg_cnt_ptr;
	size_t maxmap;
	int error;

	cam_periph_assert(periph, MA_NOTOWNED);

	softc = periph->softc;

	error = 0;
	ccb = &io_req->ccb;
	maxmap = 0;
	num_segs = 0;
	seg_cnt_ptr = NULL;

	switch(ccb->ccb_h.func_code) {
	case XPT_DEV_MATCH:
	if (ccb->cdm.match_buf_len == 0) {
	printf("%s: invalid match buffer length 0\n", __func__);
	return(EINVAL);
	}
	if (ccb->cdm.pattern_buf_len > 0) {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
	lengths[0] = ccb->cdm.pattern_buf_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
	lengths[1] = ccb->cdm.match_buf_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	} else {
	data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
	lengths[0] = ccb->cdm.match_buf_len;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;
	}
	io_req->data_flags = CAM_DATA_VADDR;
	break;
	case XPT_SCSI_IO:
	case XPT_CONT_TARGET_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return(0);

	/*
	* The user shouldn't be able to supply a bio.
	*/
	if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
	return (EINVAL);

	io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;

	data_ptrs[0] = &ccb->csio.data_ptr;
	lengths[0] = ccb->csio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	num_segs = ccb->csio.sglist_cnt;
	seg_cnt_ptr = &ccb->csio.sglist_cnt;
	numbufs = 1;
	maxmap = softc->maxio;
	break;
	case XPT_ATA_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return(0);

	/*
	* We only support a single virtual address for ATA I/O.
	*/
	if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
	return (EINVAL);

	io_req->data_flags = CAM_DATA_VADDR;

	data_ptrs[0] = &ccb->ataio.data_ptr;
	lengths[0] = ccb->ataio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	numbufs = 1;
	maxmap = softc->maxio;
	break;
	case XPT_SMP_IO:
	io_req->data_flags = CAM_DATA_VADDR;

	data_ptrs[0] = &ccb->smpio.smp_request;
	lengths[0] = ccb->smpio.smp_request_len;
	dirs[0] = CAM_DIR_OUT;
	data_ptrs[1] = &ccb->smpio.smp_response;
	lengths[1] = ccb->smpio.smp_response_len;
	dirs[1] = CAM_DIR_IN;
	numbufs = 2;
	maxmap = softc->maxio;
	break;
	case XPT_DEV_ADVINFO:
	if (ccb->cdai.bufsiz == 0)
	return (0);

	io_req->data_flags = CAM_DATA_VADDR;

	data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
	lengths[0] = ccb->cdai.bufsiz;
	dirs[0] = CAM_DIR_IN;
	numbufs = 1;
	break;
	case XPT_NVME_ADMIN:
	case XPT_NVME_IO:
	if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
	return (0);

	io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK;

	data_ptrs[0] = &ccb->nvmeio.data_ptr;
	lengths[0] = ccb->nvmeio.dxfer_len;
	dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
	num_segs = ccb->nvmeio.sglist_cnt;
	seg_cnt_ptr = &ccb->nvmeio.sglist_cnt;
	numbufs = 1;
	maxmap = softc->maxio;
	break;
	default:
	return(EINVAL);
	break; /* NOTREACHED */
	}

	io_req->num_bufs = numbufs;

	/*
	* If there is a maximum, check to make sure that the user's
	* request fits within the limit. In general, we should only have
	* a maximum length for requests that go to hardware. Otherwise it
	* is whatever we're able to malloc.
	*/
	for (i = 0; i < numbufs; i++) {
	io_req->user_bufs[i] = *data_ptrs[i];
	io_req->dirs[i] = dirs[i];
	io_req->lengths[i] = lengths[i];

	if (maxmap == 0)
	continue;

	if (lengths[i] <= maxmap)
	continue;

	xpt_print(periph->path, "%s: data length %u > max allowed %u "
	"bytes\n", __func__, lengths[i], maxmap);
	error = EINVAL;
	goto bailout;
	}

	switch (io_req->data_flags) {
	case CAM_DATA_VADDR:
	/* Map or copy the buffer into kernel address space */
	for (i = 0; i < numbufs; i++) {
	uint8_t *tmp_buf;

	/*
	* If for some reason no length is specified, we
	* don't need to allocate anything.
	*/
	if (io_req->lengths[i] == 0)
	continue;

	/*
	* Make sure that the user's buffer is accessible
	* to that process.
	*/
	if (!useracc(io_req->user_bufs[i], io_req->lengths[i],
	(io_req->dirs[i] == CAM_DIR_IN) ? VM_PROT_WRITE :
	VM_PROT_READ)) {
	xpt_print(periph->path, "%s: user address %p "
	"length %u is not accessible\n", __func__,
	io_req->user_bufs[i], io_req->lengths[i]);
	error = EFAULT;
	goto bailout;
	}

	tmp_buf = malloc(lengths[i], M_SCSIPASS,
	M_WAITOK \| M_ZERO);
	io_req->kern_bufs[i] = tmp_buf;
	*data_ptrs[i] = tmp_buf;

	#if 0
	xpt_print(periph->path, "%s: malloced %p len %u, user "
	"buffer %p, operation: %s\n", __func__,
	tmp_buf, lengths[i], io_req->user_bufs[i],
	(dirs[i] == CAM_DIR_IN) ? "read" : "write");
	#endif
	/*
	* We only need to copy in if the user is writing.
	*/
	if (dirs[i] != CAM_DIR_OUT)
	continue;

	error = copyin(io_req->user_bufs[i],
	io_req->kern_bufs[i], lengths[i]);
	if (error != 0) {
	xpt_print(periph->path, "%s: copy of user "
	"buffer from %p to %p failed with "
	"error %d\n", __func__,
	io_req->user_bufs[i],
	io_req->kern_bufs[i], error);
	goto bailout;
	}
	}
	break;
	case CAM_DATA_PADDR:
	/* Pass down the pointer as-is */
	break;
	case CAM_DATA_SG: {
	size_t sg_length, size_to_go, alloc_size;
	uint32_t num_segs_needed;

	/*
	* Copy the user S/G list in, and then copy in the
	* individual segments.
	*/
	/*
	* We shouldn't see this, but check just in case.
	*/
	if (numbufs != 1) {
	xpt_print(periph->path, "%s: cannot currently handle "
	"more than one S/G list per CCB\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* We have to have at least one segment.
	*/
	if (num_segs == 0) {
	xpt_print(periph->path, "%s: CAM_DATA_SG flag set, "
	"but sglist_cnt=0!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* Make sure the user specified the total length and didn't
	* just leave it to us to decode the S/G list.
	*/
	if (lengths[0] == 0) {
	xpt_print(periph->path, "%s: no dxfer_len specified, "
	"but CAM_DATA_SG flag is set!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* We allocate buffers in io_zone_size increments for an
	* S/G list. This will generally be MAXPHYS.
	*/
	if (lengths[0] <= softc->io_zone_size)
	num_segs_needed = 1;
	else {
	num_segs_needed = lengths[0] / softc->io_zone_size;
	if ((lengths[0] % softc->io_zone_size) != 0)
	num_segs_needed++;
	}

	/* Figure out the size of the S/G list */
	sg_length = num_segs * sizeof(bus_dma_segment_t);
	io_req->num_user_segs = num_segs;
	io_req->num_kern_segs = num_segs_needed;

	/* Save the user's S/G list pointer for later restoration */
	io_req->user_bufs[0] = *data_ptrs[0];

	/*
	* If we have enough segments allocated by default to handle
	* the length of the user's S/G list,
	*/
	if (num_segs > PASS_MAX_SEGS) {
	io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
	num_segs, M_SCSIPASS, M_WAITOK \| M_ZERO);
	io_req->flags \|= PASS_IO_USER_SEG_MALLOC;
	} else
	io_req->user_segptr = io_req->user_segs;

	if (!useracc(*data_ptrs[0], sg_length, VM_PROT_READ)) {
	xpt_print(periph->path, "%s: unable to access user "
	"S/G list at %p\n", __func__, *data_ptrs[0]);
	error = EFAULT;
	goto bailout;
	}

	error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
	if (error != 0) {
	xpt_print(periph->path, "%s: copy of user S/G list "
	"from %p to %p failed with error %d\n",
	__func__, *data_ptrs[0], io_req->user_segptr,
	error);
	goto bailout;
	}

	if (num_segs_needed > PASS_MAX_SEGS) {
	io_req->kern_segptr = malloc(sizeof(bus_dma_segment_t) *
	num_segs_needed, M_SCSIPASS, M_WAITOK \| M_ZERO);
	io_req->flags \|= PASS_IO_KERN_SEG_MALLOC;
	} else {
	io_req->kern_segptr = io_req->kern_segs;
	}

	/*
	* Allocate the kernel S/G list.
	*/
	for (size_to_go = lengths[0], i = 0;
	size_to_go > 0 && i < num_segs_needed;
	i++, size_to_go -= alloc_size) {
	uint8_t *kern_ptr;

	alloc_size = min(size_to_go, softc->io_zone_size);
	kern_ptr = uma_zalloc(softc->pass_io_zone, M_WAITOK);
	io_req->kern_segptr[i].ds_addr =
	(bus_addr_t)(uintptr_t)kern_ptr;
	io_req->kern_segptr[i].ds_len = alloc_size;
	}
	if (size_to_go > 0) {
	printf("%s: size_to_go = %zu, software error!\n",
	__func__, size_to_go);
	error = EINVAL;
	goto bailout;
	}

	data_ptrs[0] = (uint8_t )io_req->kern_segptr;
	*seg_cnt_ptr = io_req->num_kern_segs;

	/*
	* We only need to copy data here if the user is writing.
	*/
	if (dirs[0] == CAM_DIR_OUT)
	error = passcopysglist(periph, io_req, dirs[0]);
	break;
	}
	case CAM_DATA_SG_PADDR: {
	size_t sg_length;

	/*
	* We shouldn't see this, but check just in case.
	*/
	if (numbufs != 1) {
	printf("%s: cannot currently handle more than one "
	"S/G list per CCB\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* We have to have at least one segment.
	*/
	if (num_segs == 0) {
	xpt_print(periph->path, "%s: CAM_DATA_SG_PADDR flag "
	"set, but sglist_cnt=0!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/*
	* Make sure the user specified the total length and didn't
	* just leave it to us to decode the S/G list.
	*/
	if (lengths[0] == 0) {
	xpt_print(periph->path, "%s: no dxfer_len specified, "
	"but CAM_DATA_SG flag is set!\n", __func__);
	error = EINVAL;
	goto bailout;
	}

	/* Figure out the size of the S/G list */
	sg_length = num_segs * sizeof(bus_dma_segment_t);
	io_req->num_user_segs = num_segs;
	io_req->num_kern_segs = io_req->num_user_segs;

	/* Save the user's S/G list pointer for later restoration */
	io_req->user_bufs[0] = *data_ptrs[0];

	if (num_segs > PASS_MAX_SEGS) {
	io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) *
	num_segs, M_SCSIPASS, M_WAITOK \| M_ZERO);
	io_req->flags \|= PASS_IO_USER_SEG_MALLOC;
	} else
	io_req->user_segptr = io_req->user_segs;

	io_req->kern_segptr = io_req->user_segptr;

	error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length);
	if (error != 0) {
	xpt_print(periph->path, "%s: copy of user S/G list "
	"from %p to %p failed with error %d\n",
	__func__, *data_ptrs[0], io_req->user_segptr,
	error);
	goto bailout;
	}
	break;
	}
	default:
	case CAM_DATA_BIO:
	/*
	* A user shouldn't be attaching a bio to the CCB. It
	* isn't a user-accessible structure.
	*/
	error = EINVAL;
	break;
	}

	bailout:
	if (error != 0)
	passiocleanup(softc, io_req);

	return (error);
	}

	static int
	passmemdone(struct cam_periph periph, struct pass_io_req io_req)
	{
	struct pass_softc *softc;
	- union ccb *ccb;
	int error;
	int i;

	error = 0;
	softc = (struct pass_softc *)periph->softc;
	- ccb = &io_req->ccb;

	switch (io_req->data_flags) {
	case CAM_DATA_VADDR:
	/*
	* Copy back to the user buffer if this was a read.
	*/
	for (i = 0; i < io_req->num_bufs; i++) {
	if (io_req->dirs[i] != CAM_DIR_IN)
	continue;

	error = copyout(io_req->kern_bufs[i],
	io_req->user_bufs[i], io_req->lengths[i]);
	if (error != 0) {
	xpt_print(periph->path, "Unable to copy %u "
	"bytes from %p to user address %p\n",
	io_req->lengths[i],
	io_req->kern_bufs[i],
	io_req->user_bufs[i]);
	goto bailout;
	}

	}
	break;
	case CAM_DATA_PADDR:
	/* Do nothing. The pointer is a physical address already */
	break;
	case CAM_DATA_SG:
	/*
	* Copy back to the user buffer if this was a read.
	* Restore the user's S/G list buffer pointer.
	*/
	if (io_req->dirs[0] == CAM_DIR_IN)
	error = passcopysglist(periph, io_req, io_req->dirs[0]);
	break;
	case CAM_DATA_SG_PADDR:
	/*
	* Restore the user's S/G list buffer pointer. No need to
	* copy.
	*/
	break;
	default:
	case CAM_DATA_BIO:
	error = EINVAL;
	break;
	}

	bailout:
	/*
	* Reset the user's pointers to their original values and free
	* allocated memory.
	*/
	passiocleanup(softc, io_req);

	return (error);
	}

	static int
	passioctl(struct cdev dev, u_long cmd, caddr_t addr, int flag, struct thread td)
	{
	int error;

	if ((error = passdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) {
	error = cam_compat_ioctl(dev, cmd, addr, flag, td, passdoioctl);
	}
	return (error);
	}

	static int
	passdoioctl(struct cdev dev, u_long cmd, caddr_t addr, int flag, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int error;
	uint32_t priority;

	periph = (struct cam_periph *)dev->si_drv1;
	cam_periph_lock(periph);
	softc = (struct pass_softc *)periph->softc;

	error = 0;

	switch (cmd) {

	case CAMIOCOMMAND:
	{
	union ccb *inccb;
	union ccb *ccb;
	int ccb_malloced;

	inccb = (union ccb *)addr;
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (inccb->ccb_h.func_code == XPT_SCSI_IO)
	inccb->csio.bio = NULL;
	#endif

	if (inccb->ccb_h.flags & CAM_UNLOCKED) {
	error = EINVAL;
	break;
	}

	/*
	* Some CCB types, like scan bus and scan lun can only go
	* through the transport layer device.
	*/
	if (inccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
	xpt_print(periph->path, "CCB function code %#x is "
	"restricted to the XPT device\n",
	inccb->ccb_h.func_code);
	error = ENODEV;
	break;
	}

	/* Compatibility for RL/priority-unaware code. */
	priority = inccb->ccb_h.pinfo.priority;
	if (priority <= CAM_PRIORITY_OOB)
	priority += CAM_PRIORITY_OOB + 1;

	/*
	* Non-immediate CCBs need a CCB from the per-device pool
	* of CCBs, which is scheduled by the transport layer.
	* Immediate CCBs and user-supplied CCBs should just be
	* malloced.
	*/
	if ((inccb->ccb_h.func_code & XPT_FC_QUEUED)
	&& ((inccb->ccb_h.func_code & XPT_FC_USER_CCB) == 0)) {
	ccb = cam_periph_getccb(periph, priority);
	ccb_malloced = 0;
	} else {
	ccb = xpt_alloc_ccb_nowait();

	if (ccb != NULL)
	xpt_setup_ccb(&ccb->ccb_h, periph->path,
	priority);
	ccb_malloced = 1;
	}

	if (ccb == NULL) {
	xpt_print(periph->path, "unable to allocate CCB\n");
	error = ENOMEM;
	break;
	}

	error = passsendccb(periph, ccb, inccb);

	if (ccb_malloced)
	xpt_free_ccb(ccb);
	else
	xpt_release_ccb(ccb);

	break;
	}
	case CAMIOQUEUE:
	{
	struct pass_io_req *io_req;
	union ccb *user_ccb, ccb;
	xpt_opcode fc;

	if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0) {
	error = passcreatezone(periph);
	if (error != 0)
	goto bailout;
	}

	/*
	* We're going to do a blocking allocation for this I/O
	* request, so we have to drop the lock.
	*/
	cam_periph_unlock(periph);

	io_req = uma_zalloc(softc->pass_zone, M_WAITOK \| M_ZERO);
	ccb = &io_req->ccb;
	user_ccb = (union ccb **)addr;

	/*
	* Unlike the CAMIOCOMMAND ioctl above, we only have a
	* pointer to the user's CCB, so we have to copy the whole
	* thing in to a buffer we have allocated (above) instead
	* of allowing the ioctl code to malloc a buffer and copy
	* it in.
	*
	* This is an advantage for this asynchronous interface,
	* since we don't want the memory to get freed while the
	* CCB is outstanding.
	*/
	#if 0
	xpt_print(periph->path, "Copying user CCB %p to "
	"kernel address %p\n", *user_ccb, ccb);
	#endif
	error = copyin(user_ccb, ccb, sizeof(ccb));
	if (error != 0) {
	xpt_print(periph->path, "Copy of user CCB %p to "
	"kernel address %p failed with error %d\n",
	*user_ccb, ccb, error);
	goto camioqueue_error;
	}
	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	if (ccb->ccb_h.func_code == XPT_SCSI_IO)
	ccb->csio.bio = NULL;
	#endif

	if (ccb->ccb_h.flags & CAM_UNLOCKED) {
	error = EINVAL;
	goto camioqueue_error;
	}

	if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
	if (ccb->csio.cdb_len > IOCDBLEN) {
	error = EINVAL;
	goto camioqueue_error;
	}
	error = copyin(ccb->csio.cdb_io.cdb_ptr,
	ccb->csio.cdb_io.cdb_bytes, ccb->csio.cdb_len);
	if (error != 0)
	goto camioqueue_error;
	ccb->ccb_h.flags &= ~CAM_CDB_POINTER;
	}

	/*
	* Some CCB types, like scan bus and scan lun can only go
	* through the transport layer device.
	*/
	if (ccb->ccb_h.func_code & XPT_FC_XPT_ONLY) {
	xpt_print(periph->path, "CCB function code %#x is "
	"restricted to the XPT device\n",
	ccb->ccb_h.func_code);
	error = ENODEV;
	goto camioqueue_error;
	}

	/*
	* Save the user's CCB pointer as well as his linked list
	* pointers and peripheral private area so that we can
	* restore these later.
	*/
	io_req->user_ccb_ptr = *user_ccb;
	io_req->user_periph_links = ccb->ccb_h.periph_links;
	io_req->user_periph_priv = ccb->ccb_h.periph_priv;

	/*
	* Now that we've saved the user's values, we can set our
	* own peripheral private entry.
	*/
	ccb->ccb_h.ccb_ioreq = io_req;

	/* Compatibility for RL/priority-unaware code. */
	priority = ccb->ccb_h.pinfo.priority;
	if (priority <= CAM_PRIORITY_OOB)
	priority += CAM_PRIORITY_OOB + 1;

	/*
	* Setup fields in the CCB like the path and the priority.
	* The path in particular cannot be done in userland, since
	* it is a pointer to a kernel data structure.
	*/
	xpt_setup_ccb_flags(&ccb->ccb_h, periph->path, priority,
	ccb->ccb_h.flags);

	/*
	* Setup our done routine. There is no way for the user to
	* have a valid pointer here.
	*/
	ccb->ccb_h.cbfcnp = passdone;

	fc = ccb->ccb_h.func_code;
	/*
	* If this function code has memory that can be mapped in
	* or out, we need to call passmemsetup().
	*/
	if ((fc == XPT_SCSI_IO) \|\| (fc == XPT_ATA_IO)
	\|\| (fc == XPT_SMP_IO) \|\| (fc == XPT_DEV_MATCH)
	\|\| (fc == XPT_DEV_ADVINFO)
	\|\| (fc == XPT_NVME_ADMIN) \|\| (fc == XPT_NVME_IO)) {
	error = passmemsetup(periph, io_req);
	if (error != 0)
	goto camioqueue_error;
	} else
	io_req->mapinfo.num_bufs_used = 0;

	cam_periph_lock(periph);

	/*
	* Everything goes on the incoming queue initially.
	*/
	TAILQ_INSERT_TAIL(&softc->incoming_queue, io_req, links);

	/*
	* If the CCB is queued, and is not a user CCB, then
	* we need to allocate a slot for it. Call xpt_schedule()
	* so that our start routine will get called when a CCB is
	* available.
	*/
	if ((fc & XPT_FC_QUEUED)
	&& ((fc & XPT_FC_USER_CCB) == 0)) {
	xpt_schedule(periph, priority);
	break;
	}

	/*
	* At this point, the CCB in question is either an
	* immediate CCB (like XPT_DEV_ADVINFO) or it is a user CCB
	* and therefore should be malloced, not allocated via a slot.
	* Remove the CCB from the incoming queue and add it to the
	* active queue.
	*/
	TAILQ_REMOVE(&softc->incoming_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links);

	xpt_action(ccb);

	/*
	* If this is not a queued CCB (i.e. it is an immediate CCB),
	* then it is already done. We need to put it on the done
	* queue for the user to fetch.
	*/
	if ((fc & XPT_FC_QUEUED) == 0) {
	TAILQ_REMOVE(&softc->active_queue, io_req, links);
	TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links);
	}
	break;

	camioqueue_error:
	uma_zfree(softc->pass_zone, io_req);
	cam_periph_lock(periph);
	break;
	}
	case CAMIOGET:
	{
	union ccb **user_ccb;
	struct pass_io_req *io_req;
	int old_error;

	user_ccb = (union ccb **)addr;
	old_error = 0;

	io_req = TAILQ_FIRST(&softc->done_queue);
	if (io_req == NULL) {
	error = ENOENT;
	break;
	}

	/*
	* Remove the I/O from the done queue.
	*/
	TAILQ_REMOVE(&softc->done_queue, io_req, links);

	/*
	* We have to drop the lock during the copyout because the
	* copyout can result in VM faults that require sleeping.
	*/
	cam_periph_unlock(periph);

	/*
	* Do any needed copies (e.g. for reads) and revert the
	* pointers in the CCB back to the user's pointers.
	*/
	error = passmemdone(periph, io_req);

	old_error = error;

	io_req->ccb.ccb_h.periph_links = io_req->user_periph_links;
	io_req->ccb.ccb_h.periph_priv = io_req->user_periph_priv;

	#if 0
	xpt_print(periph->path, "Copying to user CCB %p from "
	"kernel address %p\n", *user_ccb, &io_req->ccb);
	#endif

	error = copyout(&io_req->ccb, *user_ccb, sizeof(union ccb));
	if (error != 0) {
	xpt_print(periph->path, "Copy to user CCB %p from "
	"kernel address %p failed with error %d\n",
	*user_ccb, &io_req->ccb, error);
	}

	/*
	* Prefer the first error we got back, and make sure we
	* don't overwrite bad status with good.
	*/
	if (old_error != 0)
	error = old_error;

	cam_periph_lock(periph);

	/*
	* At this point, if there was an error, we could potentially
	* re-queue the I/O and try again. But why? The error
	* would almost certainly happen again. We might as well
	* not leak memory.
	*/
	uma_zfree(softc->pass_zone, io_req);
	break;
	}
	default:
	error = cam_periph_ioctl(periph, cmd, addr, passerror);
	break;
	}

	bailout:
	cam_periph_unlock(periph);

	return(error);
	}

	static int
	passpoll(struct cdev dev, int poll_events, struct thread td)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int revents;

	periph = (struct cam_periph *)dev->si_drv1;
	softc = (struct pass_softc *)periph->softc;

	revents = poll_events & (POLLOUT \| POLLWRNORM);
	if ((poll_events & (POLLIN \| POLLRDNORM)) != 0) {
	cam_periph_lock(periph);

	if (!TAILQ_EMPTY(&softc->done_queue)) {
	revents \|= poll_events & (POLLIN \| POLLRDNORM);
	}
	cam_periph_unlock(periph);
	if (revents == 0)
	selrecord(td, &softc->read_select);
	}

	return (revents);
	}

	static int
	passkqfilter(struct cdev dev, struct knote kn)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = (struct cam_periph *)dev->si_drv1;
	softc = (struct pass_softc *)periph->softc;

	kn->kn_hook = (caddr_t)periph;
	kn->kn_fop = &passread_filtops;
	knlist_add(&softc->read_select.si_note, kn, 0);

	return (0);
	}

	static void
	passreadfiltdetach(struct knote *kn)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = (struct cam_periph *)kn->kn_hook;
	softc = (struct pass_softc *)periph->softc;

	knlist_remove(&softc->read_select.si_note, kn, 0);
	}

	static int
	passreadfilt(struct knote *kn, long hint)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;
	int retval;

	periph = (struct cam_periph *)kn->kn_hook;
	softc = (struct pass_softc *)periph->softc;

	cam_periph_assert(periph, MA_OWNED);

	if (TAILQ_EMPTY(&softc->done_queue))
	retval = 0;
	else
	retval = 1;

	return (retval);
	}

	/*
	* Generally, "ccb" should be the CCB supplied by the kernel. "inccb"
	* should be the CCB that is copied in from the user.
	*/
	static int
	passsendccb(struct cam_periph periph, union ccb ccb, union ccb *inccb)
	{
	struct pass_softc *softc;
	struct cam_periph_map_info mapinfo;
	uint8_t *cmd;
	xpt_opcode fc;
	int error;

	softc = (struct pass_softc *)periph->softc;

	/*
	* There are some fields in the CCB header that need to be
	* preserved, the rest we get from the user.
	*/
	xpt_merge_ccb(ccb, inccb);

	if (ccb->ccb_h.flags & CAM_CDB_POINTER) {
	cmd = __builtin_alloca(ccb->csio.cdb_len);
	error = copyin(ccb->csio.cdb_io.cdb_ptr, cmd, ccb->csio.cdb_len);
	if (error)
	return (error);
	ccb->csio.cdb_io.cdb_ptr = cmd;
	}

	/*
	*/
	ccb->ccb_h.cbfcnp = passdone;

	/*
	* Let cam_periph_mapmem do a sanity check on the data pointer format.
	* Even if no data transfer is needed, it's a cheap check and it
	* simplifies the code.
	*/
	fc = ccb->ccb_h.func_code;
	if ((fc == XPT_SCSI_IO) \|\| (fc == XPT_ATA_IO) \|\| (fc == XPT_SMP_IO)
	\|\| (fc == XPT_DEV_MATCH) \|\| (fc == XPT_DEV_ADVINFO) \|\| (fc == XPT_MMC_IO)
	\|\| (fc == XPT_NVME_ADMIN) \|\| (fc == XPT_NVME_IO)) {

	bzero(&mapinfo, sizeof(mapinfo));

	/*
	* cam_periph_mapmem calls into proc and vm functions that can
	* sleep as well as trigger I/O, so we can't hold the lock.
	* Dropping it here is reasonably safe.
	*/
	cam_periph_unlock(periph);
	error = cam_periph_mapmem(ccb, &mapinfo, softc->maxio);
	cam_periph_lock(periph);

	/*
	* cam_periph_mapmem returned an error, we can't continue.
	* Return the error to the user.
	*/
	if (error)
	return(error);
	} else
	/* Ensure that the unmap call later on is a no-op. */
	mapinfo.num_bufs_used = 0;

	/*
	* If the user wants us to perform any error recovery, then honor
	* that request. Otherwise, it's up to the user to perform any
	* error recovery.
	*/
	cam_periph_runccb(ccb, (ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) ?
	passerror : NULL, /* cam_flags */ CAM_RETRY_SELTO,
	/* sense_flags */ SF_RETRY_UA \| SF_NO_PRINT,
	softc->device_stats);

	cam_periph_unmapmem(ccb, &mapinfo);

	ccb->ccb_h.cbfcnp = NULL;
	ccb->ccb_h.periph_priv = inccb->ccb_h.periph_priv;
	bcopy(ccb, inccb, sizeof(union ccb));

	return(0);
	}

	static int
	passerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
	{
	struct cam_periph *periph;
	struct pass_softc *softc;

	periph = xpt_path_periph(ccb->ccb_h.path);
	softc = (struct pass_softc *)periph->softc;

	return(cam_periph_error(ccb, cam_flags, sense_flags));
	}
	Index: head/sys/dev/al_eth/al_eth.c
	===================================================================
	--- head/sys/dev/al_eth/al_eth.c (revision 327172)
	+++ head/sys/dev/al_eth/al_eth.c (revision 327173)
	@@ -1,3584 +1,3581 @@
	/*-
	* Copyright (c) 2015,2016 Annapurna Labs Ltd. and affiliates
	* All rights reserved.
	*
	* Developed by Semihalf.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/rman.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>

	#include <machine/atomic.h>

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <net/ethernet.h>
	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_arp.h>
	#include <net/if_dl.h>
	#include <net/if_media.h>
	#include <net/if_types.h>
	#include <netinet/in.h>
	#include <net/if_vlan_var.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_lro.h>

	#ifdef INET
	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#endif

	#ifdef INET6
	#include <netinet/ip6.h>
	#endif

	#include <sys/sockio.h>

	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcivar.h>

	#include <dev/mii/mii.h>
	#include <dev/mii/miivar.h>

	#include <al_hal_common.h>
	#include <al_hal_plat_services.h>
	#include <al_hal_udma_config.h>
	#include <al_hal_udma_iofic.h>
	#include <al_hal_udma_debug.h>
	#include <al_hal_eth.h>

	#include "al_eth.h"
	#include "al_init_eth_lm.h"
	#include "arm/annapurna/alpine/alpine_serdes.h"

	#include "miibus_if.h"

	#define device_printf_dbg(fmt, ...) do { \
	if (AL_DBG_LEVEL >= AL_DBG_LEVEL_DBG) { AL_DBG_LOCK(); \
	device_printf(fmt, __VA_ARGS__); AL_DBG_UNLOCK();} \
	} while (0)

	MALLOC_DEFINE(M_IFAL, "if_al_malloc", "All allocated data for AL ETH driver");

	/* move out to some pci header file */
	#define PCI_VENDOR_ID_ANNAPURNA_LABS 0x1c36
	#define PCI_DEVICE_ID_AL_ETH 0x0001
	#define PCI_DEVICE_ID_AL_ETH_ADVANCED 0x0002
	#define PCI_DEVICE_ID_AL_ETH_NIC 0x0003
	#define PCI_DEVICE_ID_AL_ETH_FPGA_NIC 0x0030
	#define PCI_DEVICE_ID_AL_CRYPTO 0x0011
	#define PCI_DEVICE_ID_AL_CRYPTO_VF 0x8011
	#define PCI_DEVICE_ID_AL_RAID_DMA 0x0021
	#define PCI_DEVICE_ID_AL_RAID_DMA_VF 0x8021
	#define PCI_DEVICE_ID_AL_USB 0x0041

	#define MAC_ADDR_STR "%02x:%02x:%02x:%02x:%02x:%02x"
	#define MAC_ADDR(addr) addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]

	#define AL_ETH_MAC_TABLE_UNICAST_IDX_BASE 0
	#define AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT 4
	#define AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX (AL_ETH_MAC_TABLE_UNICAST_IDX_BASE + \
	AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT)

	#define AL_ETH_MAC_TABLE_DROP_IDX (AL_ETH_FWD_MAC_NUM - 1)
	#define AL_ETH_MAC_TABLE_BROADCAST_IDX (AL_ETH_MAC_TABLE_DROP_IDX - 1)

	#define AL_ETH_THASH_UDMA_SHIFT 0
	#define AL_ETH_THASH_UDMA_MASK (0xF << AL_ETH_THASH_UDMA_SHIFT)

	#define AL_ETH_THASH_Q_SHIFT 4
	#define AL_ETH_THASH_Q_MASK (0x3 << AL_ETH_THASH_Q_SHIFT)

	/* the following defines should be moved to hal */
	#define AL_ETH_FSM_ENTRY_IPV4_TCP 0
	#define AL_ETH_FSM_ENTRY_IPV4_UDP 1
	#define AL_ETH_FSM_ENTRY_IPV6_TCP 2
	#define AL_ETH_FSM_ENTRY_IPV6_UDP 3
	#define AL_ETH_FSM_ENTRY_IPV6_NO_UDP_TCP 4
	#define AL_ETH_FSM_ENTRY_IPV4_NO_UDP_TCP 5

	/* FSM DATA format */
	#define AL_ETH_FSM_DATA_OUTER_2_TUPLE 0
	#define AL_ETH_FSM_DATA_OUTER_4_TUPLE 1
	#define AL_ETH_FSM_DATA_INNER_2_TUPLE 2
	#define AL_ETH_FSM_DATA_INNER_4_TUPLE 3

	#define AL_ETH_FSM_DATA_HASH_SEL (1 << 2)

	#define AL_ETH_FSM_DATA_DEFAULT_Q 0
	#define AL_ETH_FSM_DATA_DEFAULT_UDMA 0

	#define AL_BR_SIZE 512
	#define AL_TSO_SIZE 65500
	#define AL_DEFAULT_MTU 1500

	#define CSUM_OFFLOAD (CSUM_IP\|CSUM_TCP\|CSUM_UDP\|CSUM_SCTP)

	#define AL_IP_ALIGNMENT_OFFSET 2

	#define SFP_I2C_ADDR 0x50

	#define AL_MASK_GROUP_A_INT 0x7
	#define AL_MASK_GROUP_B_INT 0xF
	#define AL_MASK_GROUP_C_INT 0xF
	#define AL_MASK_GROUP_D_INT 0xFFFFFFFF

	#define AL_REG_OFFSET_FORWARD_INTR (0x1800000 + 0x1210)
	#define AL_EN_FORWARD_INTR 0x1FFFF
	#define AL_DIS_FORWARD_INTR 0

	#define AL_M2S_MASK_INIT 0x480
	#define AL_S2M_MASK_INIT 0x1E0
	#define AL_M2S_S2M_MASK_NOT_INT (0x3f << 25)

	#define AL_10BASE_T_SPEED 10
	#define AL_100BASE_TX_SPEED 100
	#define AL_1000BASE_T_SPEED 1000

	static devclass_t al_devclass;

	#define AL_RX_LOCK_INIT(_sc) mtx_init(&((_sc)->if_rx_lock), "ALRXL", "ALRXL", MTX_DEF)
	#define AL_RX_LOCK(_sc) mtx_lock(&((_sc)->if_rx_lock))
	#define AL_RX_UNLOCK(_sc) mtx_unlock(&((_sc)->if_rx_lock))

	/* helper functions */
	static int al_is_device_supported(device_t);

	static void al_eth_init_rings(struct al_eth_adapter *);
	static void al_eth_flow_ctrl_disable(struct al_eth_adapter *);
	int al_eth_fpga_read_pci_config(void , int, uint32_t );
	int al_eth_fpga_write_pci_config(void *, int, uint32_t);
	int al_eth_read_pci_config(void , int, uint32_t );
	int al_eth_write_pci_config(void *, int, uint32_t);
	void al_eth_irq_config(uint32_t *, uint32_t);
	void al_eth_forward_int_config(uint32_t *, uint32_t);
	static void al_eth_start_xmit(void *, int);
	static void al_eth_rx_recv_work(void *, int);
	static int al_eth_up(struct al_eth_adapter *);
	static void al_eth_down(struct al_eth_adapter *);
	static void al_eth_interrupts_unmask(struct al_eth_adapter *);
	static void al_eth_interrupts_mask(struct al_eth_adapter *);
	static int al_eth_check_mtu(struct al_eth_adapter *, int);
	static uint64_t al_get_counter(struct ifnet *, ift_counter);
	static void al_eth_req_rx_buff_size(struct al_eth_adapter *, int);
	static int al_eth_board_params_init(struct al_eth_adapter *);
	static int al_media_update(struct ifnet *);
	static void al_media_status(struct ifnet , struct ifmediareq );
	static int al_eth_function_reset(struct al_eth_adapter *);
	static int al_eth_hw_init_adapter(struct al_eth_adapter *);
	static void al_eth_serdes_init(struct al_eth_adapter *);
	static void al_eth_lm_config(struct al_eth_adapter *);
	static int al_eth_hw_init(struct al_eth_adapter *);

	static void al_tick_stats(void *);

	/* ifnet entry points */
	static void al_init(void *);
	static int al_mq_start(struct ifnet , struct mbuf );
	static void al_qflush(struct ifnet *);
	static int al_ioctl(struct ifnet * ifp, u_long, caddr_t);

	/* bus entry points */
	static int al_probe(device_t);
	static int al_attach(device_t);
	static int al_detach(device_t);
	static int al_shutdown(device_t);

	/* mii bus support routines */
	static int al_miibus_readreg(device_t, int, int);
	static int al_miibus_writereg(device_t, int, int, int);
	static void al_miibus_statchg(device_t);
	static void al_miibus_linkchg(device_t);

	struct al_eth_adapter* g_adapters[16];
	uint32_t g_adapters_count;

	/* flag for napi-like mbuf processing, controlled from sysctl */
	static int napi = 0;

	static device_method_t al_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, al_probe),
	DEVMETHOD(device_attach, al_attach),
	DEVMETHOD(device_detach, al_detach),
	DEVMETHOD(device_shutdown, al_shutdown),

	DEVMETHOD(miibus_readreg, al_miibus_readreg),
	DEVMETHOD(miibus_writereg, al_miibus_writereg),
	DEVMETHOD(miibus_statchg, al_miibus_statchg),
	DEVMETHOD(miibus_linkchg, al_miibus_linkchg),
	{ 0, 0 }
	};

	static driver_t al_driver = {
	"al",
	al_methods,
	sizeof(struct al_eth_adapter),
	};

	DRIVER_MODULE(al, pci, al_driver, al_devclass, 0, 0);
	DRIVER_MODULE(miibus, al, miibus_driver, miibus_devclass, 0, 0);

	static int
	al_probe(device_t dev)
	{
	if ((al_is_device_supported(dev)) != 0) {
	device_set_desc(dev, "al");
	return (BUS_PROBE_DEFAULT);
	}
	return (ENXIO);
	}

	static int
	al_attach(device_t dev)
	{
	- struct al_eth_lm_context *lm_context;
	struct al_eth_adapter *adapter;
	struct sysctl_oid_list *child;
	struct sysctl_ctx_list *ctx;
	struct sysctl_oid *tree;
	struct ifnet *ifp;
	uint32_t dev_id;
	uint32_t rev_id;
	int bar_udma;
	int bar_mac;
	int bar_ec;
	int err;

	err = 0;
	ifp = NULL;
	dev_id = rev_id = 0;
	ctx = device_get_sysctl_ctx(dev);
	tree = SYSCTL_PARENT(device_get_sysctl_tree(dev));
	child = SYSCTL_CHILDREN(tree);

	if (g_adapters_count == 0) {
	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "napi",
	CTLFLAG_RW, &napi, 0, "Use pseudo-napi mechanism");
	}
	adapter = device_get_softc(dev);
	adapter->dev = dev;
	adapter->board_type = ALPINE_INTEGRATED;
	snprintf(adapter->name, AL_ETH_NAME_MAX_LEN, "%s",
	device_get_nameunit(dev));
	AL_RX_LOCK_INIT(adapter);

	g_adapters[g_adapters_count] = adapter;
	-
	- lm_context = &adapter->lm_context;

	bar_udma = PCIR_BAR(AL_ETH_UDMA_BAR);
	adapter->udma_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&bar_udma, RF_ACTIVE);
	if (adapter->udma_res == NULL) {
	device_printf(adapter->dev,
	"could not allocate memory resources for DMA.\n");
	err = ENOMEM;
	goto err_res_dma;
	}
	adapter->udma_base = al_bus_dma_to_va(rman_get_bustag(adapter->udma_res),
	rman_get_bushandle(adapter->udma_res));
	bar_mac = PCIR_BAR(AL_ETH_MAC_BAR);
	adapter->mac_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
	&bar_mac, RF_ACTIVE);
	if (adapter->mac_res == NULL) {
	device_printf(adapter->dev,
	"could not allocate memory resources for MAC.\n");
	err = ENOMEM;
	goto err_res_mac;
	}
	adapter->mac_base = al_bus_dma_to_va(rman_get_bustag(adapter->mac_res),
	rman_get_bushandle(adapter->mac_res));

	bar_ec = PCIR_BAR(AL_ETH_EC_BAR);
	adapter->ec_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &bar_ec,
	RF_ACTIVE);
	if (adapter->ec_res == NULL) {
	device_printf(adapter->dev,
	"could not allocate memory resources for EC.\n");
	err = ENOMEM;
	goto err_res_ec;
	}
	adapter->ec_base = al_bus_dma_to_va(rman_get_bustag(adapter->ec_res),
	rman_get_bushandle(adapter->ec_res));

	adapter->netdev = ifp = if_alloc(IFT_ETHER);

	adapter->netdev->if_link_state = LINK_STATE_DOWN;

	ifp->if_softc = adapter;
	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
	ifp->if_flags = ifp->if_drv_flags;
	ifp->if_flags \|= IFF_BROADCAST \| IFF_SIMPLEX \| IFF_MULTICAST \| IFF_ALLMULTI;
	ifp->if_transmit = al_mq_start;
	ifp->if_qflush = al_qflush;
	ifp->if_ioctl = al_ioctl;
	ifp->if_init = al_init;
	ifp->if_get_counter = al_get_counter;
	ifp->if_mtu = AL_DEFAULT_MTU;

	adapter->if_flags = ifp->if_flags;

	ifp->if_capabilities = ifp->if_capenable = 0;

	ifp->if_capabilities \|= IFCAP_HWCSUM \|
	IFCAP_HWCSUM_IPV6 \| IFCAP_TSO \|
	IFCAP_LRO \| IFCAP_JUMBO_MTU;

	ifp->if_capenable = ifp->if_capabilities;

	adapter->id_number = g_adapters_count;

	if (adapter->board_type == ALPINE_INTEGRATED) {
	dev_id = pci_get_device(adapter->dev);
	rev_id = pci_get_revid(adapter->dev);
	} else {
	al_eth_fpga_read_pci_config(adapter->internal_pcie_base,
	PCIR_DEVICE, &dev_id);
	al_eth_fpga_read_pci_config(adapter->internal_pcie_base,
	PCIR_REVID, &rev_id);
	}

	adapter->dev_id = dev_id;
	adapter->rev_id = rev_id;

	/* set default ring sizes */
	adapter->tx_ring_count = AL_ETH_DEFAULT_TX_SW_DESCS;
	adapter->tx_descs_count = AL_ETH_DEFAULT_TX_HW_DESCS;
	adapter->rx_ring_count = AL_ETH_DEFAULT_RX_DESCS;
	adapter->rx_descs_count = AL_ETH_DEFAULT_RX_DESCS;

	adapter->num_tx_queues = AL_ETH_NUM_QUEUES;
	adapter->num_rx_queues = AL_ETH_NUM_QUEUES;

	adapter->small_copy_len = AL_ETH_DEFAULT_SMALL_PACKET_LEN;
	adapter->link_poll_interval = AL_ETH_DEFAULT_LINK_POLL_INTERVAL;
	adapter->max_rx_buff_alloc_size = AL_ETH_DEFAULT_MAX_RX_BUFF_ALLOC_SIZE;

	al_eth_req_rx_buff_size(adapter, adapter->netdev->if_mtu);

	adapter->link_config.force_1000_base_x = AL_ETH_DEFAULT_FORCE_1000_BASEX;

	err = al_eth_board_params_init(adapter);
	if (err != 0)
	goto err;

	if (adapter->mac_mode == AL_ETH_MAC_MODE_10GbE_Serial) {
	ifmedia_init(&adapter->media, IFM_IMASK,
	al_media_update, al_media_status);
	ifmedia_add(&adapter->media, IFM_ETHER \| IFM_1000_LX, 0, NULL);
	ifmedia_add(&adapter->media, IFM_ETHER \| IFM_10G_LR, 0, NULL);
	ifmedia_add(&adapter->media, IFM_ETHER \| IFM_AUTO, 0, NULL);
	ifmedia_set(&adapter->media, IFM_ETHER \| IFM_AUTO);
	}

	al_eth_function_reset(adapter);

	err = al_eth_hw_init_adapter(adapter);
	if (err != 0)
	goto err;

	al_eth_init_rings(adapter);
	g_adapters_count++;

	al_eth_lm_config(adapter);
	mtx_init(&adapter->stats_mtx, "AlStatsMtx", NULL, MTX_DEF);
	mtx_init(&adapter->wd_mtx, "AlWdMtx", NULL, MTX_DEF);
	callout_init_mtx(&adapter->stats_callout, &adapter->stats_mtx, 0);
	callout_init_mtx(&adapter->wd_callout, &adapter->wd_mtx, 0);

	ether_ifattach(ifp, adapter->mac_addr);
	ifp->if_mtu = AL_DEFAULT_MTU;

	if (adapter->mac_mode == AL_ETH_MAC_MODE_RGMII) {
	al_eth_hw_init(adapter);

	/* Attach PHY(s) */
	err = mii_attach(adapter->dev, &adapter->miibus, adapter->netdev,
	al_media_update, al_media_status, BMSR_DEFCAPMASK, 0,
	MII_OFFSET_ANY, 0);
	if (err != 0) {
	device_printf(adapter->dev, "attaching PHYs failed\n");
	return (err);
	}

	adapter->mii = device_get_softc(adapter->miibus);
	}

	return (err);

	err:
	bus_release_resource(dev, SYS_RES_MEMORY, bar_ec, adapter->ec_res);
	err_res_ec:
	bus_release_resource(dev, SYS_RES_MEMORY, bar_mac, adapter->mac_res);
	err_res_mac:
	bus_release_resource(dev, SYS_RES_MEMORY, bar_udma, adapter->udma_res);
	err_res_dma:
	return (err);
	}

	static int
	al_detach(device_t dev)
	{
	struct al_eth_adapter *adapter;

	adapter = device_get_softc(dev);
	ether_ifdetach(adapter->netdev);

	mtx_destroy(&adapter->stats_mtx);
	mtx_destroy(&adapter->wd_mtx);

	al_eth_down(adapter);

	bus_release_resource(dev, SYS_RES_IRQ, 0, adapter->irq_res);
	bus_release_resource(dev, SYS_RES_MEMORY, 0, adapter->ec_res);
	bus_release_resource(dev, SYS_RES_MEMORY, 0, adapter->mac_res);
	bus_release_resource(dev, SYS_RES_MEMORY, 0, adapter->udma_res);

	return (0);
	}

	int
	al_eth_fpga_read_pci_config(void handle, int where, uint32_t val)
	{

	/* handle is the base address of the adapter */
	val = al_reg_read32((void)((u_long)handle + where));

	return (0);
	}

	int
	al_eth_fpga_write_pci_config(void *handle, int where, uint32_t val)
	{

	/* handle is the base address of the adapter */
	al_reg_write32((void*)((u_long)handle + where), val);
	return (0);
	}

	int
	al_eth_read_pci_config(void handle, int where, uint32_t val)
	{

	/* handle is a pci_dev */
	val = pci_read_config((device_t)handle, where, sizeof(val));
	return (0);
	}

	int
	al_eth_write_pci_config(void *handle, int where, uint32_t val)
	{

	/* handle is a pci_dev */
	pci_write_config((device_t)handle, where, val, sizeof(val));
	return (0);
	}

	void
	al_eth_irq_config(uint32_t *offset, uint32_t value)
	{

	al_reg_write32_relaxed(offset, value);
	}

	void
	al_eth_forward_int_config(uint32_t *offset, uint32_t value)
	{

	al_reg_write32(offset, value);
	}

	static void
	al_eth_serdes_init(struct al_eth_adapter *adapter)
	{
	void __iomem *serdes_base;

	adapter->serdes_init = false;

	serdes_base = alpine_serdes_resource_get(adapter->serdes_grp);
	if (serdes_base == NULL) {
	device_printf(adapter->dev, "serdes_base get failed!\n");
	return;
	}

	serdes_base = al_bus_dma_to_va(serdes_tag, serdes_base);

	al_serdes_handle_grp_init(serdes_base, adapter->serdes_grp,
	&adapter->serdes_obj);

	adapter->serdes_init = true;
	}

	static void
	al_dma_map_addr(void arg, bus_dma_segment_t segs, int nseg, int error)
	{
	bus_addr_t *paddr;

	paddr = arg;
	*paddr = segs->ds_addr;
	}

	static int
	al_dma_alloc_coherent(struct device dev, bus_dma_tag_t tag, bus_dmamap_t *map,
	bus_addr_t baddr, void *vaddr, uint32_t size)
	{
	int ret;
	uint32_t maxsize = ((size - 1)/PAGE_SIZE + 1) * PAGE_SIZE;

	ret = bus_dma_tag_create(bus_get_dma_tag(dev), 8, 0,
	BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
	maxsize, 1, maxsize, BUS_DMA_COHERENT, NULL, NULL, tag);
	if (ret != 0) {
	device_printf(dev,
	"failed to create bus tag, ret = %d\n", ret);
	return (ret);
	}

	ret = bus_dmamem_alloc(*tag, vaddr,
	BUS_DMA_COHERENT \| BUS_DMA_ZERO, map);
	if (ret != 0) {
	device_printf(dev,
	"failed to allocate dmamem, ret = %d\n", ret);
	return (ret);
	}

	ret = bus_dmamap_load(tag, map, *vaddr,
	size, al_dma_map_addr, baddr, 0);
	if (ret != 0) {
	device_printf(dev,
	"failed to allocate bus_dmamap_load, ret = %d\n", ret);
	return (ret);
	}

	return (0);
	}

	static void
	al_dma_free_coherent(bus_dma_tag_t tag, bus_dmamap_t map, void *vaddr)
	{

	bus_dmamap_unload(tag, map);
	bus_dmamem_free(tag, vaddr, map);
	bus_dma_tag_destroy(tag);
	}

	static void
	al_eth_mac_table_unicast_add(struct al_eth_adapter *adapter,
	uint8_t idx, uint8_t *addr, uint8_t udma_mask)
	{
	struct al_eth_fwd_mac_table_entry entry = { { 0 } };

	memcpy(entry.addr, adapter->mac_addr, sizeof(adapter->mac_addr));

	memset(entry.mask, 0xff, sizeof(entry.mask));
	entry.rx_valid = true;
	entry.tx_valid = false;
	entry.udma_mask = udma_mask;
	entry.filter = false;

	device_printf_dbg(adapter->dev,
	"%s: [%d]: addr "MAC_ADDR_STR" mask "MAC_ADDR_STR"\n",
	__func__, idx, MAC_ADDR(entry.addr), MAC_ADDR(entry.mask));

	al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
	}

	static void
	al_eth_mac_table_all_multicast_add(struct al_eth_adapter *adapter, uint8_t idx,
	uint8_t udma_mask)
	{
	struct al_eth_fwd_mac_table_entry entry = { { 0 } };

	memset(entry.addr, 0x00, sizeof(entry.addr));
	memset(entry.mask, 0x00, sizeof(entry.mask));
	entry.mask[0] \|= 1;
	entry.addr[0] \|= 1;

	entry.rx_valid = true;
	entry.tx_valid = false;
	entry.udma_mask = udma_mask;
	entry.filter = false;

	device_printf_dbg(adapter->dev,
	"%s: [%d]: addr "MAC_ADDR_STR" mask "MAC_ADDR_STR"\n",
	__func__, idx, MAC_ADDR(entry.addr), MAC_ADDR(entry.mask));

	al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
	}

	static void
	al_eth_mac_table_broadcast_add(struct al_eth_adapter *adapter,
	uint8_t idx, uint8_t udma_mask)
	{
	struct al_eth_fwd_mac_table_entry entry = { { 0 } };

	memset(entry.addr, 0xff, sizeof(entry.addr));
	memset(entry.mask, 0xff, sizeof(entry.mask));

	entry.rx_valid = true;
	entry.tx_valid = false;
	entry.udma_mask = udma_mask;
	entry.filter = false;

	device_printf_dbg(adapter->dev,
	"%s: [%d]: addr "MAC_ADDR_STR" mask "MAC_ADDR_STR"\n",
	__func__, idx, MAC_ADDR(entry.addr), MAC_ADDR(entry.mask));

	al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
	}

	static void
	al_eth_mac_table_promiscuous_set(struct al_eth_adapter *adapter,
	boolean_t promiscuous)
	{
	struct al_eth_fwd_mac_table_entry entry = { { 0 } };

	memset(entry.addr, 0x00, sizeof(entry.addr));
	memset(entry.mask, 0x00, sizeof(entry.mask));

	entry.rx_valid = true;
	entry.tx_valid = false;
	entry.udma_mask = (promiscuous) ? 1 : 0;
	entry.filter = (promiscuous) ? false : true;

	device_printf_dbg(adapter->dev, "%s: %s promiscuous mode\n",
	__func__, (promiscuous) ? "enter" : "exit");

	al_eth_fwd_mac_table_set(&adapter->hal_adapter,
	AL_ETH_MAC_TABLE_DROP_IDX, &entry);
	}

	static void
	al_eth_set_thash_table_entry(struct al_eth_adapter *adapter, uint8_t idx,
	uint8_t udma, uint32_t queue)
	{

	if (udma != 0)
	panic("only UDMA0 is supporter");

	if (queue >= AL_ETH_NUM_QUEUES)
	panic("invalid queue number");

	al_eth_thash_table_set(&adapter->hal_adapter, idx, udma, queue);
	}

	/* init FSM, no tunneling supported yet, if packet is tcp/udp over ipv4/ipv6, use 4 tuple hash */
	static void
	al_eth_fsm_table_init(struct al_eth_adapter *adapter)
	{
	uint32_t val;
	int i;

	for (i = 0; i < AL_ETH_RX_FSM_TABLE_SIZE; i++) {
	uint8_t outer_type = AL_ETH_FSM_ENTRY_OUTER(i);
	switch (outer_type) {
	case AL_ETH_FSM_ENTRY_IPV4_TCP:
	case AL_ETH_FSM_ENTRY_IPV4_UDP:
	case AL_ETH_FSM_ENTRY_IPV6_TCP:
	case AL_ETH_FSM_ENTRY_IPV6_UDP:
	val = AL_ETH_FSM_DATA_OUTER_4_TUPLE \|
	AL_ETH_FSM_DATA_HASH_SEL;
	break;
	case AL_ETH_FSM_ENTRY_IPV6_NO_UDP_TCP:
	case AL_ETH_FSM_ENTRY_IPV4_NO_UDP_TCP:
	val = AL_ETH_FSM_DATA_OUTER_2_TUPLE \|
	AL_ETH_FSM_DATA_HASH_SEL;
	break;
	default:
	val = AL_ETH_FSM_DATA_DEFAULT_Q \|
	AL_ETH_FSM_DATA_DEFAULT_UDMA;
	}
	al_eth_fsm_table_set(&adapter->hal_adapter, i, val);
	}
	}

	static void
	al_eth_mac_table_entry_clear(struct al_eth_adapter *adapter,
	uint8_t idx)
	{
	struct al_eth_fwd_mac_table_entry entry = { { 0 } };

	device_printf_dbg(adapter->dev, "%s: clear entry %d\n", __func__, idx);

	al_eth_fwd_mac_table_set(&adapter->hal_adapter, idx, &entry);
	}

	static int
	al_eth_hw_init_adapter(struct al_eth_adapter *adapter)
	{
	struct al_eth_adapter_params *params = &adapter->eth_hal_params;
	int rc;

	/* params->dev_id = adapter->dev_id; */
	params->rev_id = adapter->rev_id;
	params->udma_id = 0;
	params->enable_rx_parser = 1; /* enable rx epe parser*/
	params->udma_regs_base = adapter->udma_base; /* UDMA register base address */
	params->ec_regs_base = adapter->ec_base; /* Ethernet controller registers base address */
	params->mac_regs_base = adapter->mac_base; /* Ethernet MAC registers base address */
	params->name = adapter->name;
	params->serdes_lane = adapter->serdes_lane;

	rc = al_eth_adapter_init(&adapter->hal_adapter, params);
	if (rc != 0)
	device_printf(adapter->dev, "%s failed at hal init!\n",
	__func__);

	if ((adapter->board_type == ALPINE_NIC) \|\|
	(adapter->board_type == ALPINE_FPGA_NIC)) {
	/* in pcie NIC mode, force eth UDMA to access PCIE0 using the vmid */
	struct al_udma_gen_tgtid_conf conf;
	int i;
	for (i = 0; i < DMA_MAX_Q; i++) {
	conf.tx_q_conf[i].queue_en = AL_TRUE;
	conf.tx_q_conf[i].desc_en = AL_FALSE;
	conf.tx_q_conf[i].tgtid = 0x100; /* for access from PCIE0 */
	conf.rx_q_conf[i].queue_en = AL_TRUE;
	conf.rx_q_conf[i].desc_en = AL_FALSE;
	conf.rx_q_conf[i].tgtid = 0x100; /* for access from PCIE0 */
	}
	al_udma_gen_tgtid_conf_set(adapter->udma_base, &conf);
	}

	return (rc);
	}

	static void
	al_eth_lm_config(struct al_eth_adapter *adapter)
	{
	struct al_eth_lm_init_params params = {0};

	params.adapter = &adapter->hal_adapter;
	params.serdes_obj = &adapter->serdes_obj;
	params.lane = adapter->serdes_lane;
	params.sfp_detection = adapter->sfp_detection_needed;
	if (adapter->sfp_detection_needed == true) {
	params.sfp_bus_id = adapter->i2c_adapter_id;
	params.sfp_i2c_addr = SFP_I2C_ADDR;
	}

	if (adapter->sfp_detection_needed == false) {
	switch (adapter->mac_mode) {
	case AL_ETH_MAC_MODE_10GbE_Serial:
	if ((adapter->lt_en != 0) && (adapter->an_en != 0))
	params.default_mode = AL_ETH_LM_MODE_10G_DA;
	else
	params.default_mode = AL_ETH_LM_MODE_10G_OPTIC;
	break;
	case AL_ETH_MAC_MODE_SGMII:
	params.default_mode = AL_ETH_LM_MODE_1G;
	break;
	default:
	params.default_mode = AL_ETH_LM_MODE_10G_DA;
	}
	} else
	params.default_mode = AL_ETH_LM_MODE_10G_DA;

	params.link_training = adapter->lt_en;
	params.rx_equal = true;
	params.static_values = !adapter->dont_override_serdes;
	params.i2c_context = adapter;
	params.kr_fec_enable = false;

	params.retimer_exist = adapter->retimer.exist;
	params.retimer_bus_id = adapter->retimer.bus_id;
	params.retimer_i2c_addr = adapter->retimer.i2c_addr;
	params.retimer_channel = adapter->retimer.channel;

	al_eth_lm_init(&adapter->lm_context, &params);
	}

	static int
	al_eth_board_params_init(struct al_eth_adapter *adapter)
	{

	if (adapter->board_type == ALPINE_NIC) {
	adapter->mac_mode = AL_ETH_MAC_MODE_10GbE_Serial;
	adapter->sfp_detection_needed = false;
	adapter->phy_exist = false;
	adapter->an_en = false;
	adapter->lt_en = false;
	adapter->ref_clk_freq = AL_ETH_REF_FREQ_375_MHZ;
	adapter->mdio_freq = AL_ETH_DEFAULT_MDIO_FREQ_KHZ;
	} else if (adapter->board_type == ALPINE_FPGA_NIC) {
	adapter->mac_mode = AL_ETH_MAC_MODE_SGMII;
	adapter->sfp_detection_needed = false;
	adapter->phy_exist = false;
	adapter->an_en = false;
	adapter->lt_en = false;
	adapter->ref_clk_freq = AL_ETH_REF_FREQ_375_MHZ;
	adapter->mdio_freq = AL_ETH_DEFAULT_MDIO_FREQ_KHZ;
	} else {
	struct al_eth_board_params params;
	int rc;

	adapter->auto_speed = false;

	rc = al_eth_board_params_get(adapter->mac_base, &params);
	if (rc != 0) {
	device_printf(adapter->dev,
	"board info not available\n");
	return (-1);
	}

	adapter->phy_exist = params.phy_exist == TRUE;
	adapter->phy_addr = params.phy_mdio_addr;
	adapter->an_en = params.autoneg_enable;
	adapter->lt_en = params.kr_lt_enable;
	adapter->serdes_grp = params.serdes_grp;
	adapter->serdes_lane = params.serdes_lane;
	adapter->sfp_detection_needed = params.sfp_plus_module_exist;
	adapter->i2c_adapter_id = params.i2c_adapter_id;
	adapter->ref_clk_freq = params.ref_clk_freq;
	adapter->dont_override_serdes = params.dont_override_serdes;
	adapter->link_config.active_duplex = !params.half_duplex;
	adapter->link_config.autoneg = !params.an_disable;
	adapter->link_config.force_1000_base_x = params.force_1000_base_x;
	adapter->retimer.exist = params.retimer_exist;
	adapter->retimer.bus_id = params.retimer_bus_id;
	adapter->retimer.i2c_addr = params.retimer_i2c_addr;
	adapter->retimer.channel = params.retimer_channel;

	switch (params.speed) {
	default:
	device_printf(adapter->dev,
	"%s: invalid speed (%d)\n", __func__, params.speed);
	case AL_ETH_BOARD_1G_SPEED_1000M:
	adapter->link_config.active_speed = 1000;
	break;
	case AL_ETH_BOARD_1G_SPEED_100M:
	adapter->link_config.active_speed = 100;
	break;
	case AL_ETH_BOARD_1G_SPEED_10M:
	adapter->link_config.active_speed = 10;
	break;
	}

	switch (params.mdio_freq) {
	default:
	device_printf(adapter->dev,
	"%s: invalid mdio freq (%d)\n", __func__,
	params.mdio_freq);
	case AL_ETH_BOARD_MDIO_FREQ_2_5_MHZ:
	adapter->mdio_freq = AL_ETH_DEFAULT_MDIO_FREQ_KHZ;
	break;
	case AL_ETH_BOARD_MDIO_FREQ_1_MHZ:
	adapter->mdio_freq = AL_ETH_MDIO_FREQ_1000_KHZ;
	break;
	}

	switch (params.media_type) {
	case AL_ETH_BOARD_MEDIA_TYPE_RGMII:
	if (params.sfp_plus_module_exist == TRUE)
	/* Backward compatibility */
	adapter->mac_mode = AL_ETH_MAC_MODE_SGMII;
	else
	adapter->mac_mode = AL_ETH_MAC_MODE_RGMII;

	adapter->use_lm = false;
	break;
	case AL_ETH_BOARD_MEDIA_TYPE_SGMII:
	adapter->mac_mode = AL_ETH_MAC_MODE_SGMII;
	adapter->use_lm = true;
	break;
	case AL_ETH_BOARD_MEDIA_TYPE_10GBASE_SR:
	adapter->mac_mode = AL_ETH_MAC_MODE_10GbE_Serial;
	adapter->use_lm = true;
	break;
	case AL_ETH_BOARD_MEDIA_TYPE_AUTO_DETECT:
	adapter->sfp_detection_needed = TRUE;
	adapter->auto_speed = false;
	adapter->use_lm = true;
	break;
	case AL_ETH_BOARD_MEDIA_TYPE_AUTO_DETECT_AUTO_SPEED:
	adapter->sfp_detection_needed = TRUE;
	adapter->auto_speed = true;
	adapter->mac_mode_set = false;
	adapter->use_lm = true;

	adapter->mac_mode = AL_ETH_MAC_MODE_10GbE_Serial;
	break;
	default:
	device_printf(adapter->dev,
	"%s: unsupported media type %d\n",
	__func__, params.media_type);
	return (-1);
	}

	device_printf(adapter->dev,
	"Board info: phy exist %s. phy addr %d. mdio freq %u Khz. "
	"SFP connected %s. media %d\n",
	params.phy_exist == TRUE ? "Yes" : "No",
	params.phy_mdio_addr, adapter->mdio_freq,
	params.sfp_plus_module_exist == TRUE ? "Yes" : "No",
	params.media_type);
	}

	al_eth_mac_addr_read(adapter->ec_base, 0, adapter->mac_addr);

	return (0);
	}

	static int
	al_eth_function_reset(struct al_eth_adapter *adapter)
	{
	struct al_eth_board_params params;
	int rc;

	/* save board params so we restore it after reset */
	al_eth_board_params_get(adapter->mac_base, &params);
	al_eth_mac_addr_read(adapter->ec_base, 0, adapter->mac_addr);
	if (adapter->board_type == ALPINE_INTEGRATED)
	rc = al_eth_flr_rmn(&al_eth_read_pci_config,
	&al_eth_write_pci_config,
	adapter->dev, adapter->mac_base);
	else
	rc = al_eth_flr_rmn(&al_eth_fpga_read_pci_config,
	&al_eth_fpga_write_pci_config,
	adapter->internal_pcie_base, adapter->mac_base);

	/* restore params */
	al_eth_board_params_set(adapter->mac_base, &params);
	al_eth_mac_addr_store(adapter->ec_base, 0, adapter->mac_addr);

	return (rc);
	}

	static void
	al_eth_init_rings(struct al_eth_adapter *adapter)
	{
	int i;

	for (i = 0; i < adapter->num_tx_queues; i++) {
	struct al_eth_ring *ring = &adapter->tx_ring[i];

	ring->ring_id = i;
	ring->dev = adapter->dev;
	ring->adapter = adapter;
	ring->netdev = adapter->netdev;
	al_udma_q_handle_get(&adapter->hal_adapter.tx_udma, i,
	&ring->dma_q);
	ring->sw_count = adapter->tx_ring_count;
	ring->hw_count = adapter->tx_descs_count;
	ring->unmask_reg_offset = al_udma_iofic_unmask_offset_get((struct unit_regs *)adapter->udma_base, AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_C);
	ring->unmask_val = ~(1 << i);
	}

	for (i = 0; i < adapter->num_rx_queues; i++) {
	struct al_eth_ring *ring = &adapter->rx_ring[i];

	ring->ring_id = i;
	ring->dev = adapter->dev;
	ring->adapter = adapter;
	ring->netdev = adapter->netdev;
	al_udma_q_handle_get(&adapter->hal_adapter.rx_udma, i, &ring->dma_q);
	ring->sw_count = adapter->rx_ring_count;
	ring->hw_count = adapter->rx_descs_count;
	ring->unmask_reg_offset = al_udma_iofic_unmask_offset_get(
	(struct unit_regs *)adapter->udma_base,
	AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_B);
	ring->unmask_val = ~(1 << i);
	}
	}

	static void
	al_init_locked(void *arg)
	{
	struct al_eth_adapter *adapter = arg;
	if_t ifp = adapter->netdev;
	int rc = 0;

	al_eth_down(adapter);
	rc = al_eth_up(adapter);

	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
	if (rc == 0)
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	}

	static void
	al_init(void *arg)
	{
	struct al_eth_adapter *adapter = arg;

	al_init_locked(adapter);
	}

	static inline int
	al_eth_alloc_rx_buf(struct al_eth_adapter *adapter,
	struct al_eth_ring *rx_ring,
	struct al_eth_rx_buffer *rx_info)
	{
	struct al_buf *al_buf;
	bus_dma_segment_t segs[2];
	int error;
	int nsegs;

	if (rx_info->m != NULL)
	return (0);

	rx_info->data_size = adapter->rx_mbuf_sz;

	AL_RX_LOCK(adapter);

	/* Get mbuf using UMA allocator */
	rx_info->m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
	rx_info->data_size);
	AL_RX_UNLOCK(adapter);

	if (rx_info->m == NULL)
	return (ENOMEM);

	rx_info->m->m_pkthdr.len = rx_info->m->m_len = adapter->rx_mbuf_sz;

	/* Map packets for DMA */
	error = bus_dmamap_load_mbuf_sg(rx_ring->dma_buf_tag, rx_info->dma_map,
	rx_info->m, segs, &nsegs, BUS_DMA_NOWAIT);
	if (__predict_false(error)) {
	device_printf(rx_ring->dev, "failed to map mbuf, error = %d\n",
	error);
	m_freem(rx_info->m);
	rx_info->m = NULL;
	return (EFAULT);
	}

	al_buf = &rx_info->al_buf;
	al_buf->addr = segs[0].ds_addr + AL_IP_ALIGNMENT_OFFSET;
	al_buf->len = rx_info->data_size - AL_IP_ALIGNMENT_OFFSET;

	return (0);
	}

	static int
	al_eth_refill_rx_bufs(struct al_eth_adapter *adapter, unsigned int qid,
	unsigned int num)
	{
	struct al_eth_ring *rx_ring = &adapter->rx_ring[qid];
	uint16_t next_to_use;
	unsigned int i;

	next_to_use = rx_ring->next_to_use;

	for (i = 0; i < num; i++) {
	int rc;
	struct al_eth_rx_buffer *rx_info =
	&rx_ring->rx_buffer_info[next_to_use];

	if (__predict_false(al_eth_alloc_rx_buf(adapter,
	rx_ring, rx_info) < 0)) {
	device_printf(adapter->dev,
	"failed to alloc buffer for rx queue %d\n", qid);
	break;
	}

	rc = al_eth_rx_buffer_add(rx_ring->dma_q,
	&rx_info->al_buf, AL_ETH_RX_FLAGS_INT, NULL);
	if (__predict_false(rc)) {
	device_printf(adapter->dev,
	"failed to add buffer for rx queue %d\n", qid);
	break;
	}

	next_to_use = AL_ETH_RX_RING_IDX_NEXT(rx_ring, next_to_use);
	}

	if (__predict_false(i < num))
	device_printf(adapter->dev,
	"refilled rx queue %d with %d pages only - available %d\n",
	qid, i, al_udma_available_get(rx_ring->dma_q));

	if (__predict_true(i))
	al_eth_rx_buffer_action(rx_ring->dma_q, i);

	rx_ring->next_to_use = next_to_use;

	return (i);
	}

	/*
	* al_eth_refill_all_rx_bufs - allocate all queues Rx buffers
	* @adapter: board private structure
	*/
	static void
	al_eth_refill_all_rx_bufs(struct al_eth_adapter *adapter)
	{
	int i;

	for (i = 0; i < adapter->num_rx_queues; i++)
	al_eth_refill_rx_bufs(adapter, i, AL_ETH_DEFAULT_RX_DESCS - 1);
	}

	static void
	al_eth_tx_do_cleanup(struct al_eth_ring *tx_ring)
	{
	unsigned int total_done;
	uint16_t next_to_clean;
	int qid = tx_ring->ring_id;

	total_done = al_eth_comp_tx_get(tx_ring->dma_q);
	device_printf_dbg(tx_ring->dev,
	"tx_poll: q %d total completed descs %x\n", qid, total_done);
	next_to_clean = tx_ring->next_to_clean;

	while (total_done != 0) {
	struct al_eth_tx_buffer *tx_info;
	struct mbuf *mbuf;

	tx_info = &tx_ring->tx_buffer_info[next_to_clean];
	/* stop if not all descriptors of the packet are completed */
	if (tx_info->tx_descs > total_done)
	break;

	mbuf = tx_info->m;

	tx_info->m = NULL;

	device_printf_dbg(tx_ring->dev,
	"tx_poll: q %d mbuf %p completed\n", qid, mbuf);

	/* map is no longer required */
	bus_dmamap_unload(tx_ring->dma_buf_tag, tx_info->dma_map);

	m_freem(mbuf);
	total_done -= tx_info->tx_descs;
	next_to_clean = AL_ETH_TX_RING_IDX_NEXT(tx_ring, next_to_clean);
	}

	tx_ring->next_to_clean = next_to_clean;

	device_printf_dbg(tx_ring->dev, "tx_poll: q %d done next to clean %x\n",
	qid, next_to_clean);

	/*
	* need to make the rings circular update visible to
	* al_eth_start_xmit() before checking for netif_queue_stopped().
	*/
	al_smp_data_memory_barrier();
	}

	static void
	al_eth_tx_csum(struct al_eth_ring tx_ring, struct al_eth_tx_buffer tx_info,
	struct al_eth_pkt hal_pkt, struct mbuf m)
	{
	uint32_t mss = m->m_pkthdr.tso_segsz;
	struct ether_vlan_header *eh;
	uint16_t etype;
	struct ip *ip;
	struct ip6_hdr *ip6;
	struct tcphdr *th = NULL;
	int ehdrlen, ip_hlen = 0;
	uint8_t ipproto = 0;
	uint32_t offload = 0;

	if (mss != 0)
	offload = 1;

	if ((m->m_pkthdr.csum_flags & CSUM_TSO) != 0)
	offload = 1;

	if ((m->m_pkthdr.csum_flags & CSUM_OFFLOAD) != 0)
	offload = 1;

	if (offload != 0) {
	struct al_eth_meta_data *meta = &tx_ring->hal_meta;

	if (mss != 0)
	hal_pkt->flags \|= (AL_ETH_TX_FLAGS_TSO \|
	AL_ETH_TX_FLAGS_L4_CSUM);
	else
	hal_pkt->flags \|= (AL_ETH_TX_FLAGS_L4_CSUM \|
	AL_ETH_TX_FLAGS_L4_PARTIAL_CSUM);

	/*
	* Determine where frame payload starts.
	* Jump over vlan headers if already present,
	* helpful for QinQ too.
	*/
	eh = mtod(m, struct ether_vlan_header *);
	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
	etype = ntohs(eh->evl_proto);
	ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
	} else {
	etype = ntohs(eh->evl_encap_proto);
	ehdrlen = ETHER_HDR_LEN;
	}

	switch (etype) {
	case ETHERTYPE_IP:
	ip = (struct ip *)(m->m_data + ehdrlen);
	ip_hlen = ip->ip_hl << 2;
	ipproto = ip->ip_p;
	hal_pkt->l3_proto_idx = AL_ETH_PROTO_ID_IPv4;
	th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
	if (mss != 0)
	hal_pkt->flags \|= AL_ETH_TX_FLAGS_IPV4_L3_CSUM;
	if (ipproto == IPPROTO_TCP)
	hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_TCP;
	else
	hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_UDP;
	break;
	case ETHERTYPE_IPV6:
	ip6 = (struct ip6_hdr *)(m->m_data + ehdrlen);
	hal_pkt->l3_proto_idx = AL_ETH_PROTO_ID_IPv6;
	ip_hlen = sizeof(struct ip6_hdr);
	th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
	ipproto = ip6->ip6_nxt;
	if (ipproto == IPPROTO_TCP)
	hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_TCP;
	else
	hal_pkt->l4_proto_idx = AL_ETH_PROTO_ID_UDP;
	break;
	default:
	break;
	}

	meta->words_valid = 4;
	meta->l3_header_len = ip_hlen;
	meta->l3_header_offset = ehdrlen;
	if (th != NULL)
	meta->l4_header_len = th->th_off; /* this param needed only for TSO */
	meta->mss_idx_sel = 0; /* check how to select MSS */
	meta->mss_val = mss;
	hal_pkt->meta = meta;
	} else
	hal_pkt->meta = NULL;
	}

	#define XMIT_QUEUE_TIMEOUT 100

	static void
	al_eth_xmit_mbuf(struct al_eth_ring tx_ring, struct mbuf m)
	{
	struct al_eth_tx_buffer *tx_info;
	int error;
	int nsegs, a;
	uint16_t next_to_use;
	bus_dma_segment_t segs[AL_ETH_PKT_MAX_BUFS + 1];
	struct al_eth_pkt *hal_pkt;
	struct al_buf *al_buf;
	boolean_t remap;

	/* Check if queue is ready */
	if (unlikely(tx_ring->stall) != 0) {
	for (a = 0; a < XMIT_QUEUE_TIMEOUT; a++) {
	if (al_udma_available_get(tx_ring->dma_q) >=
	(AL_ETH_DEFAULT_TX_HW_DESCS -
	AL_ETH_TX_WAKEUP_THRESH)) {
	tx_ring->stall = 0;
	break;
	}
	pause("stall", 1);
	}
	if (a == XMIT_QUEUE_TIMEOUT) {
	device_printf(tx_ring->dev,
	"timeout waiting for queue %d ready!\n",
	tx_ring->ring_id);
	return;
	} else {
	device_printf_dbg(tx_ring->dev,
	"queue %d is ready!\n", tx_ring->ring_id);
	}
	}

	next_to_use = tx_ring->next_to_use;
	tx_info = &tx_ring->tx_buffer_info[next_to_use];
	tx_info->m = m;
	hal_pkt = &tx_info->hal_pkt;

	if (m == NULL) {
	device_printf(tx_ring->dev, "mbuf is NULL\n");
	return;
	}

	remap = TRUE;
	/* Map packets for DMA */
	retry:
	error = bus_dmamap_load_mbuf_sg(tx_ring->dma_buf_tag, tx_info->dma_map,
	m, segs, &nsegs, BUS_DMA_NOWAIT);
	if (__predict_false(error)) {
	struct mbuf *m_new;

	if (error == EFBIG) {
	/* Try it again? - one try */
	if (remap == TRUE) {
	remap = FALSE;
	m_new = m_defrag(m, M_NOWAIT);
	if (m_new == NULL) {
	device_printf(tx_ring->dev,
	"failed to defrag mbuf\n");
	goto exit;
	}
	m = m_new;
	goto retry;
	} else {
	device_printf(tx_ring->dev,
	"failed to map mbuf, error %d\n", error);
	goto exit;
	}
	} else {
	device_printf(tx_ring->dev,
	"failed to map mbuf, error %d\n", error);
	goto exit;
	}
	}

	/* set flags and meta data */
	hal_pkt->flags = AL_ETH_TX_FLAGS_INT;
	al_eth_tx_csum(tx_ring, tx_info, hal_pkt, m);

	al_buf = hal_pkt->bufs;
	for (a = 0; a < nsegs; a++) {
	al_buf->addr = segs[a].ds_addr;
	al_buf->len = segs[a].ds_len;

	al_buf++;
	}

	hal_pkt->num_of_bufs = nsegs;

	/* prepare the packet's descriptors to dma engine */
	tx_info->tx_descs = al_eth_tx_pkt_prepare(tx_ring->dma_q, hal_pkt);

	if (tx_info->tx_descs == 0)
	goto exit;

	/*
	* stop the queue when no more space available, the packet can have up
	* to AL_ETH_PKT_MAX_BUFS + 1 buffers and a meta descriptor
	*/
	if (unlikely(al_udma_available_get(tx_ring->dma_q) <
	(AL_ETH_PKT_MAX_BUFS + 2))) {
	tx_ring->stall = 1;
	device_printf_dbg(tx_ring->dev, "stall, stopping queue %d...\n",
	tx_ring->ring_id);
	al_data_memory_barrier();
	}

	tx_ring->next_to_use = AL_ETH_TX_RING_IDX_NEXT(tx_ring, next_to_use);

	/* trigger the dma engine */
	al_eth_tx_dma_action(tx_ring->dma_q, tx_info->tx_descs);
	return;

	exit:
	m_freem(m);
	}

	static void
	al_eth_tx_cmpl_work(void *arg, int pending)
	{
	struct al_eth_ring *tx_ring = arg;

	if (napi != 0) {
	tx_ring->cmpl_is_running = 1;
	al_data_memory_barrier();
	}

	al_eth_tx_do_cleanup(tx_ring);

	if (napi != 0) {
	tx_ring->cmpl_is_running = 0;
	al_data_memory_barrier();
	}
	/* all work done, enable IRQs */
	al_eth_irq_config(tx_ring->unmask_reg_offset, tx_ring->unmask_val);
	}

	static int
	al_eth_tx_cmlp_irq_filter(void *arg)
	{
	struct al_eth_ring *tx_ring = arg;

	/* Interrupt should be auto-masked upon arrival */

	device_printf_dbg(tx_ring->dev, "%s for ring ID = %d\n", __func__,
	tx_ring->ring_id);

	/*
	* For napi, if work is not running, schedule it. Always schedule
	* for casual (non-napi) packet handling.
	*/
	if ((napi == 0) \|\| (napi && tx_ring->cmpl_is_running == 0))
	taskqueue_enqueue(tx_ring->cmpl_tq, &tx_ring->cmpl_task);

	/* Do not run bottom half */
	return (FILTER_HANDLED);
	}

	static int
	al_eth_rx_recv_irq_filter(void *arg)
	{
	struct al_eth_ring *rx_ring = arg;

	/* Interrupt should be auto-masked upon arrival */

	device_printf_dbg(rx_ring->dev, "%s for ring ID = %d\n", __func__,
	rx_ring->ring_id);

	/*
	* For napi, if work is not running, schedule it. Always schedule
	* for casual (non-napi) packet handling.
	*/
	if ((napi == 0) \|\| (napi && rx_ring->enqueue_is_running == 0))
	taskqueue_enqueue(rx_ring->enqueue_tq, &rx_ring->enqueue_task);

	/* Do not run bottom half */
	return (FILTER_HANDLED);
	}

	/*
	* al_eth_rx_checksum - indicate in mbuf if hw indicated a good cksum
	* @adapter: structure containing adapter specific data
	* @hal_pkt: HAL structure for the packet
	* @mbuf: mbuf currently being received and modified
	*/
	static inline void
	al_eth_rx_checksum(struct al_eth_adapter *adapter,
	struct al_eth_pkt hal_pkt, struct mbuf mbuf)
	{

	/* if IPv4 and error */
	if (unlikely((adapter->netdev->if_capenable & IFCAP_RXCSUM) &&
	(hal_pkt->l3_proto_idx == AL_ETH_PROTO_ID_IPv4) &&
	(hal_pkt->flags & AL_ETH_RX_FLAGS_L3_CSUM_ERR))) {
	device_printf(adapter->dev,"rx ipv4 header checksum error\n");
	return;
	}

	/* if IPv6 and error */
	if (unlikely((adapter->netdev->if_capenable & IFCAP_RXCSUM_IPV6) &&
	(hal_pkt->l3_proto_idx == AL_ETH_PROTO_ID_IPv6) &&
	(hal_pkt->flags & AL_ETH_RX_FLAGS_L3_CSUM_ERR))) {
	device_printf(adapter->dev,"rx ipv6 header checksum error\n");
	return;
	}

	/* if TCP/UDP */
	if (likely((hal_pkt->l4_proto_idx == AL_ETH_PROTO_ID_TCP) \|\|
	(hal_pkt->l4_proto_idx == AL_ETH_PROTO_ID_UDP))) {
	if (unlikely(hal_pkt->flags & AL_ETH_RX_FLAGS_L4_CSUM_ERR)) {
	device_printf_dbg(adapter->dev, "rx L4 checksum error\n");

	/* TCP/UDP checksum error */
	mbuf->m_pkthdr.csum_flags = 0;
	} else {
	device_printf_dbg(adapter->dev, "rx checksum correct\n");

	/* IP Checksum Good */
	mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
	mbuf->m_pkthdr.csum_flags \|= CSUM_IP_VALID;
	}
	}
	}

	static struct mbuf*
	al_eth_rx_mbuf(struct al_eth_adapter *adapter,
	struct al_eth_ring rx_ring, struct al_eth_pkt hal_pkt,
	unsigned int descs, uint16_t *next_to_clean)
	{
	struct mbuf *mbuf;
	struct al_eth_rx_buffer *rx_info =
	&rx_ring->rx_buffer_info[*next_to_clean];
	unsigned int len;

	len = hal_pkt->bufs[0].len;
	device_printf_dbg(adapter->dev, "rx_info %p data %p\n", rx_info,
	rx_info->m);

	if (rx_info->m == NULL) {
	*next_to_clean = AL_ETH_RX_RING_IDX_NEXT(rx_ring,
	*next_to_clean);
	return (NULL);
	}

	mbuf = rx_info->m;
	mbuf->m_pkthdr.len = len;
	mbuf->m_len = len;
	mbuf->m_pkthdr.rcvif = rx_ring->netdev;
	mbuf->m_flags \|= M_PKTHDR;

	if (len <= adapter->small_copy_len) {
	struct mbuf *smbuf;
	device_printf_dbg(adapter->dev, "rx small packet. len %d\n", len);

	AL_RX_LOCK(adapter);
	smbuf = m_gethdr(M_NOWAIT, MT_DATA);
	AL_RX_UNLOCK(adapter);
	if (__predict_false(smbuf == NULL)) {
	device_printf(adapter->dev, "smbuf is NULL\n");
	return (NULL);
	}

	smbuf->m_data = smbuf->m_data + AL_IP_ALIGNMENT_OFFSET;
	memcpy(smbuf->m_data, mbuf->m_data + AL_IP_ALIGNMENT_OFFSET, len);

	smbuf->m_len = len;
	smbuf->m_pkthdr.rcvif = rx_ring->netdev;

	/* first desc of a non-ps chain */
	smbuf->m_flags \|= M_PKTHDR;
	smbuf->m_pkthdr.len = smbuf->m_len;

	*next_to_clean = AL_ETH_RX_RING_IDX_NEXT(rx_ring,
	*next_to_clean);

	return (smbuf);
	}
	mbuf->m_data = mbuf->m_data + AL_IP_ALIGNMENT_OFFSET;

	/* Unmap the buffer */
	bus_dmamap_unload(rx_ring->dma_buf_tag, rx_info->dma_map);

	rx_info->m = NULL;
	next_to_clean = AL_ETH_RX_RING_IDX_NEXT(rx_ring, next_to_clean);

	return (mbuf);
	}

	static void
	al_eth_rx_recv_work(void *arg, int pending)
	{
	struct al_eth_ring *rx_ring = arg;
	struct mbuf *mbuf;
	struct lro_entry *queued;
	unsigned int qid = rx_ring->ring_id;
	struct al_eth_pkt *hal_pkt = &rx_ring->hal_pkt;
	uint16_t next_to_clean = rx_ring->next_to_clean;
	uint32_t refill_required;
	uint32_t refill_actual;
	uint32_t do_if_input;

	if (napi != 0) {
	rx_ring->enqueue_is_running = 1;
	al_data_memory_barrier();
	}

	do {
	unsigned int descs;

	descs = al_eth_pkt_rx(rx_ring->dma_q, hal_pkt);
	if (unlikely(descs == 0))
	break;

	device_printf_dbg(rx_ring->dev, "rx_poll: q %d got packet "
	"from hal. descs %d\n", qid, descs);
	device_printf_dbg(rx_ring->dev, "rx_poll: q %d flags %x. "
	"l3 proto %d l4 proto %d\n", qid, hal_pkt->flags,
	hal_pkt->l3_proto_idx, hal_pkt->l4_proto_idx);

	/* ignore if detected dma or eth controller errors */
	if ((hal_pkt->flags & (AL_ETH_RX_ERROR \|
	AL_UDMA_CDESC_ERROR)) != 0) {
	device_printf(rx_ring->dev, "receive packet with error. "
	"flags = 0x%x\n", hal_pkt->flags);
	next_to_clean = AL_ETH_RX_RING_IDX_ADD(rx_ring,
	next_to_clean, descs);
	continue;
	}

	/* allocate mbuf and fill it */
	mbuf = al_eth_rx_mbuf(rx_ring->adapter, rx_ring, hal_pkt, descs,
	&next_to_clean);

	/* exit if we failed to retrieve a buffer */
	if (unlikely(mbuf == NULL)) {
	next_to_clean = AL_ETH_RX_RING_IDX_ADD(rx_ring,
	next_to_clean, descs);
	break;
	}

	if (__predict_true(rx_ring->netdev->if_capenable & IFCAP_RXCSUM \|\|
	rx_ring->netdev->if_capenable & IFCAP_RXCSUM_IPV6)) {
	al_eth_rx_checksum(rx_ring->adapter, hal_pkt, mbuf);
	}

	#if __FreeBSD_version >= 800000
	mbuf->m_pkthdr.flowid = qid;
	M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE);
	#endif

	/*
	* LRO is only for IP/TCP packets and TCP checksum of the packet
	* should be computed by hardware.
	*/
	do_if_input = 1;
	if ((rx_ring->lro_enabled != 0) &&
	((mbuf->m_pkthdr.csum_flags & CSUM_IP_VALID) != 0) &&
	hal_pkt->l4_proto_idx == AL_ETH_PROTO_ID_TCP) {
	/*
	* Send to the stack if:
	* - LRO not enabled, or
	* - no LRO resources, or
	* - lro enqueue fails
	*/
	if (rx_ring->lro.lro_cnt != 0) {
	if (tcp_lro_rx(&rx_ring->lro, mbuf, 0) == 0)
	do_if_input = 0;
	}
	}

	if (do_if_input)
	(*rx_ring->netdev->if_input)(rx_ring->netdev, mbuf);

	} while (1);

	rx_ring->next_to_clean = next_to_clean;

	refill_required = al_udma_available_get(rx_ring->dma_q);
	refill_actual = al_eth_refill_rx_bufs(rx_ring->adapter, qid,
	refill_required);

	if (unlikely(refill_actual < refill_required)) {
	device_printf_dbg(rx_ring->dev,
	"%s: not filling rx queue %d\n", __func__, qid);
	}

	while (((queued = LIST_FIRST(&rx_ring->lro.lro_active)) != NULL)) {
	LIST_REMOVE(queued, next);
	tcp_lro_flush(&rx_ring->lro, queued);
	}

	if (napi != 0) {
	rx_ring->enqueue_is_running = 0;
	al_data_memory_barrier();
	}
	/* unmask irq */
	al_eth_irq_config(rx_ring->unmask_reg_offset, rx_ring->unmask_val);
	}

	static void
	al_eth_start_xmit(void *arg, int pending)
	{
	struct al_eth_ring *tx_ring = arg;
	struct mbuf *mbuf;

	if (napi != 0) {
	tx_ring->enqueue_is_running = 1;
	al_data_memory_barrier();
	}

	while (1) {
	mtx_lock(&tx_ring->br_mtx);
	mbuf = drbr_dequeue(NULL, tx_ring->br);
	mtx_unlock(&tx_ring->br_mtx);

	if (mbuf == NULL)
	break;

	al_eth_xmit_mbuf(tx_ring, mbuf);
	}

	if (napi != 0) {
	tx_ring->enqueue_is_running = 0;
	al_data_memory_barrier();
	while (1) {
	mtx_lock(&tx_ring->br_mtx);
	mbuf = drbr_dequeue(NULL, tx_ring->br);
	mtx_unlock(&tx_ring->br_mtx);
	if (mbuf == NULL)
	break;
	al_eth_xmit_mbuf(tx_ring, mbuf);
	}
	}
	}

	static int
	al_mq_start(struct ifnet ifp, struct mbuf m)
	{
	struct al_eth_adapter *adapter = ifp->if_softc;
	struct al_eth_ring *tx_ring;
	int i;
	int ret;

	/* Which queue to use */
	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
	i = m->m_pkthdr.flowid % adapter->num_tx_queues;
	else
	i = curcpu % adapter->num_tx_queues;

	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING\|IFF_DRV_OACTIVE)) !=
	IFF_DRV_RUNNING) {
	return (EFAULT);
	}

	tx_ring = &adapter->tx_ring[i];

	device_printf_dbg(adapter->dev, "dgb start() - assuming link is active, "
	"sending packet to queue %d\n", i);

	ret = drbr_enqueue(ifp, tx_ring->br, m);

	/*
	* For napi, if work is not running, schedule it. Always schedule
	* for casual (non-napi) packet handling.
	*/
	if ((napi == 0) \|\| ((napi != 0) && (tx_ring->enqueue_is_running == 0)))
	taskqueue_enqueue(tx_ring->enqueue_tq, &tx_ring->enqueue_task);

	return (ret);
	}

	static void
	al_qflush(struct ifnet * ifp)
	{

	/* unused */
	}

	static inline void
	al_eth_flow_ctrl_init(struct al_eth_adapter *adapter)
	{
	uint8_t default_flow_ctrl;

	default_flow_ctrl = AL_ETH_FLOW_CTRL_TX_PAUSE;
	default_flow_ctrl \|= AL_ETH_FLOW_CTRL_RX_PAUSE;

	adapter->link_config.flow_ctrl_supported = default_flow_ctrl;
	}

	static int
	al_eth_flow_ctrl_config(struct al_eth_adapter *adapter)
	{
	struct al_eth_flow_control_params *flow_ctrl_params;
	uint8_t active = adapter->link_config.flow_ctrl_active;
	int i;

	flow_ctrl_params = &adapter->flow_ctrl_params;

	flow_ctrl_params->type = AL_ETH_FLOW_CONTROL_TYPE_LINK_PAUSE;
	flow_ctrl_params->obay_enable =
	((active & AL_ETH_FLOW_CTRL_RX_PAUSE) != 0);
	flow_ctrl_params->gen_enable =
	((active & AL_ETH_FLOW_CTRL_TX_PAUSE) != 0);

	flow_ctrl_params->rx_fifo_th_high = AL_ETH_FLOW_CTRL_RX_FIFO_TH_HIGH;
	flow_ctrl_params->rx_fifo_th_low = AL_ETH_FLOW_CTRL_RX_FIFO_TH_LOW;
	flow_ctrl_params->quanta = AL_ETH_FLOW_CTRL_QUANTA;
	flow_ctrl_params->quanta_th = AL_ETH_FLOW_CTRL_QUANTA_TH;

	/* map priority to queue index, queue id = priority/2 */
	for (i = 0; i < AL_ETH_FWD_PRIO_TABLE_NUM; i++)
	flow_ctrl_params->prio_q_map[0][i] = 1 << (i >> 1);

	al_eth_flow_control_config(&adapter->hal_adapter, flow_ctrl_params);

	return (0);
	}

	static void
	al_eth_flow_ctrl_enable(struct al_eth_adapter *adapter)
	{

	/*
	* change the active configuration to the default / force by ethtool
	* and call to configure
	*/
	adapter->link_config.flow_ctrl_active =
	adapter->link_config.flow_ctrl_supported;

	al_eth_flow_ctrl_config(adapter);
	}

	static void
	al_eth_flow_ctrl_disable(struct al_eth_adapter *adapter)
	{

	adapter->link_config.flow_ctrl_active = 0;
	al_eth_flow_ctrl_config(adapter);
	}

	static int
	al_eth_hw_init(struct al_eth_adapter *adapter)
	{
	int rc;

	rc = al_eth_hw_init_adapter(adapter);
	if (rc != 0)
	return (rc);

	rc = al_eth_mac_config(&adapter->hal_adapter, adapter->mac_mode);
	if (rc < 0) {
	device_printf(adapter->dev, "%s failed to configure mac!\n",
	__func__);
	return (rc);
	}

	if ((adapter->mac_mode == AL_ETH_MAC_MODE_SGMII) \|\|
	(adapter->mac_mode == AL_ETH_MAC_MODE_RGMII &&
	adapter->phy_exist == FALSE)) {
	rc = al_eth_mac_link_config(&adapter->hal_adapter,
	adapter->link_config.force_1000_base_x,
	adapter->link_config.autoneg,
	adapter->link_config.active_speed,
	adapter->link_config.active_duplex);
	if (rc != 0) {
	device_printf(adapter->dev,
	"%s failed to configure link parameters!\n",
	__func__);
	return (rc);
	}
	}

	rc = al_eth_mdio_config(&adapter->hal_adapter,
	AL_ETH_MDIO_TYPE_CLAUSE_22, TRUE /* shared_mdio_if */,
	adapter->ref_clk_freq, adapter->mdio_freq);
	if (rc != 0) {
	device_printf(adapter->dev, "%s failed at mdio config!\n",
	__func__);
	return (rc);
	}

	al_eth_flow_ctrl_init(adapter);

	return (rc);
	}

	static int
	al_eth_hw_stop(struct al_eth_adapter *adapter)
	{

	al_eth_mac_stop(&adapter->hal_adapter);

	/*
	* wait till pending rx packets written and UDMA becomes idle,
	* the MAC has ~10KB fifo, 10us should be enought time for the
	* UDMA to write to the memory
	*/
	DELAY(10);

	al_eth_adapter_stop(&adapter->hal_adapter);

	adapter->flags \|= AL_ETH_FLAG_RESET_REQUESTED;

	/* disable flow ctrl to avoid pause packets*/
	al_eth_flow_ctrl_disable(adapter);

	return (0);
	}

	/*
	* al_eth_intr_intx_all - Legacy Interrupt Handler for all interrupts
	* @irq: interrupt number
	* @data: pointer to a network interface device structure
	*/
	static int
	al_eth_intr_intx_all(void *data)
	{
	struct al_eth_adapter *adapter = data;

	struct unit_regs __iomem *regs_base =
	(struct unit_regs __iomem *)adapter->udma_base;
	uint32_t reg;

	reg = al_udma_iofic_read_cause(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_A);
	if (likely(reg))
	device_printf_dbg(adapter->dev, "%s group A cause %x\n",
	__func__, reg);

	if (unlikely(reg & AL_INT_GROUP_A_GROUP_D_SUM)) {
	struct al_iofic_grp_ctrl __iomem *sec_ints_base;
	uint32_t cause_d = al_udma_iofic_read_cause(regs_base,
	AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_D);

	sec_ints_base =
	&regs_base->gen.interrupt_regs.secondary_iofic_ctrl[0];
	if (cause_d != 0) {
	device_printf_dbg(adapter->dev,
	"got interrupt from group D. cause %x\n", cause_d);

	cause_d = al_iofic_read_cause(sec_ints_base,
	AL_INT_GROUP_A);
	device_printf(adapter->dev,
	"secondary A cause %x\n", cause_d);

	cause_d = al_iofic_read_cause(sec_ints_base,
	AL_INT_GROUP_B);

	device_printf_dbg(adapter->dev,
	"secondary B cause %x\n", cause_d);
	}
	}
	if ((reg & AL_INT_GROUP_A_GROUP_B_SUM) != 0 ) {
	uint32_t cause_b = al_udma_iofic_read_cause(regs_base,
	AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_B);
	int qid;
	device_printf_dbg(adapter->dev, "secondary B cause %x\n",
	cause_b);
	for (qid = 0; qid < adapter->num_rx_queues; qid++) {
	if (cause_b & (1 << qid)) {
	/* mask */
	al_udma_iofic_mask(
	(struct unit_regs __iomem *)adapter->udma_base,
	AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_B, 1 << qid);
	}
	}
	}
	if ((reg & AL_INT_GROUP_A_GROUP_C_SUM) != 0) {
	uint32_t cause_c = al_udma_iofic_read_cause(regs_base,
	AL_UDMA_IOFIC_LEVEL_PRIMARY, AL_INT_GROUP_C);
	int qid;
	device_printf_dbg(adapter->dev, "secondary C cause %x\n", cause_c);
	for (qid = 0; qid < adapter->num_tx_queues; qid++) {
	if ((cause_c & (1 << qid)) != 0) {
	al_udma_iofic_mask(
	(struct unit_regs __iomem *)adapter->udma_base,
	AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_C, 1 << qid);
	}
	}
	}

	al_eth_tx_cmlp_irq_filter(adapter->tx_ring);

	return (0);
	}

	static int
	al_eth_intr_msix_all(void *data)
	{
	struct al_eth_adapter *adapter = data;

	device_printf_dbg(adapter->dev, "%s\n", __func__);
	return (0);
	}

	static int
	al_eth_intr_msix_mgmt(void *data)
	{
	struct al_eth_adapter *adapter = data;

	device_printf_dbg(adapter->dev, "%s\n", __func__);
	return (0);
	}

	static int
	al_eth_enable_msix(struct al_eth_adapter *adapter)
	{
	int i, msix_vecs, rc, count;

	device_printf_dbg(adapter->dev, "%s\n", __func__);
	msix_vecs = 1 + adapter->num_rx_queues + adapter->num_tx_queues;

	device_printf_dbg(adapter->dev,
	"Try to enable MSIX, vector numbers = %d\n", msix_vecs);

	adapter->msix_entries = malloc(msix_vecssizeof(adapter->msix_entries),
	M_IFAL, M_ZERO \| M_WAITOK);

	if (adapter->msix_entries == NULL) {
	device_printf_dbg(adapter->dev, "failed to allocate"
	" msix_entries %d\n", msix_vecs);
	rc = ENOMEM;
	goto exit;
	}

	/* management vector (GROUP_A) @2*/
	adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].entry = 2;
	adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].vector = 0;

	/* rx queues start @3 */
	for (i = 0; i < adapter->num_rx_queues; i++) {
	int irq_idx = AL_ETH_RXQ_IRQ_IDX(adapter, i);

	adapter->msix_entries[irq_idx].entry = 3 + i;
	adapter->msix_entries[irq_idx].vector = 0;
	}
	/* tx queues start @7 */
	for (i = 0; i < adapter->num_tx_queues; i++) {
	int irq_idx = AL_ETH_TXQ_IRQ_IDX(adapter, i);

	adapter->msix_entries[irq_idx].entry = 3 +
	AL_ETH_MAX_HW_QUEUES + i;
	adapter->msix_entries[irq_idx].vector = 0;
	}

	count = msix_vecs + 2; /* entries start from 2 */
	rc = pci_alloc_msix(adapter->dev, &count);

	if (rc != 0) {
	device_printf_dbg(adapter->dev, "failed to allocate MSIX "
	"vectors %d\n", msix_vecs+2);
	device_printf_dbg(adapter->dev, "ret = %d\n", rc);
	goto msix_entries_exit;
	}

	if (count != msix_vecs + 2) {
	device_printf_dbg(adapter->dev, "failed to allocate all MSIX "
	"vectors %d, allocated %d\n", msix_vecs+2, count);
	rc = ENOSPC;
	goto msix_entries_exit;
	}

	for (i = 0; i < msix_vecs; i++)
	adapter->msix_entries[i].vector = 2 + 1 + i;

	device_printf_dbg(adapter->dev, "successfully enabled MSIX,"
	" vectors %d\n", msix_vecs);

	adapter->msix_vecs = msix_vecs;
	adapter->flags \|= AL_ETH_FLAG_MSIX_ENABLED;
	goto exit;

	msix_entries_exit:
	adapter->msix_vecs = 0;
	free(adapter->msix_entries, M_IFAL);
	adapter->msix_entries = NULL;

	exit:
	return (rc);
	}

	static int
	al_eth_setup_int_mode(struct al_eth_adapter *adapter)
	{
	int i, rc;

	rc = al_eth_enable_msix(adapter);
	if (rc != 0) {
	device_printf(adapter->dev, "Failed to enable MSIX mode.\n");
	return (rc);
	}

	adapter->irq_vecs = max(1, adapter->msix_vecs);
	/* single INTX mode */
	if (adapter->msix_vecs == 0) {
	snprintf(adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].name,
	AL_ETH_IRQNAME_SIZE, "al-eth-intx-all@pci:%s",
	device_get_name(adapter->dev));
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].handler =
	al_eth_intr_intx_all;
	/* IRQ vector will be resolved from device resources */
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector = 0;
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].data = adapter;

	device_printf(adapter->dev, "%s and vector %d \n", __func__,
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector);

	return (0);
	}
	/* single MSI-X mode */
	if (adapter->msix_vecs == 1) {
	snprintf(adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].name,
	AL_ETH_IRQNAME_SIZE, "al-eth-msix-all@pci:%s",
	device_get_name(adapter->dev));
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].handler =
	al_eth_intr_msix_all;
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector =
	adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].vector;
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].data = adapter;

	return (0);
	}
	/* MSI-X per queue */
	snprintf(adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].name, AL_ETH_IRQNAME_SIZE,
	"al-eth-msix-mgmt@pci:%s", device_get_name(adapter->dev));
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].handler = al_eth_intr_msix_mgmt;

	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].data = adapter;
	adapter->irq_tbl[AL_ETH_MGMT_IRQ_IDX].vector =
	adapter->msix_entries[AL_ETH_MGMT_IRQ_IDX].vector;

	for (i = 0; i < adapter->num_rx_queues; i++) {
	int irq_idx = AL_ETH_RXQ_IRQ_IDX(adapter, i);

	snprintf(adapter->irq_tbl[irq_idx].name, AL_ETH_IRQNAME_SIZE,
	"al-eth-rx-comp-%d@pci:%s", i,
	device_get_name(adapter->dev));
	adapter->irq_tbl[irq_idx].handler = al_eth_rx_recv_irq_filter;
	adapter->irq_tbl[irq_idx].data = &adapter->rx_ring[i];
	adapter->irq_tbl[irq_idx].vector =
	adapter->msix_entries[irq_idx].vector;
	}

	for (i = 0; i < adapter->num_tx_queues; i++) {
	int irq_idx = AL_ETH_TXQ_IRQ_IDX(adapter, i);

	snprintf(adapter->irq_tbl[irq_idx].name,
	AL_ETH_IRQNAME_SIZE, "al-eth-tx-comp-%d@pci:%s", i,
	device_get_name(adapter->dev));
	adapter->irq_tbl[irq_idx].handler = al_eth_tx_cmlp_irq_filter;
	adapter->irq_tbl[irq_idx].data = &adapter->tx_ring[i];
	adapter->irq_tbl[irq_idx].vector =
	adapter->msix_entries[irq_idx].vector;
	}

	return (0);
	}

	static void
	__al_eth_free_irq(struct al_eth_adapter *adapter)
	{
	struct al_eth_irq *irq;
	int i, rc;

	for (i = 0; i < adapter->irq_vecs; i++) {
	irq = &adapter->irq_tbl[i];
	if (irq->requested != 0) {
	device_printf_dbg(adapter->dev, "tear down irq: %d\n",
	irq->vector);
	rc = bus_teardown_intr(adapter->dev, irq->res,
	irq->cookie);
	if (rc != 0)
	device_printf(adapter->dev, "failed to tear "
	"down irq: %d\n", irq->vector);

	}
	irq->requested = 0;
	}
	}

	static void
	al_eth_free_irq(struct al_eth_adapter *adapter)
	{
	struct al_eth_irq *irq;
	int i, rc;
	#ifdef CONFIG_RFS_ACCEL
	if (adapter->msix_vecs >= 1) {
	free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
	adapter->netdev->rx_cpu_rmap = NULL;
	}
	#endif

	__al_eth_free_irq(adapter);

	for (i = 0; i < adapter->irq_vecs; i++) {
	irq = &adapter->irq_tbl[i];
	if (irq->res == NULL)
	continue;
	device_printf_dbg(adapter->dev, "release resource irq: %d\n",
	irq->vector);
	rc = bus_release_resource(adapter->dev, SYS_RES_IRQ, irq->vector,
	irq->res);
	irq->res = NULL;
	if (rc != 0)
	device_printf(adapter->dev, "dev has no parent while "
	"releasing res for irq: %d\n", irq->vector);
	}

	pci_release_msi(adapter->dev);

	adapter->flags &= ~AL_ETH_FLAG_MSIX_ENABLED;

	adapter->msix_vecs = 0;
	free(adapter->msix_entries, M_IFAL);
	adapter->msix_entries = NULL;
	}

	static int
	al_eth_request_irq(struct al_eth_adapter *adapter)
	{
	unsigned long flags;
	struct al_eth_irq *irq;
	int rc = 0, i, v;

	if ((adapter->flags & AL_ETH_FLAG_MSIX_ENABLED) != 0)
	flags = RF_ACTIVE;
	else
	flags = RF_ACTIVE \| RF_SHAREABLE;

	for (i = 0; i < adapter->irq_vecs; i++) {
	irq = &adapter->irq_tbl[i];

	if (irq->requested != 0)
	continue;

	irq->res = bus_alloc_resource_any(adapter->dev, SYS_RES_IRQ,
	&irq->vector, flags);
	if (irq->res == NULL) {
	device_printf(adapter->dev, "could not allocate "
	"irq vector=%d\n", irq->vector);
	rc = ENXIO;
	goto exit_res;
	}

	if ((rc = bus_setup_intr(adapter->dev, irq->res,
	INTR_TYPE_NET \| INTR_MPSAFE, irq->handler,
	NULL, irq->data, &irq->cookie)) != 0) {
	device_printf(adapter->dev, "failed to register "
	"interrupt handler for irq %ju: %d\n",
	(uintmax_t)rman_get_start(irq->res), rc);
	goto exit_intr;
	}
	irq->requested = 1;
	}
	goto exit;

	exit_intr:
	v = i - 1; /* -1 because we omit the operation that failed */
	while (v-- >= 0) {
	int bti;
	irq = &adapter->irq_tbl[v];
	bti = bus_teardown_intr(adapter->dev, irq->res, irq->cookie);
	if (bti != 0) {
	device_printf(adapter->dev, "failed to tear "
	"down irq: %d\n", irq->vector);
	}

	irq->requested = 0;
	device_printf_dbg(adapter->dev, "exit_intr: releasing irq %d\n",
	irq->vector);
	}

	exit_res:
	v = i - 1; /* -1 because we omit the operation that failed */
	while (v-- >= 0) {
	int brr;
	irq = &adapter->irq_tbl[v];
	device_printf_dbg(adapter->dev, "exit_res: releasing resource"
	" for irq %d\n", irq->vector);
	brr = bus_release_resource(adapter->dev, SYS_RES_IRQ,
	irq->vector, irq->res);
	if (brr != 0)
	device_printf(adapter->dev, "dev has no parent while "
	"releasing res for irq: %d\n", irq->vector);
	irq->res = NULL;
	}

	exit:
	return (rc);
	}

	/**
	* al_eth_setup_tx_resources - allocate Tx resources (Descriptors)
	* @adapter: network interface device structure
	* @qid: queue index
	*
	* Return 0 on success, negative on failure
	**/
	static int
	al_eth_setup_tx_resources(struct al_eth_adapter *adapter, int qid)
	{
	struct al_eth_ring *tx_ring = &adapter->tx_ring[qid];
	struct device *dev = tx_ring->dev;
	struct al_udma_q_params *q_params = &tx_ring->q_params;
	int size;
	int ret;

	if (adapter->up)
	return (0);

	size = sizeof(struct al_eth_tx_buffer) * tx_ring->sw_count;

	tx_ring->tx_buffer_info = malloc(size, M_IFAL, M_ZERO \| M_WAITOK);
	if (tx_ring->tx_buffer_info == NULL)
	return (ENOMEM);

	tx_ring->descs_size = tx_ring->hw_count * sizeof(union al_udma_desc);
	q_params->size = tx_ring->hw_count;

	ret = al_dma_alloc_coherent(dev, &q_params->desc_phy_base_tag,
	(bus_dmamap_t *)&q_params->desc_phy_base_map,
	(bus_addr_t *)&q_params->desc_phy_base,
	(void**)&q_params->desc_base, tx_ring->descs_size);
	if (ret != 0) {
	device_printf(dev, "failed to al_dma_alloc_coherent,"
	" ret = %d\n", ret);
	return (ENOMEM);
	}

	if (q_params->desc_base == NULL)
	return (ENOMEM);

	device_printf_dbg(dev, "Initializing ring queues %d\n", qid);

	/* Allocate Ring Queue */
	mtx_init(&tx_ring->br_mtx, "AlRingMtx", NULL, MTX_DEF);
	tx_ring->br = buf_ring_alloc(AL_BR_SIZE, M_DEVBUF, M_WAITOK,
	&tx_ring->br_mtx);
	if (tx_ring->br == NULL) {
	device_printf(dev, "Critical Failure setting up buf ring\n");
	return (ENOMEM);
	}

	/* Allocate taskqueues */
	TASK_INIT(&tx_ring->enqueue_task, 0, al_eth_start_xmit, tx_ring);
	tx_ring->enqueue_tq = taskqueue_create_fast("al_tx_enque", M_NOWAIT,
	taskqueue_thread_enqueue, &tx_ring->enqueue_tq);
	taskqueue_start_threads(&tx_ring->enqueue_tq, 1, PI_NET, "%s txeq",
	device_get_nameunit(adapter->dev));
	TASK_INIT(&tx_ring->cmpl_task, 0, al_eth_tx_cmpl_work, tx_ring);
	tx_ring->cmpl_tq = taskqueue_create_fast("al_tx_cmpl", M_NOWAIT,
	taskqueue_thread_enqueue, &tx_ring->cmpl_tq);
	taskqueue_start_threads(&tx_ring->cmpl_tq, 1, PI_REALTIME, "%s txcq",
	device_get_nameunit(adapter->dev));

	/* Setup DMA descriptor areas. */
	ret = bus_dma_tag_create(bus_get_dma_tag(dev),
	1, 0, /* alignment, bounds */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	AL_TSO_SIZE, /* maxsize */
	AL_ETH_PKT_MAX_BUFS, /* nsegments */
	PAGE_SIZE, /* maxsegsize */
	0, /* flags */
	NULL, /* lockfunc */
	NULL, /* lockfuncarg */
	&tx_ring->dma_buf_tag);

	if (ret != 0) {
	device_printf(dev,"Unable to allocate dma_buf_tag, ret = %d\n",
	ret);
	return (ret);
	}

	for (size = 0; size < tx_ring->sw_count; size++) {
	ret = bus_dmamap_create(tx_ring->dma_buf_tag, 0,
	&tx_ring->tx_buffer_info[size].dma_map);
	if (ret != 0) {
	device_printf(dev, "Unable to map DMA TX "
	"buffer memory [iter=%d]\n", size);
	return (ret);
	}
	}

	/* completion queue not used for tx */
	q_params->cdesc_base = NULL;
	/* size in bytes of the udma completion ring descriptor */
	q_params->cdesc_size = 8;
	tx_ring->next_to_use = 0;
	tx_ring->next_to_clean = 0;

	return (0);
	}

	/*
	* al_eth_free_tx_resources - Free Tx Resources per Queue
	* @adapter: network interface device structure
	* @qid: queue index
	*
	* Free all transmit software resources
	*/
	static void
	al_eth_free_tx_resources(struct al_eth_adapter *adapter, int qid)
	{
	struct al_eth_ring *tx_ring = &adapter->tx_ring[qid];
	struct al_udma_q_params *q_params = &tx_ring->q_params;
	int size;

	/* At this point interrupts' handlers must be deactivated */
	while (taskqueue_cancel(tx_ring->cmpl_tq, &tx_ring->cmpl_task, NULL))
	taskqueue_drain(tx_ring->cmpl_tq, &tx_ring->cmpl_task);

	taskqueue_free(tx_ring->cmpl_tq);
	while (taskqueue_cancel(tx_ring->enqueue_tq,
	&tx_ring->enqueue_task, NULL)) {
	taskqueue_drain(tx_ring->enqueue_tq, &tx_ring->enqueue_task);
	}

	taskqueue_free(tx_ring->enqueue_tq);

	if (tx_ring->br != NULL) {
	drbr_flush(adapter->netdev, tx_ring->br);
	buf_ring_free(tx_ring->br, M_DEVBUF);
	}

	for (size = 0; size < tx_ring->sw_count; size++) {
	m_freem(tx_ring->tx_buffer_info[size].m);
	tx_ring->tx_buffer_info[size].m = NULL;

	bus_dmamap_unload(tx_ring->dma_buf_tag,
	tx_ring->tx_buffer_info[size].dma_map);
	bus_dmamap_destroy(tx_ring->dma_buf_tag,
	tx_ring->tx_buffer_info[size].dma_map);
	}
	bus_dma_tag_destroy(tx_ring->dma_buf_tag);

	free(tx_ring->tx_buffer_info, M_IFAL);
	tx_ring->tx_buffer_info = NULL;

	mtx_destroy(&tx_ring->br_mtx);

	/* if not set, then don't free */
	if (q_params->desc_base == NULL)
	return;

	al_dma_free_coherent(q_params->desc_phy_base_tag,
	q_params->desc_phy_base_map, q_params->desc_base);

	q_params->desc_base = NULL;
	}

	/*
	* al_eth_free_all_tx_resources - Free Tx Resources for All Queues
	* @adapter: board private structure
	*
	* Free all transmit software resources
	*/
	static void
	al_eth_free_all_tx_resources(struct al_eth_adapter *adapter)
	{
	int i;

	for (i = 0; i < adapter->num_tx_queues; i++)
	if (adapter->tx_ring[i].q_params.desc_base)
	al_eth_free_tx_resources(adapter, i);
	}

	/*
	* al_eth_setup_rx_resources - allocate Rx resources (Descriptors)
	* @adapter: network interface device structure
	* @qid: queue index
	*
	* Returns 0 on success, negative on failure
	*/
	static int
	al_eth_setup_rx_resources(struct al_eth_adapter *adapter, unsigned int qid)
	{
	struct al_eth_ring *rx_ring = &adapter->rx_ring[qid];
	struct device *dev = rx_ring->dev;
	struct al_udma_q_params *q_params = &rx_ring->q_params;
	int size;
	int ret;

	size = sizeof(struct al_eth_rx_buffer) * rx_ring->sw_count;

	/* alloc extra element so in rx path we can always prefetch rx_info + 1 */
	size += 1;

	rx_ring->rx_buffer_info = malloc(size, M_IFAL, M_ZERO \| M_WAITOK);
	if (rx_ring->rx_buffer_info == NULL)
	return (ENOMEM);

	rx_ring->descs_size = rx_ring->hw_count * sizeof(union al_udma_desc);
	q_params->size = rx_ring->hw_count;

	ret = al_dma_alloc_coherent(dev, &q_params->desc_phy_base_tag,
	&q_params->desc_phy_base_map,
	(bus_addr_t *)&q_params->desc_phy_base,
	(void**)&q_params->desc_base, rx_ring->descs_size);

	if ((q_params->desc_base == NULL) \|\| (ret != 0))
	return (ENOMEM);

	/* size in bytes of the udma completion ring descriptor */
	q_params->cdesc_size = 16;
	rx_ring->cdescs_size = rx_ring->hw_count * q_params->cdesc_size;
	ret = al_dma_alloc_coherent(dev, &q_params->cdesc_phy_base_tag,
	&q_params->cdesc_phy_base_map,
	(bus_addr_t *)&q_params->cdesc_phy_base,
	(void**)&q_params->cdesc_base, rx_ring->cdescs_size);

	if ((q_params->cdesc_base == NULL) \|\| (ret != 0))
	return (ENOMEM);

	/* Allocate taskqueues */
	TASK_INIT(&rx_ring->enqueue_task, 0, al_eth_rx_recv_work, rx_ring);
	rx_ring->enqueue_tq = taskqueue_create_fast("al_rx_enque", M_NOWAIT,
	taskqueue_thread_enqueue, &rx_ring->enqueue_tq);
	taskqueue_start_threads(&rx_ring->enqueue_tq, 1, PI_NET, "%s rxeq",
	device_get_nameunit(adapter->dev));

	/* Setup DMA descriptor areas. */
	ret = bus_dma_tag_create(bus_get_dma_tag(dev),
	1, 0, /* alignment, bounds */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	AL_TSO_SIZE, /* maxsize */
	1, /* nsegments */
	AL_TSO_SIZE, /* maxsegsize */
	0, /* flags */
	NULL, /* lockfunc */
	NULL, /* lockfuncarg */
	&rx_ring->dma_buf_tag);

	if (ret != 0) {
	device_printf(dev,"Unable to allocate RX dma_buf_tag\n");
	return (ret);
	}

	for (size = 0; size < rx_ring->sw_count; size++) {
	ret = bus_dmamap_create(rx_ring->dma_buf_tag, 0,
	&rx_ring->rx_buffer_info[size].dma_map);
	if (ret != 0) {
	device_printf(dev,"Unable to map DMA RX buffer memory\n");
	return (ret);
	}
	}

	/* Zero out the descriptor ring */
	memset(q_params->cdesc_base, 0, rx_ring->cdescs_size);

	/* Create LRO for the ring */
	if ((adapter->netdev->if_capenable & IFCAP_LRO) != 0) {
	int err = tcp_lro_init(&rx_ring->lro);
	if (err != 0) {
	device_printf(adapter->dev,
	"LRO[%d] Initialization failed!\n", qid);
	} else {
	device_printf_dbg(adapter->dev,
	"RX Soft LRO[%d] Initialized\n", qid);
	rx_ring->lro_enabled = TRUE;
	rx_ring->lro.ifp = adapter->netdev;
	}
	}

	rx_ring->next_to_clean = 0;
	rx_ring->next_to_use = 0;

	return (0);
	}

	/*
	* al_eth_free_rx_resources - Free Rx Resources
	* @adapter: network interface device structure
	* @qid: queue index
	*
	* Free all receive software resources
	*/
	static void
	al_eth_free_rx_resources(struct al_eth_adapter *adapter, unsigned int qid)
	{
	struct al_eth_ring *rx_ring = &adapter->rx_ring[qid];
	struct al_udma_q_params *q_params = &rx_ring->q_params;
	int size;

	/* At this point interrupts' handlers must be deactivated */
	while (taskqueue_cancel(rx_ring->enqueue_tq,
	&rx_ring->enqueue_task, NULL)) {
	taskqueue_drain(rx_ring->enqueue_tq, &rx_ring->enqueue_task);
	}

	taskqueue_free(rx_ring->enqueue_tq);

	for (size = 0; size < rx_ring->sw_count; size++) {
	m_freem(rx_ring->rx_buffer_info[size].m);
	rx_ring->rx_buffer_info[size].m = NULL;
	bus_dmamap_unload(rx_ring->dma_buf_tag,
	rx_ring->rx_buffer_info[size].dma_map);
	bus_dmamap_destroy(rx_ring->dma_buf_tag,
	rx_ring->rx_buffer_info[size].dma_map);
	}
	bus_dma_tag_destroy(rx_ring->dma_buf_tag);

	free(rx_ring->rx_buffer_info, M_IFAL);
	rx_ring->rx_buffer_info = NULL;

	/* if not set, then don't free */
	if (q_params->desc_base == NULL)
	return;

	al_dma_free_coherent(q_params->desc_phy_base_tag,
	q_params->desc_phy_base_map, q_params->desc_base);

	q_params->desc_base = NULL;

	/* if not set, then don't free */
	if (q_params->cdesc_base == NULL)
	return;

	al_dma_free_coherent(q_params->cdesc_phy_base_tag,
	q_params->cdesc_phy_base_map, q_params->cdesc_base);

	q_params->cdesc_phy_base = 0;

	/* Free LRO resources */
	tcp_lro_free(&rx_ring->lro);
	}

	/*
	* al_eth_free_all_rx_resources - Free Rx Resources for All Queues
	* @adapter: board private structure
	*
	* Free all receive software resources
	*/
	static void
	al_eth_free_all_rx_resources(struct al_eth_adapter *adapter)
	{
	int i;

	for (i = 0; i < adapter->num_rx_queues; i++)
	if (adapter->rx_ring[i].q_params.desc_base != 0)
	al_eth_free_rx_resources(adapter, i);
	}

	/*
	* al_eth_setup_all_rx_resources - allocate all queues Rx resources
	* @adapter: board private structure
	*
	* Return 0 on success, negative on failure
	*/
	static int
	al_eth_setup_all_rx_resources(struct al_eth_adapter *adapter)
	{
	int i, rc = 0;

	for (i = 0; i < adapter->num_rx_queues; i++) {
	rc = al_eth_setup_rx_resources(adapter, i);
	if (rc == 0)
	continue;

	device_printf(adapter->dev, "Allocation for Rx Queue %u failed\n", i);
	goto err_setup_rx;
	}
	return (0);

	err_setup_rx:
	/* rewind the index freeing the rings as we go */
	while (i--)
	al_eth_free_rx_resources(adapter, i);
	return (rc);
	}

	/*
	* al_eth_setup_all_tx_resources - allocate all queues Tx resources
	* @adapter: private structure
	*
	* Return 0 on success, negative on failure
	*/
	static int
	al_eth_setup_all_tx_resources(struct al_eth_adapter *adapter)
	{
	int i, rc = 0;

	for (i = 0; i < adapter->num_tx_queues; i++) {
	rc = al_eth_setup_tx_resources(adapter, i);
	if (rc == 0)
	continue;

	device_printf(adapter->dev,
	"Allocation for Tx Queue %u failed\n", i);
	goto err_setup_tx;
	}

	return (0);

	err_setup_tx:
	/* rewind the index freeing the rings as we go */
	while (i--)
	al_eth_free_tx_resources(adapter, i);

	return (rc);
	}

	static void
	al_eth_disable_int_sync(struct al_eth_adapter *adapter)
	{

	/* disable forwarding interrupts from eth through pci end point */
	if ((adapter->board_type == ALPINE_FPGA_NIC) \|\|
	(adapter->board_type == ALPINE_NIC)) {
	al_eth_forward_int_config((uint32_t*)adapter->internal_pcie_base +
	AL_REG_OFFSET_FORWARD_INTR, AL_DIS_FORWARD_INTR);
	}

	/* mask hw interrupts */
	al_eth_interrupts_mask(adapter);
	}

	static void
	al_eth_interrupts_unmask(struct al_eth_adapter *adapter)
	{
	uint32_t group_a_mask = AL_INT_GROUP_A_GROUP_D_SUM; /* enable group D summery */
	uint32_t group_b_mask = (1 << adapter->num_rx_queues) - 1;/* bit per Rx q*/
	uint32_t group_c_mask = (1 << adapter->num_tx_queues) - 1;/* bit per Tx q*/
	uint32_t group_d_mask = 3 << 8;
	struct unit_regs __iomem *regs_base =
	(struct unit_regs __iomem *)adapter->udma_base;

	if (adapter->int_mode == AL_IOFIC_MODE_LEGACY)
	group_a_mask \|= AL_INT_GROUP_A_GROUP_B_SUM \|
	AL_INT_GROUP_A_GROUP_C_SUM \|
	AL_INT_GROUP_A_GROUP_D_SUM;

	al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_A, group_a_mask);
	al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_B, group_b_mask);
	al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_C, group_c_mask);
	al_udma_iofic_unmask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_D, group_d_mask);
	}

	static void
	al_eth_interrupts_mask(struct al_eth_adapter *adapter)
	{
	struct unit_regs __iomem *regs_base =
	(struct unit_regs __iomem *)adapter->udma_base;

	/* mask all interrupts */
	al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_A, AL_MASK_GROUP_A_INT);
	al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_B, AL_MASK_GROUP_B_INT);
	al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_C, AL_MASK_GROUP_C_INT);
	al_udma_iofic_mask(regs_base, AL_UDMA_IOFIC_LEVEL_PRIMARY,
	AL_INT_GROUP_D, AL_MASK_GROUP_D_INT);
	}

	static int
	al_eth_configure_int_mode(struct al_eth_adapter *adapter)
	{
	enum al_iofic_mode int_mode;
	uint32_t m2s_errors_disable = AL_M2S_MASK_INIT;
	uint32_t m2s_aborts_disable = AL_M2S_MASK_INIT;
	uint32_t s2m_errors_disable = AL_S2M_MASK_INIT;
	uint32_t s2m_aborts_disable = AL_S2M_MASK_INIT;

	/* single INTX mode */
	if (adapter->msix_vecs == 0)
	int_mode = AL_IOFIC_MODE_LEGACY;
	else if (adapter->msix_vecs > 1)
	int_mode = AL_IOFIC_MODE_MSIX_PER_Q;
	else {
	device_printf(adapter->dev,
	"udma doesn't support single MSI-X mode yet.\n");
	return (EIO);
	}

	if (adapter->board_type != ALPINE_INTEGRATED) {
	m2s_errors_disable \|= AL_M2S_S2M_MASK_NOT_INT;
	m2s_errors_disable \|= AL_M2S_S2M_MASK_NOT_INT;
	s2m_aborts_disable \|= AL_M2S_S2M_MASK_NOT_INT;
	s2m_aborts_disable \|= AL_M2S_S2M_MASK_NOT_INT;
	}

	if (al_udma_iofic_config((struct unit_regs __iomem *)adapter->udma_base,
	int_mode, m2s_errors_disable, m2s_aborts_disable,
	s2m_errors_disable, s2m_aborts_disable)) {
	device_printf(adapter->dev,
	"al_udma_unit_int_config failed!.\n");
	return (EIO);
	}
	adapter->int_mode = int_mode;
	device_printf_dbg(adapter->dev, "using %s interrupt mode\n",
	int_mode == AL_IOFIC_MODE_LEGACY ? "INTx" :
	int_mode == AL_IOFIC_MODE_MSIX_PER_Q ? "MSI-X per Queue" : "Unknown");
	/* set interrupt moderation resolution to 15us */
	al_iofic_moder_res_config(&((struct unit_regs *)(adapter->udma_base))->gen.interrupt_regs.main_iofic, AL_INT_GROUP_B, 15);
	al_iofic_moder_res_config(&((struct unit_regs *)(adapter->udma_base))->gen.interrupt_regs.main_iofic, AL_INT_GROUP_C, 15);
	/* by default interrupt coalescing is disabled */
	adapter->tx_usecs = 0;
	adapter->rx_usecs = 0;

	return (0);
	}

	/*
	* ethtool_rxfh_indir_default - get default value for RX flow hash indirection
	* @index: Index in RX flow hash indirection table
	* @n_rx_rings: Number of RX rings to use
	*
	* This function provides the default policy for RX flow hash indirection.
	*/
	static inline uint32_t
	ethtool_rxfh_indir_default(uint32_t index, uint32_t n_rx_rings)
	{

	return (index % n_rx_rings);
	}

	static void*
	al_eth_update_stats(struct al_eth_adapter *adapter)
	{
	struct al_eth_mac_stats *mac_stats = &adapter->mac_stats;

	if (adapter->up == 0)
	return (NULL);

	al_eth_mac_stats_get(&adapter->hal_adapter, mac_stats);

	return (NULL);
	}

	static uint64_t
	al_get_counter(struct ifnet *ifp, ift_counter cnt)
	{
	struct al_eth_adapter *adapter;
	struct al_eth_mac_stats *mac_stats;
	uint64_t rv;

	adapter = if_getsoftc(ifp);
	mac_stats = &adapter->mac_stats;

	switch (cnt) {
	case IFCOUNTER_IPACKETS:
	return (mac_stats->aFramesReceivedOK); /* including pause frames */
	case IFCOUNTER_OPACKETS:
	return (mac_stats->aFramesTransmittedOK);
	case IFCOUNTER_IBYTES:
	return (mac_stats->aOctetsReceivedOK);
	case IFCOUNTER_OBYTES:
	return (mac_stats->aOctetsTransmittedOK);
	case IFCOUNTER_IMCASTS:
	return (mac_stats->ifInMulticastPkts);
	case IFCOUNTER_OMCASTS:
	return (mac_stats->ifOutMulticastPkts);
	case IFCOUNTER_COLLISIONS:
	return (0);
	case IFCOUNTER_IQDROPS:
	return (mac_stats->etherStatsDropEvents);
	case IFCOUNTER_IERRORS:
	rv = mac_stats->ifInErrors +
	mac_stats->etherStatsUndersizePkts + /* good but short */
	mac_stats->etherStatsFragments + /* short and bad*/
	mac_stats->etherStatsJabbers + /* with crc errors */
	mac_stats->etherStatsOversizePkts +
	mac_stats->aFrameCheckSequenceErrors +
	mac_stats->aAlignmentErrors;
	return (rv);
	case IFCOUNTER_OERRORS:
	return (mac_stats->ifOutErrors);
	default:
	return (if_get_counter_default(ifp, cnt));
	}
	}

	/*
	* Unicast, Multicast and Promiscuous mode set
	*
	* The set_rx_mode entry point is called whenever the unicast or multicast
	* address lists or the network interface flags are updated. This routine is
	* responsible for configuring the hardware for proper unicast, multicast,
	* promiscuous mode, and all-multi behavior.
	*/
	#define MAX_NUM_MULTICAST_ADDRESSES 32
	#define MAX_NUM_ADDRESSES 32

	static void
	al_eth_set_rx_mode(struct al_eth_adapter *adapter)
	{
	struct ifnet *ifp = adapter->netdev;
	struct ifmultiaddr ifma; / multicast addresses configured */
	struct ifaddr ifua; / unicast address */
	int mc = 0;
	int uc = 0;
	uint8_t i;
	unsigned char *mac;

	if_maddr_rlock(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_LINK)
	continue;
	if (mc == MAX_NUM_MULTICAST_ADDRESSES)
	break;

	mac = LLADDR((struct sockaddr_dl *) ifma->ifma_addr);
	/* default mc address inside mac address */
	if (mac[3] != 0 && mac[4] != 0 && mac[5] != 1)
	mc++;
	}
	if_maddr_runlock(ifp);

	if_addr_rlock(ifp);
	TAILQ_FOREACH(ifua, &ifp->if_addrhead, ifa_link) {
	if (ifua->ifa_addr->sa_family != AF_LINK)
	continue;
	if (uc == MAX_NUM_ADDRESSES)
	break;
	uc++;
	}
	if_addr_runlock(ifp);

	if ((ifp->if_flags & IFF_PROMISC) != 0) {
	al_eth_mac_table_promiscuous_set(adapter, true);
	} else {
	if ((ifp->if_flags & IFF_ALLMULTI) != 0) {
	/* This interface is in all-multicasts mode (used by multicast routers). */
	al_eth_mac_table_all_multicast_add(adapter,
	AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX, 1);
	} else {
	if (mc == 0) {
	al_eth_mac_table_entry_clear(adapter,
	AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX);
	} else {
	al_eth_mac_table_all_multicast_add(adapter,
	AL_ETH_MAC_TABLE_ALL_MULTICAST_IDX, 1);
	}
	}
	if (uc != 0) {
	i = AL_ETH_MAC_TABLE_UNICAST_IDX_BASE + 1;
	if (uc > AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT) {
	/*
	* In this case there are more addresses then
	* entries in the mac table - set promiscuous
	*/
	al_eth_mac_table_promiscuous_set(adapter, true);
	return;
	}

	/* clear the last configuration */
	while (i < (AL_ETH_MAC_TABLE_UNICAST_IDX_BASE +
	AL_ETH_MAC_TABLE_UNICAST_MAX_COUNT)) {
	al_eth_mac_table_entry_clear(adapter, i);
	i++;
	}

	/* set new addresses */
	i = AL_ETH_MAC_TABLE_UNICAST_IDX_BASE + 1;
	if_addr_rlock(ifp);
	TAILQ_FOREACH(ifua, &ifp->if_addrhead, ifa_link) {
	if (ifua->ifa_addr->sa_family != AF_LINK) {
	continue;
	}
	al_eth_mac_table_unicast_add(adapter, i,
	(unsigned char *)ifua->ifa_addr, 1);
	i++;
	}
	if_addr_runlock(ifp);

	}
	al_eth_mac_table_promiscuous_set(adapter, false);
	}
	}

	static void
	al_eth_config_rx_fwd(struct al_eth_adapter *adapter)
	{
	struct al_eth_fwd_ctrl_table_entry entry;
	int i;

	/* let priority be equal to pbits */
	for (i = 0; i < AL_ETH_FWD_PBITS_TABLE_NUM; i++)
	al_eth_fwd_pbits_table_set(&adapter->hal_adapter, i, i);

	/* map priority to queue index, queue id = priority/2 */
	for (i = 0; i < AL_ETH_FWD_PRIO_TABLE_NUM; i++)
	al_eth_fwd_priority_table_set(&adapter->hal_adapter, i, i >> 1);

	entry.prio_sel = AL_ETH_CTRL_TABLE_PRIO_SEL_VAL_0;
	entry.queue_sel_1 = AL_ETH_CTRL_TABLE_QUEUE_SEL_1_THASH_TABLE;
	entry.queue_sel_2 = AL_ETH_CTRL_TABLE_QUEUE_SEL_2_NO_PRIO;
	entry.udma_sel = AL_ETH_CTRL_TABLE_UDMA_SEL_MAC_TABLE;
	entry.filter = FALSE;

	al_eth_ctrl_table_def_set(&adapter->hal_adapter, FALSE, &entry);

	/*
	* By default set the mac table to forward all unicast packets to our
	* MAC address and all broadcast. all the rest will be dropped.
	*/
	al_eth_mac_table_unicast_add(adapter, AL_ETH_MAC_TABLE_UNICAST_IDX_BASE,
	adapter->mac_addr, 1);
	al_eth_mac_table_broadcast_add(adapter, AL_ETH_MAC_TABLE_BROADCAST_IDX, 1);
	al_eth_mac_table_promiscuous_set(adapter, false);

	/* set toeplitz hash keys */
	for (i = 0; i < sizeof(adapter->toeplitz_hash_key); i++)
	((uint8_t)adapter->toeplitz_hash_key + i) = (uint8_t)random();

	for (i = 0; i < AL_ETH_RX_HASH_KEY_NUM; i++)
	al_eth_hash_key_set(&adapter->hal_adapter, i,
	htonl(adapter->toeplitz_hash_key[i]));

	for (i = 0; i < AL_ETH_RX_RSS_TABLE_SIZE; i++) {
	adapter->rss_ind_tbl[i] = ethtool_rxfh_indir_default(i,
	AL_ETH_NUM_QUEUES);
	al_eth_set_thash_table_entry(adapter, i, 0,
	adapter->rss_ind_tbl[i]);
	}

	al_eth_fsm_table_init(adapter);
	}

	static void
	al_eth_req_rx_buff_size(struct al_eth_adapter *adapter, int size)
	{

	/*
	* Determine the correct mbuf pool
	* for doing jumbo frames
	* Try from the smallest up to maximum supported
	*/
	adapter->rx_mbuf_sz = MCLBYTES;
	if (size > 2048) {
	if (adapter->max_rx_buff_alloc_size > 2048)
	adapter->rx_mbuf_sz = MJUMPAGESIZE;
	else
	return;
	}
	if (size > 4096) {
	if (adapter->max_rx_buff_alloc_size > 4096)
	adapter->rx_mbuf_sz = MJUM9BYTES;
	else
	return;
	}
	if (size > 9216) {
	if (adapter->max_rx_buff_alloc_size > 9216)
	adapter->rx_mbuf_sz = MJUM16BYTES;
	else
	return;
	}
	}

	static int
	al_eth_change_mtu(struct al_eth_adapter *adapter, int new_mtu)
	{
	int max_frame = new_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN +
	ETHER_VLAN_ENCAP_LEN;

	al_eth_req_rx_buff_size(adapter, new_mtu);

	device_printf_dbg(adapter->dev, "set MTU to %d\n", new_mtu);
	al_eth_rx_pkt_limit_config(&adapter->hal_adapter,
	AL_ETH_MIN_FRAME_LEN, max_frame);

	al_eth_tso_mss_config(&adapter->hal_adapter, 0, new_mtu - 100);

	return (0);
	}

	static int
	al_eth_check_mtu(struct al_eth_adapter *adapter, int new_mtu)
	{
	int max_frame = new_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN;

	if ((new_mtu < AL_ETH_MIN_FRAME_LEN) \|\|
	(max_frame > AL_ETH_MAX_FRAME_LEN)) {
	return (EINVAL);
	}

	return (0);
	}

	static int
	al_eth_udma_queue_enable(struct al_eth_adapter *adapter, enum al_udma_type type,
	int qid)
	{
	int rc = 0;
	char *name = (type == UDMA_TX) ? "Tx" : "Rx";
	struct al_udma_q_params *q_params;

	if (type == UDMA_TX)
	q_params = &adapter->tx_ring[qid].q_params;
	else
	q_params = &adapter->rx_ring[qid].q_params;

	rc = al_eth_queue_config(&adapter->hal_adapter, type, qid, q_params);
	if (rc < 0) {
	device_printf(adapter->dev, "config %s queue %u failed\n", name,
	qid);
	return (rc);
	}
	return (rc);
	}

	static int
	al_eth_udma_queues_enable_all(struct al_eth_adapter *adapter)
	{
	int i;

	for (i = 0; i < adapter->num_tx_queues; i++)
	al_eth_udma_queue_enable(adapter, UDMA_TX, i);

	for (i = 0; i < adapter->num_rx_queues; i++)
	al_eth_udma_queue_enable(adapter, UDMA_RX, i);

	return (0);
	}

	static void
	al_eth_up_complete(struct al_eth_adapter *adapter)
	{

	al_eth_configure_int_mode(adapter);
	al_eth_config_rx_fwd(adapter);
	al_eth_change_mtu(adapter, adapter->netdev->if_mtu);
	al_eth_udma_queues_enable_all(adapter);
	al_eth_refill_all_rx_bufs(adapter);
	al_eth_interrupts_unmask(adapter);

	/* enable forwarding interrupts from eth through pci end point */
	if ((adapter->board_type == ALPINE_FPGA_NIC) \|\|
	(adapter->board_type == ALPINE_NIC)) {
	al_eth_forward_int_config((uint32_t*)adapter->internal_pcie_base +
	AL_REG_OFFSET_FORWARD_INTR, AL_EN_FORWARD_INTR);
	}

	al_eth_flow_ctrl_enable(adapter);

	mtx_lock(&adapter->stats_mtx);
	callout_reset(&adapter->stats_callout, hz, al_tick_stats, (void*)adapter);
	mtx_unlock(&adapter->stats_mtx);

	al_eth_mac_start(&adapter->hal_adapter);
	}

	static int
	al_media_update(struct ifnet *ifp)
	{
	struct al_eth_adapter *adapter = ifp->if_softc;

	if ((ifp->if_flags & IFF_UP) != 0)
	mii_mediachg(adapter->mii);

	return (0);
	}

	static void
	al_media_status(struct ifnet ifp, struct ifmediareq ifmr)
	{
	struct al_eth_adapter *sc = ifp->if_softc;
	struct mii_data *mii;

	if (sc->mii == NULL) {
	ifmr->ifm_active = IFM_ETHER \| IFM_NONE;
	ifmr->ifm_status = 0;

	return;
	}

	mii = sc->mii;
	mii_pollstat(mii);

	ifmr->ifm_active = mii->mii_media_active;
	ifmr->ifm_status = mii->mii_media_status;
	}

	static void
	al_tick(void *arg)
	{
	struct al_eth_adapter *adapter = arg;

	mii_tick(adapter->mii);

	/* Schedule another timeout one second from now */
	callout_schedule(&adapter->wd_callout, hz);
	}

	static void
	al_tick_stats(void *arg)
	{
	struct al_eth_adapter *adapter = arg;

	al_eth_update_stats(adapter);

	callout_schedule(&adapter->stats_callout, hz);
	}

	static int
	al_eth_up(struct al_eth_adapter *adapter)
	{
	struct ifnet *ifp = adapter->netdev;
	int rc;

	if (adapter->up)
	return (0);

	if ((adapter->flags & AL_ETH_FLAG_RESET_REQUESTED) != 0) {
	al_eth_function_reset(adapter);
	adapter->flags &= ~AL_ETH_FLAG_RESET_REQUESTED;
	}

	ifp->if_hwassist = 0;
	if ((ifp->if_capenable & IFCAP_TSO) != 0)
	ifp->if_hwassist \|= CSUM_TSO;
	if ((ifp->if_capenable & IFCAP_TXCSUM) != 0)
	ifp->if_hwassist \|= (CSUM_TCP \| CSUM_UDP);
	if ((ifp->if_capenable & IFCAP_TXCSUM_IPV6) != 0)
	ifp->if_hwassist \|= (CSUM_TCP_IPV6 \| CSUM_UDP_IPV6);

	al_eth_serdes_init(adapter);

	rc = al_eth_hw_init(adapter);
	if (rc != 0)
	goto err_hw_init_open;

	rc = al_eth_setup_int_mode(adapter);
	if (rc != 0) {
	device_printf(adapter->dev,
	"%s failed at setup interrupt mode!\n", __func__);
	goto err_setup_int;
	}

	/* allocate transmit descriptors */
	rc = al_eth_setup_all_tx_resources(adapter);
	if (rc != 0)
	goto err_setup_tx;

	/* allocate receive descriptors */
	rc = al_eth_setup_all_rx_resources(adapter);
	if (rc != 0)
	goto err_setup_rx;

	rc = al_eth_request_irq(adapter);
	if (rc != 0)
	goto err_req_irq;

	al_eth_up_complete(adapter);

	adapter->up = true;

	if (adapter->mac_mode == AL_ETH_MAC_MODE_10GbE_Serial)
	adapter->netdev->if_link_state = LINK_STATE_UP;

	if (adapter->mac_mode == AL_ETH_MAC_MODE_RGMII) {
	mii_mediachg(adapter->mii);

	/* Schedule watchdog timeout */
	mtx_lock(&adapter->wd_mtx);
	callout_reset(&adapter->wd_callout, hz, al_tick, adapter);
	mtx_unlock(&adapter->wd_mtx);

	mii_pollstat(adapter->mii);
	}

	return (rc);

	err_req_irq:
	al_eth_free_all_rx_resources(adapter);
	err_setup_rx:
	al_eth_free_all_tx_resources(adapter);
	err_setup_tx:
	al_eth_free_irq(adapter);
	err_setup_int:
	al_eth_hw_stop(adapter);
	err_hw_init_open:
	al_eth_function_reset(adapter);

	return (rc);
	}

	static int
	al_shutdown(device_t dev)
	{
	struct al_eth_adapter *adapter = device_get_softc(dev);

	al_eth_down(adapter);

	return (0);
	}

	static void
	al_eth_down(struct al_eth_adapter *adapter)
	{

	device_printf_dbg(adapter->dev, "al_eth_down: begin\n");

	adapter->up = false;

	mtx_lock(&adapter->wd_mtx);
	callout_stop(&adapter->wd_callout);
	mtx_unlock(&adapter->wd_mtx);

	al_eth_disable_int_sync(adapter);

	mtx_lock(&adapter->stats_mtx);
	callout_stop(&adapter->stats_callout);
	mtx_unlock(&adapter->stats_mtx);

	al_eth_free_irq(adapter);
	al_eth_hw_stop(adapter);

	al_eth_free_all_tx_resources(adapter);
	al_eth_free_all_rx_resources(adapter);
	}

	static int
	al_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
	{
	struct al_eth_adapter *adapter = ifp->if_softc;
	struct ifreq ifr = (struct ifreq )data;
	int error = 0;

	switch (command) {
	case SIOCSIFMTU:
	{
	error = al_eth_check_mtu(adapter, ifr->ifr_mtu);
	if (error != 0) {
	device_printf(adapter->dev, "ioctl wrong mtu %u\n",
	adapter->netdev->if_mtu);
	break;
	}

	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	adapter->netdev->if_mtu = ifr->ifr_mtu;
	al_init(adapter);
	break;
	}
	case SIOCSIFFLAGS:
	if ((ifp->if_flags & IFF_UP) != 0) {
	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
	if (((ifp->if_flags ^ adapter->if_flags) &
	(IFF_PROMISC \| IFF_ALLMULTI)) != 0) {
	device_printf_dbg(adapter->dev,
	"ioctl promisc/allmulti\n");
	al_eth_set_rx_mode(adapter);
	}
	} else {
	error = al_eth_up(adapter);
	if (error == 0)
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	}
	} else {
	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
	al_eth_down(adapter);
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	}
	}

	adapter->if_flags = ifp->if_flags;
	break;

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
	device_printf_dbg(adapter->dev,
	"ioctl add/del multi before\n");
	al_eth_set_rx_mode(adapter);
	#ifdef DEVICE_POLLING
	if ((ifp->if_capenable & IFCAP_POLLING) == 0)
	#endif
	}
	break;
	case SIOCSIFMEDIA:
	case SIOCGIFMEDIA:
	if (adapter->mii != NULL)
	error = ifmedia_ioctl(ifp, ifr,
	&adapter->mii->mii_media, command);
	else
	error = ifmedia_ioctl(ifp, ifr,
	&adapter->media, command);
	break;
	case SIOCSIFCAP:
	{
	int mask, reinit;

	reinit = 0;
	mask = ifr->ifr_reqcap ^ ifp->if_capenable;
	#ifdef DEVICE_POLLING
	if ((mask & IFCAP_POLLING) != 0) {
	if ((ifr->ifr_reqcap & IFCAP_POLLING) != 0) {
	if (error != 0)
	return (error);
	ifp->if_capenable \|= IFCAP_POLLING;
	} else {
	error = ether_poll_deregister(ifp);
	/* Enable interrupt even in error case */
	ifp->if_capenable &= ~IFCAP_POLLING;
	}
	}
	#endif
	if ((mask & IFCAP_HWCSUM) != 0) {
	/* apply to both rx and tx */
	ifp->if_capenable ^= IFCAP_HWCSUM;
	reinit = 1;
	}
	if ((mask & IFCAP_HWCSUM_IPV6) != 0) {
	ifp->if_capenable ^= IFCAP_HWCSUM_IPV6;
	reinit = 1;
	}
	if ((mask & IFCAP_TSO) != 0) {
	ifp->if_capenable ^= IFCAP_TSO;
	reinit = 1;
	}
	if ((mask & IFCAP_LRO) != 0) {
	ifp->if_capenable ^= IFCAP_LRO;
	}
	if ((mask & IFCAP_VLAN_HWTAGGING) != 0) {
	ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
	reinit = 1;
	}
	if ((mask & IFCAP_VLAN_HWFILTER) != 0) {
	ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
	reinit = 1;
	}
	if ((mask & IFCAP_VLAN_HWTSO) != 0) {
	ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
	reinit = 1;
	}
	if ((reinit != 0) &&
	((ifp->if_drv_flags & IFF_DRV_RUNNING)) != 0)
	{
	al_init(adapter);
	}
	break;
	}

	default:
	error = ether_ioctl(ifp, command, data);
	break;
	}

	return (error);
	}

	static int
	al_is_device_supported(device_t dev)
	{
	uint16_t pci_vendor_id = pci_get_vendor(dev);
	uint16_t pci_device_id = pci_get_device(dev);

	return (pci_vendor_id == PCI_VENDOR_ID_ANNAPURNA_LABS &&
	(pci_device_id == PCI_DEVICE_ID_AL_ETH \|\|
	pci_device_id == PCI_DEVICE_ID_AL_ETH_ADVANCED \|\|
	pci_device_id == PCI_DEVICE_ID_AL_ETH_NIC \|\|
	pci_device_id == PCI_DEVICE_ID_AL_ETH_FPGA_NIC));
	}

	/* Time in mSec to keep trying to read / write from MDIO in case of error */
	#define MDIO_TIMEOUT_MSEC 100
	#define MDIO_PAUSE_MSEC 10

	static int
	al_miibus_readreg(device_t dev, int phy, int reg)
	{
	struct al_eth_adapter *adapter = device_get_softc(dev);
	uint16_t value = 0;
	int rc;
	int timeout = MDIO_TIMEOUT_MSEC;

	while (timeout > 0) {
	rc = al_eth_mdio_read(&adapter->hal_adapter, adapter->phy_addr,
	-1, reg, &value);

	if (rc == 0)
	return (value);

	device_printf_dbg(adapter->dev,
	"mdio read failed. try again in 10 msec\n");

	timeout -= MDIO_PAUSE_MSEC;
	pause("readred pause", MDIO_PAUSE_MSEC);
	}

	if (rc != 0)
	device_printf(adapter->dev, "MDIO read failed on timeout\n");

	return (value);
	}

	static int
	al_miibus_writereg(device_t dev, int phy, int reg, int value)
	{
	struct al_eth_adapter *adapter = device_get_softc(dev);
	int rc;
	int timeout = MDIO_TIMEOUT_MSEC;

	while (timeout > 0) {
	rc = al_eth_mdio_write(&adapter->hal_adapter, adapter->phy_addr,
	-1, reg, value);

	if (rc == 0)
	return (0);

	device_printf(adapter->dev,
	"mdio write failed. try again in 10 msec\n");

	timeout -= MDIO_PAUSE_MSEC;
	pause("miibus writereg", MDIO_PAUSE_MSEC);
	}

	if (rc != 0)
	device_printf(adapter->dev, "MDIO write failed on timeout\n");

	return (rc);
	}

	static void
	al_miibus_statchg(device_t dev)
	{
	struct al_eth_adapter *adapter = device_get_softc(dev);

	device_printf_dbg(adapter->dev,
	"al_miibus_statchg: state has changed!\n");
	device_printf_dbg(adapter->dev,
	"al_miibus_statchg: active = 0x%x status = 0x%x\n",
	adapter->mii->mii_media_active, adapter->mii->mii_media_status);

	if (adapter->up == 0)
	return;

	if ((adapter->mii->mii_media_status & IFM_AVALID) != 0) {
	if (adapter->mii->mii_media_status & IFM_ACTIVE) {
	device_printf(adapter->dev, "link is UP\n");
	adapter->netdev->if_link_state = LINK_STATE_UP;
	} else {
	device_printf(adapter->dev, "link is DOWN\n");
	adapter->netdev->if_link_state = LINK_STATE_DOWN;
	}
	}
	}

	static void
	al_miibus_linkchg(device_t dev)
	{
	struct al_eth_adapter *adapter = device_get_softc(dev);
	uint8_t duplex = 0;
	uint8_t speed = 0;

	if (adapter->mii == NULL)
	return;

	if ((adapter->netdev->if_flags & IFF_UP) == 0)
	return;

	/* Ignore link changes when link is not ready */
	if ((adapter->mii->mii_media_status & (IFM_AVALID \| IFM_ACTIVE)) !=
	(IFM_AVALID \| IFM_ACTIVE)) {
	return;
	}

	if ((adapter->mii->mii_media_active & IFM_FDX) != 0)
	duplex = 1;

	speed = IFM_SUBTYPE(adapter->mii->mii_media_active);

	if (speed == IFM_10_T) {
	al_eth_mac_link_config(&adapter->hal_adapter, 0, 1,
	AL_10BASE_T_SPEED, duplex);
	return;
	}

	if (speed == IFM_100_TX) {
	al_eth_mac_link_config(&adapter->hal_adapter, 0, 1,
	AL_100BASE_TX_SPEED, duplex);
	return;
	}

	if (speed == IFM_1000_T) {
	al_eth_mac_link_config(&adapter->hal_adapter, 0, 1,
	AL_1000BASE_T_SPEED, duplex);
	return;
	}

	device_printf(adapter->dev, "ERROR: unknown MII media active 0x%08x\n",
	adapter->mii->mii_media_active);
	}
	Index: head/sys/dev/axgbe/xgbe-drv.c
	===================================================================
	--- head/sys/dev/axgbe/xgbe-drv.c (revision 327172)
	+++ head/sys/dev/axgbe/xgbe-drv.c (revision 327173)
	@@ -1,1079 +1,1076 @@
	/*
	* AMD 10Gb Ethernet driver
	*
	* This file is available to you under your choice of the following two
	* licenses:
	*
	* License 1: GPLv2
	*
	* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
	*
	* This file is free software; you may copy, redistribute and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation, either version 2 of the License, or (at
	* your option) any later version.
	*
	* This file is distributed in the hope that it will be useful, but
	* WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	*
	* This file incorporates work covered by the following copyright and
	* permission notice:
	* The Synopsys DWC ETHER XGMAC Software Driver and documentation
	* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
	* Inc. unless otherwise expressly agreed to in writing between Synopsys
	* and you.
	*
	* The Software IS NOT an item of Licensed Software or Licensed Product
	* under any End User Software License Agreement or Agreement for Licensed
	* Product with Synopsys or any supplement thereto. Permission is hereby
	* granted, free of charge, to any person obtaining a copy of this software
	* annotated with this license and the Software, to deal in the Software
	* without restriction, including without limitation the rights to use,
	* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	* of the Software, and to permit persons to whom the Software is furnished
	* to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included
	* in all copies or substantial portions of the Software.
	*
	* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
	* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	* THE POSSIBILITY OF SUCH DAMAGE.
	*
	*
	* License 2: Modified BSD
	*
	* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of Advanced Micro Devices, Inc. nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* This file incorporates work covered by the following copyright and
	* permission notice:
	* The Synopsys DWC ETHER XGMAC Software Driver and documentation
	* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
	* Inc. unless otherwise expressly agreed to in writing between Synopsys
	* and you.
	*
	* The Software IS NOT an item of Licensed Software or Licensed Product
	* under any End User Software License Agreement or Agreement for Licensed
	* Product with Synopsys or any supplement thereto. Permission is hereby
	* granted, free of charge, to any person obtaining a copy of this software
	* annotated with this license and the Software, to deal in the Software
	* without restriction, including without limitation the rights to use,
	* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	* of the Software, and to permit persons to whom the Software is furnished
	* to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included
	* in all copies or substantial portions of the Software.
	*
	* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
	* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	* THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>

	#include "xgbe.h"
	#include "xgbe-common.h"

	static int xgbe_one_poll(struct xgbe_channel *channel, int budget);
	static int xgbe_all_poll(struct xgbe_prv_data *pdata, int budget);

	static int xgbe_alloc_channels(struct xgbe_prv_data *pdata)
	{
	struct xgbe_channel channel_mem, channel;
	struct xgbe_ring tx_ring, rx_ring;
	unsigned int count, i;
	int ret = -ENOMEM;

	count = max_t(unsigned int, pdata->tx_ring_count, pdata->rx_ring_count);

	channel_mem = malloc(count * sizeof(struct xgbe_channel), M_AXGBE,
	M_WAITOK \| M_ZERO);
	tx_ring = malloc(pdata->tx_ring_count * sizeof(struct xgbe_ring),
	M_AXGBE, M_WAITOK \| M_ZERO);
	rx_ring = malloc(pdata->rx_ring_count * sizeof(struct xgbe_ring),
	M_AXGBE, M_WAITOK \| M_ZERO);

	for (i = 0, channel = channel_mem; i < count; i++, channel++) {
	snprintf(channel->name, sizeof(channel->name), "channel-%d", i);
	channel->pdata = pdata;
	channel->queue_index = i;
	channel->dma_tag = rman_get_bustag(pdata->xgmac_res);
	bus_space_subregion(channel->dma_tag,
	rman_get_bushandle(pdata->xgmac_res),
	DMA_CH_BASE + (DMA_CH_INC * i), DMA_CH_INC,
	&channel->dma_handle);

	if (pdata->per_channel_irq) {
	if (pdata->chan_irq_res[i] == NULL)
	goto err_irq;

	channel->dma_irq_res = pdata->chan_irq_res[i];
	}

	if (i < pdata->tx_ring_count) {
	spin_lock_init(&tx_ring->lock);
	channel->tx_ring = tx_ring++;
	}

	if (i < pdata->rx_ring_count) {
	spin_lock_init(&rx_ring->lock);
	channel->rx_ring = rx_ring++;
	}
	}

	pdata->channel = channel_mem;
	pdata->channel_count = count;

	return 0;

	err_irq:
	free(rx_ring, M_AXGBE);
	free(tx_ring, M_AXGBE);
	free(channel_mem, M_AXGBE);

	return ret;
	}

	static void xgbe_free_channels(struct xgbe_prv_data *pdata)
	{
	if (!pdata->channel)
	return;

	free(pdata->channel->rx_ring, M_AXGBE);
	free(pdata->channel->tx_ring, M_AXGBE);
	free(pdata->channel, M_AXGBE);

	pdata->channel = NULL;
	pdata->channel_count = 0;
	}

	static inline unsigned int xgbe_tx_avail_desc(struct xgbe_ring *ring)
	{
	return (ring->rdesc_count - (ring->cur - ring->dirty));
	}

	static inline unsigned int xgbe_rx_dirty_desc(struct xgbe_ring *ring)
	{
	return (ring->cur - ring->dirty);
	}

	static int xgbe_maybe_stop_tx_queue(struct xgbe_channel *channel,
	struct xgbe_ring *ring, unsigned int count)
	{
	struct xgbe_prv_data *pdata = channel->pdata;

	if (count > xgbe_tx_avail_desc(ring)) {
	/* If we haven't notified the hardware because of xmit_more
	* support, tell it now
	*/
	if (ring->tx.xmit_more)
	pdata->hw_if.tx_start_xmit(channel, ring);

	return EFBIG;
	}

	return 0;
	}

	static int xgbe_calc_rx_buf_size(struct ifnet *netdev, unsigned int mtu)
	{
	unsigned int rx_buf_size;

	if (mtu > XGMAC_JUMBO_PACKET_MTU) {
	return -EINVAL;
	}

	rx_buf_size = mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
	rx_buf_size = MIN(XGBE_RX_MIN_BUF_SIZE, PAGE_SIZE);

	rx_buf_size = (rx_buf_size + XGBE_RX_BUF_ALIGN - 1) &
	~(XGBE_RX_BUF_ALIGN - 1);

	return rx_buf_size;
	}

	static void xgbe_enable_rx_tx_ints(struct xgbe_prv_data *pdata)
	{
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_channel *channel;
	enum xgbe_int int_id;
	unsigned int i;

	channel = pdata->channel;
	for (i = 0; i < pdata->channel_count; i++, channel++) {
	if (channel->tx_ring && channel->rx_ring)
	int_id = XGMAC_INT_DMA_CH_SR_TI_RI;
	else if (channel->tx_ring)
	int_id = XGMAC_INT_DMA_CH_SR_TI;
	else if (channel->rx_ring)
	int_id = XGMAC_INT_DMA_CH_SR_RI;
	else
	continue;

	hw_if->enable_int(channel, int_id);
	}
	}

	static void xgbe_isr(void *data)
	{
	struct xgbe_prv_data *pdata = data;
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_channel *channel;
	unsigned int dma_isr, dma_ch_isr;
	unsigned int mac_isr;
	unsigned int i;

	/* The DMA interrupt status register also reports MAC and MTL
	* interrupts. So for polling mode, we just need to check for
	* this register to be non-zero
	*/
	dma_isr = XGMAC_IOREAD(pdata, DMA_ISR);
	if (!dma_isr)
	return;

	for (i = 0; i < pdata->channel_count; i++) {
	if (!(dma_isr & (1 << i)))
	continue;

	channel = pdata->channel + i;

	dma_ch_isr = XGMAC_DMA_IOREAD(channel, DMA_CH_SR);

	/* The TI or RI interrupt bits may still be set even if using
	* per channel DMA interrupts. Check to be sure those are not
	* enabled before using the private data napi structure.
	*/
	if (!pdata->per_channel_irq &&
	(XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, TI) \|\|
	XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, RI))) {
	xgbe_all_poll(pdata, 16);
	}

	if (XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, RBU))
	pdata->ext_stats.rx_buffer_unavailable++;

	/* Restart the device on a Fatal Bus Error */
	if (XGMAC_GET_BITS(dma_ch_isr, DMA_CH_SR, FBE))
	taskqueue_enqueue(taskqueue_thread,
	&pdata->restart_work);

	/* Clear all interrupt signals */
	XGMAC_DMA_IOWRITE(channel, DMA_CH_SR, dma_ch_isr);
	}

	if (XGMAC_GET_BITS(dma_isr, DMA_ISR, MACIS)) {
	mac_isr = XGMAC_IOREAD(pdata, MAC_ISR);

	if (XGMAC_GET_BITS(mac_isr, MAC_ISR, MMCTXIS))
	hw_if->tx_mmc_int(pdata);

	if (XGMAC_GET_BITS(mac_isr, MAC_ISR, MMCRXIS))
	hw_if->rx_mmc_int(pdata);
	}
	}

	static void xgbe_dma_isr(void *data)
	{
	struct xgbe_channel *channel = data;

	xgbe_one_poll(channel, 16);
	}

	static void xgbe_service(void *ctx, int pending)
	{
	struct xgbe_prv_data *pdata = ctx;

	pdata->phy_if.phy_status(pdata);
	}

	static void xgbe_service_timer(void *data)
	{
	struct xgbe_prv_data *pdata = data;

	DBGPR("--> xgbe_service_timer\n");
	taskqueue_enqueue(pdata->dev_workqueue, &pdata->service_work);

	callout_reset(&pdata->service_timer, hz, xgbe_service_timer, pdata);
	DBGPR("<-- xgbe_service_timer\n");
	}

	static void xgbe_init_timers(struct xgbe_prv_data *pdata)
	{

	callout_init(&pdata->service_timer, 1);
	}

	static void xgbe_start_timers(struct xgbe_prv_data *pdata)
	{
	callout_reset(&pdata->service_timer, hz, xgbe_service_timer, pdata);
	}

	static void xgbe_stop_timers(struct xgbe_prv_data *pdata)
	{

	callout_drain(&pdata->service_timer);
	}

	void xgbe_get_all_hw_features(struct xgbe_prv_data *pdata)
	{
	unsigned int mac_hfr0, mac_hfr1, mac_hfr2;
	struct xgbe_hw_features *hw_feat = &pdata->hw_feat;

	DBGPR("-->xgbe_get_all_hw_features\n");

	mac_hfr0 = XGMAC_IOREAD(pdata, MAC_HWF0R);
	mac_hfr1 = XGMAC_IOREAD(pdata, MAC_HWF1R);
	mac_hfr2 = XGMAC_IOREAD(pdata, MAC_HWF2R);

	memset(hw_feat, 0, sizeof(*hw_feat));

	hw_feat->version = XGMAC_IOREAD(pdata, MAC_VR);

	/* Hardware feature register 0 */
	hw_feat->gmii = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, GMIISEL);
	hw_feat->vlhash = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, VLHASH);
	hw_feat->sma = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, SMASEL);
	hw_feat->rwk = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, RWKSEL);
	hw_feat->mgk = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, MGKSEL);
	hw_feat->mmc = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, MMCSEL);
	hw_feat->aoe = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, ARPOFFSEL);
	hw_feat->ts = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, TSSEL);
	hw_feat->eee = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, EEESEL);
	hw_feat->tx_coe = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, TXCOESEL);
	hw_feat->rx_coe = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, RXCOESEL);
	hw_feat->addn_mac = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R,
	ADDMACADRSEL);
	hw_feat->ts_src = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, TSSTSSEL);
	hw_feat->sa_vlan_ins = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, SAVLANINS);

	/* Hardware feature register 1 */
	hw_feat->rx_fifo_size = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
	RXFIFOSIZE);
	hw_feat->tx_fifo_size = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
	TXFIFOSIZE);
	hw_feat->adv_ts_hi = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, ADVTHWORD);
	hw_feat->dma_width = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, ADDR64);
	hw_feat->dcb = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, DCBEN);
	hw_feat->sph = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, SPHEN);
	hw_feat->tso = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, TSOEN);
	hw_feat->dma_debug = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, DBGMEMA);
	hw_feat->rss = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, RSSEN);
	hw_feat->tc_cnt = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, NUMTC);
	hw_feat->hash_table_size = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
	HASHTBLSZ);
	hw_feat->l3l4_filter_num = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R,
	L3L4FNUM);

	/* Hardware feature register 2 */
	hw_feat->rx_q_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, RXQCNT);
	hw_feat->tx_q_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, TXQCNT);
	hw_feat->rx_ch_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, RXCHCNT);
	hw_feat->tx_ch_cnt = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, TXCHCNT);
	hw_feat->pps_out_num = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, PPSOUTNUM);
	hw_feat->aux_snap_num = XGMAC_GET_BITS(mac_hfr2, MAC_HWF2R, AUXSNAPNUM);

	/* Translate the Hash Table size into actual number */
	switch (hw_feat->hash_table_size) {
	case 0:
	break;
	case 1:
	hw_feat->hash_table_size = 64;
	break;
	case 2:
	hw_feat->hash_table_size = 128;
	break;
	case 3:
	hw_feat->hash_table_size = 256;
	break;
	}

	/* Translate the address width setting into actual number */
	switch (hw_feat->dma_width) {
	case 0:
	hw_feat->dma_width = 32;
	break;
	case 1:
	hw_feat->dma_width = 40;
	break;
	case 2:
	hw_feat->dma_width = 48;
	break;
	default:
	hw_feat->dma_width = 32;
	}

	/* The Queue, Channel and TC counts are zero based so increment them
	* to get the actual number
	*/
	hw_feat->rx_q_cnt++;
	hw_feat->tx_q_cnt++;
	hw_feat->rx_ch_cnt++;
	hw_feat->tx_ch_cnt++;
	hw_feat->tc_cnt++;

	DBGPR("<--xgbe_get_all_hw_features\n");
	}

	static int xgbe_request_irqs(struct xgbe_prv_data *pdata)
	{
	struct xgbe_channel *channel;
	unsigned int i;
	int ret;

	ret = bus_setup_intr(pdata->dev, pdata->dev_irq_res,
	INTR_MPSAFE \| INTR_TYPE_NET, NULL, xgbe_isr, pdata,
	&pdata->dev_irq_tag);
	if (ret) {
	return ret;
	}

	if (!pdata->per_channel_irq)
	return 0;

	channel = pdata->channel;
	for (i = 0; i < pdata->channel_count; i++, channel++) {
	ret = bus_setup_intr(pdata->dev, channel->dma_irq_res,
	INTR_MPSAFE \| INTR_TYPE_NET, NULL, xgbe_dma_isr, channel,
	&channel->dma_irq_tag);
	if (ret != 0) {
	goto err_irq;
	}
	}

	return 0;

	err_irq:
	/* Using an unsigned int, 'i' will go to UINT_MAX and exit */
	for (i--, channel--; i < pdata->channel_count; i--, channel--)
	bus_teardown_intr(pdata->dev, channel->dma_irq_res,
	channel->dma_irq_tag);

	bus_teardown_intr(pdata->dev, pdata->dev_irq_res, pdata->dev_irq_tag);

	return -ret;
	}

	static void xgbe_free_irqs(struct xgbe_prv_data *pdata)
	{
	struct xgbe_channel *channel;
	unsigned int i;

	bus_teardown_intr(pdata->dev, pdata->dev_irq_res, pdata->dev_irq_tag);

	if (!pdata->per_channel_irq)
	return;

	channel = pdata->channel;
	for (i = 0; i < pdata->channel_count; i++, channel++)
	bus_teardown_intr(pdata->dev, channel->dma_irq_res,
	channel->dma_irq_tag);
	}

	void xgbe_init_tx_coalesce(struct xgbe_prv_data *pdata)
	{
	struct xgbe_hw_if *hw_if = &pdata->hw_if;

	DBGPR("-->xgbe_init_tx_coalesce\n");

	pdata->tx_usecs = XGMAC_INIT_DMA_TX_USECS;
	pdata->tx_frames = XGMAC_INIT_DMA_TX_FRAMES;

	hw_if->config_tx_coalesce(pdata);

	DBGPR("<--xgbe_init_tx_coalesce\n");
	}

	void xgbe_init_rx_coalesce(struct xgbe_prv_data *pdata)
	{
	struct xgbe_hw_if *hw_if = &pdata->hw_if;

	DBGPR("-->xgbe_init_rx_coalesce\n");

	pdata->rx_riwt = hw_if->usec_to_riwt(pdata, XGMAC_INIT_DMA_RX_USECS);
	pdata->rx_usecs = XGMAC_INIT_DMA_RX_USECS;
	pdata->rx_frames = XGMAC_INIT_DMA_RX_FRAMES;

	hw_if->config_rx_coalesce(pdata);

	DBGPR("<--xgbe_init_rx_coalesce\n");
	}

	static void xgbe_free_tx_data(struct xgbe_prv_data *pdata)
	{
	struct xgbe_desc_if *desc_if = &pdata->desc_if;
	struct xgbe_channel *channel;
	struct xgbe_ring *ring;
	struct xgbe_ring_data *rdata;
	unsigned int i, j;

	DBGPR("-->xgbe_free_tx_data\n");

	channel = pdata->channel;
	for (i = 0; i < pdata->channel_count; i++, channel++) {
	ring = channel->tx_ring;
	if (!ring)
	break;

	for (j = 0; j < ring->rdesc_count; j++) {
	rdata = XGBE_GET_DESC_DATA(ring, j);
	desc_if->unmap_rdata(pdata, rdata);
	}
	}

	DBGPR("<--xgbe_free_tx_data\n");
	}

	static void xgbe_free_rx_data(struct xgbe_prv_data *pdata)
	{
	struct xgbe_desc_if *desc_if = &pdata->desc_if;
	struct xgbe_channel *channel;
	struct xgbe_ring *ring;
	struct xgbe_ring_data *rdata;
	unsigned int i, j;

	DBGPR("-->xgbe_free_rx_data\n");

	channel = pdata->channel;
	for (i = 0; i < pdata->channel_count; i++, channel++) {
	ring = channel->rx_ring;
	if (!ring)
	break;

	for (j = 0; j < ring->rdesc_count; j++) {
	rdata = XGBE_GET_DESC_DATA(ring, j);
	desc_if->unmap_rdata(pdata, rdata);
	}
	}

	DBGPR("<--xgbe_free_rx_data\n");
	}

	static int xgbe_phy_init(struct xgbe_prv_data *pdata)
	{
	pdata->phy_link = -1;
	pdata->phy_speed = SPEED_UNKNOWN;

	return pdata->phy_if.phy_reset(pdata);
	}

	static int xgbe_start(struct xgbe_prv_data *pdata)
	{
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_phy_if *phy_if = &pdata->phy_if;
	int ret;

	DBGPR("-->xgbe_start\n");

	hw_if->init(pdata);

	ret = phy_if->phy_start(pdata);
	if (ret)
	goto err_phy;

	ret = xgbe_request_irqs(pdata);
	if (ret)
	goto err_napi;

	hw_if->enable_tx(pdata);
	hw_if->enable_rx(pdata);

	xgbe_enable_rx_tx_ints(pdata);

	xgbe_start_timers(pdata);
	taskqueue_enqueue(pdata->dev_workqueue, &pdata->service_work);

	DBGPR("<--xgbe_start\n");

	return 0;

	err_napi:
	phy_if->phy_stop(pdata);

	err_phy:
	hw_if->exit(pdata);

	return ret;
	}

	static void xgbe_stop(struct xgbe_prv_data *pdata)
	{
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_phy_if *phy_if = &pdata->phy_if;

	DBGPR("-->xgbe_stop\n");

	xgbe_stop_timers(pdata);
	taskqueue_drain_all(pdata->dev_workqueue);

	hw_if->disable_tx(pdata);
	hw_if->disable_rx(pdata);

	xgbe_free_irqs(pdata);

	phy_if->phy_stop(pdata);

	hw_if->exit(pdata);

	DBGPR("<--xgbe_stop\n");
	}

	static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
	{
	DBGPR("-->xgbe_restart_dev\n");

	/* If not running, "restart" will happen on open */
	if ((pdata->netdev->if_drv_flags & IFF_DRV_RUNNING) == 0)
	return;

	xgbe_stop(pdata);

	xgbe_free_tx_data(pdata);
	xgbe_free_rx_data(pdata);

	xgbe_start(pdata);

	DBGPR("<--xgbe_restart_dev\n");
	}

	static void xgbe_restart(void *ctx, int pending)
	{
	struct xgbe_prv_data *pdata = ctx;

	xgbe_restart_dev(pdata);
	}

	static void xgbe_packet_info(struct xgbe_prv_data *pdata,
	struct xgbe_ring ring, struct mbuf m0,
	struct xgbe_packet_data *packet)
	{
	struct mbuf *m;
	unsigned int len;

	packet->m = m0;

	packet->rdesc_count = 0;

	packet->tx_packets = 1;
	packet->tx_bytes = m_length(m0, NULL);

	for (m = m0; m != NULL; m = m->m_next) {
	for (len = m->m_len; len != 0;) {
	packet->rdesc_count++;
	len -= MIN(len, XGBE_TX_MAX_BUF_SIZE);
	}
	}
	}

	int xgbe_open(struct ifnet *netdev)
	{
	struct xgbe_prv_data *pdata = netdev->if_softc;
	struct xgbe_desc_if *desc_if = &pdata->desc_if;
	int ret;

	DBGPR("-->xgbe_open\n");

	/* Initialize the phy */
	ret = xgbe_phy_init(pdata);
	if (ret)
	return ret;

	/* Calculate the Rx buffer size before allocating rings */
	ret = xgbe_calc_rx_buf_size(netdev, if_getmtu(netdev));
	if (ret < 0) {
	goto err_ptpclk;
	}
	pdata->rx_buf_size = ret;

	/* Allocate the channel and ring structures */
	ret = xgbe_alloc_channels(pdata);
	if (ret) {
	printf("xgbe_alloc_channels failed\n");
	goto err_ptpclk;
	}

	/* Allocate the ring descriptors and buffers */
	ret = desc_if->alloc_ring_resources(pdata);
	if (ret) {
	printf("desc_if->alloc_ring_resources failed\n");
	goto err_channels;
	}

	TASK_INIT(&pdata->service_work, 0, xgbe_service, pdata);
	TASK_INIT(&pdata->restart_work, 0, xgbe_restart, pdata);
	xgbe_init_timers(pdata);

	ret = xgbe_start(pdata);
	if (ret)
	goto err_rings;

	clear_bit(XGBE_DOWN, &pdata->dev_state);

	DBGPR("<--xgbe_open\n");

	return 0;

	err_rings:
	desc_if->free_ring_resources(pdata);

	err_channels:
	xgbe_free_channels(pdata);

	err_ptpclk:

	return ret;
	}

	int xgbe_close(struct ifnet *netdev)
	{
	struct xgbe_prv_data *pdata = netdev->if_softc;
	struct xgbe_desc_if *desc_if = &pdata->desc_if;

	DBGPR("-->xgbe_close\n");

	/* Stop the device */
	xgbe_stop(pdata);

	/* Free the ring descriptors and buffers */
	desc_if->free_ring_resources(pdata);

	/* Free the channel and ring structures */
	xgbe_free_channels(pdata);

	set_bit(XGBE_DOWN, &pdata->dev_state);

	DBGPR("<--xgbe_close\n");

	return 0;
	}

	int xgbe_xmit(struct ifnet ifp, struct mbuf m)
	{
	struct xgbe_prv_data *pdata = ifp->if_softc;
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_desc_if *desc_if = &pdata->desc_if;
	struct xgbe_channel *channel;
	struct xgbe_ring *ring;
	struct xgbe_packet_data *packet;
	int ret;

	M_ASSERTPKTHDR(m);
	MPASS(m->m_nextpkt == NULL);

	if (__predict_false(test_bit(XGBE_DOWN, &pdata->dev_state) \|\|
	!pdata->phy.link)) {
	m_freem(m);
	return (ENETDOWN);
	}

	channel = pdata->channel;
	ring = channel->tx_ring;
	packet = &ring->packet_data;

	/* Calculate preliminary packet info */
	memset(packet, 0, sizeof(*packet));
	xgbe_packet_info(pdata, ring, m, packet);

	/* Check that there are enough descriptors available */
	ret = xgbe_maybe_stop_tx_queue(channel, ring, packet->rdesc_count);
	if (ret)
	goto tx_netdev_return;

	if (!desc_if->map_tx_skb(channel, m)) {
	goto tx_netdev_return;
	}

	/* Configure required descriptor fields for transmission */
	hw_if->dev_xmit(channel);

	return 0;

	tx_netdev_return:
	m_free(m);

	return 0;
	}

	int xgbe_change_mtu(struct ifnet *netdev, int mtu)
	{
	struct xgbe_prv_data *pdata = netdev->if_softc;
	int ret;

	DBGPR("-->xgbe_change_mtu\n");

	ret = xgbe_calc_rx_buf_size(netdev, mtu);
	if (ret < 0)
	return -ret;

	pdata->rx_buf_size = ret;
	netdev->if_mtu = mtu;

	xgbe_restart_dev(pdata);

	DBGPR("<--xgbe_change_mtu\n");

	return 0;
	}

	static void xgbe_rx_refresh(struct xgbe_channel *channel)
	{
	struct xgbe_prv_data *pdata = channel->pdata;
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_desc_if *desc_if = &pdata->desc_if;
	struct xgbe_ring *ring = channel->rx_ring;
	struct xgbe_ring_data *rdata;

	while (ring->dirty != ring->cur) {
	rdata = XGBE_GET_DESC_DATA(ring, ring->dirty);

	/* Reset rdata values */
	desc_if->unmap_rdata(pdata, rdata);

	if (desc_if->map_rx_buffer(pdata, ring, rdata))
	break;

	hw_if->rx_desc_reset(pdata, rdata, ring->dirty);

	ring->dirty++;
	}

	/* Make sure everything is written before the register write */
	dsb(sy);

	/* Update the Rx Tail Pointer Register with address of
	* the last cleaned entry */
	rdata = XGBE_GET_DESC_DATA(ring, ring->dirty - 1);
	XGMAC_DMA_IOWRITE(channel, DMA_CH_RDTR_LO,
	lower_32_bits(rdata->rdata_paddr));
	}

	static int xgbe_tx_poll(struct xgbe_channel *channel)
	{
	struct xgbe_prv_data *pdata = channel->pdata;
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_desc_if *desc_if = &pdata->desc_if;
	struct xgbe_ring *ring = channel->tx_ring;
	struct xgbe_ring_data *rdata;
	struct xgbe_ring_desc *rdesc;
	int processed = 0;
	unsigned int cur;

	DBGPR("-->xgbe_tx_poll\n");

	/* Nothing to do if there isn't a Tx ring for this channel */
	if (!ring)
	return 0;

	cur = ring->cur;

	/* Be sure we get ring->cur before accessing descriptor data */
	dsb(sy);

	while ((processed < XGBE_TX_DESC_MAX_PROC) &&
	(ring->dirty != cur)) {
	rdata = XGBE_GET_DESC_DATA(ring, ring->dirty);
	rdesc = rdata->rdesc;

	if (!hw_if->tx_complete(rdesc))
	break;

	/* Make sure descriptor fields are read after reading the OWN
	* bit */
	dsb(sy);

	/* Free the SKB and reset the descriptor for re-use */
	desc_if->unmap_rdata(pdata, rdata);
	hw_if->tx_desc_reset(rdata);

	processed++;
	ring->dirty++;
	}

	if (!processed)
	return 0;

	DBGPR("<--xgbe_tx_poll: processed=%d\n", processed);

	return processed;
	}

	static int xgbe_rx_poll(struct xgbe_channel *channel, int budget)
	{
	struct xgbe_prv_data *pdata = channel->pdata;
	struct xgbe_hw_if *hw_if = &pdata->hw_if;
	struct xgbe_ring *ring = channel->rx_ring;
	struct xgbe_ring_data *rdata;
	struct xgbe_packet_data *packet;
	struct ifnet *ifp = pdata->netdev;
	struct mbuf *m;
	- unsigned int incomplete, context_next, context;
	+ unsigned int incomplete, context_next;
	unsigned int received = 0;
	int packet_count = 0;

	DBGPR("-->xgbe_rx_poll: budget=%d\n", budget);

	/* Nothing to do if there isn't a Rx ring for this channel */
	if (!ring)
	return 0;

	incomplete = 0;
	context_next = 0;

	rdata = XGBE_GET_DESC_DATA(ring, ring->cur);
	packet = &ring->packet_data;
	while (packet_count < budget) {
	DBGPR(" cur = %d\n", ring->cur);

	read_again:
	rdata = XGBE_GET_DESC_DATA(ring, ring->cur);

	if (xgbe_rx_dirty_desc(ring) > (XGBE_RX_DESC_CNT >> 3))
	xgbe_rx_refresh(channel);

	if (hw_if->dev_read(channel))
	break;

	m = rdata->mb;

	received++;
	ring->cur++;

	incomplete = XGMAC_GET_BITS(packet->attributes,
	RX_PACKET_ATTRIBUTES,
	INCOMPLETE);
	context_next = XGMAC_GET_BITS(packet->attributes,
	RX_PACKET_ATTRIBUTES,
	CONTEXT_NEXT);
	- context = XGMAC_GET_BITS(packet->attributes,
	- RX_PACKET_ATTRIBUTES,
	- CONTEXT);

	/* Earlier error, just drain the remaining data */
	if (incomplete \|\| context_next) {
	goto read_again;
	}

	if (packet->errors) {
	rdata->mbuf_free = 1;
	goto next_packet;
	}
	rdata->mb = NULL;

	m->m_pkthdr.len = rdata->rx.hdr_len + rdata->rx.len;
	if (rdata->rx.hdr_len != 0) {
	m->m_len = rdata->rx.hdr_len;
	m->m_next->m_len = rdata->rx.len;
	} else {
	m->m_len = rdata->rx.len;
	m_freem(m->m_next);
	m->m_next = NULL;
	}
	if_setrcvif(m, ifp);
	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);

	ifp->if_input(ifp, m);

	next_packet:
	packet_count++;
	}

	DBGPR("<--xgbe_rx_poll: packet_count = %d\n", packet_count);

	return packet_count;
	}

	static int xgbe_one_poll(struct xgbe_channel *channel, int budget)
	{
	int processed = 0;

	DBGPR("-->xgbe_one_poll: budget=%d\n", budget);

	/* Cleanup Tx ring first */
	xgbe_tx_poll(channel);

	/* Process Rx ring next */
	processed = xgbe_rx_poll(channel, budget);

	DBGPR("<--xgbe_one_poll: received = %d\n", processed);

	return processed;
	}

	static int xgbe_all_poll(struct xgbe_prv_data *pdata, int budget)
	{
	struct xgbe_channel *channel;
	int ring_budget;
	int processed, last_processed;
	unsigned int i;

	DBGPR("-->xgbe_all_poll: budget=%d\n", budget);

	processed = 0;
	ring_budget = budget / pdata->rx_ring_count;
	do {
	last_processed = processed;

	channel = pdata->channel;
	for (i = 0; i < pdata->channel_count; i++, channel++) {
	/* Cleanup Tx ring first */
	xgbe_tx_poll(channel);

	/* Process Rx ring next */
	if (ring_budget > (budget - processed))
	ring_budget = budget - processed;
	processed += xgbe_rx_poll(channel, ring_budget);
	}
	} while ((processed < budget) && (processed != last_processed));

	DBGPR("<--xgbe_all_poll: received = %d\n", processed);

	return processed;
	}
	Index: head/sys/dev/axgbe/xgbe-mdio.c
	===================================================================
	--- head/sys/dev/axgbe/xgbe-mdio.c (revision 327172)
	+++ head/sys/dev/axgbe/xgbe-mdio.c (revision 327173)
	@@ -1,1180 +1,1174 @@
	/*
	* AMD 10Gb Ethernet driver
	*
	* This file is available to you under your choice of the following two
	* licenses:
	*
	* License 1: GPLv2
	*
	* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
	*
	* This file is free software; you may copy, redistribute and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation, either version 2 of the License, or (at
	* your option) any later version.
	*
	* This file is distributed in the hope that it will be useful, but
	* WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	*
	* This file incorporates work covered by the following copyright and
	* permission notice:
	* The Synopsys DWC ETHER XGMAC Software Driver and documentation
	* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
	* Inc. unless otherwise expressly agreed to in writing between Synopsys
	* and you.
	*
	* The Software IS NOT an item of Licensed Software or Licensed Product
	* under any End User Software License Agreement or Agreement for Licensed
	* Product with Synopsys or any supplement thereto. Permission is hereby
	* granted, free of charge, to any person obtaining a copy of this software
	* annotated with this license and the Software, to deal in the Software
	* without restriction, including without limitation the rights to use,
	* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	* of the Software, and to permit persons to whom the Software is furnished
	* to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included
	* in all copies or substantial portions of the Software.
	*
	* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
	* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	* THE POSSIBILITY OF SUCH DAMAGE.
	*
	*
	* License 2: Modified BSD
	*
	* Copyright (c) 2014-2016 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of Advanced Micro Devices, Inc. nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* This file incorporates work covered by the following copyright and
	* permission notice:
	* The Synopsys DWC ETHER XGMAC Software Driver and documentation
	* (hereinafter "Software") is an unsupported proprietary work of Synopsys,
	* Inc. unless otherwise expressly agreed to in writing between Synopsys
	* and you.
	*
	* The Software IS NOT an item of Licensed Software or Licensed Product
	* under any End User Software License Agreement or Agreement for Licensed
	* Product with Synopsys or any supplement thereto. Permission is hereby
	* granted, free of charge, to any person obtaining a copy of this software
	* annotated with this license and the Software, to deal in the Software
	* without restriction, including without limitation the rights to use,
	* copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	* of the Software, and to permit persons to whom the Software is furnished
	* to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included
	* in all copies or substantial portions of the Software.
	*
	* THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS"
	* BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	* PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	* THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>

	#include "xgbe.h"
	#include "xgbe-common.h"

	static void xgbe_an_state_machine(struct xgbe_prv_data *pdata);

	static void xgbe_an_enable_kr_training(struct xgbe_prv_data *pdata)
	{
	unsigned int reg;

	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);

	reg \|= XGBE_KR_TRAINING_ENABLE;
	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
	}

	static void xgbe_an_disable_kr_training(struct xgbe_prv_data *pdata)
	{
	unsigned int reg;

	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);

	reg &= ~XGBE_KR_TRAINING_ENABLE;
	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
	}

	static void xgbe_pcs_power_cycle(struct xgbe_prv_data *pdata)
	{
	unsigned int reg;

	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);

	reg \|= MDIO_CTRL1_LPOWER;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);

	DELAY(75);

	reg &= ~MDIO_CTRL1_LPOWER;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);
	}

	static void xgbe_serdes_start_ratechange(struct xgbe_prv_data *pdata)
	{
	/* Assert Rx and Tx ratechange */
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, RATECHANGE, 1);
	}

	static void xgbe_serdes_complete_ratechange(struct xgbe_prv_data *pdata)
	{
	unsigned int wait;
	u16 status;

	/* Release Rx and Tx ratechange */
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, RATECHANGE, 0);

	/* Wait for Rx and Tx ready */
	wait = XGBE_RATECHANGE_COUNT;
	while (wait--) {
	DELAY(50);

	status = XSIR0_IOREAD(pdata, SIR0_STATUS);
	if (XSIR_GET_BITS(status, SIR0_STATUS, RX_READY) &&
	XSIR_GET_BITS(status, SIR0_STATUS, TX_READY))
	goto rx_reset;
	}

	rx_reset:
	/* Perform Rx reset for the DFE changes */
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG6, RESETB_RXD, 0);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG6, RESETB_RXD, 1);
	}

	static void xgbe_xgmii_mode(struct xgbe_prv_data *pdata)
	{
	unsigned int reg;

	/* Enable KR training */
	xgbe_an_enable_kr_training(pdata);

	/* Set MAC to 10G speed */
	pdata->hw_if.set_xgmii_speed(pdata);

	/* Set PCS to KR/10G speed */
	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
	reg &= ~MDIO_PCS_CTRL2_TYPE;
	reg \|= MDIO_PCS_CTRL2_10GBR;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL2, reg);

	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
	reg &= ~MDIO_CTRL1_SPEEDSEL;
	reg \|= MDIO_CTRL1_SPEED10G;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);

	xgbe_pcs_power_cycle(pdata);

	/* Set SerDes to 10G speed */
	xgbe_serdes_start_ratechange(pdata);

	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, DATARATE, XGBE_SPEED_10000_RATE);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, WORDMODE, XGBE_SPEED_10000_WORD);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, PLLSEL, XGBE_SPEED_10000_PLL);

	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, CDR_RATE,
	pdata->serdes_cdr_rate[XGBE_SPEED_10000]);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, TXAMP,
	pdata->serdes_tx_amp[XGBE_SPEED_10000]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG20, BLWC_ENA,
	pdata->serdes_blwc[XGBE_SPEED_10000]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG114, PQ_REG,
	pdata->serdes_pq_skew[XGBE_SPEED_10000]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG129, RXDFE_CONFIG,
	pdata->serdes_dfe_tap_cfg[XGBE_SPEED_10000]);
	XRXTX_IOWRITE(pdata, RXTX_REG22,
	pdata->serdes_dfe_tap_ena[XGBE_SPEED_10000]);

	xgbe_serdes_complete_ratechange(pdata);
	}

	static void xgbe_gmii_2500_mode(struct xgbe_prv_data *pdata)
	{
	unsigned int reg;

	/* Disable KR training */
	xgbe_an_disable_kr_training(pdata);

	/* Set MAC to 2.5G speed */
	pdata->hw_if.set_gmii_2500_speed(pdata);

	/* Set PCS to KX/1G speed */
	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
	reg &= ~MDIO_PCS_CTRL2_TYPE;
	reg \|= MDIO_PCS_CTRL2_10GBX;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL2, reg);

	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
	reg &= ~MDIO_CTRL1_SPEEDSEL;
	reg \|= MDIO_CTRL1_SPEED1G;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);

	xgbe_pcs_power_cycle(pdata);

	/* Set SerDes to 2.5G speed */
	xgbe_serdes_start_ratechange(pdata);

	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, DATARATE, XGBE_SPEED_2500_RATE);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, WORDMODE, XGBE_SPEED_2500_WORD);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, PLLSEL, XGBE_SPEED_2500_PLL);

	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, CDR_RATE,
	pdata->serdes_cdr_rate[XGBE_SPEED_2500]);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, TXAMP,
	pdata->serdes_tx_amp[XGBE_SPEED_2500]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG20, BLWC_ENA,
	pdata->serdes_blwc[XGBE_SPEED_2500]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG114, PQ_REG,
	pdata->serdes_pq_skew[XGBE_SPEED_2500]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG129, RXDFE_CONFIG,
	pdata->serdes_dfe_tap_cfg[XGBE_SPEED_2500]);
	XRXTX_IOWRITE(pdata, RXTX_REG22,
	pdata->serdes_dfe_tap_ena[XGBE_SPEED_2500]);

	xgbe_serdes_complete_ratechange(pdata);
	}

	static void xgbe_gmii_mode(struct xgbe_prv_data *pdata)
	{
	unsigned int reg;

	/* Disable KR training */
	xgbe_an_disable_kr_training(pdata);

	/* Set MAC to 1G speed */
	pdata->hw_if.set_gmii_speed(pdata);

	/* Set PCS to KX/1G speed */
	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
	reg &= ~MDIO_PCS_CTRL2_TYPE;
	reg \|= MDIO_PCS_CTRL2_10GBX;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL2, reg);

	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
	reg &= ~MDIO_CTRL1_SPEEDSEL;
	reg \|= MDIO_CTRL1_SPEED1G;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);

	xgbe_pcs_power_cycle(pdata);

	/* Set SerDes to 1G speed */
	xgbe_serdes_start_ratechange(pdata);

	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, DATARATE, XGBE_SPEED_1000_RATE);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, WORDMODE, XGBE_SPEED_1000_WORD);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, PLLSEL, XGBE_SPEED_1000_PLL);

	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, CDR_RATE,
	pdata->serdes_cdr_rate[XGBE_SPEED_1000]);
	XSIR1_IOWRITE_BITS(pdata, SIR1_SPEED, TXAMP,
	pdata->serdes_tx_amp[XGBE_SPEED_1000]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG20, BLWC_ENA,
	pdata->serdes_blwc[XGBE_SPEED_1000]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG114, PQ_REG,
	pdata->serdes_pq_skew[XGBE_SPEED_1000]);
	XRXTX_IOWRITE_BITS(pdata, RXTX_REG129, RXDFE_CONFIG,
	pdata->serdes_dfe_tap_cfg[XGBE_SPEED_1000]);
	XRXTX_IOWRITE(pdata, RXTX_REG22,
	pdata->serdes_dfe_tap_ena[XGBE_SPEED_1000]);

	xgbe_serdes_complete_ratechange(pdata);
	}

	static void xgbe_cur_mode(struct xgbe_prv_data *pdata,
	enum xgbe_mode *mode)
	{
	unsigned int reg;

	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL2);
	if ((reg & MDIO_PCS_CTRL2_TYPE) == MDIO_PCS_CTRL2_10GBR)
	*mode = XGBE_MODE_KR;
	else
	*mode = XGBE_MODE_KX;
	}

	static bool xgbe_in_kr_mode(struct xgbe_prv_data *pdata)
	{
	enum xgbe_mode mode;

	xgbe_cur_mode(pdata, &mode);

	return (mode == XGBE_MODE_KR);
	}

	static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
	{
	/* If we are in KR switch to KX, and vice-versa */
	if (xgbe_in_kr_mode(pdata)) {
	if (pdata->speed_set == XGBE_SPEEDSET_1000_10000)
	xgbe_gmii_mode(pdata);
	else
	xgbe_gmii_2500_mode(pdata);
	} else {
	xgbe_xgmii_mode(pdata);
	}
	}

	static void xgbe_set_mode(struct xgbe_prv_data *pdata,
	enum xgbe_mode mode)
	{
	enum xgbe_mode cur_mode;

	xgbe_cur_mode(pdata, &cur_mode);
	if (mode != cur_mode)
	xgbe_switch_mode(pdata);
	}

	static bool xgbe_use_xgmii_mode(struct xgbe_prv_data *pdata)
	{
	if (pdata->phy.autoneg == AUTONEG_ENABLE) {
	if (pdata->phy.advertising & ADVERTISED_10000baseKR_Full)
	return true;
	} else {
	if (pdata->phy.speed == SPEED_10000)
	return true;
	}

	return false;
	}

	static bool xgbe_use_gmii_2500_mode(struct xgbe_prv_data *pdata)
	{
	if (pdata->phy.autoneg == AUTONEG_ENABLE) {
	if (pdata->phy.advertising & ADVERTISED_2500baseX_Full)
	return true;
	} else {
	if (pdata->phy.speed == SPEED_2500)
	return true;
	}

	return false;
	}

	static bool xgbe_use_gmii_mode(struct xgbe_prv_data *pdata)
	{
	if (pdata->phy.autoneg == AUTONEG_ENABLE) {
	if (pdata->phy.advertising & ADVERTISED_1000baseKX_Full)
	return true;
	} else {
	if (pdata->phy.speed == SPEED_1000)
	return true;
	}

	return false;
	}

	static void xgbe_set_an(struct xgbe_prv_data *pdata, bool enable, bool restart)
	{
	unsigned int reg;

	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
	reg &= ~MDIO_AN_CTRL1_ENABLE;

	if (enable)
	reg \|= MDIO_AN_CTRL1_ENABLE;

	if (restart)
	reg \|= MDIO_AN_CTRL1_RESTART;

	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_CTRL1, reg);
	}

	static void xgbe_restart_an(struct xgbe_prv_data *pdata)
	{
	xgbe_set_an(pdata, true, true);
	}

	static void xgbe_disable_an(struct xgbe_prv_data *pdata)
	{
	xgbe_set_an(pdata, false, false);
	}

	static enum xgbe_an xgbe_an_tx_training(struct xgbe_prv_data *pdata,
	enum xgbe_rx *state)
	{
	unsigned int ad_reg, lp_reg, reg;

	*state = XGBE_RX_COMPLETE;

	/* If we're not in KR mode then we're done */
	if (!xgbe_in_kr_mode(pdata))
	return XGBE_AN_PAGE_RECEIVED;

	/* Enable/Disable FEC */
	ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2);
	lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 2);

	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL);
	reg &= ~(MDIO_PMA_10GBR_FECABLE_ABLE \| MDIO_PMA_10GBR_FECABLE_ERRABLE);
	if ((ad_reg & 0xc000) && (lp_reg & 0xc000))
	reg \|= pdata->fec_ability;

	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);

	/* Start KR training */
	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
	if (reg & XGBE_KR_TRAINING_ENABLE) {
	XSIR0_IOWRITE_BITS(pdata, SIR0_KR_RT_1, RESET, 1);

	reg \|= XGBE_KR_TRAINING_START;
	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
	reg);

	XSIR0_IOWRITE_BITS(pdata, SIR0_KR_RT_1, RESET, 0);
	}

	return XGBE_AN_PAGE_RECEIVED;
	}

	static enum xgbe_an xgbe_an_tx_xnp(struct xgbe_prv_data *pdata,
	enum xgbe_rx *state)
	{
	u16 msg;

	*state = XGBE_RX_XNP;

	msg = XGBE_XNP_MCF_NULL_MESSAGE;
	msg \|= XGBE_XNP_MP_FORMATTED;

	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_XNP + 2, 0);
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_XNP + 1, 0);
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_XNP, msg);

	return XGBE_AN_PAGE_RECEIVED;
	}

	static enum xgbe_an xgbe_an_rx_bpa(struct xgbe_prv_data *pdata,
	enum xgbe_rx *state)
	{
	unsigned int link_support;
	unsigned int reg, ad_reg, lp_reg;

	/* Read Base Ability register 2 first */
	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 1);

	/* Check for a supported mode, otherwise restart in a different one */
	link_support = xgbe_in_kr_mode(pdata) ? 0x80 : 0x20;
	if (!(reg & link_support))
	return XGBE_AN_INCOMPAT_LINK;

	/* Check Extended Next Page support */
	ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE);
	lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA);

	return ((ad_reg & XGBE_XNP_NP_EXCHANGE) \|\|
	(lp_reg & XGBE_XNP_NP_EXCHANGE))
	? xgbe_an_tx_xnp(pdata, state)
	: xgbe_an_tx_training(pdata, state);
	}

	static enum xgbe_an xgbe_an_rx_xnp(struct xgbe_prv_data *pdata,
	enum xgbe_rx *state)
	{
	unsigned int ad_reg, lp_reg;

	/* Check Extended Next Page support */
	ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_XNP);
	lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPX);

	return ((ad_reg & XGBE_XNP_NP_EXCHANGE) \|\|
	(lp_reg & XGBE_XNP_NP_EXCHANGE))
	? xgbe_an_tx_xnp(pdata, state)
	: xgbe_an_tx_training(pdata, state);
	}

	static enum xgbe_an xgbe_an_page_received(struct xgbe_prv_data *pdata)
	{
	enum xgbe_rx *state;
	unsigned long an_timeout;
	enum xgbe_an ret;

	if (!pdata->an_start) {
	pdata->an_start = ticks;
	} else {
	an_timeout = pdata->an_start +
	((uint64_t)XGBE_AN_MS_TIMEOUT * (uint64_t)hz) / 1000ull;
	if ((int)(ticks - an_timeout) > 0) {
	/* Auto-negotiation timed out, reset state */
	pdata->kr_state = XGBE_RX_BPA;
	pdata->kx_state = XGBE_RX_BPA;

	pdata->an_start = ticks;
	}
	}

	state = xgbe_in_kr_mode(pdata) ? &pdata->kr_state
	: &pdata->kx_state;

	switch (*state) {
	case XGBE_RX_BPA:
	ret = xgbe_an_rx_bpa(pdata, state);
	break;

	case XGBE_RX_XNP:
	ret = xgbe_an_rx_xnp(pdata, state);
	break;

	default:
	ret = XGBE_AN_ERROR;
	}

	return ret;
	}

	static enum xgbe_an xgbe_an_incompat_link(struct xgbe_prv_data *pdata)
	{
	/* Be sure we aren't looping trying to negotiate */
	if (xgbe_in_kr_mode(pdata)) {
	pdata->kr_state = XGBE_RX_ERROR;

	if (!(pdata->phy.advertising & ADVERTISED_1000baseKX_Full) &&
	!(pdata->phy.advertising & ADVERTISED_2500baseX_Full))
	return XGBE_AN_NO_LINK;

	if (pdata->kx_state != XGBE_RX_BPA)
	return XGBE_AN_NO_LINK;
	} else {
	pdata->kx_state = XGBE_RX_ERROR;

	if (!(pdata->phy.advertising & ADVERTISED_10000baseKR_Full))
	return XGBE_AN_NO_LINK;

	if (pdata->kr_state != XGBE_RX_BPA)
	return XGBE_AN_NO_LINK;
	}

	xgbe_disable_an(pdata);

	xgbe_switch_mode(pdata);

	xgbe_restart_an(pdata);

	return XGBE_AN_INCOMPAT_LINK;
	}

	static void xgbe_an_isr(void *data)
	{
	struct xgbe_prv_data pdata = (struct xgbe_prv_data )data;

	/* Disable AN interrupts */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);

	/* Save the interrupt(s) that fired */
	pdata->an_int = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_INT);

	if (pdata->an_int) {
	/* Clear the interrupt(s) that fired and process them */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, ~pdata->an_int);

	xgbe_an_state_machine(pdata);
	} else {
	/* Enable AN interrupts */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK,
	XGBE_AN_INT_MASK);
	}
	}

	static void xgbe_an_state_machine(struct xgbe_prv_data *pdata)
	{
	enum xgbe_an cur_state = pdata->an_state;

	sx_xlock(&pdata->an_mutex);

	if (!pdata->an_int)
	goto out;

	next_int:
	if (pdata->an_int & XGBE_AN_PG_RCV) {
	pdata->an_state = XGBE_AN_PAGE_RECEIVED;
	pdata->an_int &= ~XGBE_AN_PG_RCV;
	} else if (pdata->an_int & XGBE_AN_INC_LINK) {
	pdata->an_state = XGBE_AN_INCOMPAT_LINK;
	pdata->an_int &= ~XGBE_AN_INC_LINK;
	} else if (pdata->an_int & XGBE_AN_INT_CMPLT) {
	pdata->an_state = XGBE_AN_COMPLETE;
	pdata->an_int &= ~XGBE_AN_INT_CMPLT;
	} else {
	pdata->an_state = XGBE_AN_ERROR;
	}

	pdata->an_result = pdata->an_state;

	again:
	cur_state = pdata->an_state;

	switch (pdata->an_state) {
	case XGBE_AN_READY:
	pdata->an_supported = 0;
	break;

	case XGBE_AN_PAGE_RECEIVED:
	pdata->an_state = xgbe_an_page_received(pdata);
	pdata->an_supported++;
	break;

	case XGBE_AN_INCOMPAT_LINK:
	pdata->an_supported = 0;
	pdata->parallel_detect = 0;
	pdata->an_state = xgbe_an_incompat_link(pdata);
	break;

	case XGBE_AN_COMPLETE:
	pdata->parallel_detect = pdata->an_supported ? 0 : 1;
	break;

	case XGBE_AN_NO_LINK:
	break;

	default:
	pdata->an_state = XGBE_AN_ERROR;
	}

	if (pdata->an_state == XGBE_AN_NO_LINK) {
	pdata->an_int = 0;
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
	} else if (pdata->an_state == XGBE_AN_ERROR) {
	pdata->an_int = 0;
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
	}

	if (pdata->an_state >= XGBE_AN_COMPLETE) {
	pdata->an_result = pdata->an_state;
	pdata->an_state = XGBE_AN_READY;
	pdata->kr_state = XGBE_RX_BPA;
	pdata->kx_state = XGBE_RX_BPA;
	pdata->an_start = 0;
	}

	if (cur_state != pdata->an_state)
	goto again;

	if (pdata->an_int)
	goto next_int;

	out:
	/* Enable AN interrupts on the way out */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, XGBE_AN_INT_MASK);

	sx_xunlock(&pdata->an_mutex);
	}

	static void xgbe_an_init(struct xgbe_prv_data *pdata)
	{
	unsigned int reg;

	/* Set up Advertisement register 3 first */
	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2);
	reg &= ~0xc000;

	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2, reg);

	/* Set up Advertisement register 2 next */
	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1);
	if (pdata->phy.advertising & ADVERTISED_10000baseKR_Full)
	reg \|= 0x80;
	else
	reg &= ~0x80;

	if ((pdata->phy.advertising & ADVERTISED_1000baseKX_Full) \|\|
	(pdata->phy.advertising & ADVERTISED_2500baseX_Full))
	reg \|= 0x20;
	else
	reg &= ~0x20;

	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1, reg);

	/* Set up Advertisement register 1 last */
	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE);
	if (pdata->phy.advertising & ADVERTISED_Pause)
	reg \|= 0x400;
	else
	reg &= ~0x400;

	if (pdata->phy.advertising & ADVERTISED_Asym_Pause)
	reg \|= 0x800;
	else
	reg &= ~0x800;

	/* We don't intend to perform XNP */
	reg &= ~XGBE_XNP_NP_EXCHANGE;

	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE, reg);
	}

	static void xgbe_phy_adjust_link(struct xgbe_prv_data *pdata)
	{
	- int new_state = 0;

	if (pdata->phy.link) {
	/* Flow control support */
	pdata->pause_autoneg = pdata->phy.pause_autoneg;

	if (pdata->tx_pause != pdata->phy.tx_pause) {
	- new_state = 1;
	pdata->hw_if.config_tx_flow_control(pdata);
	pdata->tx_pause = pdata->phy.tx_pause;
	}

	if (pdata->rx_pause != pdata->phy.rx_pause) {
	- new_state = 1;
	pdata->hw_if.config_rx_flow_control(pdata);
	pdata->rx_pause = pdata->phy.rx_pause;
	}

	/* Speed support */
	if (pdata->phy_speed != pdata->phy.speed) {
	- new_state = 1;
	pdata->phy_speed = pdata->phy.speed;
	}

	if (pdata->phy_link != pdata->phy.link) {
	- new_state = 1;
	pdata->phy_link = pdata->phy.link;
	}
	} else if (pdata->phy_link) {
	- new_state = 1;
	pdata->phy_link = 0;
	pdata->phy_speed = SPEED_UNKNOWN;
	}
	}

	static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata)
	{

	/* Disable auto-negotiation */
	xgbe_disable_an(pdata);

	/* Validate/Set specified speed */
	switch (pdata->phy.speed) {
	case SPEED_10000:
	xgbe_set_mode(pdata, XGBE_MODE_KR);
	break;

	case SPEED_2500:
	case SPEED_1000:
	xgbe_set_mode(pdata, XGBE_MODE_KX);
	break;

	default:
	return -EINVAL;
	}

	/* Validate duplex mode */
	if (pdata->phy.duplex != DUPLEX_FULL)
	return -EINVAL;

	return 0;
	}

	static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
	{
	set_bit(XGBE_LINK_INIT, &pdata->dev_state);
	pdata->link_check = ticks;

	if (pdata->phy.autoneg != AUTONEG_ENABLE)
	return xgbe_phy_config_fixed(pdata);

	/* Disable auto-negotiation interrupt */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);

	/* Clear any auto-negotitation interrupts */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);

	/* Start auto-negotiation in a supported mode */
	if (pdata->phy.advertising & ADVERTISED_10000baseKR_Full) {
	xgbe_set_mode(pdata, XGBE_MODE_KR);
	} else if ((pdata->phy.advertising & ADVERTISED_1000baseKX_Full) \|\|
	(pdata->phy.advertising & ADVERTISED_2500baseX_Full)) {
	xgbe_set_mode(pdata, XGBE_MODE_KX);
	} else {
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0x07);
	return -EINVAL;
	}

	/* Disable and stop any in progress auto-negotiation */
	xgbe_disable_an(pdata);

	/* Clear any auto-negotitation interrupts */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);

	pdata->an_result = XGBE_AN_READY;
	pdata->an_state = XGBE_AN_READY;
	pdata->kr_state = XGBE_RX_BPA;
	pdata->kx_state = XGBE_RX_BPA;

	/* Re-enable auto-negotiation interrupt */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0x07);

	/* Set up advertisement registers based on current settings */
	xgbe_an_init(pdata);

	/* Enable and start auto-negotiation */
	xgbe_restart_an(pdata);

	return 0;
	}

	static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
	{
	int ret;

	sx_xlock(&pdata->an_mutex);

	ret = __xgbe_phy_config_aneg(pdata);
	if (ret)
	set_bit(XGBE_LINK_ERR, &pdata->dev_state);
	else
	clear_bit(XGBE_LINK_ERR, &pdata->dev_state);

	sx_unlock(&pdata->an_mutex);

	return ret;
	}

	static bool xgbe_phy_aneg_done(struct xgbe_prv_data *pdata)
	{
	return (pdata->an_result == XGBE_AN_COMPLETE);
	}

	static void xgbe_check_link_timeout(struct xgbe_prv_data *pdata)
	{
	unsigned long link_timeout;

	link_timeout = pdata->link_check + (XGBE_LINK_TIMEOUT * hz);
	if ((int)(ticks - link_timeout) >= 0) {
	xgbe_phy_config_aneg(pdata);
	}
	}

	static void xgbe_phy_status_force(struct xgbe_prv_data *pdata)
	{
	if (xgbe_in_kr_mode(pdata)) {
	pdata->phy.speed = SPEED_10000;
	} else {
	switch (pdata->speed_set) {
	case XGBE_SPEEDSET_1000_10000:
	pdata->phy.speed = SPEED_1000;
	break;

	case XGBE_SPEEDSET_2500_10000:
	pdata->phy.speed = SPEED_2500;
	break;
	}
	}
	pdata->phy.duplex = DUPLEX_FULL;
	}

	static void xgbe_phy_status_aneg(struct xgbe_prv_data *pdata)
	{
	unsigned int ad_reg, lp_reg;

	pdata->phy.lp_advertising = 0;

	if ((pdata->phy.autoneg != AUTONEG_ENABLE) \|\| pdata->parallel_detect)
	return xgbe_phy_status_force(pdata);

	pdata->phy.lp_advertising \|= ADVERTISED_Autoneg;
	pdata->phy.lp_advertising \|= ADVERTISED_Backplane;

	/* Compare Advertisement and Link Partner register 1 */
	ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE);
	lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA);
	if (lp_reg & 0x400)
	pdata->phy.lp_advertising \|= ADVERTISED_Pause;
	if (lp_reg & 0x800)
	pdata->phy.lp_advertising \|= ADVERTISED_Asym_Pause;

	if (pdata->phy.pause_autoneg) {
	/* Set flow control based on auto-negotiation result */
	pdata->phy.tx_pause = 0;
	pdata->phy.rx_pause = 0;

	if (ad_reg & lp_reg & 0x400) {
	pdata->phy.tx_pause = 1;
	pdata->phy.rx_pause = 1;
	} else if (ad_reg & lp_reg & 0x800) {
	if (ad_reg & 0x400)
	pdata->phy.rx_pause = 1;
	else if (lp_reg & 0x400)
	pdata->phy.tx_pause = 1;
	}
	}

	/* Compare Advertisement and Link Partner register 2 */
	ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1);
	lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 1);
	if (lp_reg & 0x80)
	pdata->phy.lp_advertising \|= ADVERTISED_10000baseKR_Full;
	if (lp_reg & 0x20) {
	switch (pdata->speed_set) {
	case XGBE_SPEEDSET_1000_10000:
	pdata->phy.lp_advertising \|= ADVERTISED_1000baseKX_Full;
	break;
	case XGBE_SPEEDSET_2500_10000:
	pdata->phy.lp_advertising \|= ADVERTISED_2500baseX_Full;
	break;
	}
	}

	ad_reg &= lp_reg;
	if (ad_reg & 0x80) {
	pdata->phy.speed = SPEED_10000;
	xgbe_set_mode(pdata, XGBE_MODE_KR);
	} else if (ad_reg & 0x20) {
	switch (pdata->speed_set) {
	case XGBE_SPEEDSET_1000_10000:
	pdata->phy.speed = SPEED_1000;
	break;

	case XGBE_SPEEDSET_2500_10000:
	pdata->phy.speed = SPEED_2500;
	break;
	}

	xgbe_set_mode(pdata, XGBE_MODE_KX);
	} else {
	pdata->phy.speed = SPEED_UNKNOWN;
	}

	/* Compare Advertisement and Link Partner register 3 */
	ad_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2);
	lp_reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_LPA + 2);
	}

	static void xgbe_phy_status(struct xgbe_prv_data *pdata)
	{
	unsigned int reg, link_aneg;

	if (test_bit(XGBE_LINK_ERR, &pdata->dev_state)) {
	pdata->phy.link = 0;
	goto adjust_link;
	}

	link_aneg = (pdata->phy.autoneg == AUTONEG_ENABLE);

	/* Get the link status. Link status is latched low, so read
	* once to clear and then read again to get current state
	*/
	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_STAT1);
	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_STAT1);
	pdata->phy.link = (reg & MDIO_STAT1_LSTATUS) ? 1 : 0;

	if (pdata->phy.link) {
	if (link_aneg && !xgbe_phy_aneg_done(pdata)) {
	xgbe_check_link_timeout(pdata);
	return;
	}

	xgbe_phy_status_aneg(pdata);

	if (test_bit(XGBE_LINK_INIT, &pdata->dev_state))
	clear_bit(XGBE_LINK_INIT, &pdata->dev_state);
	} else {
	if (test_bit(XGBE_LINK_INIT, &pdata->dev_state)) {
	xgbe_check_link_timeout(pdata);

	if (link_aneg)
	return;
	}

	xgbe_phy_status_aneg(pdata);
	}

	adjust_link:
	xgbe_phy_adjust_link(pdata);
	}

	static void xgbe_phy_stop(struct xgbe_prv_data *pdata)
	{

	/* Disable auto-negotiation */
	xgbe_disable_an(pdata);

	/* Disable auto-negotiation interrupts */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);

	bus_teardown_intr(pdata->dev, pdata->an_irq_res, pdata->an_irq_tag);

	pdata->phy.link = 0;

	xgbe_phy_adjust_link(pdata);
	}

	static int xgbe_phy_start(struct xgbe_prv_data *pdata)
	{
	int ret;

	ret = bus_setup_intr(pdata->dev, pdata->an_irq_res,
	INTR_MPSAFE \| INTR_TYPE_NET, NULL, xgbe_an_isr, pdata,
	&pdata->an_irq_tag);
	if (ret) {
	return -ret;
	}

	/* Set initial mode - call the mode setting routines
	* directly to insure we are properly configured
	*/
	if (xgbe_use_xgmii_mode(pdata)) {
	xgbe_xgmii_mode(pdata);
	} else if (xgbe_use_gmii_mode(pdata)) {
	xgbe_gmii_mode(pdata);
	} else if (xgbe_use_gmii_2500_mode(pdata)) {
	xgbe_gmii_2500_mode(pdata);
	} else {
	ret = -EINVAL;
	goto err_irq;
	}

	/* Set up advertisement registers based on current settings */
	xgbe_an_init(pdata);

	/* Enable auto-negotiation interrupts */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0x07);

	return xgbe_phy_config_aneg(pdata);

	err_irq:
	bus_teardown_intr(pdata->dev, pdata->an_irq_res, pdata->an_irq_tag);

	return ret;
	}

	static int xgbe_phy_reset(struct xgbe_prv_data *pdata)
	{
	unsigned int count, reg;

	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
	reg \|= MDIO_CTRL1_RESET;
	XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, reg);

	count = 50;
	do {
	DELAY(20);
	reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1);
	} while ((reg & MDIO_CTRL1_RESET) && --count);

	if (reg & MDIO_CTRL1_RESET)
	return -ETIMEDOUT;

	/* Disable auto-negotiation for now */
	xgbe_disable_an(pdata);

	/* Clear auto-negotiation interrupts */
	XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);

	return 0;
	}

	static void xgbe_phy_init(struct xgbe_prv_data *pdata)
	{
	sx_init(&pdata->an_mutex, "axgbe AN lock");
	pdata->mdio_mmd = MDIO_MMD_PCS;

	/* Initialize supported features */
	pdata->phy.supported = SUPPORTED_Autoneg;
	pdata->phy.supported \|= SUPPORTED_Pause \| SUPPORTED_Asym_Pause;
	pdata->phy.supported \|= SUPPORTED_Backplane;
	pdata->phy.supported \|= SUPPORTED_10000baseKR_Full;
	switch (pdata->speed_set) {
	case XGBE_SPEEDSET_1000_10000:
	pdata->phy.supported \|= SUPPORTED_1000baseKX_Full;
	break;
	case XGBE_SPEEDSET_2500_10000:
	pdata->phy.supported \|= SUPPORTED_2500baseX_Full;
	break;
	}

	pdata->fec_ability = XMDIO_READ(pdata, MDIO_MMD_PMAPMD,
	MDIO_PMA_10GBR_FECABLE);
	pdata->fec_ability &= (MDIO_PMA_10GBR_FECABLE_ABLE \|
	MDIO_PMA_10GBR_FECABLE_ERRABLE);
	if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
	pdata->phy.supported \|= SUPPORTED_10000baseR_FEC;

	pdata->phy.advertising = pdata->phy.supported;

	pdata->phy.address = 0;

	pdata->phy.autoneg = AUTONEG_ENABLE;
	pdata->phy.speed = SPEED_UNKNOWN;
	pdata->phy.duplex = DUPLEX_UNKNOWN;

	pdata->phy.link = 0;

	pdata->phy.pause_autoneg = pdata->pause_autoneg;
	pdata->phy.tx_pause = pdata->tx_pause;
	pdata->phy.rx_pause = pdata->rx_pause;

	/* Fix up Flow Control advertising */
	pdata->phy.advertising &= ~ADVERTISED_Pause;
	pdata->phy.advertising &= ~ADVERTISED_Asym_Pause;

	if (pdata->rx_pause) {
	pdata->phy.advertising \|= ADVERTISED_Pause;
	pdata->phy.advertising \|= ADVERTISED_Asym_Pause;
	}

	if (pdata->tx_pause)
	pdata->phy.advertising ^= ADVERTISED_Asym_Pause;
	}

	void xgbe_init_function_ptrs_phy(struct xgbe_phy_if *phy_if)
	{
	phy_if->phy_init = xgbe_phy_init;

	phy_if->phy_reset = xgbe_phy_reset;
	phy_if->phy_start = xgbe_phy_start;
	phy_if->phy_stop = xgbe_phy_stop;

	phy_if->phy_status = xgbe_phy_status;
	phy_if->phy_config_aneg = xgbe_phy_config_aneg;
	}
	Index: head/sys/dev/e1000/igb_txrx.c
	===================================================================
	--- head/sys/dev/e1000/igb_txrx.c (revision 327172)
	+++ head/sys/dev/e1000/igb_txrx.c (revision 327173)
	@@ -1,584 +1,584 @@
	/*-
	* Copyright (c) 2016 Matthew Macy <mmacy@mattmacy.io>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/* $FreeBSD$ */
	#include "if_em.h"

	#ifdef RSS
	#include <net/rss_config.h>
	#include <netinet/in_rss.h>
	#endif

	#ifdef VERBOSE_DEBUG
	#define DPRINTF device_printf
	#else
	#define DPRINTF(...)
	#endif

	/*********************************************************************
	* Local Function prototypes
	*********************************************************************/
	static int igb_isc_txd_encap(void *arg, if_pkt_info_t pi);
	static void igb_isc_txd_flush(void *arg, uint16_t txqid, qidx_t pidx);
	static int igb_isc_txd_credits_update(void *arg, uint16_t txqid, bool clear);

	static void igb_isc_rxd_refill(void *arg, if_rxd_update_t iru);

	static void igb_isc_rxd_flush(void *arg, uint16_t rxqid, uint8_t flid __unused, qidx_t pidx);
	static int igb_isc_rxd_available(void *arg, uint16_t rxqid, qidx_t idx, qidx_t budget);

	static int igb_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri);

	static int igb_tx_ctx_setup(struct tx_ring txr, if_pkt_info_t pi, u32 cmd_type_len, u32 *olinfo_status);
	static int igb_tso_setup(struct tx_ring txr, if_pkt_info_t pi, u32 cmd_type_len, u32 *olinfo_status);

	static void igb_rx_checksum(u32 staterr, if_rxd_info_t ri, u32 ptype);
	static int igb_determine_rsstype(u16 pkt_info);

	extern void igb_if_enable_intr(if_ctx_t ctx);
	extern int em_intr(void *arg);

	struct if_txrx igb_txrx = {
	igb_isc_txd_encap,
	igb_isc_txd_flush,
	igb_isc_txd_credits_update,
	igb_isc_rxd_available,
	igb_isc_rxd_pkt_get,
	igb_isc_rxd_refill,
	igb_isc_rxd_flush,
	em_intr
	};

	extern if_shared_ctx_t em_sctx;

	/**********************************************************************
	*
	* Setup work for hardware segmentation offload (TSO) on
	* adapters using advanced tx descriptors
	*
	**********************************************************************/
	static int
	igb_tso_setup(struct tx_ring txr, if_pkt_info_t pi, u32 cmd_type_len, u32 *olinfo_status)
	{
	struct e1000_adv_tx_context_desc *TXD;
	struct adapter *adapter = txr->adapter;
	u32 type_tucmd_mlhl = 0, vlan_macip_lens = 0;
	u32 mss_l4len_idx = 0;
	u32 paylen;

	switch(pi->ipi_etype) {
	case ETHERTYPE_IPV6:
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_IPV6;
	break;
	case ETHERTYPE_IP:
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_IPV4;
	/* Tell transmit desc to also do IPv4 checksum. */
	*olinfo_status \|= E1000_TXD_POPTS_IXSM << 8;
	break;
	default:
	panic("%s: CSUM_TSO but no supported IP version (0x%04x)",
	__func__, ntohs(pi->ipi_etype));
	break;
	}

	TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[pi->ipi_pidx];

	/* This is used in the transmit desc in encap */
	paylen = pi->ipi_len - pi->ipi_ehdrlen - pi->ipi_ip_hlen - pi->ipi_tcp_hlen;

	/* VLAN MACLEN IPLEN */
	if (pi->ipi_mflags & M_VLANTAG) {
	vlan_macip_lens \|= (pi->ipi_vtag << E1000_ADVTXD_VLAN_SHIFT);
	}

	vlan_macip_lens \|= pi->ipi_ehdrlen << E1000_ADVTXD_MACLEN_SHIFT;
	vlan_macip_lens \|= pi->ipi_ip_hlen;
	TXD->vlan_macip_lens = htole32(vlan_macip_lens);

	/* ADV DTYPE TUCMD */
	type_tucmd_mlhl \|= E1000_ADVTXD_DCMD_DEXT \| E1000_ADVTXD_DTYP_CTXT;
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_L4T_TCP;
	TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);

	/* MSS L4LEN IDX */
	mss_l4len_idx \|= (pi->ipi_tso_segsz << E1000_ADVTXD_MSS_SHIFT);
	mss_l4len_idx \|= (pi->ipi_tcp_hlen << E1000_ADVTXD_L4LEN_SHIFT);
	/* 82575 needs the queue index added */
	if (adapter->hw.mac.type == e1000_82575)
	mss_l4len_idx \|= txr->me << 4;
	TXD->mss_l4len_idx = htole32(mss_l4len_idx);

	TXD->seqnum_seed = htole32(0);
	*cmd_type_len \|= E1000_ADVTXD_DCMD_TSE;
	*olinfo_status \|= E1000_TXD_POPTS_TXSM << 8;
	*olinfo_status \|= paylen << E1000_ADVTXD_PAYLEN_SHIFT;

	return (1);
	}

	/*********************************************************************
	*
	* Advanced Context Descriptor setup for VLAN, CSUM or TSO
	*
	**********************************************************************/
	static int
	igb_tx_ctx_setup(struct tx_ring txr, if_pkt_info_t pi, u32 cmd_type_len, u32 *olinfo_status)
	{
	struct e1000_adv_tx_context_desc *TXD;
	struct adapter *adapter = txr->adapter;
	u32 vlan_macip_lens, type_tucmd_mlhl;
	u32 mss_l4len_idx;
	mss_l4len_idx = vlan_macip_lens = type_tucmd_mlhl = 0;
	int offload = TRUE;

	/* First check if TSO is to be used */
	if (pi->ipi_csum_flags & CSUM_TSO)
	return (igb_tso_setup(txr, pi, cmd_type_len, olinfo_status));

	/* Indicate the whole packet as payload when not doing TSO */
	*olinfo_status \|= pi->ipi_len << E1000_ADVTXD_PAYLEN_SHIFT;

	/* Now ready a context descriptor */
	TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[pi->ipi_pidx];

	/*
	** In advanced descriptors the vlan tag must
	** be placed into the context descriptor. Hence
	** we need to make one even if not doing offloads.
	*/
	if (pi->ipi_mflags & M_VLANTAG) {
	vlan_macip_lens \|= (pi->ipi_vtag << E1000_ADVTXD_VLAN_SHIFT);
	} else if ((pi->ipi_csum_flags & IGB_CSUM_OFFLOAD) == 0) {
	return (0);
	}

	/* Set the ether header length */
	vlan_macip_lens \|= pi->ipi_ehdrlen << E1000_ADVTXD_MACLEN_SHIFT;

	switch(pi->ipi_etype) {
	case ETHERTYPE_IP:
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_IPV4;
	break;
	case ETHERTYPE_IPV6:
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_IPV6;
	break;
	default:
	offload = FALSE;
	break;
	}

	vlan_macip_lens \|= pi->ipi_ip_hlen;
	type_tucmd_mlhl \|= E1000_ADVTXD_DCMD_DEXT \| E1000_ADVTXD_DTYP_CTXT;

	switch (pi->ipi_ipproto) {
	case IPPROTO_TCP:
	if (pi->ipi_csum_flags & (CSUM_IP_TCP \| CSUM_IP6_TCP))
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_L4T_TCP;
	break;
	case IPPROTO_UDP:
	if (pi->ipi_csum_flags & (CSUM_IP_UDP \| CSUM_IP6_UDP))
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_L4T_UDP;
	break;
	case IPPROTO_SCTP:
	if (pi->ipi_csum_flags & (CSUM_IP_SCTP \| CSUM_IP6_SCTP))
	type_tucmd_mlhl \|= E1000_ADVTXD_TUCMD_L4T_SCTP;
	break;
	default:
	offload = FALSE;
	break;
	}

	if (offload) /* For the TX descriptor setup */
	*olinfo_status \|= E1000_TXD_POPTS_TXSM << 8;

	/* 82575 needs the queue index added */
	if (adapter->hw.mac.type == e1000_82575)
	mss_l4len_idx = txr->me << 4;

	/* Now copy bits into descriptor */
	TXD->vlan_macip_lens = htole32(vlan_macip_lens);
	TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
	TXD->seqnum_seed = htole32(0);
	TXD->mss_l4len_idx = htole32(mss_l4len_idx);

	return (1);
	}

	static int
	igb_isc_txd_encap(void *arg, if_pkt_info_t pi)
	{
	struct adapter *sc = arg;
	if_softc_ctx_t scctx = sc->shared;
	struct em_tx_queue *que = &sc->tx_queues[pi->ipi_qsidx];
	struct tx_ring *txr = &que->txr;
	int nsegs = pi->ipi_nsegs;
	bus_dma_segment_t *segs = pi->ipi_segs;
	union e1000_adv_tx_desc *txd = NULL;
	- int i, j, first, pidx_last;
	+ int i, j, pidx_last;
	u32 olinfo_status, cmd_type_len, txd_flags;
	qidx_t ntxd;

	pidx_last = olinfo_status = 0;
	/* Basic descriptor defines */
	cmd_type_len = (E1000_ADVTXD_DTYP_DATA \|
	E1000_ADVTXD_DCMD_IFCS \| E1000_ADVTXD_DCMD_DEXT);

	if (pi->ipi_mflags & M_VLANTAG)
	cmd_type_len \|= E1000_ADVTXD_DCMD_VLE;

	- first = i = pi->ipi_pidx;
	+ i = pi->ipi_pidx;
	ntxd = scctx->isc_ntxd[0];
	txd_flags = pi->ipi_flags & IPI_TX_INTR ? E1000_ADVTXD_DCMD_RS : 0;
	/* Consume the first descriptor */
	i += igb_tx_ctx_setup(txr, pi, &cmd_type_len, &olinfo_status);
	if (i == scctx->isc_ntxd[0])
	i = 0;

	/* 82575 needs the queue index added */
	if (sc->hw.mac.type == e1000_82575)
	olinfo_status \|= txr->me << 4;

	for (j = 0; j < nsegs; j++) {
	bus_size_t seglen;
	bus_addr_t segaddr;

	txd = (union e1000_adv_tx_desc *)&txr->tx_base[i];
	seglen = segs[j].ds_len;
	segaddr = htole64(segs[j].ds_addr);

	txd->read.buffer_addr = segaddr;
	txd->read.cmd_type_len = htole32(E1000_TXD_CMD_IFCS \|
	cmd_type_len \| seglen);
	txd->read.olinfo_status = htole32(olinfo_status);
	pidx_last = i;
	if (++i == scctx->isc_ntxd[0]) {
	i = 0;
	}
	}
	if (txd_flags) {
	txr->tx_rsq[txr->tx_rs_pidx] = pidx_last;
	txr->tx_rs_pidx = (txr->tx_rs_pidx+1) & (ntxd-1);
	MPASS(txr->tx_rs_pidx != txr->tx_rs_cidx);
	}

	txd->read.cmd_type_len \|= htole32(E1000_TXD_CMD_EOP \| txd_flags);
	pi->ipi_new_pidx = i;

	return (0);
	}

	static void
	igb_isc_txd_flush(void *arg, uint16_t txqid, qidx_t pidx)
	{
	struct adapter *adapter = arg;
	struct em_tx_queue *que = &adapter->tx_queues[txqid];
	struct tx_ring *txr = &que->txr;

	E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), pidx);
	}

	static int
	igb_isc_txd_credits_update(void *arg, uint16_t txqid, bool clear)
	{
	struct adapter *adapter = arg;
	if_softc_ctx_t scctx = adapter->shared;
	struct em_tx_queue *que = &adapter->tx_queues[txqid];
	struct tx_ring *txr = &que->txr;

	qidx_t processed = 0;
	int updated;
	qidx_t cur, prev, ntxd, rs_cidx;
	int32_t delta;
	uint8_t status;

	rs_cidx = txr->tx_rs_cidx;
	if (rs_cidx == txr->tx_rs_pidx)
	return (0);
	cur = txr->tx_rsq[rs_cidx];
	status = ((union e1000_adv_tx_desc *)&txr->tx_base[cur])->wb.status;
	updated = !!(status & E1000_TXD_STAT_DD);

	if (!clear \|\| !updated)
	return (updated);

	prev = txr->tx_cidx_processed;
	ntxd = scctx->isc_ntxd[0];
	do {
	delta = (int32_t)cur - (int32_t)prev;
	MPASS(prev == 0 \|\| delta != 0);
	if (delta < 0)
	delta += ntxd;

	processed += delta;
	prev = cur;
	rs_cidx = (rs_cidx + 1) & (ntxd-1);
	if (rs_cidx == txr->tx_rs_pidx)
	break;
	cur = txr->tx_rsq[rs_cidx];
	status = ((union e1000_adv_tx_desc *)&txr->tx_base[cur])->wb.status;
	} while ((status & E1000_TXD_STAT_DD));

	txr->tx_rs_cidx = rs_cidx;
	txr->tx_cidx_processed = prev;
	return (processed);
	}

	static void
	igb_isc_rxd_refill(void *arg, if_rxd_update_t iru)
	{
	struct adapter *sc = arg;
	if_softc_ctx_t scctx = sc->shared;
	uint16_t rxqid = iru->iru_qsidx;
	struct em_rx_queue *que = &sc->rx_queues[rxqid];
	union e1000_adv_rx_desc *rxd;
	struct rx_ring *rxr = &que->rxr;
	uint64_t *paddrs;
	uint32_t next_pidx, pidx;
	uint16_t count;
	int i;

	paddrs = iru->iru_paddrs;
	pidx = iru->iru_pidx;
	count = iru->iru_count;

	for (i = 0, next_pidx = pidx; i < count; i++) {
	rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[next_pidx];

	rxd->read.pkt_addr = htole64(paddrs[i]);
	if (++next_pidx == scctx->isc_nrxd[0])
	next_pidx = 0;
	}
	}

	static void
	igb_isc_rxd_flush(void *arg, uint16_t rxqid, uint8_t flid __unused, qidx_t pidx)
	{
	struct adapter *sc = arg;
	struct em_rx_queue *que = &sc->rx_queues[rxqid];
	struct rx_ring *rxr = &que->rxr;

	E1000_WRITE_REG(&sc->hw, E1000_RDT(rxr->me), pidx);
	}

	static int
	igb_isc_rxd_available(void *arg, uint16_t rxqid, qidx_t idx, qidx_t budget)
	{
	struct adapter *sc = arg;
	if_softc_ctx_t scctx = sc->shared;
	struct em_rx_queue *que = &sc->rx_queues[rxqid];
	struct rx_ring *rxr = &que->rxr;
	union e1000_adv_rx_desc *rxd;
	u32 staterr = 0;
	int cnt, i, iter;

	if (budget == 1) {
	rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[idx];
	staterr = le32toh(rxd->wb.upper.status_error);
	return (staterr & E1000_RXD_STAT_DD);
	}

	for (iter = cnt = 0, i = idx; iter < scctx->isc_nrxd[0] && iter <= budget;) {
	rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[i];
	staterr = le32toh(rxd->wb.upper.status_error);

	if ((staterr & E1000_RXD_STAT_DD) == 0)
	break;

	if (++i == scctx->isc_nrxd[0]) {
	i = 0;
	}

	if (staterr & E1000_RXD_STAT_EOP)
	cnt++;
	iter++;
	}
	return (cnt);
	}

	/****************************************************************
	* Routine sends data which has been dma'ed into host memory
	* to upper layer. Initialize ri structure.
	*
	* Returns 0 upon success, errno on failure
	***************************************************************/

	static int
	igb_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri)
	{
	struct adapter *adapter = arg;
	if_softc_ctx_t scctx = adapter->shared;
	struct em_rx_queue *que = &adapter->rx_queues[ri->iri_qsidx];
	struct rx_ring *rxr = &que->rxr;
	struct ifnet *ifp = iflib_get_ifp(adapter->ctx);
	union e1000_adv_rx_desc *rxd;

	u16 pkt_info, len;
	u16 vtag = 0;
	u32 ptype;
	u32 staterr = 0;
	bool eop;
	int i = 0;
	int cidx = ri->iri_cidx;

	do {
	rxd = (union e1000_adv_rx_desc *)&rxr->rx_base[cidx];
	staterr = le32toh(rxd->wb.upper.status_error);
	pkt_info = le16toh(rxd->wb.lower.lo_dword.hs_rss.pkt_info);

	MPASS ((staterr & E1000_RXD_STAT_DD) != 0);

	len = le16toh(rxd->wb.upper.length);
	ptype = le32toh(rxd->wb.lower.lo_dword.data) & IGB_PKTTYPE_MASK;

	ri->iri_len += len;
	rxr->rx_bytes += ri->iri_len;

	rxd->wb.upper.status_error = 0;
	eop = ((staterr & E1000_RXD_STAT_EOP) == E1000_RXD_STAT_EOP);

	if (((adapter->hw.mac.type == e1000_i350) \|\|
	(adapter->hw.mac.type == e1000_i354)) &&
	(staterr & E1000_RXDEXT_STATERR_LB))
	vtag = be16toh(rxd->wb.upper.vlan);
	else
	vtag = le16toh(rxd->wb.upper.vlan);

	/* Make sure bad packets are discarded */
	if (eop && ((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) != 0)) {
	adapter->dropped_pkts++;
	++rxr->rx_discarded;
	return (EBADMSG);
	}
	ri->iri_frags[i].irf_flid = 0;
	ri->iri_frags[i].irf_idx = cidx;
	ri->iri_frags[i].irf_len = len;

	if (++cidx == scctx->isc_nrxd[0])
	cidx = 0;
	#ifdef notyet
	if (rxr->hdr_split == TRUE) {
	ri->iri_frags[i].irf_flid = 1;
	ri->iri_frags[i].irf_idx = cidx;
	if (++cidx == scctx->isc_nrxd[0])
	cidx = 0;
	}
	#endif
	i++;
	} while (!eop);

	rxr->rx_packets++;

	if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
	igb_rx_checksum(staterr, ri, ptype);

	if ((ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
	(staterr & E1000_RXD_STAT_VP) != 0) {
	ri->iri_vtag = vtag;
	ri->iri_flags \|= M_VLANTAG;
	}
	ri->iri_flowid =
	le32toh(rxd->wb.lower.hi_dword.rss);
	ri->iri_rsstype = igb_determine_rsstype(pkt_info);
	ri->iri_nfrags = i;

	return (0);
	}

	/*********************************************************************
	*
	* Verify that the hardware indicated that the checksum is valid.
	* Inform the stack about the status of checksum so that stack
	* doesn't spend time verifying the checksum.
	*
	*********************************************************************/
	static void
	igb_rx_checksum(u32 staterr, if_rxd_info_t ri, u32 ptype)
	{
	u16 status = (u16)staterr;
	u8 errors = (u8) (staterr >> 24);
	bool sctp = FALSE;

	/* Ignore Checksum bit is set */
	if (status & E1000_RXD_STAT_IXSM) {
	ri->iri_csum_flags = 0;
	return;
	}

	if ((ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 &&
	(ptype & E1000_RXDADV_PKTTYPE_SCTP) != 0)
	sctp = 1;
	else
	sctp = 0;

	if (status & E1000_RXD_STAT_IPCS) {
	/* Did it pass? */
	if (!(errors & E1000_RXD_ERR_IPE)) {
	/* IP Checksum Good */
	ri->iri_csum_flags = CSUM_IP_CHECKED;
	ri->iri_csum_flags \|= CSUM_IP_VALID;
	} else
	ri->iri_csum_flags = 0;
	}

	if (status & (E1000_RXD_STAT_TCPCS \| E1000_RXD_STAT_UDPCS)) {
	u64 type = (CSUM_DATA_VALID \| CSUM_PSEUDO_HDR);
	if (sctp) /* reassign */
	type = CSUM_SCTP_VALID;
	/* Did it pass? */
	if (!(errors & E1000_RXD_ERR_TCPE)) {
	ri->iri_csum_flags \|= type;
	if (sctp == 0)
	ri->iri_csum_data = htons(0xffff);
	}
	}
	return;
	}

	/********************************************************************
	*
	* Parse the packet type to determine the appropriate hash
	*
	******************************************************************/
	static int
	igb_determine_rsstype(u16 pkt_info)
	{
	switch (pkt_info & E1000_RXDADV_RSSTYPE_MASK) {
	case E1000_RXDADV_RSSTYPE_IPV4_TCP:
	return M_HASHTYPE_RSS_TCP_IPV4;
	case E1000_RXDADV_RSSTYPE_IPV4:
	return M_HASHTYPE_RSS_IPV4;
	case E1000_RXDADV_RSSTYPE_IPV6_TCP:
	return M_HASHTYPE_RSS_TCP_IPV6;
	case E1000_RXDADV_RSSTYPE_IPV6_EX:
	return M_HASHTYPE_RSS_IPV6_EX;
	case E1000_RXDADV_RSSTYPE_IPV6:
	return M_HASHTYPE_RSS_IPV6;
	case E1000_RXDADV_RSSTYPE_IPV6_TCP_EX:
	return M_HASHTYPE_RSS_TCP_IPV6_EX;
	default:
	return M_HASHTYPE_OPAQUE;
	}
	}
	Index: head/sys/dev/extres/clk/clk_bus.c
	===================================================================
	--- head/sys/dev/extres/clk/clk_bus.c (revision 327172)
	+++ head/sys/dev/extres/clk/clk_bus.c (revision 327173)
	@@ -1,93 +1,91 @@
	/*-
	* Copyright 2016 Michal Meloun <mmel@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/bus.h>

	#include <dev/fdt/simplebus.h>

	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_bus_subr.h>

	struct ofw_clkbus_softc {
	struct simplebus_softc simplebus_sc;
	};

	static int
	ofw_clkbus_probe(device_t dev)
	{
	const char *name;

	name = ofw_bus_get_name(dev);

	if (name == NULL \|\| strcmp(name, "clocks") != 0)
	return (ENXIO);

	device_set_desc(dev, "OFW clocks bus");

	return (BUS_PROBE_GENERIC);
	}

	static int
	ofw_clkbus_attach(device_t dev)
	{
	- struct ofw_clkbus_softc *sc;
	phandle_t node, child;
	device_t cdev;

	- sc = device_get_softc(dev);
	node = ofw_bus_get_node(dev);
	simplebus_init(dev, node);

	for (child = OF_child(node); child > 0; child = OF_peer(child)) {
	cdev = simplebus_add_device(dev, child, 0, NULL, -1, NULL);
	if (cdev != NULL)
	device_probe_and_attach(cdev);
	}

	return (bus_generic_attach(dev));
	}

	static device_method_t ofw_clkbus_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, ofw_clkbus_probe),
	DEVMETHOD(device_attach, ofw_clkbus_attach),

	DEVMETHOD_END
	};

	DEFINE_CLASS_1(ofw_clkbus, ofw_clkbus_driver, ofw_clkbus_methods,
	sizeof(struct ofw_clkbus_softc), simplebus_driver);
	static devclass_t ofw_clkbus_devclass;
	EARLY_DRIVER_MODULE(ofw_clkbus, simplebus, ofw_clkbus_driver,
	ofw_clkbus_devclass, 0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
	MODULE_VERSION(ofw_clkbus, 1);
	Index: head/sys/dev/extres/regulator/regulator_bus.c
	===================================================================
	--- head/sys/dev/extres/regulator/regulator_bus.c (revision 327172)
	+++ head/sys/dev/extres/regulator/regulator_bus.c (revision 327173)
	@@ -1,89 +1,87 @@
	/*-
	* Copyright 2016 Michal Meloun <mmel@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/bus.h>

	#include <dev/fdt/simplebus.h>

	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_bus_subr.h>

	struct ofw_regulator_bus_softc {
	struct simplebus_softc simplebus_sc;
	};

	static int
	ofw_regulator_bus_probe(device_t dev)
	{
	const char *name;

	name = ofw_bus_get_name(dev);
	if (name == NULL \|\| strcmp(name, "regulators") != 0)
	return (ENXIO);
	device_set_desc(dev, "OFW regulators bus");

	return (0);
	}

	static int
	ofw_regulator_bus_attach(device_t dev)
	{
	- struct ofw_regulator_bus_softc *sc;
	phandle_t node, child;

	- sc = device_get_softc(dev);
	node = ofw_bus_get_node(dev);
	simplebus_init(dev, node);

	for (child = OF_child(node); child > 0; child = OF_peer(child)) {
	simplebus_add_device(dev, child, 0, NULL, -1, NULL);
	}

	return (bus_generic_attach(dev));
	}

	static device_method_t ofw_regulator_bus_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, ofw_regulator_bus_probe),
	DEVMETHOD(device_attach, ofw_regulator_bus_attach),

	DEVMETHOD_END
	};

	DEFINE_CLASS_1(ofw_regulator_bus, ofw_regulator_bus_driver,
	ofw_regulator_bus_methods, sizeof(struct ofw_regulator_bus_softc),
	simplebus_driver);
	static devclass_t ofw_regulator_bus_devclass;
	EARLY_DRIVER_MODULE(ofw_regulator_bus, simplebus, ofw_regulator_bus_driver,
	ofw_regulator_bus_devclass, 0, 0, BUS_PASS_BUS + BUS_PASS_ORDER_MIDDLE);
	MODULE_VERSION(ofw_regulator_bus, 1);
	Index: head/sys/dev/fdt/fdt_common.c
	===================================================================
	--- head/sys/dev/fdt/fdt_common.c (revision 327172)
	+++ head/sys/dev/fdt/fdt_common.c (revision 327173)
	@@ -1,745 +1,737 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009-2014 The FreeBSD Foundation
	* All rights reserved.
	*
	* This software was developed by Andrew Turner under sponsorship from
	* the FreeBSD Foundation.
	* This software was developed by Semihalf under sponsorship from
	* the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/bus.h>
	#include <sys/limits.h>
	#include <sys/sysctl.h>

	#include <machine/resource.h>

	#include <dev/fdt/fdt_common.h>
	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#include <dev/ofw/openfirm.h>

	#include "ofw_bus_if.h"

	#ifdef DEBUG
	#define debugf(fmt, args...) do { printf("%s(): ", __func__); \
	printf(fmt,##args); } while (0)
	#else
	#define debugf(fmt, args...)
	#endif

	#define FDT_COMPAT_LEN 255
	#define FDT_TYPE_LEN 64

	#define FDT_REG_CELLS 4
	#define FDT_RANGES_SIZE 48

	SYSCTL_NODE(_hw, OID_AUTO, fdt, CTLFLAG_RD, 0, "Flattened Device Tree");

	vm_paddr_t fdt_immr_pa;
	vm_offset_t fdt_immr_va;
	vm_offset_t fdt_immr_size;

	struct fdt_ic_list fdt_ic_list_head = SLIST_HEAD_INITIALIZER(fdt_ic_list_head);

	static int fdt_is_compatible(phandle_t, const char *);

	static int
	fdt_get_range_by_busaddr(phandle_t node, u_long addr, u_long *base,
	u_long *size)
	{
	pcell_t ranges[32], *rangesptr;
	pcell_t addr_cells, size_cells, par_addr_cells;
	u_long bus_addr, par_bus_addr, pbase, psize;
	int err, i, len, tuple_size, tuples;

	if (node == 0) {
	*base = 0;
	*size = ULONG_MAX;
	return (0);
	}

	if ((fdt_addrsize_cells(node, &addr_cells, &size_cells)) != 0)
	return (ENXIO);
	/*
	* Process 'ranges' property.
	*/
	par_addr_cells = fdt_parent_addr_cells(node);
	if (par_addr_cells > 2) {
	return (ERANGE);
	}

	len = OF_getproplen(node, "ranges");
	if (len < 0)
	return (-1);
	if (len > sizeof(ranges))
	return (ENOMEM);
	if (len == 0) {
	return (fdt_get_range_by_busaddr(OF_parent(node), addr,
	base, size));
	}

	if (OF_getprop(node, "ranges", ranges, sizeof(ranges)) <= 0)
	return (EINVAL);

	tuple_size = addr_cells + par_addr_cells + size_cells;
	tuples = len / (tuple_size * sizeof(cell_t));

	if (par_addr_cells > 2 \|\| addr_cells > 2 \|\| size_cells > 2)
	return (ERANGE);

	*base = 0;
	*size = 0;

	for (i = 0; i < tuples; i++) {
	rangesptr = &ranges[i * tuple_size];

	bus_addr = fdt_data_get((void *)rangesptr, addr_cells);
	if (bus_addr != addr)
	continue;
	rangesptr += addr_cells;

	par_bus_addr = fdt_data_get((void *)rangesptr, par_addr_cells);
	rangesptr += par_addr_cells;

	err = fdt_get_range_by_busaddr(OF_parent(node), par_bus_addr,
	&pbase, &psize);
	if (err > 0)
	return (err);
	if (err == 0)
	*base = pbase;
	else
	*base = par_bus_addr;

	size = fdt_data_get((void )rangesptr, size_cells);

	return (0);
	}

	return (EINVAL);
	}

	int
	fdt_get_range(phandle_t node, int range_id, u_long base, u_long size)
	{
	pcell_t ranges[FDT_RANGES_SIZE], *rangesptr;
	pcell_t addr_cells, size_cells, par_addr_cells;
	u_long par_bus_addr, pbase, psize;
	- int err, len, tuple_size, tuples;
	+ int err, len;

	if ((fdt_addrsize_cells(node, &addr_cells, &size_cells)) != 0)
	return (ENXIO);
	/*
	* Process 'ranges' property.
	*/
	par_addr_cells = fdt_parent_addr_cells(node);
	if (par_addr_cells > 2)
	return (ERANGE);

	len = OF_getproplen(node, "ranges");
	if (len > sizeof(ranges))
	return (ENOMEM);
	if (len == 0) {
	*base = 0;
	*size = ULONG_MAX;
	return (0);
	}

	if (!(range_id < len))
	return (ERANGE);

	if (OF_getprop(node, "ranges", ranges, sizeof(ranges)) <= 0)
	return (EINVAL);

	- tuple_size = sizeof(pcell_t) * (addr_cells + par_addr_cells +
	- size_cells);
	- tuples = len / tuple_size;
	-
	if (par_addr_cells > 2 \|\| addr_cells > 2 \|\| size_cells > 2)
	return (ERANGE);

	*base = 0;
	*size = 0;
	rangesptr = &ranges[range_id];

	base = fdt_data_get((void )rangesptr, addr_cells);
	rangesptr += addr_cells;

	par_bus_addr = fdt_data_get((void *)rangesptr, par_addr_cells);
	rangesptr += par_addr_cells;

	err = fdt_get_range_by_busaddr(OF_parent(node), par_bus_addr,
	&pbase, &psize);
	if (err == 0)
	*base += pbase;
	else
	*base += par_bus_addr;

	size = fdt_data_get((void )rangesptr, size_cells);
	return (0);
	}

	int
	fdt_immr_addr(vm_offset_t immr_va)
	{
	phandle_t node;
	u_long base, size;
	int r;

	/*
	* Try to access the SOC node directly i.e. through /aliases/.
	*/
	if ((node = OF_finddevice("soc")) != 0)
	if (fdt_is_compatible(node, "simple-bus"))
	goto moveon;
	/*
	* Find the node the long way.
	*/
	if ((node = OF_finddevice("/")) == 0)
	return (ENXIO);

	if ((node = fdt_find_compatible(node, "simple-bus", 0)) == 0)
	return (ENXIO);

	moveon:
	if ((r = fdt_get_range(node, 0, &base, &size)) == 0) {
	fdt_immr_pa = base;
	fdt_immr_va = immr_va;
	fdt_immr_size = size;
	}

	return (r);
	}

	/*
	* This routine is an early-usage version of the ofw_bus_is_compatible() when
	* the ofw_bus I/F is not available (like early console routines and similar).
	* Note the buffer has to be on the stack since malloc() is usually not
	* available in such cases either.
	*/
	static int
	fdt_is_compatible(phandle_t node, const char *compatstr)
	{
	char buf[FDT_COMPAT_LEN];
	char *compat;
	int len, onelen, l, rv;

	if ((len = OF_getproplen(node, "compatible")) <= 0)
	return (0);

	compat = (char *)&buf;
	bzero(compat, FDT_COMPAT_LEN);

	if (OF_getprop(node, "compatible", compat, FDT_COMPAT_LEN) < 0)
	return (0);

	onelen = strlen(compatstr);
	rv = 0;
	while (len > 0) {
	if (strncasecmp(compat, compatstr, onelen) == 0) {
	/* Found it. */
	rv = 1;
	break;
	}
	/* Slide to the next sub-string. */
	l = strlen(compat) + 1;
	compat += l;
	len -= l;
	}

	return (rv);
	}

	int
	fdt_is_compatible_strict(phandle_t node, const char *compatible)
	{
	char compat[FDT_COMPAT_LEN];

	if (OF_getproplen(node, "compatible") <= 0)
	return (0);

	if (OF_getprop(node, "compatible", compat, FDT_COMPAT_LEN) < 0)
	return (0);

	if (strncasecmp(compat, compatible, FDT_COMPAT_LEN) == 0)
	/* This fits. */
	return (1);

	return (0);
	}

	phandle_t
	fdt_find_compatible(phandle_t start, const char *compat, int strict)
	{
	phandle_t child;

	/*
	* Traverse all children of 'start' node, and find first with
	* matching 'compatible' property.
	*/
	for (child = OF_child(start); child != 0; child = OF_peer(child))
	if (fdt_is_compatible(child, compat)) {
	if (strict)
	if (!fdt_is_compatible_strict(child, compat))
	continue;
	return (child);
	}
	return (0);
	}

	phandle_t
	fdt_depth_search_compatible(phandle_t start, const char *compat, int strict)
	{
	phandle_t child, node;

	/*
	* Depth-search all descendants of 'start' node, and find first with
	* matching 'compatible' property.
	*/
	for (node = OF_child(start); node != 0; node = OF_peer(node)) {
	if (fdt_is_compatible(node, compat) &&
	(strict == 0 \|\| fdt_is_compatible_strict(node, compat))) {
	return (node);
	}
	child = fdt_depth_search_compatible(node, compat, strict);
	if (child != 0)
	return (child);
	}
	return (0);
	}

	int
	fdt_is_enabled(phandle_t node)
	{
	char *stat;
	int ena, len;

	len = OF_getprop_alloc(node, "status", sizeof(char),
	(void **)&stat);

	if (len <= 0)
	/* It is OK if no 'status' property. */
	return (1);

	/* Anything other than 'okay' means disabled. */
	ena = 0;
	if (strncmp((char *)stat, "okay", len) == 0)
	ena = 1;

	OF_prop_free(stat);
	return (ena);
	}

	int
	fdt_is_type(phandle_t node, const char *typestr)
	{
	char type[FDT_TYPE_LEN];

	if (OF_getproplen(node, "device_type") <= 0)
	return (0);

	if (OF_getprop(node, "device_type", type, FDT_TYPE_LEN) < 0)
	return (0);

	if (strncasecmp(type, typestr, FDT_TYPE_LEN) == 0)
	/* This fits. */
	return (1);

	return (0);
	}

	int
	fdt_parent_addr_cells(phandle_t node)
	{
	pcell_t addr_cells;

	/* Find out #address-cells of the superior bus. */
	if (OF_searchprop(OF_parent(node), "#address-cells", &addr_cells,
	sizeof(addr_cells)) <= 0)
	return (2);

	return ((int)fdt32_to_cpu(addr_cells));
	}

	int
	fdt_pm_is_enabled(phandle_t node)
	{
	int ret;

	ret = 1;

	#if defined(SOC_MV_KIRKWOOD) \|\| defined(SOC_MV_DISCOVERY)
	ret = fdt_pm(node);
	#endif
	return (ret);
	}

	u_long
	fdt_data_get(void *data, int cells)
	{

	if (cells == 1)
	return (fdt32_to_cpu(((uint32_t )data)));

	return (fdt64_to_cpu(((uint64_t )data)));
	}

	int
	fdt_addrsize_cells(phandle_t node, int addr_cells, int size_cells)
	{
	pcell_t cell;
	int cell_size;

	/*
	* Retrieve #{address,size}-cells.
	*/
	cell_size = sizeof(cell);
	if (OF_getencprop(node, "#address-cells", &cell, cell_size) < cell_size)
	cell = 2;
	*addr_cells = (int)cell;

	if (OF_getencprop(node, "#size-cells", &cell, cell_size) < cell_size)
	cell = 1;
	*size_cells = (int)cell;

	if (addr_cells > 3 \|\| size_cells > 2)
	return (ERANGE);
	return (0);
	}

	int
	fdt_data_to_res(pcell_t data, int addr_cells, int size_cells, u_long start,
	u_long *count)
	{

	/* Address portion. */
	if (addr_cells > 2)
	return (ERANGE);

	start = fdt_data_get((void )data, addr_cells);
	data += addr_cells;

	/* Size portion. */
	if (size_cells > 2)
	return (ERANGE);

	count = fdt_data_get((void )data, size_cells);
	return (0);
	}

	int
	fdt_regsize(phandle_t node, u_long base, u_long size)
	{
	pcell_t reg[4];
	int addr_cells, len, size_cells;

	if (fdt_addrsize_cells(OF_parent(node), &addr_cells, &size_cells))
	return (ENXIO);

	if ((sizeof(pcell_t) * (addr_cells + size_cells)) > sizeof(reg))
	return (ENOMEM);

	len = OF_getprop(node, "reg", &reg, sizeof(reg));
	if (len <= 0)
	return (EINVAL);

	*base = fdt_data_get(&reg[0], addr_cells);
	*size = fdt_data_get(&reg[addr_cells], size_cells);
	return (0);
	}

	int
	fdt_reg_to_rl(phandle_t node, struct resource_list *rl)
	{
	u_long end, count, start;
	pcell_t reg, regptr;
	pcell_t addr_cells, size_cells;
	int tuple_size, tuples;
	int i, rv;
	long busaddr, bussize;

	if (fdt_addrsize_cells(OF_parent(node), &addr_cells, &size_cells) != 0)
	return (ENXIO);
	if (fdt_get_range(OF_parent(node), 0, &busaddr, &bussize)) {
	busaddr = 0;
	bussize = 0;
	}

	tuple_size = sizeof(pcell_t) * (addr_cells + size_cells);
	tuples = OF_getprop_alloc(node, "reg", tuple_size, (void **)&reg);
	debugf("addr_cells = %d, size_cells = %d\n", addr_cells, size_cells);
	debugf("tuples = %d, tuple size = %d\n", tuples, tuple_size);
	if (tuples <= 0)
	/* No 'reg' property in this node. */
	return (0);

	regptr = reg;
	for (i = 0; i < tuples; i++) {

	rv = fdt_data_to_res(reg, addr_cells, size_cells, &start,
	&count);
	if (rv != 0) {
	resource_list_free(rl);
	goto out;
	}
	reg += addr_cells + size_cells;

	/* Calculate address range relative to base. */
	start += busaddr;
	end = start + count - 1;

	debugf("reg addr start = %lx, end = %lx, count = %lx\n", start,
	end, count);

	resource_list_add(rl, SYS_RES_MEMORY, i, start, end,
	count);
	}
	rv = 0;

	out:
	OF_prop_free(regptr);
	return (rv);
	}

	int
	fdt_get_phyaddr(phandle_t node, device_t dev, int phy_addr, void *phy_sc)
	{
	phandle_t phy_node;
	pcell_t phy_handle, phy_reg;
	uint32_t i;
	device_t parent, child;

	if (OF_getencprop(node, "phy-handle", (void *)&phy_handle,
	sizeof(phy_handle)) <= 0)
	return (ENXIO);

	phy_node = OF_node_from_xref(phy_handle);

	if (OF_getencprop(phy_node, "reg", (void *)&phy_reg,
	sizeof(phy_reg)) <= 0)
	return (ENXIO);

	*phy_addr = phy_reg;

	/*
	* Search for softc used to communicate with phy.
	*/

	/*
	* Step 1: Search for ancestor of the phy-node with a "phy-handle"
	* property set.
	*/
	phy_node = OF_parent(phy_node);
	while (phy_node != 0) {
	if (OF_getprop(phy_node, "phy-handle", (void *)&phy_handle,
	sizeof(phy_handle)) > 0)
	break;
	phy_node = OF_parent(phy_node);
	}
	if (phy_node == 0)
	return (ENXIO);

	/*
	* Step 2: For each device with the same parent and name as ours
	* compare its node with the one found in step 1, ancestor of phy
	* node (stored in phy_node).
	*/
	parent = device_get_parent(dev);
	i = 0;
	child = device_find_child(parent, device_get_name(dev), i);
	while (child != NULL) {
	if (ofw_bus_get_node(child) == phy_node)
	break;
	i++;
	child = device_find_child(parent, device_get_name(dev), i);
	}
	if (child == NULL)
	return (ENXIO);

	/*
	* Use softc of the device found.
	*/
	phy_sc = (void )device_get_softc(child);

	return (0);
	}

	int
	fdt_get_reserved_regions(struct mem_region mr, int mrcnt)
	{
	pcell_t reserve[FDT_REG_CELLS * FDT_MEM_REGIONS];
	pcell_t *reservep;
	phandle_t memory, root;
	- uint32_t memory_size;
	int addr_cells, size_cells;
	- int i, max_size, res_len, rv, tuple_size, tuples;
	+ int i, res_len, rv, tuple_size, tuples;

	- max_size = sizeof(reserve);
	root = OF_finddevice("/");
	memory = OF_finddevice("/memory");
	if (memory == -1) {
	rv = ENXIO;
	goto out;
	}

	if ((rv = fdt_addrsize_cells(OF_parent(memory), &addr_cells,
	&size_cells)) != 0)
	goto out;

	if (addr_cells > 2) {
	rv = ERANGE;
	goto out;
	}

	tuple_size = sizeof(pcell_t) * (addr_cells + size_cells);

	res_len = OF_getproplen(root, "memreserve");
	if (res_len <= 0 \|\| res_len > sizeof(reserve)) {
	rv = ERANGE;
	goto out;
	}

	if (OF_getprop(root, "memreserve", reserve, res_len) <= 0) {
	rv = ENXIO;
	goto out;
	}

	- memory_size = 0;
	tuples = res_len / tuple_size;
	reservep = (pcell_t *)&reserve;
	for (i = 0; i < tuples; i++) {

	rv = fdt_data_to_res(reservep, addr_cells, size_cells,
	(u_long )&mr[i].mr_start, (u_long )&mr[i].mr_size);

	if (rv != 0)
	goto out;

	reservep += addr_cells + size_cells;
	}

	*mrcnt = i;
	rv = 0;
	out:
	return (rv);
	}

	int
	fdt_get_mem_regions(struct mem_region mr, int mrcnt, uint64_t *memsize)
	{
	pcell_t reg[FDT_REG_CELLS * FDT_MEM_REGIONS];
	pcell_t *regp;
	phandle_t memory;
	uint64_t memory_size;
	int addr_cells, size_cells;
	- int i, max_size, reg_len, rv, tuple_size, tuples;
	+ int i, reg_len, rv, tuple_size, tuples;

	- max_size = sizeof(reg);
	memory = OF_finddevice("/memory");
	if (memory == -1) {
	rv = ENXIO;
	goto out;
	}

	if ((rv = fdt_addrsize_cells(OF_parent(memory), &addr_cells,
	&size_cells)) != 0)
	goto out;

	if (addr_cells > 2) {
	rv = ERANGE;
	goto out;
	}

	tuple_size = sizeof(pcell_t) * (addr_cells + size_cells);
	reg_len = OF_getproplen(memory, "reg");
	if (reg_len <= 0 \|\| reg_len > sizeof(reg)) {
	rv = ERANGE;
	goto out;
	}

	if (OF_getprop(memory, "reg", reg, reg_len) <= 0) {
	rv = ENXIO;
	goto out;
	}

	memory_size = 0;
	tuples = reg_len / tuple_size;
	regp = (pcell_t *)®
	for (i = 0; i < tuples; i++) {

	rv = fdt_data_to_res(regp, addr_cells, size_cells,
	(u_long )&mr[i].mr_start, (u_long )&mr[i].mr_size);

	if (rv != 0)
	goto out;

	regp += addr_cells + size_cells;
	memory_size += mr[i].mr_size;
	}

	if (memory_size == 0) {
	rv = ERANGE;
	goto out;
	}

	*mrcnt = i;
	if (memsize != NULL)
	*memsize = memory_size;
	rv = 0;
	out:
	return (rv);
	}

	int
	fdt_get_unit(device_t dev)
	{
	const char * name;

	name = ofw_bus_get_name(dev);
	name = strchr(name, '@') + 1;

	return (strtol(name,NULL,0));
	}

	int
	fdt_get_chosen_bootargs(char *bootargs, size_t max_size)
	{
	phandle_t chosen;

	chosen = OF_finddevice("/chosen");
	if (chosen == -1)
	return (ENXIO);
	if (OF_getprop(chosen, "bootargs", bootargs, max_size) == -1)
	return (ENXIO);
	return (0);
	}
	Index: head/sys/dev/mii/rdcphy.c
	===================================================================
	--- head/sys/dev/mii/rdcphy.c (revision 327172)
	+++ head/sys/dev/mii/rdcphy.c (revision 327173)
	@@ -1,236 +1,234 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2010, Pyun YongHyeon <yongari@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* Driver for the RDC Semiconductor R6040 10/100 PHY.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/socket.h>
	#include <sys/bus.h>

	#include <net/if.h>
	#include <net/if_media.h>

	#include <dev/mii/mii.h>
	#include <dev/mii/miivar.h>
	#include "miidevs.h"

	#include <dev/mii/rdcphyreg.h>

	#include "miibus_if.h"

	static device_probe_t rdcphy_probe;
	static device_attach_t rdcphy_attach;

	struct rdcphy_softc {
	struct mii_softc mii_sc;
	int mii_link_tick;
	#define RDCPHY_MANNEG_TICK 3
	};

	static device_method_t rdcphy_methods[] = {
	/* device interface */
	DEVMETHOD(device_probe, rdcphy_probe),
	DEVMETHOD(device_attach, rdcphy_attach),
	DEVMETHOD(device_detach, mii_phy_detach),
	DEVMETHOD(device_shutdown, bus_generic_shutdown),
	DEVMETHOD_END
	};

	static devclass_t rdcphy_devclass;

	static driver_t rdcphy_driver = {
	"rdcphy",
	rdcphy_methods,
	sizeof(struct rdcphy_softc)
	};

	DRIVER_MODULE(rdcphy, miibus, rdcphy_driver, rdcphy_devclass, 0, 0);

	static int rdcphy_service(struct mii_softc , struct mii_data , int);
	static void rdcphy_status(struct mii_softc *);

	static const struct mii_phydesc rdcphys[] = {
	MII_PHY_DESC(RDC, R6040),
	MII_PHY_END
	};

	static const struct mii_phy_funcs rdcphy_funcs = {
	rdcphy_service,
	rdcphy_status,
	mii_phy_reset
	};

	static int
	rdcphy_probe(device_t dev)
	{

	return (mii_phy_dev_probe(dev, rdcphys, BUS_PROBE_DEFAULT));
	}

	static int
	rdcphy_attach(device_t dev)
	{

	mii_phy_dev_attach(dev, MIIF_NOMANPAUSE, &rdcphy_funcs, 1);
	return (0);
	}

	static int
	rdcphy_service(struct mii_softc sc, struct mii_data mii, int cmd)
	{
	struct rdcphy_softc *rsc;
	struct ifmedia_entry *ife;

	rsc = (struct rdcphy_softc *)sc;
	ife = mii->mii_media.ifm_cur;

	switch (cmd) {
	case MII_POLLSTAT:
	break;

	case MII_MEDIACHG:
	mii_phy_setmedia(sc);
	switch (IFM_SUBTYPE(ife->ifm_media)) {
	case IFM_100_TX:
	case IFM_10_T:
	/*
	* Report fake lost link event to parent
	* driver. This will stop MAC of parent
	* driver and make it possible to reconfigure
	* MAC after completion of link establishment.
	* Note, the parent MAC seems to require
	* restarting MAC when underlying any PHY
	* configuration was changed even if the
	* resolved speed/duplex was not changed at
	* all.
	*/
	mii->mii_media_status = 0;
	mii->mii_media_active = IFM_ETHER \| IFM_NONE;
	rsc->mii_link_tick = RDCPHY_MANNEG_TICK;
	/* Immediately report link down. */
	mii_phy_update(sc, MII_MEDIACHG);
	return (0);
	default:
	break;
	}
	break;

	case MII_TICK:
	if (mii_phy_tick(sc) == EJUSTRETURN)
	return (0);
	if (IFM_SUBTYPE(ife->ifm_media) != IFM_AUTO) {
	/*
	* It seems the PHY hardware does not correctly
	* report link status changes when manual link
	* configuration is in progress. It is also
	* possible for the PHY to complete establishing
	* a link within one second such that mii(4)
	* did not notice the link change. To workaround
	* the issue, emulate lost link event and wait
	* for 3 seconds when manual link configuration
	* is in progress. 3 seconds would be long
	* enough to absorb transient link flips.
	*/
	if (rsc->mii_link_tick > 0) {
	rsc->mii_link_tick--;
	return (0);
	}
	}
	break;
	}

	/* Update the media status. */
	PHY_STATUS(sc);

	/* Callback if something changed. */
	mii_phy_update(sc, cmd);
	return (0);
	}

	static void
	rdcphy_status(struct mii_softc *sc)
	{
	struct mii_data *mii;
	- struct ifmedia_entry *ife;
	int bmsr, bmcr, physts;

	mii = sc->mii_pdata;
	- ife = mii->mii_media.ifm_cur;

	mii->mii_media_status = IFM_AVALID;
	mii->mii_media_active = IFM_ETHER;

	bmsr = PHY_READ(sc, MII_BMSR) \| PHY_READ(sc, MII_BMSR);
	physts = PHY_READ(sc, MII_RDCPHY_STATUS);

	if ((physts & STATUS_LINK_UP) != 0)
	mii->mii_media_status \|= IFM_ACTIVE;

	bmcr = PHY_READ(sc, MII_BMCR);
	if ((bmcr & BMCR_ISO) != 0) {
	mii->mii_media_active \|= IFM_NONE;
	mii->mii_media_status = 0;
	return;
	}

	if ((bmcr & BMCR_LOOP) != 0)
	mii->mii_media_active \|= IFM_LOOP;

	if ((bmcr & BMCR_AUTOEN) != 0) {
	if ((bmsr & BMSR_ACOMP) == 0) {
	/* Erg, still trying, I guess... */
	mii->mii_media_active \|= IFM_NONE;
	return;
	}
	}

	switch (physts & STATUS_SPEED_MASK) {
	case STATUS_SPEED_100:
	mii->mii_media_active \|= IFM_100_TX;
	break;
	case STATUS_SPEED_10:
	mii->mii_media_active \|= IFM_10_T;
	break;
	default:
	mii->mii_media_active \|= IFM_NONE;
	return;
	}
	if ((physts & STATUS_FULL_DUPLEX) != 0)
	mii->mii_media_active \|= IFM_FDX \| mii_phy_flowstatus(sc);
	else
	mii->mii_media_active \|= IFM_HDX;
	}
	Index: head/sys/dev/mmc/host/dwmmc.c
	===================================================================
	--- head/sys/dev/mmc/host/dwmmc.c (revision 327172)
	+++ head/sys/dev/mmc/host/dwmmc.c (revision 327173)
	@@ -1,1188 +1,1186 @@
	/*-
	* Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
	* All rights reserved.
	*
	* This software was developed by SRI International and the University of
	* Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
	* ("CTSRD"), as part of the DARPA CRASH research programme.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* Synopsys DesignWare Mobile Storage Host Controller
	* Chapter 14, Altera Cyclone V Device Handbook (CV-5V2 2014.07.22)
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/malloc.h>
	#include <sys/rman.h>

	#include <dev/mmc/bridge.h>
	#include <dev/mmc/mmcbrvar.h>

	#include <dev/fdt/fdt_common.h>
	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <machine/bus.h>
	#include <machine/cpu.h>
	#include <machine/intr.h>

	#include <dev/mmc/host/dwmmc_reg.h>
	#include <dev/mmc/host/dwmmc_var.h>

	#include "opt_mmccam.h"

	#include "mmcbr_if.h"

	#define dprintf(x, arg...)

	#define READ4(_sc, _reg) \
	bus_read_4((_sc)->res[0], _reg)
	#define WRITE4(_sc, _reg, _val) \
	bus_write_4((_sc)->res[0], _reg, _val)

	#define DIV_ROUND_UP(n, d) howmany(n, d)

	#define DWMMC_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx)
	#define DWMMC_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx)
	#define DWMMC_LOCK_INIT(_sc) \
	mtx_init(&_sc->sc_mtx, device_get_nameunit(_sc->dev), \
	"dwmmc", MTX_DEF)
	#define DWMMC_LOCK_DESTROY(_sc) mtx_destroy(&_sc->sc_mtx);
	#define DWMMC_ASSERT_LOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_OWNED);
	#define DWMMC_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);

	#define PENDING_CMD 0x01
	#define PENDING_STOP 0x02
	#define CARD_INIT_DONE 0x04

	#define DWMMC_DATA_ERR_FLAGS (SDMMC_INTMASK_DRT \| SDMMC_INTMASK_DCRC \
	\|SDMMC_INTMASK_HTO \| SDMMC_INTMASK_SBE \
	\|SDMMC_INTMASK_EBE)
	#define DWMMC_CMD_ERR_FLAGS (SDMMC_INTMASK_RTO \| SDMMC_INTMASK_RCRC \
	\|SDMMC_INTMASK_RE)
	#define DWMMC_ERR_FLAGS (DWMMC_DATA_ERR_FLAGS \| DWMMC_CMD_ERR_FLAGS \
	\|SDMMC_INTMASK_HLE)

	#define DES0_DIC (1 << 1)
	#define DES0_LD (1 << 2)
	#define DES0_FS (1 << 3)
	#define DES0_CH (1 << 4)
	#define DES0_ER (1 << 5)
	#define DES0_CES (1 << 30)
	#define DES0_OWN (1 << 31)

	#define DES1_BS1_MASK 0xfff
	#define DES1_BS1_SHIFT 0

	struct idmac_desc {
	uint32_t des0; /* control */
	uint32_t des1; /* bufsize */
	uint32_t des2; /* buf1 phys addr */
	uint32_t des3; /* buf2 phys addr or next descr */
	};

	#define DESC_MAX 256
	#define DESC_SIZE (sizeof(struct idmac_desc) * DESC_MAX)
	#define DEF_MSIZE 0x2 /* Burst size of multiple transaction */

	static void dwmmc_next_operation(struct dwmmc_softc *);
	static int dwmmc_setup_bus(struct dwmmc_softc *, int);
	static int dma_done(struct dwmmc_softc , struct mmc_command );
	static int dma_stop(struct dwmmc_softc *);
	static void pio_read(struct dwmmc_softc , struct mmc_command );
	static void pio_write(struct dwmmc_softc , struct mmc_command );

	static struct resource_spec dwmmc_spec[] = {
	{ SYS_RES_MEMORY, 0, RF_ACTIVE },
	{ SYS_RES_IRQ, 0, RF_ACTIVE },
	{ -1, 0 }
	};

	#define HWTYPE_MASK (0x0000ffff)
	#define HWFLAG_MASK (0xffff << 16)

	static struct ofw_compat_data compat_data[] = {
	{"altr,socfpga-dw-mshc", HWTYPE_ALTERA},
	{"samsung,exynos5420-dw-mshc", HWTYPE_EXYNOS},
	{"rockchip,rk2928-dw-mshc", HWTYPE_ROCKCHIP},
	{NULL, HWTYPE_NONE},
	};

	static void
	dwmmc_get1paddr(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{

	if (error != 0)
	return;
	(bus_addr_t )arg = segs[0].ds_addr;
	}

	static void
	dwmmc_ring_setup(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct dwmmc_softc *sc;
	int idx;

	if (error != 0)
	return;

	sc = arg;

	dprintf("nsegs %d seg0len %lu\n", nsegs, segs[0].ds_len);

	for (idx = 0; idx < nsegs; idx++) {
	sc->desc_ring[idx].des0 = (DES0_OWN \| DES0_DIC \| DES0_CH);
	sc->desc_ring[idx].des1 = segs[idx].ds_len;
	sc->desc_ring[idx].des2 = segs[idx].ds_addr;

	if (idx == 0)
	sc->desc_ring[idx].des0 \|= DES0_FS;

	if (idx == (nsegs - 1)) {
	sc->desc_ring[idx].des0 &= ~(DES0_DIC \| DES0_CH);
	sc->desc_ring[idx].des0 \|= DES0_LD;
	}
	}
	}

	static int
	dwmmc_ctrl_reset(struct dwmmc_softc *sc, int reset_bits)
	{
	int reg;
	int i;

	reg = READ4(sc, SDMMC_CTRL);
	reg \|= (reset_bits);
	WRITE4(sc, SDMMC_CTRL, reg);

	/* Wait reset done */
	for (i = 0; i < 100; i++) {
	if (!(READ4(sc, SDMMC_CTRL) & reset_bits))
	return (0);
	DELAY(10);
	}

	device_printf(sc->dev, "Reset failed\n");

	return (1);
	}

	static int
	dma_setup(struct dwmmc_softc *sc)
	{
	int error;
	int nidx;
	int idx;

	/*
	* Set up TX descriptor ring, descriptors, and dma maps.
	*/
	error = bus_dma_tag_create(
	bus_get_dma_tag(sc->dev), /* Parent tag. */
	4096, 0, /* alignment, boundary */
	BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	DESC_SIZE, 1, /* maxsize, nsegments */
	DESC_SIZE, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->desc_tag);
	if (error != 0) {
	device_printf(sc->dev,
	"could not create ring DMA tag.\n");
	return (1);
	}

	error = bus_dmamem_alloc(sc->desc_tag, (void**)&sc->desc_ring,
	BUS_DMA_COHERENT \| BUS_DMA_WAITOK \| BUS_DMA_ZERO,
	&sc->desc_map);
	if (error != 0) {
	device_printf(sc->dev,
	"could not allocate descriptor ring.\n");
	return (1);
	}

	error = bus_dmamap_load(sc->desc_tag, sc->desc_map,
	sc->desc_ring, DESC_SIZE, dwmmc_get1paddr,
	&sc->desc_ring_paddr, 0);
	if (error != 0) {
	device_printf(sc->dev,
	"could not load descriptor ring map.\n");
	return (1);
	}

	for (idx = 0; idx < sc->desc_count; idx++) {
	sc->desc_ring[idx].des0 = DES0_CH;
	sc->desc_ring[idx].des1 = 0;
	nidx = (idx + 1) % sc->desc_count;
	sc->desc_ring[idx].des3 = sc->desc_ring_paddr + \
	(nidx * sizeof(struct idmac_desc));
	}

	error = bus_dma_tag_create(
	bus_get_dma_tag(sc->dev), /* Parent tag. */
	4096, 0, /* alignment, boundary */
	BUS_SPACE_MAXADDR_32BIT, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	sc->desc_count * MMC_SECTOR_SIZE, /* maxsize */
	sc->desc_count, /* nsegments */
	MMC_SECTOR_SIZE, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->buf_tag);
	if (error != 0) {
	device_printf(sc->dev,
	"could not create ring DMA tag.\n");
	return (1);
	}

	error = bus_dmamap_create(sc->buf_tag, 0,
	&sc->buf_map);
	if (error != 0) {
	device_printf(sc->dev,
	"could not create TX buffer DMA map.\n");
	return (1);
	}

	return (0);
	}

	static void
	dwmmc_cmd_done(struct dwmmc_softc *sc)
	{
	struct mmc_command *cmd;

	cmd = sc->curcmd;
	if (cmd == NULL)
	return;

	if (cmd->flags & MMC_RSP_PRESENT) {
	if (cmd->flags & MMC_RSP_136) {
	cmd->resp[3] = READ4(sc, SDMMC_RESP0);
	cmd->resp[2] = READ4(sc, SDMMC_RESP1);
	cmd->resp[1] = READ4(sc, SDMMC_RESP2);
	cmd->resp[0] = READ4(sc, SDMMC_RESP3);
	} else {
	cmd->resp[3] = 0;
	cmd->resp[2] = 0;
	cmd->resp[1] = 0;
	cmd->resp[0] = READ4(sc, SDMMC_RESP0);
	}
	}
	}

	static void
	dwmmc_tasklet(struct dwmmc_softc *sc)
	{
	struct mmc_command *cmd;

	cmd = sc->curcmd;
	if (cmd == NULL)
	return;

	if (!sc->cmd_done)
	return;

	if (cmd->error != MMC_ERR_NONE \|\| !cmd->data) {
	dwmmc_next_operation(sc);
	} else if (cmd->data && sc->dto_rcvd) {
	if ((cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK \|\|
	cmd->opcode == MMC_READ_MULTIPLE_BLOCK) &&
	sc->use_auto_stop) {
	if (sc->acd_rcvd)
	dwmmc_next_operation(sc);
	} else {
	dwmmc_next_operation(sc);
	}
	}
	}

	static void
	dwmmc_intr(void *arg)
	{
	struct mmc_command *cmd;
	struct dwmmc_softc *sc;
	uint32_t reg;

	sc = arg;

	DWMMC_LOCK(sc);

	cmd = sc->curcmd;

	/* First handle SDMMC controller interrupts */
	reg = READ4(sc, SDMMC_MINTSTS);
	if (reg) {
	dprintf("%s 0x%08x\n", __func__, reg);

	if (reg & DWMMC_CMD_ERR_FLAGS) {
	WRITE4(sc, SDMMC_RINTSTS, DWMMC_CMD_ERR_FLAGS);
	dprintf("cmd err 0x%08x cmd 0x%08x\n",
	reg, cmd->opcode);
	cmd->error = MMC_ERR_TIMEOUT;
	}

	if (reg & DWMMC_DATA_ERR_FLAGS) {
	WRITE4(sc, SDMMC_RINTSTS, DWMMC_DATA_ERR_FLAGS);
	dprintf("data err 0x%08x cmd 0x%08x\n",
	reg, cmd->opcode);
	cmd->error = MMC_ERR_FAILED;
	if (!sc->use_pio) {
	dma_done(sc, cmd);
	dma_stop(sc);
	}
	}

	if (reg & SDMMC_INTMASK_CMD_DONE) {
	dwmmc_cmd_done(sc);
	sc->cmd_done = 1;
	WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_CMD_DONE);
	}

	if (reg & SDMMC_INTMASK_ACD) {
	sc->acd_rcvd = 1;
	WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_ACD);
	}

	if (reg & SDMMC_INTMASK_DTO) {
	sc->dto_rcvd = 1;
	WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_DTO);
	}

	if (reg & SDMMC_INTMASK_CD) {
	/* XXX: Handle card detect */
	WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_CD);
	}
	}

	if (sc->use_pio) {
	if (reg & (SDMMC_INTMASK_RXDR\|SDMMC_INTMASK_DTO)) {
	pio_read(sc, cmd);
	}
	if (reg & (SDMMC_INTMASK_TXDR\|SDMMC_INTMASK_DTO)) {
	pio_write(sc, cmd);
	}
	} else {
	/* Now handle DMA interrupts */
	reg = READ4(sc, SDMMC_IDSTS);
	if (reg) {
	dprintf("dma intr 0x%08x\n", reg);
	if (reg & (SDMMC_IDINTEN_TI \| SDMMC_IDINTEN_RI)) {
	WRITE4(sc, SDMMC_IDSTS, (SDMMC_IDINTEN_TI \|
	SDMMC_IDINTEN_RI));
	WRITE4(sc, SDMMC_IDSTS, SDMMC_IDINTEN_NI);
	dma_done(sc, cmd);
	}
	}
	}

	dwmmc_tasklet(sc);

	DWMMC_UNLOCK(sc);
	}

	static int
	parse_fdt(struct dwmmc_softc *sc)
	{
	pcell_t dts_value[3];
	phandle_t node;
	int len;

	if ((node = ofw_bus_get_node(sc->dev)) == -1)
	return (ENXIO);

	/* fifo-depth */
	if ((len = OF_getproplen(node, "fifo-depth")) > 0) {
	OF_getencprop(node, "fifo-depth", dts_value, len);
	sc->fifo_depth = dts_value[0];
	}

	/* num-slots */
	sc->num_slots = 1;
	if ((len = OF_getproplen(node, "num-slots")) > 0) {
	OF_getencprop(node, "num-slots", dts_value, len);
	sc->num_slots = dts_value[0];
	}

	/*
	* We need some platform-specific code to know
	* what the clock is supplied for our device.
	* For now rely on the value specified in FDT.
	*/
	if (sc->bus_hz == 0) {
	if ((len = OF_getproplen(node, "bus-frequency")) <= 0)
	return (ENXIO);
	OF_getencprop(node, "bus-frequency", dts_value, len);
	sc->bus_hz = dts_value[0];
	}

	/*
	* Platform-specific stuff
	* XXX: Move to separate file
	*/

	if ((sc->hwtype & HWTYPE_MASK) != HWTYPE_EXYNOS)
	return (0);

	if ((len = OF_getproplen(node, "samsung,dw-mshc-ciu-div")) <= 0)
	return (ENXIO);
	OF_getencprop(node, "samsung,dw-mshc-ciu-div", dts_value, len);
	sc->sdr_timing = (dts_value[0] << SDMMC_CLKSEL_DIVIDER_SHIFT);
	sc->ddr_timing = (dts_value[0] << SDMMC_CLKSEL_DIVIDER_SHIFT);

	if ((len = OF_getproplen(node, "samsung,dw-mshc-sdr-timing")) <= 0)
	return (ENXIO);
	OF_getencprop(node, "samsung,dw-mshc-sdr-timing", dts_value, len);
	sc->sdr_timing \|= ((dts_value[0] << SDMMC_CLKSEL_SAMPLE_SHIFT) \|
	(dts_value[1] << SDMMC_CLKSEL_DRIVE_SHIFT));

	if ((len = OF_getproplen(node, "samsung,dw-mshc-ddr-timing")) <= 0)
	return (ENXIO);
	OF_getencprop(node, "samsung,dw-mshc-ddr-timing", dts_value, len);
	sc->ddr_timing \|= ((dts_value[0] << SDMMC_CLKSEL_SAMPLE_SHIFT) \|
	(dts_value[1] << SDMMC_CLKSEL_DRIVE_SHIFT));

	return (0);
	}

	static int
	dwmmc_probe(device_t dev)
	{
	uintptr_t hwtype;

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	hwtype = ofw_bus_search_compatible(dev, compat_data)->ocd_data;
	if (hwtype == HWTYPE_NONE)
	return (ENXIO);

	device_set_desc(dev, "Synopsys DesignWare Mobile "
	"Storage Host Controller");
	return (BUS_PROBE_DEFAULT);
	}

	int
	dwmmc_attach(device_t dev)
	{
	struct dwmmc_softc *sc;
	int error;
	int slot;

	sc = device_get_softc(dev);

	sc->dev = dev;
	if (sc->hwtype == HWTYPE_NONE) {
	sc->hwtype =
	ofw_bus_search_compatible(dev, compat_data)->ocd_data;
	}

	/* Why not to use Auto Stop? It save a hundred of irq per second */
	sc->use_auto_stop = 1;

	error = parse_fdt(sc);
	if (error != 0) {
	device_printf(dev, "Can't get FDT property.\n");
	return (ENXIO);
	}

	DWMMC_LOCK_INIT(sc);

	if (bus_alloc_resources(dev, dwmmc_spec, sc->res)) {
	device_printf(dev, "could not allocate resources\n");
	return (ENXIO);
	}

	/* Setup interrupt handler. */
	error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_NET \| INTR_MPSAFE,
	NULL, dwmmc_intr, sc, &sc->intr_cookie);
	if (error != 0) {
	device_printf(dev, "could not setup interrupt handler.\n");
	return (ENXIO);
	}

	device_printf(dev, "Hardware version ID is %04x\n",
	READ4(sc, SDMMC_VERID) & 0xffff);

	if (sc->desc_count == 0)
	sc->desc_count = DESC_MAX;

	if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_ROCKCHIP) {
	sc->use_pio = 1;
	sc->pwren_inverted = 1;
	} else if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_EXYNOS) {
	WRITE4(sc, EMMCP_MPSBEGIN0, 0);
	WRITE4(sc, EMMCP_SEND0, 0);
	WRITE4(sc, EMMCP_CTRL0, (MPSCTRL_SECURE_READ_BIT \|
	MPSCTRL_SECURE_WRITE_BIT \|
	MPSCTRL_NON_SECURE_READ_BIT \|
	MPSCTRL_NON_SECURE_WRITE_BIT \|
	MPSCTRL_VALID));
	}

	/* XXX: we support operation for slot index 0 only */
	slot = 0;
	if (sc->pwren_inverted) {
	WRITE4(sc, SDMMC_PWREN, (0 << slot));
	} else {
	WRITE4(sc, SDMMC_PWREN, (1 << slot));
	}

	/* Reset all */
	if (dwmmc_ctrl_reset(sc, (SDMMC_CTRL_RESET \|
	SDMMC_CTRL_FIFO_RESET \|
	SDMMC_CTRL_DMA_RESET)))
	return (ENXIO);

	dwmmc_setup_bus(sc, sc->host.f_min);

	if (sc->fifo_depth == 0) {
	sc->fifo_depth = 1 +
	((READ4(sc, SDMMC_FIFOTH) >> SDMMC_FIFOTH_RXWMARK_S) & 0xfff);
	device_printf(dev, "No fifo-depth, using FIFOTH %x\n",
	sc->fifo_depth);
	}

	if (!sc->use_pio) {
	if (dma_setup(sc))
	return (ENXIO);

	/* Install desc base */
	WRITE4(sc, SDMMC_DBADDR, sc->desc_ring_paddr);

	/* Enable DMA interrupts */
	WRITE4(sc, SDMMC_IDSTS, SDMMC_IDINTEN_MASK);
	WRITE4(sc, SDMMC_IDINTEN, (SDMMC_IDINTEN_NI \|
	SDMMC_IDINTEN_RI \|
	SDMMC_IDINTEN_TI));
	}

	/* Clear and disable interrups for a while */
	WRITE4(sc, SDMMC_RINTSTS, 0xffffffff);
	WRITE4(sc, SDMMC_INTMASK, 0);

	/* Maximum timeout */
	WRITE4(sc, SDMMC_TMOUT, 0xffffffff);

	/* Enable interrupts */
	WRITE4(sc, SDMMC_RINTSTS, 0xffffffff);
	WRITE4(sc, SDMMC_INTMASK, (SDMMC_INTMASK_CMD_DONE \|
	SDMMC_INTMASK_DTO \|
	SDMMC_INTMASK_ACD \|
	SDMMC_INTMASK_TXDR \|
	SDMMC_INTMASK_RXDR \|
	DWMMC_ERR_FLAGS \|
	SDMMC_INTMASK_CD));
	WRITE4(sc, SDMMC_CTRL, SDMMC_CTRL_INT_ENABLE);

	sc->host.f_min = 400000;
	sc->host.f_max = min(200000000, sc->bus_hz);
	sc->host.host_ocr = MMC_OCR_320_330 \| MMC_OCR_330_340;
	sc->host.caps = MMC_CAP_4_BIT_DATA;

	device_add_child(dev, "mmc", -1);
	return (bus_generic_attach(dev));
	}

	static int
	dwmmc_setup_bus(struct dwmmc_softc *sc, int freq)
	{
	int tout;
	int div;

	if (freq == 0) {
	WRITE4(sc, SDMMC_CLKENA, 0);
	WRITE4(sc, SDMMC_CMD, (SDMMC_CMD_WAIT_PRVDATA \|
	SDMMC_CMD_UPD_CLK_ONLY \| SDMMC_CMD_START));

	tout = 1000;
	do {
	if (tout-- < 0) {
	device_printf(sc->dev, "Failed update clk\n");
	return (1);
	}
	} while (READ4(sc, SDMMC_CMD) & SDMMC_CMD_START);

	return (0);
	}

	WRITE4(sc, SDMMC_CLKENA, 0);
	WRITE4(sc, SDMMC_CLKSRC, 0);

	div = (sc->bus_hz != freq) ? DIV_ROUND_UP(sc->bus_hz, 2 * freq) : 0;

	WRITE4(sc, SDMMC_CLKDIV, div);
	WRITE4(sc, SDMMC_CMD, (SDMMC_CMD_WAIT_PRVDATA \|
	SDMMC_CMD_UPD_CLK_ONLY \| SDMMC_CMD_START));

	tout = 1000;
	do {
	if (tout-- < 0) {
	device_printf(sc->dev, "Failed to update clk");
	return (1);
	}
	} while (READ4(sc, SDMMC_CMD) & SDMMC_CMD_START);

	WRITE4(sc, SDMMC_CLKENA, (SDMMC_CLKENA_CCLK_EN \| SDMMC_CLKENA_LP));
	WRITE4(sc, SDMMC_CMD, SDMMC_CMD_WAIT_PRVDATA \|
	SDMMC_CMD_UPD_CLK_ONLY \| SDMMC_CMD_START);

	tout = 1000;
	do {
	if (tout-- < 0) {
	device_printf(sc->dev, "Failed to enable clk\n");
	return (1);
	}
	} while (READ4(sc, SDMMC_CMD) & SDMMC_CMD_START);

	return (0);
	}

	static int
	dwmmc_update_ios(device_t brdev, device_t reqdev)
	{
	struct dwmmc_softc *sc;
	struct mmc_ios *ios;

	sc = device_get_softc(brdev);
	ios = &sc->host.ios;

	dprintf("Setting up clk %u bus_width %d\n",
	ios->clock, ios->bus_width);

	dwmmc_setup_bus(sc, ios->clock);

	if (ios->bus_width == bus_width_8)
	WRITE4(sc, SDMMC_CTYPE, SDMMC_CTYPE_8BIT);
	else if (ios->bus_width == bus_width_4)
	WRITE4(sc, SDMMC_CTYPE, SDMMC_CTYPE_4BIT);
	else
	WRITE4(sc, SDMMC_CTYPE, 0);

	if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_EXYNOS) {
	/* XXX: take care about DDR or SDR use here */
	WRITE4(sc, SDMMC_CLKSEL, sc->sdr_timing);
	}

	/*
	* XXX: take care about DDR bit
	*
	* reg = READ4(sc, SDMMC_UHS_REG);
	* reg \|= (SDMMC_UHS_REG_DDR);
	* WRITE4(sc, SDMMC_UHS_REG, reg);
	*/

	return (0);
	}

	static int
	dma_done(struct dwmmc_softc sc, struct mmc_command cmd)
	{
	struct mmc_data *data;

	data = cmd->data;

	if (data->flags & MMC_DATA_WRITE)
	bus_dmamap_sync(sc->buf_tag, sc->buf_map,
	BUS_DMASYNC_POSTWRITE);
	else
	bus_dmamap_sync(sc->buf_tag, sc->buf_map,
	BUS_DMASYNC_POSTREAD);

	bus_dmamap_sync(sc->desc_tag, sc->desc_map,
	BUS_DMASYNC_POSTWRITE);

	bus_dmamap_unload(sc->buf_tag, sc->buf_map);

	return (0);
	}

	static int
	dma_stop(struct dwmmc_softc *sc)
	{
	int reg;

	reg = READ4(sc, SDMMC_CTRL);
	reg &= ~(SDMMC_CTRL_USE_IDMAC);
	reg \|= (SDMMC_CTRL_DMA_RESET);
	WRITE4(sc, SDMMC_CTRL, reg);

	reg = READ4(sc, SDMMC_BMOD);
	reg &= ~(SDMMC_BMOD_DE \| SDMMC_BMOD_FB);
	reg \|= (SDMMC_BMOD_SWR);
	WRITE4(sc, SDMMC_BMOD, reg);

	return (0);
	}

	static int
	dma_prepare(struct dwmmc_softc sc, struct mmc_command cmd)
	{
	struct mmc_data *data;
	- int len;
	int err;
	int reg;

	data = cmd->data;
	- len = data->len;

	reg = READ4(sc, SDMMC_INTMASK);
	reg &= ~(SDMMC_INTMASK_TXDR \| SDMMC_INTMASK_RXDR);
	WRITE4(sc, SDMMC_INTMASK, reg);

	err = bus_dmamap_load(sc->buf_tag, sc->buf_map,
	data->data, data->len, dwmmc_ring_setup,
	sc, BUS_DMA_NOWAIT);
	if (err != 0)
	panic("dmamap_load failed\n");

	/* Ensure the device can see the desc */
	bus_dmamap_sync(sc->desc_tag, sc->desc_map,
	BUS_DMASYNC_PREWRITE);

	if (data->flags & MMC_DATA_WRITE)
	bus_dmamap_sync(sc->buf_tag, sc->buf_map,
	BUS_DMASYNC_PREWRITE);
	else
	bus_dmamap_sync(sc->buf_tag, sc->buf_map,
	BUS_DMASYNC_PREREAD);

	reg = (DEF_MSIZE << SDMMC_FIFOTH_MSIZE_S);
	reg \|= ((sc->fifo_depth / 2) - 1) << SDMMC_FIFOTH_RXWMARK_S;
	reg \|= (sc->fifo_depth / 2) << SDMMC_FIFOTH_TXWMARK_S;

	WRITE4(sc, SDMMC_FIFOTH, reg);
	wmb();

	reg = READ4(sc, SDMMC_CTRL);
	reg \|= (SDMMC_CTRL_USE_IDMAC \| SDMMC_CTRL_DMA_ENABLE);
	WRITE4(sc, SDMMC_CTRL, reg);
	wmb();

	reg = READ4(sc, SDMMC_BMOD);
	reg \|= (SDMMC_BMOD_DE \| SDMMC_BMOD_FB);
	WRITE4(sc, SDMMC_BMOD, reg);

	/* Start */
	WRITE4(sc, SDMMC_PLDMND, 1);

	return (0);
	}

	static int
	pio_prepare(struct dwmmc_softc sc, struct mmc_command cmd)
	{
	struct mmc_data *data;
	int reg;

	data = cmd->data;
	data->xfer_len = 0;

	reg = (DEF_MSIZE << SDMMC_FIFOTH_MSIZE_S);
	reg \|= ((sc->fifo_depth / 2) - 1) << SDMMC_FIFOTH_RXWMARK_S;
	reg \|= (sc->fifo_depth / 2) << SDMMC_FIFOTH_TXWMARK_S;

	WRITE4(sc, SDMMC_FIFOTH, reg);
	wmb();

	return (0);
	}

	static void
	pio_read(struct dwmmc_softc sc, struct mmc_command cmd)
	{
	struct mmc_data *data;
	uint32_t *p, status;

	if (cmd == NULL \|\| cmd->data == NULL)
	return;

	data = cmd->data;
	if ((data->flags & MMC_DATA_READ) == 0)
	return;

	KASSERT((data->xfer_len & 3) == 0, ("xfer_len not aligned"));
	p = (uint32_t *)data->data + (data->xfer_len >> 2);

	while (data->xfer_len < data->len) {
	status = READ4(sc, SDMMC_STATUS);
	if (status & SDMMC_STATUS_FIFO_EMPTY)
	break;
	*p++ = READ4(sc, SDMMC_DATA);
	data->xfer_len += 4;
	}

	WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_RXDR);
	}

	static void
	pio_write(struct dwmmc_softc sc, struct mmc_command cmd)
	{
	struct mmc_data *data;
	uint32_t *p, status;

	if (cmd == NULL \|\| cmd->data == NULL)
	return;

	data = cmd->data;
	if ((data->flags & MMC_DATA_WRITE) == 0)
	return;

	KASSERT((data->xfer_len & 3) == 0, ("xfer_len not aligned"));
	p = (uint32_t *)data->data + (data->xfer_len >> 2);

	while (data->xfer_len < data->len) {
	status = READ4(sc, SDMMC_STATUS);
	if (status & SDMMC_STATUS_FIFO_FULL)
	break;
	WRITE4(sc, SDMMC_DATA, *p++);
	data->xfer_len += 4;
	}

	WRITE4(sc, SDMMC_RINTSTS, SDMMC_INTMASK_TXDR);
	}

	static void
	dwmmc_start_cmd(struct dwmmc_softc sc, struct mmc_command cmd)
	{
	struct mmc_data *data;
	uint32_t blksz;
	uint32_t cmdr;

	sc->curcmd = cmd;
	data = cmd->data;

	if ((sc->hwtype & HWTYPE_MASK) == HWTYPE_ROCKCHIP)
	dwmmc_setup_bus(sc, sc->host.ios.clock);

	/* XXX Upper layers don't always set this */
	cmd->mrq = sc->req;

	/* Begin setting up command register. */

	cmdr = cmd->opcode;

	dprintf("cmd->opcode 0x%08x\n", cmd->opcode);

	if (cmd->opcode == MMC_STOP_TRANSMISSION \|\|
	cmd->opcode == MMC_GO_IDLE_STATE \|\|
	cmd->opcode == MMC_GO_INACTIVE_STATE)
	cmdr \|= SDMMC_CMD_STOP_ABORT;
	else if (cmd->opcode != MMC_SEND_STATUS && data)
	cmdr \|= SDMMC_CMD_WAIT_PRVDATA;

	/* Set up response handling. */
	if (MMC_RSP(cmd->flags) != MMC_RSP_NONE) {
	cmdr \|= SDMMC_CMD_RESP_EXP;
	if (cmd->flags & MMC_RSP_136)
	cmdr \|= SDMMC_CMD_RESP_LONG;
	}

	if (cmd->flags & MMC_RSP_CRC)
	cmdr \|= SDMMC_CMD_RESP_CRC;

	/*
	* XXX: Not all platforms want this.
	*/
	cmdr \|= SDMMC_CMD_USE_HOLD_REG;

	if ((sc->flags & CARD_INIT_DONE) == 0) {
	sc->flags \|= (CARD_INIT_DONE);
	cmdr \|= SDMMC_CMD_SEND_INIT;
	}

	if (data) {
	if ((cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK \|\|
	cmd->opcode == MMC_READ_MULTIPLE_BLOCK) &&
	sc->use_auto_stop)
	cmdr \|= SDMMC_CMD_SEND_ASTOP;

	cmdr \|= SDMMC_CMD_DATA_EXP;
	if (data->flags & MMC_DATA_STREAM)
	cmdr \|= SDMMC_CMD_MODE_STREAM;
	if (data->flags & MMC_DATA_WRITE)
	cmdr \|= SDMMC_CMD_DATA_WRITE;

	WRITE4(sc, SDMMC_TMOUT, 0xffffffff);
	WRITE4(sc, SDMMC_BYTCNT, data->len);
	blksz = (data->len < MMC_SECTOR_SIZE) ? \
	data->len : MMC_SECTOR_SIZE;
	WRITE4(sc, SDMMC_BLKSIZ, blksz);

	if (sc->use_pio) {
	pio_prepare(sc, cmd);
	} else {
	dma_prepare(sc, cmd);
	}
	wmb();
	}

	dprintf("cmdr 0x%08x\n", cmdr);

	WRITE4(sc, SDMMC_CMDARG, cmd->arg);
	wmb();
	WRITE4(sc, SDMMC_CMD, cmdr \| SDMMC_CMD_START);
	};

	static void
	dwmmc_next_operation(struct dwmmc_softc *sc)
	{
	struct mmc_request *req;

	req = sc->req;
	if (req == NULL)
	return;

	sc->acd_rcvd = 0;
	sc->dto_rcvd = 0;
	sc->cmd_done = 0;

	/*
	* XXX: Wait until card is still busy.
	* We do need this to prevent data timeouts,
	* mostly caused by multi-block write command
	* followed by single-read.
	*/
	while(READ4(sc, SDMMC_STATUS) & (SDMMC_STATUS_DATA_BUSY))
	continue;

	if (sc->flags & PENDING_CMD) {
	sc->flags &= ~PENDING_CMD;
	dwmmc_start_cmd(sc, req->cmd);
	return;
	} else if (sc->flags & PENDING_STOP && !sc->use_auto_stop) {
	sc->flags &= ~PENDING_STOP;
	dwmmc_start_cmd(sc, req->stop);
	return;
	}

	sc->req = NULL;
	sc->curcmd = NULL;
	req->done(req);
	}

	static int
	dwmmc_request(device_t brdev, device_t reqdev, struct mmc_request *req)
	{
	struct dwmmc_softc *sc;

	sc = device_get_softc(brdev);

	dprintf("%s\n", __func__);

	DWMMC_LOCK(sc);

	if (sc->req != NULL) {
	DWMMC_UNLOCK(sc);
	return (EBUSY);
	}

	sc->req = req;
	sc->flags \|= PENDING_CMD;
	if (sc->req->stop)
	sc->flags \|= PENDING_STOP;
	dwmmc_next_operation(sc);

	DWMMC_UNLOCK(sc);
	return (0);
	}

	static int
	dwmmc_get_ro(device_t brdev, device_t reqdev)
	{

	dprintf("%s\n", __func__);

	return (0);
	}

	static int
	dwmmc_acquire_host(device_t brdev, device_t reqdev)
	{
	struct dwmmc_softc *sc;

	sc = device_get_softc(brdev);

	DWMMC_LOCK(sc);
	while (sc->bus_busy)
	msleep(sc, &sc->sc_mtx, PZERO, "dwmmcah", hz / 5);
	sc->bus_busy++;
	DWMMC_UNLOCK(sc);
	return (0);
	}

	static int
	dwmmc_release_host(device_t brdev, device_t reqdev)
	{
	struct dwmmc_softc *sc;

	sc = device_get_softc(brdev);

	DWMMC_LOCK(sc);
	sc->bus_busy--;
	wakeup(sc);
	DWMMC_UNLOCK(sc);
	return (0);
	}

	static int
	dwmmc_read_ivar(device_t bus, device_t child, int which, uintptr_t *result)
	{
	struct dwmmc_softc *sc;

	sc = device_get_softc(bus);

	switch (which) {
	default:
	return (EINVAL);
	case MMCBR_IVAR_BUS_MODE:
	(int )result = sc->host.ios.bus_mode;
	break;
	case MMCBR_IVAR_BUS_WIDTH:
	(int )result = sc->host.ios.bus_width;
	break;
	case MMCBR_IVAR_CHIP_SELECT:
	(int )result = sc->host.ios.chip_select;
	break;
	case MMCBR_IVAR_CLOCK:
	(int )result = sc->host.ios.clock;
	break;
	case MMCBR_IVAR_F_MIN:
	(int )result = sc->host.f_min;
	break;
	case MMCBR_IVAR_F_MAX:
	(int )result = sc->host.f_max;
	break;
	case MMCBR_IVAR_HOST_OCR:
	(int )result = sc->host.host_ocr;
	break;
	case MMCBR_IVAR_MODE:
	(int )result = sc->host.mode;
	break;
	case MMCBR_IVAR_OCR:
	(int )result = sc->host.ocr;
	break;
	case MMCBR_IVAR_POWER_MODE:
	(int )result = sc->host.ios.power_mode;
	break;
	case MMCBR_IVAR_VDD:
	(int )result = sc->host.ios.vdd;
	break;
	case MMCBR_IVAR_CAPS:
	sc->host.caps \|= MMC_CAP_4_BIT_DATA \| MMC_CAP_8_BIT_DATA;
	(int )result = sc->host.caps;
	break;
	case MMCBR_IVAR_MAX_DATA:
	(int )result = sc->desc_count;
	}
	return (0);
	}

	static int
	dwmmc_write_ivar(device_t bus, device_t child, int which, uintptr_t value)
	{
	struct dwmmc_softc *sc;

	sc = device_get_softc(bus);

	switch (which) {
	default:
	return (EINVAL);
	case MMCBR_IVAR_BUS_MODE:
	sc->host.ios.bus_mode = value;
	break;
	case MMCBR_IVAR_BUS_WIDTH:
	sc->host.ios.bus_width = value;
	break;
	case MMCBR_IVAR_CHIP_SELECT:
	sc->host.ios.chip_select = value;
	break;
	case MMCBR_IVAR_CLOCK:
	sc->host.ios.clock = value;
	break;
	case MMCBR_IVAR_MODE:
	sc->host.mode = value;
	break;
	case MMCBR_IVAR_OCR:
	sc->host.ocr = value;
	break;
	case MMCBR_IVAR_POWER_MODE:
	sc->host.ios.power_mode = value;
	break;
	case MMCBR_IVAR_VDD:
	sc->host.ios.vdd = value;
	break;
	/* These are read-only */
	case MMCBR_IVAR_CAPS:
	case MMCBR_IVAR_HOST_OCR:
	case MMCBR_IVAR_F_MIN:
	case MMCBR_IVAR_F_MAX:
	case MMCBR_IVAR_MAX_DATA:
	return (EINVAL);
	}
	return (0);
	}

	static device_method_t dwmmc_methods[] = {
	DEVMETHOD(device_probe, dwmmc_probe),
	DEVMETHOD(device_attach, dwmmc_attach),

	/* Bus interface */
	DEVMETHOD(bus_read_ivar, dwmmc_read_ivar),
	DEVMETHOD(bus_write_ivar, dwmmc_write_ivar),

	/* mmcbr_if */
	DEVMETHOD(mmcbr_update_ios, dwmmc_update_ios),
	DEVMETHOD(mmcbr_request, dwmmc_request),
	DEVMETHOD(mmcbr_get_ro, dwmmc_get_ro),
	DEVMETHOD(mmcbr_acquire_host, dwmmc_acquire_host),
	DEVMETHOD(mmcbr_release_host, dwmmc_release_host),

	DEVMETHOD_END
	};

	driver_t dwmmc_driver = {
	"dwmmc",
	dwmmc_methods,
	sizeof(struct dwmmc_softc),
	};

	static devclass_t dwmmc_devclass;

	DRIVER_MODULE(dwmmc, simplebus, dwmmc_driver, dwmmc_devclass, NULL, NULL);
	DRIVER_MODULE(dwmmc, ofwbus, dwmmc_driver, dwmmc_devclass, NULL, NULL);
	#ifndef MMCCAM
	MMC_DECLARE_BRIDGE(dwmmc);
	#endif
	Index: head/sys/dev/mmc/mmc.c
	===================================================================
	--- head/sys/dev/mmc/mmc.c (revision 327172)
	+++ head/sys/dev/mmc/mmc.c (revision 327173)
	@@ -1,2586 +1,2583 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006 Bernd Walter. All rights reserved.
	* Copyright (c) 2006 M. Warner Losh. All rights reserved.
	* Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* Portions of this software may have been developed with reference to
	* the SD Simplified Specification. The following disclaimer may apply:
	*
	* The following conditions apply to the release of the simplified
	* specification ("Simplified Specification") by the SD Card Association and
	* the SD Group. The Simplified Specification is a subset of the complete SD
	* Specification which is owned by the SD Card Association and the SD
	* Group. This Simplified Specification is provided on a non-confidential
	* basis subject to the disclaimers below. Any implementation of the
	* Simplified Specification may require a license from the SD Card
	* Association, SD Group, SD-3C LLC or other third parties.
	*
	* Disclaimers:
	*
	* The information contained in the Simplified Specification is presented only
	* as a standard specification for SD Cards and SD Host/Ancillary products and
	* is provided "AS-IS" without any representations or warranties of any
	* kind. No responsibility is assumed by the SD Group, SD-3C LLC or the SD
	* Card Association for any damages, any infringements of patents or other
	* right of the SD Group, SD-3C LLC, the SD Card Association or any third
	* parties, which may result from its use. No license is granted by
	* implication, estoppel or otherwise under any patent or other rights of the
	* SD Group, SD-3C LLC, the SD Card Association or any third party. Nothing
	* herein shall be construed as an obligation by the SD Group, the SD-3C LLC
	* or the SD Card Association to disclose or distribute any technical
	* information, know-how or other confidential information to any third party.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/bus.h>
	#include <sys/endian.h>
	#include <sys/sysctl.h>
	#include <sys/time.h>

	#include <dev/mmc/bridge.h>
	#include <dev/mmc/mmc_private.h>
	#include <dev/mmc/mmc_subr.h>
	#include <dev/mmc/mmcreg.h>
	#include <dev/mmc/mmcbrvar.h>
	#include <dev/mmc/mmcvar.h>

	#include "mmcbr_if.h"
	#include "mmcbus_if.h"

	CTASSERT(bus_timing_max <= sizeof(uint32_t) * NBBY);

	/*
	* Per-card data
	*/
	struct mmc_ivars {
	uint32_t raw_cid[4]; /* Raw bits of the CID */
	uint32_t raw_csd[4]; /* Raw bits of the CSD */
	uint32_t raw_scr[2]; /* Raw bits of the SCR */
	uint8_t raw_ext_csd[MMC_EXTCSD_SIZE]; /* Raw bits of the EXT_CSD */
	uint32_t raw_sd_status[16]; /* Raw bits of the SD_STATUS */
	uint16_t rca;
	u_char read_only; /* True when the device is read-only */
	u_char high_cap; /* High Capacity device (block addressed) */
	enum mmc_card_mode mode;
	enum mmc_bus_width bus_width; /* Bus width to use */
	struct mmc_cid cid; /* cid decoded */
	struct mmc_csd csd; /* csd decoded */
	struct mmc_scr scr; /* scr decoded */
	struct mmc_sd_status sd_status; /* SD_STATUS decoded */
	uint32_t sec_count; /* Card capacity in 512byte blocks */
	uint32_t timings; /* Mask of bus timings supported */
	uint32_t vccq_120; /* Mask of bus timings at VCCQ of 1.2 V */
	uint32_t vccq_180; /* Mask of bus timings at VCCQ of 1.8 V */
	uint32_t tran_speed; /* Max speed in normal mode */
	uint32_t hs_tran_speed; /* Max speed in high speed mode */
	uint32_t erase_sector; /* Card native erase sector size */
	uint32_t cmd6_time; /* Generic switch timeout [us] */
	uint32_t quirks; /* Quirks as per mmc_quirk->quirks */
	char card_id_string[64];/* Formatted CID info (serial, MFG, etc) */
	char card_sn_string[16];/* Formatted serial # for disk->d_ident */
	};

	#define CMD_RETRIES 3

	static const struct mmc_quirk mmc_quirks[] = {
	/*
	* For some SanDisk iNAND devices, the CMD38 argument needs to be
	* provided in EXT_CSD[113].
	*/
	{ 0x2, 0x100, "SEM02G", MMC_QUIRK_INAND_CMD38 },
	{ 0x2, 0x100, "SEM04G", MMC_QUIRK_INAND_CMD38 },
	{ 0x2, 0x100, "SEM08G", MMC_QUIRK_INAND_CMD38 },
	{ 0x2, 0x100, "SEM16G", MMC_QUIRK_INAND_CMD38 },
	{ 0x2, 0x100, "SEM32G", MMC_QUIRK_INAND_CMD38 },

	/*
	* Disable TRIM for Kingston eMMCs where a firmware bug can lead to
	* unrecoverable data corruption.
	*/
	{ 0x70, MMC_QUIRK_OID_ANY, "V10008", MMC_QUIRK_BROKEN_TRIM },
	{ 0x70, MMC_QUIRK_OID_ANY, "V10016", MMC_QUIRK_BROKEN_TRIM },

	{ 0x0, 0x0, NULL, 0x0 }
	};

	static SYSCTL_NODE(_hw, OID_AUTO, mmc, CTLFLAG_RD, NULL, "mmc driver");

	static int mmc_debug;
	SYSCTL_INT(_hw_mmc, OID_AUTO, debug, CTLFLAG_RWTUN, &mmc_debug, 0,
	"Debug level");

	/* bus entry points */
	static int mmc_acquire_bus(device_t busdev, device_t dev);
	static int mmc_attach(device_t dev);
	static int mmc_child_location_str(device_t dev, device_t child, char *buf,
	size_t buflen);
	static int mmc_detach(device_t dev);
	static int mmc_probe(device_t dev);
	static int mmc_read_ivar(device_t bus, device_t child, int which,
	uintptr_t *result);
	static int mmc_release_bus(device_t busdev, device_t dev);
	static int mmc_resume(device_t dev);
	static void mmc_retune_pause(device_t busdev, device_t dev, bool retune);
	static void mmc_retune_unpause(device_t busdev, device_t dev);
	static int mmc_suspend(device_t dev);
	static int mmc_wait_for_request(device_t busdev, device_t dev,
	struct mmc_request *req);
	static int mmc_write_ivar(device_t bus, device_t child, int which,
	uintptr_t value);

	#define MMC_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx)
	#define MMC_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx)
	#define MMC_LOCK_INIT(_sc) \
	mtx_init(&(_sc)->sc_mtx, device_get_nameunit((_sc)->dev), \
	"mmc", MTX_DEF)
	#define MMC_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->sc_mtx);
	#define MMC_ASSERT_LOCKED(_sc) mtx_assert(&(_sc)->sc_mtx, MA_OWNED);
	#define MMC_ASSERT_UNLOCKED(_sc) mtx_assert(&(_sc)->sc_mtx, MA_NOTOWNED);

	static int mmc_all_send_cid(struct mmc_softc sc, uint32_t rawcid);
	static void mmc_app_decode_scr(uint32_t raw_scr, struct mmc_scr scr);
	static void mmc_app_decode_sd_status(uint32_t *raw_sd_status,
	struct mmc_sd_status *sd_status);
	static int mmc_app_sd_status(struct mmc_softc *sc, uint16_t rca,
	uint32_t *rawsdstatus);
	static int mmc_app_send_scr(struct mmc_softc *sc, uint16_t rca,
	uint32_t *rawscr);
	static int mmc_calculate_clock(struct mmc_softc *sc);
	static void mmc_decode_cid_mmc(uint32_t raw_cid, struct mmc_cid cid,
	bool is_4_41p);
	static void mmc_decode_cid_sd(uint32_t raw_cid, struct mmc_cid cid);
	static void mmc_decode_csd_mmc(uint32_t raw_csd, struct mmc_csd csd);
	static int mmc_decode_csd_sd(uint32_t raw_csd, struct mmc_csd csd);
	static void mmc_delayed_attach(void *xsc);
	static int mmc_delete_cards(struct mmc_softc *sc, bool final);
	static void mmc_discover_cards(struct mmc_softc *sc);
	static void mmc_format_card_id_string(struct mmc_ivars *ivar);
	static void mmc_go_discovery(struct mmc_softc *sc);
	static uint32_t mmc_get_bits(uint32_t *bits, int bit_len, int start,
	int size);
	static int mmc_highest_voltage(uint32_t ocr);
	static bool mmc_host_timing(device_t dev, enum mmc_bus_timing timing);
	static void mmc_idle_cards(struct mmc_softc *sc);
	static void mmc_ms_delay(int ms);
	static void mmc_log_card(device_t dev, struct mmc_ivars *ivar, int newcard);
	static void mmc_power_down(struct mmc_softc *sc);
	static void mmc_power_up(struct mmc_softc *sc);
	static void mmc_rescan_cards(struct mmc_softc *sc);
	static int mmc_retune(device_t busdev, device_t dev, bool reset);
	static void mmc_scan(struct mmc_softc *sc);
	static int mmc_sd_switch(struct mmc_softc *sc, uint8_t mode, uint8_t grp,
	uint8_t value, uint8_t *res);
	static int mmc_select_card(struct mmc_softc *sc, uint16_t rca);
	static uint32_t mmc_select_vdd(struct mmc_softc *sc, uint32_t ocr);
	static int mmc_send_app_op_cond(struct mmc_softc *sc, uint32_t ocr,
	uint32_t *rocr);
	static int mmc_send_csd(struct mmc_softc sc, uint16_t rca, uint32_t rawcsd);
	static int mmc_send_if_cond(struct mmc_softc *sc, uint8_t vhs);
	static int mmc_send_op_cond(struct mmc_softc *sc, uint32_t ocr,
	uint32_t *rocr);
	static int mmc_send_relative_addr(struct mmc_softc sc, uint32_t resp);
	static int mmc_set_blocklen(struct mmc_softc *sc, uint32_t len);
	static int mmc_set_card_bus_width(struct mmc_softc sc, struct mmc_ivars ivar,
	enum mmc_bus_timing timing);
	static int mmc_set_power_class(struct mmc_softc sc, struct mmc_ivars ivar);
	static int mmc_set_relative_addr(struct mmc_softc *sc, uint16_t resp);
	static int mmc_set_timing(struct mmc_softc sc, struct mmc_ivars ivar,
	enum mmc_bus_timing timing);
	static int mmc_set_vccq(struct mmc_softc sc, struct mmc_ivars ivar,
	enum mmc_bus_timing timing);
	static int mmc_switch_to_hs200(struct mmc_softc sc, struct mmc_ivars ivar,
	uint32_t clock);
	static int mmc_switch_to_hs400(struct mmc_softc sc, struct mmc_ivars ivar,
	uint32_t max_dtr, enum mmc_bus_timing max_timing);
	static int mmc_test_bus_width(struct mmc_softc *sc);
	static uint32_t mmc_timing_to_dtr(struct mmc_ivars *ivar,
	enum mmc_bus_timing timing);
	static const char *mmc_timing_to_string(enum mmc_bus_timing timing);
	static void mmc_update_child_list(struct mmc_softc *sc);
	static int mmc_wait_for_command(struct mmc_softc *sc, uint32_t opcode,
	uint32_t arg, uint32_t flags, uint32_t *resp, int retries);
	static int mmc_wait_for_req(struct mmc_softc sc, struct mmc_request req);
	static void mmc_wakeup(struct mmc_request *req);

	static void
	mmc_ms_delay(int ms)
	{

	DELAY(1000 * ms); /* XXX BAD */
	}

	static int
	mmc_probe(device_t dev)
	{

	device_set_desc(dev, "MMC/SD bus");
	return (0);
	}

	static int
	mmc_attach(device_t dev)
	{
	struct mmc_softc *sc;

	sc = device_get_softc(dev);
	sc->dev = dev;
	MMC_LOCK_INIT(sc);

	/* We'll probe and attach our children later, but before / mount */
	sc->config_intrhook.ich_func = mmc_delayed_attach;
	sc->config_intrhook.ich_arg = sc;
	if (config_intrhook_establish(&sc->config_intrhook) != 0)
	device_printf(dev, "config_intrhook_establish failed\n");
	return (0);
	}

	static int
	mmc_detach(device_t dev)
	{
	struct mmc_softc *sc = device_get_softc(dev);
	int err;

	err = mmc_delete_cards(sc, true);
	if (err != 0)
	return (err);
	mmc_power_down(sc);
	MMC_LOCK_DESTROY(sc);

	return (0);
	}

	static int
	mmc_suspend(device_t dev)
	{
	struct mmc_softc *sc = device_get_softc(dev);
	int err;

	err = bus_generic_suspend(dev);
	if (err != 0)
	return (err);
	/*
	* We power down with the bus acquired here, mainly so that no device
	* is selected any longer and sc->last_rca gets set to 0. Otherwise,
	* the deselect as part of the bus acquisition in mmc_scan() may fail
	* during resume, as the bus isn't powered up again before later in
	* mmc_go_discovery().
	*/
	err = mmc_acquire_bus(dev, dev);
	if (err != 0)
	return (err);
	mmc_power_down(sc);
	err = mmc_release_bus(dev, dev);
	return (err);
	}

	static int
	mmc_resume(device_t dev)
	{
	struct mmc_softc *sc = device_get_softc(dev);

	mmc_scan(sc);
	return (bus_generic_resume(dev));
	}

	static int
	mmc_acquire_bus(device_t busdev, device_t dev)
	{
	struct mmc_softc *sc;
	struct mmc_ivars *ivar;
	int err;
	uint16_t rca;
	enum mmc_bus_timing timing;

	err = MMCBR_ACQUIRE_HOST(device_get_parent(busdev), busdev);
	if (err)
	return (err);
	sc = device_get_softc(busdev);
	MMC_LOCK(sc);
	if (sc->owner)
	panic("mmc: host bridge didn't serialize us.");
	sc->owner = dev;
	MMC_UNLOCK(sc);

	if (busdev != dev) {
	/*
	* Keep track of the last rca that we've selected. If
	* we're asked to do it again, don't. We never
	* unselect unless the bus code itself wants the mmc
	* bus, and constantly reselecting causes problems.
	*/
	ivar = device_get_ivars(dev);
	rca = ivar->rca;
	if (sc->last_rca != rca) {
	if (mmc_select_card(sc, rca) != MMC_ERR_NONE) {
	device_printf(busdev, "Card at relative "
	"address %d failed to select\n", rca);
	return (ENXIO);
	}
	sc->last_rca = rca;
	timing = mmcbr_get_timing(busdev);
	/*
	* For eMMC modes, setting/updating bus width and VCCQ
	* only really is necessary if there actually is more
	* than one device on the bus as generally that already
	* had to be done by mmc_calculate_clock() or one of
	* its calees. Moreover, setting the bus width anew
	* can trigger re-tuning (via a CRC error on the next
	* CMD), even if not switching between devices an the
	* previously selected one is still tuned. Obviously,
	* we need to re-tune the host controller if devices
	* are actually switched, though.
	*/
	if (timing >= bus_timing_mmc_ddr52 &&
	sc->child_count == 1)
	return (0);
	/* Prepare bus width for the new card. */
	if (bootverbose \|\| mmc_debug) {
	device_printf(busdev,
	"setting bus width to %d bits %s timing\n",
	(ivar->bus_width == bus_width_4) ? 4 :
	(ivar->bus_width == bus_width_8) ? 8 : 1,
	mmc_timing_to_string(timing));
	}
	if (mmc_set_card_bus_width(sc, ivar, timing) !=
	MMC_ERR_NONE) {
	device_printf(busdev, "Card at relative "
	"address %d failed to set bus width\n",
	rca);
	return (ENXIO);
	}
	mmcbr_set_bus_width(busdev, ivar->bus_width);
	mmcbr_update_ios(busdev);
	if (mmc_set_vccq(sc, ivar, timing) != MMC_ERR_NONE) {
	device_printf(busdev, "Failed to set VCCQ "
	"for card at relative address %d\n", rca);
	return (ENXIO);
	}
	if (timing >= bus_timing_mmc_hs200 &&
	mmc_retune(busdev, dev, true) != 0) {
	device_printf(busdev, "Card at relative "
	"address %d failed to re-tune\n", rca);
	return (ENXIO);
	}
	}
	} else {
	/*
	* If there's a card selected, stand down.
	*/
	if (sc->last_rca != 0) {
	if (mmc_select_card(sc, 0) != MMC_ERR_NONE)
	return (ENXIO);
	sc->last_rca = 0;
	}
	}

	return (0);
	}

	static int
	mmc_release_bus(device_t busdev, device_t dev)
	{
	struct mmc_softc *sc;
	int err;

	sc = device_get_softc(busdev);

	MMC_LOCK(sc);
	if (!sc->owner)
	panic("mmc: releasing unowned bus.");
	if (sc->owner != dev)
	panic("mmc: you don't own the bus. game over.");
	MMC_UNLOCK(sc);
	err = MMCBR_RELEASE_HOST(device_get_parent(busdev), busdev);
	if (err)
	return (err);
	MMC_LOCK(sc);
	sc->owner = NULL;
	MMC_UNLOCK(sc);
	return (0);
	}

	static uint32_t
	mmc_select_vdd(struct mmc_softc *sc, uint32_t ocr)
	{

	return (ocr & MMC_OCR_VOLTAGE);
	}

	static int
	mmc_highest_voltage(uint32_t ocr)
	{
	int i;

	for (i = MMC_OCR_MAX_VOLTAGE_SHIFT;
	i >= MMC_OCR_MIN_VOLTAGE_SHIFT; i--)
	if (ocr & (1 << i))
	return (i);
	return (-1);
	}

	static void
	mmc_wakeup(struct mmc_request *req)
	{
	struct mmc_softc *sc;

	sc = (struct mmc_softc *)req->done_data;
	MMC_LOCK(sc);
	req->flags \|= MMC_REQ_DONE;
	MMC_UNLOCK(sc);
	wakeup(req);
	}

	static int
	mmc_wait_for_req(struct mmc_softc sc, struct mmc_request req)
	{

	req->done = mmc_wakeup;
	req->done_data = sc;
	if (__predict_false(mmc_debug > 1)) {
	device_printf(sc->dev, "REQUEST: CMD%d arg %#x flags %#x",
	req->cmd->opcode, req->cmd->arg, req->cmd->flags);
	if (req->cmd->data) {
	printf(" data %d\n", (int)req->cmd->data->len);
	} else
	printf("\n");
	}
	MMCBR_REQUEST(device_get_parent(sc->dev), sc->dev, req);
	MMC_LOCK(sc);
	while ((req->flags & MMC_REQ_DONE) == 0)
	msleep(req, &sc->sc_mtx, 0, "mmcreq", 0);
	MMC_UNLOCK(sc);
	if (__predict_false(mmc_debug > 2 \|\| (mmc_debug > 0 &&
	req->cmd->error != MMC_ERR_NONE)))
	device_printf(sc->dev, "CMD%d RESULT: %d\n",
	req->cmd->opcode, req->cmd->error);
	return (0);
	}

	static int
	mmc_wait_for_request(device_t busdev, device_t dev, struct mmc_request *req)
	{
	struct mmc_softc *sc;
	struct mmc_ivars *ivar;
	int err, i;
	enum mmc_retune_req retune_req;

	sc = device_get_softc(busdev);
	KASSERT(sc->owner != NULL,
	("%s: Request from %s without bus being acquired.", __func__,
	device_get_nameunit(dev)));

	/*
	* Unless no device is selected or re-tuning is already ongoing,
	* execute re-tuning if a) the bridge is requesting to do so and
	* re-tuning hasn't been otherwise paused, or b) if a child asked
	* to be re-tuned prior to pausing (see also mmc_retune_pause()).
	*/
	if (__predict_false(sc->last_rca != 0 && sc->retune_ongoing == 0 &&
	(((retune_req = mmcbr_get_retune_req(busdev)) != retune_req_none &&
	sc->retune_paused == 0) \|\| sc->retune_needed == 1))) {
	if (__predict_false(mmc_debug > 1)) {
	device_printf(busdev,
	"Re-tuning with%s circuit reset required\n",
	retune_req == retune_req_reset ? "" : "out");
	}
	if (device_get_parent(dev) == busdev)
	ivar = device_get_ivars(dev);
	else {
	for (i = 0; i < sc->child_count; i++) {
	ivar = device_get_ivars(sc->child_list[i]);
	if (ivar->rca == sc->last_rca)
	break;
	}
	if (ivar->rca != sc->last_rca)
	return (EINVAL);
	}
	sc->retune_ongoing = 1;
	err = mmc_retune(busdev, dev, retune_req == retune_req_reset);
	sc->retune_ongoing = 0;
	switch (err) {
	case MMC_ERR_NONE:
	case MMC_ERR_FAILED: /* Re-tune error but still might work */
	break;
	case MMC_ERR_BADCRC: /* Switch failure on HS400 recovery */
	return (ENXIO);
	case MMC_ERR_INVALID: /* Driver implementation b0rken */
	default: /* Unknown error, should not happen */
	return (EINVAL);
	}
	sc->retune_needed = 0;
	}
	return (mmc_wait_for_req(sc, req));
	}

	static int
	mmc_wait_for_command(struct mmc_softc *sc, uint32_t opcode,
	uint32_t arg, uint32_t flags, uint32_t *resp, int retries)
	{
	struct mmc_command cmd;
	int err;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = opcode;
	cmd.arg = arg;
	cmd.flags = flags;
	cmd.data = NULL;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, retries);
	if (err)
	return (err);
	if (resp) {
	if (flags & MMC_RSP_136)
	memcpy(resp, cmd.resp, 4 * sizeof(uint32_t));
	else
	*resp = cmd.resp[0];
	}
	return (0);
	}

	static void
	mmc_idle_cards(struct mmc_softc *sc)
	{
	device_t dev;
	struct mmc_command cmd;

	dev = sc->dev;
	mmcbr_set_chip_select(dev, cs_high);
	mmcbr_update_ios(dev);
	mmc_ms_delay(1);

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = MMC_GO_IDLE_STATE;
	cmd.arg = 0;
	cmd.flags = MMC_RSP_NONE \| MMC_CMD_BC;
	cmd.data = NULL;
	mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	mmc_ms_delay(1);

	mmcbr_set_chip_select(dev, cs_dontcare);
	mmcbr_update_ios(dev);
	mmc_ms_delay(1);
	}

	static int
	mmc_send_app_op_cond(struct mmc_softc sc, uint32_t ocr, uint32_t rocr)
	{
	struct mmc_command cmd;
	int err = MMC_ERR_NONE, i;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = ACMD_SD_SEND_OP_COND;
	cmd.arg = ocr;
	cmd.flags = MMC_RSP_R3 \| MMC_CMD_BCR;
	cmd.data = NULL;

	for (i = 0; i < 1000; i++) {
	err = mmc_wait_for_app_cmd(sc->dev, sc->dev, 0, &cmd,
	CMD_RETRIES);
	if (err != MMC_ERR_NONE)
	break;
	if ((cmd.resp[0] & MMC_OCR_CARD_BUSY) \|\|
	(ocr & MMC_OCR_VOLTAGE) == 0)
	break;
	err = MMC_ERR_TIMEOUT;
	mmc_ms_delay(10);
	}
	if (rocr && err == MMC_ERR_NONE)
	*rocr = cmd.resp[0];
	return (err);
	}

	static int
	mmc_send_op_cond(struct mmc_softc sc, uint32_t ocr, uint32_t rocr)
	{
	struct mmc_command cmd;
	int err = MMC_ERR_NONE, i;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = MMC_SEND_OP_COND;
	cmd.arg = ocr;
	cmd.flags = MMC_RSP_R3 \| MMC_CMD_BCR;
	cmd.data = NULL;

	for (i = 0; i < 1000; i++) {
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	if (err != MMC_ERR_NONE)
	break;
	if ((cmd.resp[0] & MMC_OCR_CARD_BUSY) \|\|
	(ocr & MMC_OCR_VOLTAGE) == 0)
	break;
	err = MMC_ERR_TIMEOUT;
	mmc_ms_delay(10);
	}
	if (rocr && err == MMC_ERR_NONE)
	*rocr = cmd.resp[0];
	return (err);
	}

	static int
	mmc_send_if_cond(struct mmc_softc *sc, uint8_t vhs)
	{
	struct mmc_command cmd;
	int err;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = SD_SEND_IF_COND;
	cmd.arg = (vhs << 8) + 0xAA;
	cmd.flags = MMC_RSP_R7 \| MMC_CMD_BCR;
	cmd.data = NULL;

	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	return (err);
	}

	static void
	mmc_power_up(struct mmc_softc *sc)
	{
	device_t dev;
	enum mmc_vccq vccq;

	dev = sc->dev;
	mmcbr_set_vdd(dev, mmc_highest_voltage(mmcbr_get_host_ocr(dev)));
	mmcbr_set_bus_mode(dev, opendrain);
	mmcbr_set_chip_select(dev, cs_dontcare);
	mmcbr_set_bus_width(dev, bus_width_1);
	mmcbr_set_power_mode(dev, power_up);
	mmcbr_set_clock(dev, 0);
	mmcbr_update_ios(dev);
	for (vccq = vccq_330; ; vccq--) {
	mmcbr_set_vccq(dev, vccq);
	if (mmcbr_switch_vccq(dev) == 0 \|\| vccq == vccq_120)
	break;
	}
	mmc_ms_delay(1);

	mmcbr_set_clock(dev, SD_MMC_CARD_ID_FREQUENCY);
	mmcbr_set_timing(dev, bus_timing_normal);
	mmcbr_set_power_mode(dev, power_on);
	mmcbr_update_ios(dev);
	mmc_ms_delay(2);
	}

	static void
	mmc_power_down(struct mmc_softc *sc)
	{
	device_t dev = sc->dev;

	mmcbr_set_bus_mode(dev, opendrain);
	mmcbr_set_chip_select(dev, cs_dontcare);
	mmcbr_set_bus_width(dev, bus_width_1);
	mmcbr_set_power_mode(dev, power_off);
	mmcbr_set_clock(dev, 0);
	mmcbr_set_timing(dev, bus_timing_normal);
	mmcbr_update_ios(dev);
	}

	static int
	mmc_select_card(struct mmc_softc *sc, uint16_t rca)
	{
	int err, flags;

	flags = (rca ? MMC_RSP_R1B : MMC_RSP_NONE) \| MMC_CMD_AC;
	sc->retune_paused++;
	err = mmc_wait_for_command(sc, MMC_SELECT_CARD, (uint32_t)rca << 16,
	flags, NULL, CMD_RETRIES);
	sc->retune_paused--;
	return (err);
	}

	static int
	mmc_sd_switch(struct mmc_softc *sc, uint8_t mode, uint8_t grp, uint8_t value,
	uint8_t *res)
	{
	int err;
	struct mmc_command cmd;
	struct mmc_data data;

	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));
	memset(res, 0, 64);

	cmd.opcode = SD_SWITCH_FUNC;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.arg = mode << 31; /* 0 - check, 1 - set */
	cmd.arg \|= 0x00FFFFFF;
	cmd.arg &= ~(0xF << (grp * 4));
	cmd.arg \|= value << (grp * 4);
	cmd.data = &data;

	data.data = res;
	data.len = 64;
	data.flags = MMC_DATA_READ;

	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	return (err);
	}

	static int
	mmc_set_card_bus_width(struct mmc_softc sc, struct mmc_ivars ivar,
	enum mmc_bus_timing timing)
	{
	struct mmc_command cmd;
	int err;
	uint8_t value;

	if (mmcbr_get_mode(sc->dev) == mode_sd) {
	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = ACMD_SET_CLR_CARD_DETECT;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_AC;
	cmd.arg = SD_CLR_CARD_DETECT;
	err = mmc_wait_for_app_cmd(sc->dev, sc->dev, ivar->rca, &cmd,
	CMD_RETRIES);
	if (err != 0)
	return (err);
	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = ACMD_SET_BUS_WIDTH;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_AC;
	switch (ivar->bus_width) {
	case bus_width_1:
	cmd.arg = SD_BUS_WIDTH_1;
	break;
	case bus_width_4:
	cmd.arg = SD_BUS_WIDTH_4;
	break;
	default:
	return (MMC_ERR_INVALID);
	}
	err = mmc_wait_for_app_cmd(sc->dev, sc->dev, ivar->rca, &cmd,
	CMD_RETRIES);
	} else {
	switch (ivar->bus_width) {
	case bus_width_1:
	if (timing == bus_timing_mmc_hs400 \|\|
	timing == bus_timing_mmc_hs400es)
	return (MMC_ERR_INVALID);
	value = EXT_CSD_BUS_WIDTH_1;
	break;
	case bus_width_4:
	switch (timing) {
	case bus_timing_mmc_ddr52:
	value = EXT_CSD_BUS_WIDTH_4_DDR;
	break;
	case bus_timing_mmc_hs400:
	case bus_timing_mmc_hs400es:
	return (MMC_ERR_INVALID);
	default:
	value = EXT_CSD_BUS_WIDTH_4;
	break;
	}
	break;
	case bus_width_8:
	value = 0;
	switch (timing) {
	case bus_timing_mmc_hs400es:
	value = EXT_CSD_BUS_WIDTH_ES;
	/* FALLTHROUGH */
	case bus_timing_mmc_ddr52:
	case bus_timing_mmc_hs400:
	value \|= EXT_CSD_BUS_WIDTH_8_DDR;
	break;
	default:
	value = EXT_CSD_BUS_WIDTH_8;
	break;
	}
	break;
	default:
	return (MMC_ERR_INVALID);
	}
	err = mmc_switch(sc->dev, sc->dev, ivar->rca,
	EXT_CSD_CMD_SET_NORMAL, EXT_CSD_BUS_WIDTH, value,
	ivar->cmd6_time, true);
	}
	return (err);
	}

	static int
	mmc_set_power_class(struct mmc_softc sc, struct mmc_ivars ivar)
	{
	device_t dev;
	const uint8_t *ext_csd;
	uint32_t clock;
	uint8_t value;

	dev = sc->dev;
	if (mmcbr_get_mode(dev) != mode_mmc \|\| ivar->csd.spec_vers < 4)
	return (MMC_ERR_NONE);

	value = 0;
	ext_csd = ivar->raw_ext_csd;
	clock = mmcbr_get_clock(dev);
	switch (1 << mmcbr_get_vdd(dev)) {
	case MMC_OCR_LOW_VOLTAGE:
	if (clock <= MMC_TYPE_HS_26_MAX)
	value = ext_csd[EXT_CSD_PWR_CL_26_195];
	else if (clock <= MMC_TYPE_HS_52_MAX) {
	if (mmcbr_get_timing(dev) >= bus_timing_mmc_ddr52 &&
	ivar->bus_width >= bus_width_4)
	value = ext_csd[EXT_CSD_PWR_CL_52_195_DDR];
	else
	value = ext_csd[EXT_CSD_PWR_CL_52_195];
	} else if (clock <= MMC_TYPE_HS200_HS400ES_MAX)
	value = ext_csd[EXT_CSD_PWR_CL_200_195];
	break;
	case MMC_OCR_270_280:
	case MMC_OCR_280_290:
	case MMC_OCR_290_300:
	case MMC_OCR_300_310:
	case MMC_OCR_310_320:
	case MMC_OCR_320_330:
	case MMC_OCR_330_340:
	case MMC_OCR_340_350:
	case MMC_OCR_350_360:
	if (clock <= MMC_TYPE_HS_26_MAX)
	value = ext_csd[EXT_CSD_PWR_CL_26_360];
	else if (clock <= MMC_TYPE_HS_52_MAX) {
	if (mmcbr_get_timing(dev) == bus_timing_mmc_ddr52 &&
	ivar->bus_width >= bus_width_4)
	value = ext_csd[EXT_CSD_PWR_CL_52_360_DDR];
	else
	value = ext_csd[EXT_CSD_PWR_CL_52_360];
	} else if (clock <= MMC_TYPE_HS200_HS400ES_MAX) {
	if (ivar->bus_width == bus_width_8)
	value = ext_csd[EXT_CSD_PWR_CL_200_360_DDR];
	else
	value = ext_csd[EXT_CSD_PWR_CL_200_360];
	}
	break;
	default:
	device_printf(dev, "No power class support for VDD 0x%x\n",
	1 << mmcbr_get_vdd(dev));
	return (MMC_ERR_INVALID);
	}

	if (ivar->bus_width == bus_width_8)
	value = (value & EXT_CSD_POWER_CLASS_8BIT_MASK) >>
	EXT_CSD_POWER_CLASS_8BIT_SHIFT;
	else
	value = (value & EXT_CSD_POWER_CLASS_4BIT_MASK) >>
	EXT_CSD_POWER_CLASS_4BIT_SHIFT;

	if (value == 0)
	return (MMC_ERR_NONE);

	return (mmc_switch(dev, dev, ivar->rca, EXT_CSD_CMD_SET_NORMAL,
	EXT_CSD_POWER_CLASS, value, ivar->cmd6_time, true));
	}

	static int
	mmc_set_timing(struct mmc_softc sc, struct mmc_ivars ivar,
	enum mmc_bus_timing timing)
	{
	u_char switch_res[64];
	uint8_t value;
	int err;

	if (mmcbr_get_mode(sc->dev) == mode_sd) {
	switch (timing) {
	case bus_timing_normal:
	value = SD_SWITCH_NORMAL_MODE;
	break;
	case bus_timing_hs:
	value = SD_SWITCH_HS_MODE;
	break;
	default:
	return (MMC_ERR_INVALID);
	}
	err = mmc_sd_switch(sc, SD_SWITCH_MODE_SET, SD_SWITCH_GROUP1,
	value, switch_res);
	if (err != MMC_ERR_NONE)
	return (err);
	if ((switch_res[16] & 0xf) != value)
	return (MMC_ERR_FAILED);
	mmcbr_set_timing(sc->dev, timing);
	mmcbr_update_ios(sc->dev);
	} else {
	switch (timing) {
	case bus_timing_normal:
	value = EXT_CSD_HS_TIMING_BC;
	break;
	case bus_timing_hs:
	case bus_timing_mmc_ddr52:
	value = EXT_CSD_HS_TIMING_HS;
	break;
	case bus_timing_mmc_hs200:
	value = EXT_CSD_HS_TIMING_HS200;
	break;
	case bus_timing_mmc_hs400:
	case bus_timing_mmc_hs400es:
	value = EXT_CSD_HS_TIMING_HS400;
	break;
	default:
	return (MMC_ERR_INVALID);
	}
	err = mmc_switch(sc->dev, sc->dev, ivar->rca,
	EXT_CSD_CMD_SET_NORMAL, EXT_CSD_HS_TIMING, value,
	ivar->cmd6_time, false);
	if (err != MMC_ERR_NONE)
	return (err);
	mmcbr_set_timing(sc->dev, timing);
	mmcbr_update_ios(sc->dev);
	err = mmc_switch_status(sc->dev, sc->dev, ivar->rca,
	ivar->cmd6_time);
	}
	return (err);
	}

	static int
	mmc_set_vccq(struct mmc_softc sc, struct mmc_ivars ivar,
	enum mmc_bus_timing timing)
	{

	if (isset(&ivar->vccq_120, timing))
	mmcbr_set_vccq(sc->dev, vccq_120);
	else if (isset(&ivar->vccq_180, timing))
	mmcbr_set_vccq(sc->dev, vccq_180);
	else
	mmcbr_set_vccq(sc->dev, vccq_330);
	if (mmcbr_switch_vccq(sc->dev) != 0)
	return (MMC_ERR_INVALID);
	else
	return (MMC_ERR_NONE);
	}

	static const uint8_t p8[8] = {
	0x55, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
	};

	static const uint8_t p8ok[8] = {
	0xAA, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
	};

	static const uint8_t p4[4] = {
	0x5A, 0x00, 0x00, 0x00
	};

	static const uint8_t p4ok[4] = {
	0xA5, 0x00, 0x00, 0x00
	};

	static int
	mmc_test_bus_width(struct mmc_softc *sc)
	{
	struct mmc_command cmd;
	struct mmc_data data;
	uint8_t buf[8];
	int err;

	if (mmcbr_get_caps(sc->dev) & MMC_CAP_8_BIT_DATA) {
	mmcbr_set_bus_width(sc->dev, bus_width_8);
	mmcbr_update_ios(sc->dev);

	sc->squelched++; /* Errors are expected, squelch reporting. */
	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));
	cmd.opcode = MMC_BUSTEST_W;
	cmd.arg = 0;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.data = &data;

	data.data = __DECONST(void *, p8);
	data.len = 8;
	data.flags = MMC_DATA_WRITE;
	mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);

	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));
	cmd.opcode = MMC_BUSTEST_R;
	cmd.arg = 0;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.data = &data;

	data.data = buf;
	data.len = 8;
	data.flags = MMC_DATA_READ;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);
	sc->squelched--;

	mmcbr_set_bus_width(sc->dev, bus_width_1);
	mmcbr_update_ios(sc->dev);

	if (err == MMC_ERR_NONE && memcmp(buf, p8ok, 8) == 0)
	return (bus_width_8);
	}

	if (mmcbr_get_caps(sc->dev) & MMC_CAP_4_BIT_DATA) {
	mmcbr_set_bus_width(sc->dev, bus_width_4);
	mmcbr_update_ios(sc->dev);

	sc->squelched++; /* Errors are expected, squelch reporting. */
	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));
	cmd.opcode = MMC_BUSTEST_W;
	cmd.arg = 0;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.data = &data;

	data.data = __DECONST(void *, p4);
	data.len = 4;
	data.flags = MMC_DATA_WRITE;
	mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);

	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));
	cmd.opcode = MMC_BUSTEST_R;
	cmd.arg = 0;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.data = &data;

	data.data = buf;
	data.len = 4;
	data.flags = MMC_DATA_READ;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, 0);
	sc->squelched--;

	mmcbr_set_bus_width(sc->dev, bus_width_1);
	mmcbr_update_ios(sc->dev);

	if (err == MMC_ERR_NONE && memcmp(buf, p4ok, 4) == 0)
	return (bus_width_4);
	}
	return (bus_width_1);
	}

	static uint32_t
	mmc_get_bits(uint32_t *bits, int bit_len, int start, int size)
	{
	const int i = (bit_len / 32) - (start / 32) - 1;
	const int shift = start & 31;
	uint32_t retval = bits[i] >> shift;

	if (size + shift > 32)
	retval \|= bits[i - 1] << (32 - shift);
	return (retval & ((1llu << size) - 1));
	}

	static void
	mmc_decode_cid_sd(uint32_t raw_cid, struct mmc_cid cid)
	{
	int i;

	/* There's no version info, so we take it on faith */
	memset(cid, 0, sizeof(*cid));
	cid->mid = mmc_get_bits(raw_cid, 128, 120, 8);
	cid->oid = mmc_get_bits(raw_cid, 128, 104, 16);
	for (i = 0; i < 5; i++)
	cid->pnm[i] = mmc_get_bits(raw_cid, 128, 96 - i * 8, 8);
	cid->pnm[5] = 0;
	cid->prv = mmc_get_bits(raw_cid, 128, 56, 8);
	cid->psn = mmc_get_bits(raw_cid, 128, 24, 32);
	cid->mdt_year = mmc_get_bits(raw_cid, 128, 12, 8) + 2000;
	cid->mdt_month = mmc_get_bits(raw_cid, 128, 8, 4);
	}

	static void
	mmc_decode_cid_mmc(uint32_t raw_cid, struct mmc_cid cid, bool is_4_41p)
	{
	int i;

	/* There's no version info, so we take it on faith */
	memset(cid, 0, sizeof(*cid));
	cid->mid = mmc_get_bits(raw_cid, 128, 120, 8);
	cid->oid = mmc_get_bits(raw_cid, 128, 104, 8);
	for (i = 0; i < 6; i++)
	cid->pnm[i] = mmc_get_bits(raw_cid, 128, 96 - i * 8, 8);
	cid->pnm[6] = 0;
	cid->prv = mmc_get_bits(raw_cid, 128, 48, 8);
	cid->psn = mmc_get_bits(raw_cid, 128, 16, 32);
	cid->mdt_month = mmc_get_bits(raw_cid, 128, 12, 4);
	cid->mdt_year = mmc_get_bits(raw_cid, 128, 8, 4);
	if (is_4_41p)
	cid->mdt_year += 2013;
	else
	cid->mdt_year += 1997;
	}

	static void
	mmc_format_card_id_string(struct mmc_ivars *ivar)
	{
	char oidstr[8];
	uint8_t c1;
	uint8_t c2;

	/*
	* Format a card ID string for use by the mmcsd driver, it's what
	* appears between the <> in the following:
	* mmcsd0: 968MB <SD SD01G 8.0 SN 2686905 MFG 08/2008 by 3 TN> at mmc0
	* 22.5MHz/4bit/128-block
	*
	* Also format just the card serial number, which the mmcsd driver will
	* use as the disk->d_ident string.
	*
	* The card_id_string in mmc_ivars is currently allocated as 64 bytes,
	* and our max formatted length is currently 55 bytes if every field
	* contains the largest value.
	*
	* Sometimes the oid is two printable ascii chars; when it's not,
	* format it as 0xnnnn instead.
	*/
	c1 = (ivar->cid.oid >> 8) & 0x0ff;
	c2 = ivar->cid.oid & 0x0ff;
	if (c1 > 0x1f && c1 < 0x7f && c2 > 0x1f && c2 < 0x7f)
	snprintf(oidstr, sizeof(oidstr), "%c%c", c1, c2);
	else
	snprintf(oidstr, sizeof(oidstr), "0x%04x", ivar->cid.oid);
	snprintf(ivar->card_sn_string, sizeof(ivar->card_sn_string),
	"%08X", ivar->cid.psn);
	snprintf(ivar->card_id_string, sizeof(ivar->card_id_string),
	"%s%s %s %d.%d SN %08X MFG %02d/%04d by %d %s",
	ivar->mode == mode_sd ? "SD" : "MMC", ivar->high_cap ? "HC" : "",
	ivar->cid.pnm, ivar->cid.prv >> 4, ivar->cid.prv & 0x0f,
	ivar->cid.psn, ivar->cid.mdt_month, ivar->cid.mdt_year,
	ivar->cid.mid, oidstr);
	}

	static const int exp[8] = {
	1, 10, 100, 1000, 10000, 100000, 1000000, 10000000
	};

	static const int mant[16] = {
	0, 10, 12, 13, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 70, 80
	};

	static const int cur_min[8] = {
	500, 1000, 5000, 10000, 25000, 35000, 60000, 100000
	};

	static const int cur_max[8] = {
	1000, 5000, 10000, 25000, 35000, 45000, 800000, 200000
	};

	static int
	mmc_decode_csd_sd(uint32_t raw_csd, struct mmc_csd csd)
	{
	int v;
	int m;
	int e;

	memset(csd, 0, sizeof(*csd));
	csd->csd_structure = v = mmc_get_bits(raw_csd, 128, 126, 2);
	if (v == 0) {
	m = mmc_get_bits(raw_csd, 128, 115, 4);
	e = mmc_get_bits(raw_csd, 128, 112, 3);
	csd->tacc = (exp[e] * mant[m] + 9) / 10;
	csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
	m = mmc_get_bits(raw_csd, 128, 99, 4);
	e = mmc_get_bits(raw_csd, 128, 96, 3);
	csd->tran_speed = exp[e] * 10000 * mant[m];
	csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
	csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
	csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
	csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
	csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
	csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
	csd->vdd_r_curr_min =
	cur_min[mmc_get_bits(raw_csd, 128, 59, 3)];
	csd->vdd_r_curr_max =
	cur_max[mmc_get_bits(raw_csd, 128, 56, 3)];
	csd->vdd_w_curr_min =
	cur_min[mmc_get_bits(raw_csd, 128, 53, 3)];
	csd->vdd_w_curr_max =
	cur_max[mmc_get_bits(raw_csd, 128, 50, 3)];
	m = mmc_get_bits(raw_csd, 128, 62, 12);
	e = mmc_get_bits(raw_csd, 128, 47, 3);
	csd->capacity = ((1 + m) << (e + 2)) * csd->read_bl_len;
	csd->erase_blk_en = mmc_get_bits(raw_csd, 128, 46, 1);
	csd->erase_sector = mmc_get_bits(raw_csd, 128, 39, 7) + 1;
	csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 7);
	csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
	csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
	csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
	csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
	return (MMC_ERR_NONE);
	} else if (v == 1) {
	m = mmc_get_bits(raw_csd, 128, 115, 4);
	e = mmc_get_bits(raw_csd, 128, 112, 3);
	csd->tacc = (exp[e] * mant[m] + 9) / 10;
	csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
	m = mmc_get_bits(raw_csd, 128, 99, 4);
	e = mmc_get_bits(raw_csd, 128, 96, 3);
	csd->tran_speed = exp[e] * 10000 * mant[m];
	csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
	csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
	csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
	csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
	csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
	csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
	csd->capacity = ((uint64_t)mmc_get_bits(raw_csd, 128, 48, 22) +
	1) * 512 * 1024;
	csd->erase_blk_en = mmc_get_bits(raw_csd, 128, 46, 1);
	csd->erase_sector = mmc_get_bits(raw_csd, 128, 39, 7) + 1;
	csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 7);
	csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
	csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
	csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
	csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
	return (MMC_ERR_NONE);
	}
	return (MMC_ERR_INVALID);
	}

	static void
	mmc_decode_csd_mmc(uint32_t raw_csd, struct mmc_csd csd)
	{
	int m;
	int e;

	memset(csd, 0, sizeof(*csd));
	csd->csd_structure = mmc_get_bits(raw_csd, 128, 126, 2);
	csd->spec_vers = mmc_get_bits(raw_csd, 128, 122, 4);
	m = mmc_get_bits(raw_csd, 128, 115, 4);
	e = mmc_get_bits(raw_csd, 128, 112, 3);
	csd->tacc = exp[e] * mant[m] + 9 / 10;
	csd->nsac = mmc_get_bits(raw_csd, 128, 104, 8) * 100;
	m = mmc_get_bits(raw_csd, 128, 99, 4);
	e = mmc_get_bits(raw_csd, 128, 96, 3);
	csd->tran_speed = exp[e] * 10000 * mant[m];
	csd->ccc = mmc_get_bits(raw_csd, 128, 84, 12);
	csd->read_bl_len = 1 << mmc_get_bits(raw_csd, 128, 80, 4);
	csd->read_bl_partial = mmc_get_bits(raw_csd, 128, 79, 1);
	csd->write_blk_misalign = mmc_get_bits(raw_csd, 128, 78, 1);
	csd->read_blk_misalign = mmc_get_bits(raw_csd, 128, 77, 1);
	csd->dsr_imp = mmc_get_bits(raw_csd, 128, 76, 1);
	csd->vdd_r_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 59, 3)];
	csd->vdd_r_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 56, 3)];
	csd->vdd_w_curr_min = cur_min[mmc_get_bits(raw_csd, 128, 53, 3)];
	csd->vdd_w_curr_max = cur_max[mmc_get_bits(raw_csd, 128, 50, 3)];
	m = mmc_get_bits(raw_csd, 128, 62, 12);
	e = mmc_get_bits(raw_csd, 128, 47, 3);
	csd->capacity = ((1 + m) << (e + 2)) * csd->read_bl_len;
	csd->erase_blk_en = 0;
	csd->erase_sector = (mmc_get_bits(raw_csd, 128, 42, 5) + 1) *
	(mmc_get_bits(raw_csd, 128, 37, 5) + 1);
	csd->wp_grp_size = mmc_get_bits(raw_csd, 128, 32, 5);
	csd->wp_grp_enable = mmc_get_bits(raw_csd, 128, 31, 1);
	csd->r2w_factor = 1 << mmc_get_bits(raw_csd, 128, 26, 3);
	csd->write_bl_len = 1 << mmc_get_bits(raw_csd, 128, 22, 4);
	csd->write_bl_partial = mmc_get_bits(raw_csd, 128, 21, 1);
	}

	static void
	mmc_app_decode_scr(uint32_t raw_scr, struct mmc_scr scr)
	{
	unsigned int scr_struct;

	memset(scr, 0, sizeof(*scr));

	scr_struct = mmc_get_bits(raw_scr, 64, 60, 4);
	if (scr_struct != 0) {
	printf("Unrecognised SCR structure version %d\n",
	scr_struct);
	return;
	}
	scr->sda_vsn = mmc_get_bits(raw_scr, 64, 56, 4);
	scr->bus_widths = mmc_get_bits(raw_scr, 64, 48, 4);
	}

	static void
	mmc_app_decode_sd_status(uint32_t *raw_sd_status,
	struct mmc_sd_status *sd_status)
	{

	memset(sd_status, 0, sizeof(*sd_status));

	sd_status->bus_width = mmc_get_bits(raw_sd_status, 512, 510, 2);
	sd_status->secured_mode = mmc_get_bits(raw_sd_status, 512, 509, 1);
	sd_status->card_type = mmc_get_bits(raw_sd_status, 512, 480, 16);
	sd_status->prot_area = mmc_get_bits(raw_sd_status, 512, 448, 12);
	sd_status->speed_class = mmc_get_bits(raw_sd_status, 512, 440, 8);
	sd_status->perf_move = mmc_get_bits(raw_sd_status, 512, 432, 8);
	sd_status->au_size = mmc_get_bits(raw_sd_status, 512, 428, 4);
	sd_status->erase_size = mmc_get_bits(raw_sd_status, 512, 408, 16);
	sd_status->erase_timeout = mmc_get_bits(raw_sd_status, 512, 402, 6);
	sd_status->erase_offset = mmc_get_bits(raw_sd_status, 512, 400, 2);
	}

	static int
	mmc_all_send_cid(struct mmc_softc sc, uint32_t rawcid)
	{
	struct mmc_command cmd;
	int err;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = MMC_ALL_SEND_CID;
	cmd.arg = 0;
	cmd.flags = MMC_RSP_R2 \| MMC_CMD_BCR;
	cmd.data = NULL;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	memcpy(rawcid, cmd.resp, 4 * sizeof(uint32_t));
	return (err);
	}

	static int
	mmc_send_csd(struct mmc_softc sc, uint16_t rca, uint32_t rawcsd)
	{
	struct mmc_command cmd;
	int err;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = MMC_SEND_CSD;
	cmd.arg = rca << 16;
	cmd.flags = MMC_RSP_R2 \| MMC_CMD_BCR;
	cmd.data = NULL;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	memcpy(rawcsd, cmd.resp, 4 * sizeof(uint32_t));
	return (err);
	}

	static int
	mmc_app_send_scr(struct mmc_softc sc, uint16_t rca, uint32_t rawscr)
	{
	int err;
	struct mmc_command cmd;
	struct mmc_data data;

	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));

	memset(rawscr, 0, 8);
	cmd.opcode = ACMD_SEND_SCR;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.arg = 0;
	cmd.data = &data;

	data.data = rawscr;
	data.len = 8;
	data.flags = MMC_DATA_READ;

	err = mmc_wait_for_app_cmd(sc->dev, sc->dev, rca, &cmd, CMD_RETRIES);
	rawscr[0] = be32toh(rawscr[0]);
	rawscr[1] = be32toh(rawscr[1]);
	return (err);
	}

	static int
	mmc_app_sd_status(struct mmc_softc sc, uint16_t rca, uint32_t rawsdstatus)
	{
	struct mmc_command cmd;
	struct mmc_data data;
	int err, i;

	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));

	memset(rawsdstatus, 0, 64);
	cmd.opcode = ACMD_SD_STATUS;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	cmd.arg = 0;
	cmd.data = &data;

	data.data = rawsdstatus;
	data.len = 64;
	data.flags = MMC_DATA_READ;

	err = mmc_wait_for_app_cmd(sc->dev, sc->dev, rca, &cmd, CMD_RETRIES);
	for (i = 0; i < 16; i++)
	rawsdstatus[i] = be32toh(rawsdstatus[i]);
	return (err);
	}

	static int
	mmc_set_relative_addr(struct mmc_softc *sc, uint16_t resp)
	{
	struct mmc_command cmd;
	int err;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = MMC_SET_RELATIVE_ADDR;
	cmd.arg = resp << 16;
	cmd.flags = MMC_RSP_R6 \| MMC_CMD_BCR;
	cmd.data = NULL;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	return (err);
	}

	static int
	mmc_send_relative_addr(struct mmc_softc sc, uint32_t resp)
	{
	struct mmc_command cmd;
	int err;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = SD_SEND_RELATIVE_ADDR;
	cmd.arg = 0;
	cmd.flags = MMC_RSP_R6 \| MMC_CMD_BCR;
	cmd.data = NULL;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	*resp = cmd.resp[0];
	return (err);
	}

	static int
	mmc_set_blocklen(struct mmc_softc *sc, uint32_t len)
	{
	struct mmc_command cmd;
	int err;

	memset(&cmd, 0, sizeof(cmd));
	cmd.opcode = MMC_SET_BLOCKLEN;
	cmd.arg = len;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_AC;
	cmd.data = NULL;
	err = mmc_wait_for_cmd(sc->dev, sc->dev, &cmd, CMD_RETRIES);
	return (err);
	}

	static uint32_t
	mmc_timing_to_dtr(struct mmc_ivars *ivar, enum mmc_bus_timing timing)
	{

	switch (timing) {
	case bus_timing_normal:
	return (ivar->tran_speed);
	case bus_timing_hs:
	return (ivar->hs_tran_speed);
	case bus_timing_uhs_sdr12:
	return (SD_SDR12_MAX);
	case bus_timing_uhs_sdr25:
	return (SD_SDR25_MAX);
	case bus_timing_uhs_ddr50:
	return (SD_DDR50_MAX);
	case bus_timing_uhs_sdr50:
	return (SD_SDR50_MAX);
	case bus_timing_uhs_sdr104:
	return (SD_SDR104_MAX);
	case bus_timing_mmc_ddr52:
	return (MMC_TYPE_DDR52_MAX);
	case bus_timing_mmc_hs200:
	case bus_timing_mmc_hs400:
	case bus_timing_mmc_hs400es:
	return (MMC_TYPE_HS200_HS400ES_MAX);
	}
	return (0);
	}

	static const char *
	mmc_timing_to_string(enum mmc_bus_timing timing)
	{

	switch (timing) {
	case bus_timing_normal:
	return ("normal speed");
	case bus_timing_hs:
	return ("high speed");
	case bus_timing_uhs_sdr12:
	case bus_timing_uhs_sdr25:
	case bus_timing_uhs_sdr50:
	case bus_timing_uhs_sdr104:
	return ("single data rate");
	case bus_timing_uhs_ddr50:
	case bus_timing_mmc_ddr52:
	return ("dual data rate");
	case bus_timing_mmc_hs200:
	return ("HS200");
	case bus_timing_mmc_hs400:
	return ("HS400");
	case bus_timing_mmc_hs400es:
	return ("HS400 with enhanced strobe");
	}
	return ("");
	}

	static bool
	mmc_host_timing(device_t dev, enum mmc_bus_timing timing)
	{
	int host_caps;

	host_caps = mmcbr_get_caps(dev);

	#define HOST_TIMING_CAP(host_caps, cap) ({ \
	bool retval; \
	if (((host_caps) & (cap)) == (cap)) \
	retval = true; \
	else \
	retval = false; \
	retval; \
	})

	switch (timing) {
	case bus_timing_normal:
	return (true);
	case bus_timing_hs:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_HSPEED));
	case bus_timing_uhs_sdr12:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR12));
	case bus_timing_uhs_sdr25:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR25));
	case bus_timing_uhs_ddr50:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_DDR50));
	case bus_timing_uhs_sdr50:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR50));
	case bus_timing_uhs_sdr104:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_UHS_SDR104));
	case bus_timing_mmc_ddr52:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_DDR52));
	case bus_timing_mmc_hs200:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_HS200));
	case bus_timing_mmc_hs400:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_HS400));
	case bus_timing_mmc_hs400es:
	return (HOST_TIMING_CAP(host_caps, MMC_CAP_MMC_HS400 \|
	MMC_CAP_MMC_ENH_STROBE));
	}

	#undef HOST_TIMING_CAP

	return (false);
	}

	static void
	mmc_log_card(device_t dev, struct mmc_ivars *ivar, int newcard)
	{
	- enum mmc_bus_timing max_timing, timing;
	+ enum mmc_bus_timing timing;

	device_printf(dev, "Card at relative address 0x%04x%s:\n",
	ivar->rca, newcard ? " added" : "");
	device_printf(dev, " card: %s\n", ivar->card_id_string);
	- max_timing = bus_timing_normal;
	for (timing = bus_timing_max; timing > bus_timing_normal; timing--) {
	- if (isset(&ivar->timings, timing)) {
	- max_timing = timing;
	+ if (isset(&ivar->timings, timing))
	break;
	- }
	}
	device_printf(dev, " quirks: %b\n", ivar->quirks, MMC_QUIRKS_FMT);
	device_printf(dev, " bus: %ubit, %uMHz (%s timing)\n",
	(ivar->bus_width == bus_width_1 ? 1 :
	(ivar->bus_width == bus_width_4 ? 4 : 8)),
	mmc_timing_to_dtr(ivar, timing) / 1000000,
	mmc_timing_to_string(timing));
	device_printf(dev, " memory: %u blocks, erase sector %u blocks%s\n",
	ivar->sec_count, ivar->erase_sector,
	ivar->read_only ? ", read-only" : "");
	}

	static void
	mmc_discover_cards(struct mmc_softc *sc)
	{
	u_char switch_res[64];
	uint32_t raw_cid[4];
	struct mmc_ivars *ivar = NULL;
	const struct mmc_quirk *quirk;
	device_t child;
	int err, host_caps, i, newcard;
	uint32_t resp, sec_count, status;
	uint16_t rca = 2;

	host_caps = mmcbr_get_caps(sc->dev);
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev, "Probing cards\n");
	while (1) {
	child = NULL;
	sc->squelched++; /* Errors are expected, squelch reporting. */
	err = mmc_all_send_cid(sc, raw_cid);
	sc->squelched--;
	if (err == MMC_ERR_TIMEOUT)
	break;
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev, "Error reading CID %d\n", err);
	break;
	}
	newcard = 1;
	for (i = 0; i < sc->child_count; i++) {
	ivar = device_get_ivars(sc->child_list[i]);
	if (memcmp(ivar->raw_cid, raw_cid, sizeof(raw_cid)) ==
	0) {
	newcard = 0;
	break;
	}
	}
	if (bootverbose \|\| mmc_debug) {
	device_printf(sc->dev,
	"%sard detected (CID %08x%08x%08x%08x)\n",
	newcard ? "New c" : "C",
	raw_cid[0], raw_cid[1], raw_cid[2], raw_cid[3]);
	}
	if (newcard) {
	ivar = malloc(sizeof(struct mmc_ivars), M_DEVBUF,
	M_WAITOK \| M_ZERO);
	memcpy(ivar->raw_cid, raw_cid, sizeof(raw_cid));
	}
	if (mmcbr_get_ro(sc->dev))
	ivar->read_only = 1;
	ivar->bus_width = bus_width_1;
	setbit(&ivar->timings, bus_timing_normal);
	ivar->mode = mmcbr_get_mode(sc->dev);
	if (ivar->mode == mode_sd) {
	mmc_decode_cid_sd(ivar->raw_cid, &ivar->cid);
	err = mmc_send_relative_addr(sc, &resp);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error getting RCA %d\n", err);
	goto free_ivar;
	}
	ivar->rca = resp >> 16;
	/* Get card CSD. */
	err = mmc_send_csd(sc, ivar->rca, ivar->raw_csd);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error getting CSD %d\n", err);
	goto free_ivar;
	}
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev,
	"%sard detected (CSD %08x%08x%08x%08x)\n",
	newcard ? "New c" : "C", ivar->raw_csd[0],
	ivar->raw_csd[1], ivar->raw_csd[2],
	ivar->raw_csd[3]);
	err = mmc_decode_csd_sd(ivar->raw_csd, &ivar->csd);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev, "Error decoding CSD\n");
	goto free_ivar;
	}
	ivar->sec_count = ivar->csd.capacity / MMC_SECTOR_SIZE;
	if (ivar->csd.csd_structure > 0)
	ivar->high_cap = 1;
	ivar->tran_speed = ivar->csd.tran_speed;
	ivar->erase_sector = ivar->csd.erase_sector *
	ivar->csd.write_bl_len / MMC_SECTOR_SIZE;

	err = mmc_send_status(sc->dev, sc->dev, ivar->rca,
	&status);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error reading card status %d\n", err);
	goto free_ivar;
	}
	if ((status & R1_CARD_IS_LOCKED) != 0) {
	device_printf(sc->dev,
	"Card is password protected, skipping\n");
	goto free_ivar;
	}

	/* Get card SCR. Card must be selected to fetch it. */
	err = mmc_select_card(sc, ivar->rca);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error selecting card %d\n", err);
	goto free_ivar;
	}
	err = mmc_app_send_scr(sc, ivar->rca, ivar->raw_scr);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error reading SCR %d\n", err);
	goto free_ivar;
	}
	mmc_app_decode_scr(ivar->raw_scr, &ivar->scr);
	/* Get card switch capabilities (command class 10). */
	if ((ivar->scr.sda_vsn >= 1) &&
	(ivar->csd.ccc & (1 << 10))) {
	err = mmc_sd_switch(sc, SD_SWITCH_MODE_CHECK,
	SD_SWITCH_GROUP1, SD_SWITCH_NOCHANGE,
	switch_res);
	if (err == MMC_ERR_NONE &&
	switch_res[13] & (1 << SD_SWITCH_HS_MODE)) {
	setbit(&ivar->timings, bus_timing_hs);
	ivar->hs_tran_speed = SD_HS_MAX;
	}
	}

	/*
	* We deselect then reselect the card here. Some cards
	* become unselected and timeout with the above two
	* commands, although the state tables / diagrams in the
	* standard suggest they go back to the transfer state.
	* Other cards don't become deselected, and if we
	* attempt to blindly re-select them, we get timeout
	* errors from some controllers. So we deselect then
	* reselect to handle all situations. The only thing we
	* use from the sd_status is the erase sector size, but
	* it is still nice to get that right.
	*/
	(void)mmc_select_card(sc, 0);
	(void)mmc_select_card(sc, ivar->rca);
	(void)mmc_app_sd_status(sc, ivar->rca,
	ivar->raw_sd_status);
	mmc_app_decode_sd_status(ivar->raw_sd_status,
	&ivar->sd_status);
	if (ivar->sd_status.au_size != 0) {
	ivar->erase_sector =
	16 << ivar->sd_status.au_size;
	}
	/* Find maximum supported bus width. */
	if ((host_caps & MMC_CAP_4_BIT_DATA) &&
	(ivar->scr.bus_widths & SD_SCR_BUS_WIDTH_4))
	ivar->bus_width = bus_width_4;

	goto child_common;
	}
	ivar->rca = rca++;
	err = mmc_set_relative_addr(sc, ivar->rca);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev, "Error setting RCA %d\n", err);
	goto free_ivar;
	}
	/* Get card CSD. */
	err = mmc_send_csd(sc, ivar->rca, ivar->raw_csd);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev, "Error getting CSD %d\n", err);
	goto free_ivar;
	}
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev,
	"%sard detected (CSD %08x%08x%08x%08x)\n",
	newcard ? "New c" : "C", ivar->raw_csd[0],
	ivar->raw_csd[1], ivar->raw_csd[2],
	ivar->raw_csd[3]);

	mmc_decode_csd_mmc(ivar->raw_csd, &ivar->csd);
	ivar->sec_count = ivar->csd.capacity / MMC_SECTOR_SIZE;
	ivar->tran_speed = ivar->csd.tran_speed;
	ivar->erase_sector = ivar->csd.erase_sector *
	ivar->csd.write_bl_len / MMC_SECTOR_SIZE;

	err = mmc_send_status(sc->dev, sc->dev, ivar->rca, &status);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error reading card status %d\n", err);
	goto free_ivar;
	}
	if ((status & R1_CARD_IS_LOCKED) != 0) {
	device_printf(sc->dev,
	"Card is password protected, skipping\n");
	goto free_ivar;
	}

	err = mmc_select_card(sc, ivar->rca);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev, "Error selecting card %d\n",
	err);
	goto free_ivar;
	}

	/* Only MMC >= 4.x devices support EXT_CSD. */
	if (ivar->csd.spec_vers >= 4) {
	err = mmc_send_ext_csd(sc->dev, sc->dev,
	ivar->raw_ext_csd);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error reading EXT_CSD %d\n", err);
	goto free_ivar;
	}
	/* Handle extended capacity from EXT_CSD */
	sec_count = ivar->raw_ext_csd[EXT_CSD_SEC_CNT] +
	(ivar->raw_ext_csd[EXT_CSD_SEC_CNT + 1] << 8) +
	(ivar->raw_ext_csd[EXT_CSD_SEC_CNT + 2] << 16) +
	(ivar->raw_ext_csd[EXT_CSD_SEC_CNT + 3] << 24);
	if (sec_count != 0) {
	ivar->sec_count = sec_count;
	ivar->high_cap = 1;
	}
	/* Find maximum supported bus width. */
	ivar->bus_width = mmc_test_bus_width(sc);
	/* Get device speeds beyond normal mode. */
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS_52) != 0) {
	setbit(&ivar->timings, bus_timing_hs);
	ivar->hs_tran_speed = MMC_TYPE_HS_52_MAX;
	} else if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS_26) != 0) {
	setbit(&ivar->timings, bus_timing_hs);
	ivar->hs_tran_speed = MMC_TYPE_HS_26_MAX;
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_DDR_52_1_2V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_120) != 0) {
	setbit(&ivar->timings, bus_timing_mmc_ddr52);
	setbit(&ivar->vccq_120, bus_timing_mmc_ddr52);
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_DDR_52_1_8V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_180) != 0) {
	setbit(&ivar->timings, bus_timing_mmc_ddr52);
	setbit(&ivar->vccq_180, bus_timing_mmc_ddr52);
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS200_1_2V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_120) != 0) {
	setbit(&ivar->timings, bus_timing_mmc_hs200);
	setbit(&ivar->vccq_120, bus_timing_mmc_hs200);
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS200_1_8V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_180) != 0) {
	setbit(&ivar->timings, bus_timing_mmc_hs200);
	setbit(&ivar->vccq_180, bus_timing_mmc_hs200);
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS400_1_2V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_120) != 0 &&
	ivar->bus_width == bus_width_8) {
	setbit(&ivar->timings, bus_timing_mmc_hs400);
	setbit(&ivar->vccq_120, bus_timing_mmc_hs400);
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS400_1_8V) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_180) != 0 &&
	ivar->bus_width == bus_width_8) {
	setbit(&ivar->timings, bus_timing_mmc_hs400);
	setbit(&ivar->vccq_180, bus_timing_mmc_hs400);
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS400_1_2V) != 0 &&
	(ivar->raw_ext_csd[EXT_CSD_STROBE_SUPPORT] &
	EXT_CSD_STROBE_SUPPORT_EN) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_120) != 0 &&
	ivar->bus_width == bus_width_8) {
	setbit(&ivar->timings, bus_timing_mmc_hs400es);
	setbit(&ivar->vccq_120, bus_timing_mmc_hs400es);
	}
	if ((ivar->raw_ext_csd[EXT_CSD_CARD_TYPE] &
	EXT_CSD_CARD_TYPE_HS400_1_8V) != 0 &&
	(ivar->raw_ext_csd[EXT_CSD_STROBE_SUPPORT] &
	EXT_CSD_STROBE_SUPPORT_EN) != 0 &&
	(host_caps & MMC_CAP_SIGNALING_180) != 0 &&
	ivar->bus_width == bus_width_8) {
	setbit(&ivar->timings, bus_timing_mmc_hs400es);
	setbit(&ivar->vccq_180, bus_timing_mmc_hs400es);
	}
	/*
	* Determine generic switch timeout (provided in
	* units of 10 ms), defaulting to 500 ms.
	*/
	ivar->cmd6_time = 500 * 1000;
	if (ivar->csd.spec_vers >= 6)
	ivar->cmd6_time = 10 *
	ivar->raw_ext_csd[EXT_CSD_GEN_CMD6_TIME];
	/* Handle HC erase sector size. */
	if (ivar->raw_ext_csd[EXT_CSD_ERASE_GRP_SIZE] != 0) {
	ivar->erase_sector = 1024 *
	ivar->raw_ext_csd[EXT_CSD_ERASE_GRP_SIZE];
	err = mmc_switch(sc->dev, sc->dev, ivar->rca,
	EXT_CSD_CMD_SET_NORMAL,
	EXT_CSD_ERASE_GRP_DEF,
	EXT_CSD_ERASE_GRP_DEF_EN,
	ivar->cmd6_time, true);
	if (err != MMC_ERR_NONE) {
	device_printf(sc->dev,
	"Error setting erase group %d\n",
	err);
	goto free_ivar;
	}
	}
	}

	mmc_decode_cid_mmc(ivar->raw_cid, &ivar->cid,
	ivar->raw_ext_csd[EXT_CSD_REV] >= 5);

	child_common:
	for (quirk = &mmc_quirks[0]; quirk->mid != 0x0; quirk++) {
	if ((quirk->mid == MMC_QUIRK_MID_ANY \|\|
	quirk->mid == ivar->cid.mid) &&
	(quirk->oid == MMC_QUIRK_OID_ANY \|\|
	quirk->oid == ivar->cid.oid) &&
	strncmp(quirk->pnm, ivar->cid.pnm,
	sizeof(ivar->cid.pnm)) == 0) {
	ivar->quirks = quirk->quirks;
	break;
	}
	}

	/*
	* Some cards that report maximum I/O block sizes greater
	* than 512 require the block length to be set to 512, even
	* though that is supposed to be the default. Example:
	*
	* Transcend 2GB SDSC card, CID:
	* mid=0x1b oid=0x534d pnm="00000" prv=1.0 mdt=00.2000
	*/
	if (ivar->csd.read_bl_len != MMC_SECTOR_SIZE \|\|
	ivar->csd.write_bl_len != MMC_SECTOR_SIZE)
	mmc_set_blocklen(sc, MMC_SECTOR_SIZE);

	mmc_format_card_id_string(ivar);

	if (bootverbose \|\| mmc_debug)
	mmc_log_card(sc->dev, ivar, newcard);
	if (newcard) {
	/* Add device. */
	child = device_add_child(sc->dev, NULL, -1);
	if (child != NULL) {
	device_set_ivars(child, ivar);
	sc->child_list = realloc(sc->child_list,
	sizeof(device_t) * sc->child_count + 1,
	M_DEVBUF, M_WAITOK);
	sc->child_list[sc->child_count++] = child;
	} else
	device_printf(sc->dev, "Error adding child\n");
	}

	free_ivar:
	if (newcard && child == NULL)
	free(ivar, M_DEVBUF);
	(void)mmc_select_card(sc, 0);
	/*
	* Not returning here when one MMC device could no be added
	* potentially would mean looping forever when that device
	* is broken (in which case it also may impact the remainder
	* of the bus anyway, though).
	*/
	if ((newcard && child == NULL) \|\|
	mmcbr_get_mode(sc->dev) == mode_sd)
	return;
	}
	}

	static void
	mmc_update_child_list(struct mmc_softc *sc)
	{
	device_t child;
	int i, j;

	if (sc->child_count == 0) {
	free(sc->child_list, M_DEVBUF);
	return;
	}
	for (i = j = 0; i < sc->child_count; i++) {
	for (;;) {
	child = sc->child_list[j++];
	if (child != NULL)
	break;
	}
	if (i != j)
	sc->child_list[i] = child;
	}
	sc->child_list = realloc(sc->child_list, sizeof(device_t) *
	sc->child_count, M_DEVBUF, M_WAITOK);
	}

	static void
	mmc_rescan_cards(struct mmc_softc *sc)
	{
	struct mmc_ivars *ivar;
	int err, i, j;

	for (i = j = 0; i < sc->child_count; i++) {
	ivar = device_get_ivars(sc->child_list[i]);
	if (mmc_select_card(sc, ivar->rca) != MMC_ERR_NONE) {
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev,
	"Card at relative address %d lost\n",
	ivar->rca);
	err = device_delete_child(sc->dev, sc->child_list[i]);
	if (err != 0) {
	j++;
	continue;
	}
	free(ivar, M_DEVBUF);
	} else
	j++;
	}
	if (sc->child_count == j)
	goto out;
	sc->child_count = j;
	mmc_update_child_list(sc);
	out:
	(void)mmc_select_card(sc, 0);
	}

	static int
	mmc_delete_cards(struct mmc_softc *sc, bool final)
	{
	struct mmc_ivars *ivar;
	int err, i, j;

	err = 0;
	for (i = j = 0; i < sc->child_count; i++) {
	ivar = device_get_ivars(sc->child_list[i]);
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev,
	"Card at relative address %d deleted\n",
	ivar->rca);
	err = device_delete_child(sc->dev, sc->child_list[i]);
	if (err != 0) {
	j++;
	if (final == false)
	continue;
	else
	break;
	}
	free(ivar, M_DEVBUF);
	}
	sc->child_count = j;
	mmc_update_child_list(sc);
	return (err);
	}

	static void
	mmc_go_discovery(struct mmc_softc *sc)
	{
	uint32_t ocr;
	device_t dev;
	int err;

	dev = sc->dev;
	if (mmcbr_get_power_mode(dev) != power_on) {
	/*
	* First, try SD modes
	*/
	sc->squelched++; /* Errors are expected, squelch reporting. */
	mmcbr_set_mode(dev, mode_sd);
	mmc_power_up(sc);
	mmcbr_set_bus_mode(dev, pushpull);
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev, "Probing bus\n");
	mmc_idle_cards(sc);
	err = mmc_send_if_cond(sc, 1);
	if ((bootverbose \|\| mmc_debug) && err == 0)
	device_printf(sc->dev,
	"SD 2.0 interface conditions: OK\n");
	if (mmc_send_app_op_cond(sc, 0, &ocr) != MMC_ERR_NONE) {
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev, "SD probe: failed\n");
	/*
	* Failed, try MMC
	*/
	mmcbr_set_mode(dev, mode_mmc);
	if (mmc_send_op_cond(sc, 0, &ocr) != MMC_ERR_NONE) {
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev,
	"MMC probe: failed\n");
	ocr = 0; /* Failed both, powerdown. */
	} else if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev,
	"MMC probe: OK (OCR: 0x%08x)\n", ocr);
	} else if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev, "SD probe: OK (OCR: 0x%08x)\n",
	ocr);
	sc->squelched--;

	mmcbr_set_ocr(dev, mmc_select_vdd(sc, ocr));
	if (mmcbr_get_ocr(dev) != 0)
	mmc_idle_cards(sc);
	} else {
	mmcbr_set_bus_mode(dev, opendrain);
	mmcbr_set_clock(dev, SD_MMC_CARD_ID_FREQUENCY);
	mmcbr_update_ios(dev);
	/* XXX recompute vdd based on new cards? */
	}
	/*
	* Make sure that we have a mutually agreeable voltage to at least
	* one card on the bus.
	*/
	if (bootverbose \|\| mmc_debug)
	device_printf(sc->dev, "Current OCR: 0x%08x\n",
	mmcbr_get_ocr(dev));
	if (mmcbr_get_ocr(dev) == 0) {
	device_printf(sc->dev, "No compatible cards found on bus\n");
	(void)mmc_delete_cards(sc, false);
	mmc_power_down(sc);
	return;
	}
	/*
	* Reselect the cards after we've idled them above.
	*/
	if (mmcbr_get_mode(dev) == mode_sd) {
	err = mmc_send_if_cond(sc, 1);
	mmc_send_app_op_cond(sc,
	(err ? 0 : MMC_OCR_CCS) \| mmcbr_get_ocr(dev), NULL);
	} else
	mmc_send_op_cond(sc, MMC_OCR_CCS \| mmcbr_get_ocr(dev), NULL);
	mmc_discover_cards(sc);
	mmc_rescan_cards(sc);

	mmcbr_set_bus_mode(dev, pushpull);
	mmcbr_update_ios(dev);
	mmc_calculate_clock(sc);
	}

	static int
	mmc_calculate_clock(struct mmc_softc *sc)
	{
	device_t dev;
	struct mmc_ivars *ivar;
	int i;
	uint32_t dtr, max_dtr;
	uint16_t rca;
	enum mmc_bus_timing max_timing, timing;
	bool changed, hs400;

	dev = sc->dev;
	max_dtr = mmcbr_get_f_max(dev);
	max_timing = bus_timing_max;
	do {
	changed = false;
	for (i = 0; i < sc->child_count; i++) {
	ivar = device_get_ivars(sc->child_list[i]);
	if (isclr(&ivar->timings, max_timing) \|\|
	!mmc_host_timing(dev, max_timing)) {
	for (timing = max_timing - 1; timing >=
	bus_timing_normal; timing--) {
	if (isset(&ivar->timings, timing) &&
	mmc_host_timing(dev, timing)) {
	max_timing = timing;
	break;
	}
	}
	changed = true;
	}
	dtr = mmc_timing_to_dtr(ivar, max_timing);
	if (dtr < max_dtr) {
	max_dtr = dtr;
	changed = true;
	}
	}
	} while (changed == true);

	if (bootverbose \|\| mmc_debug) {
	device_printf(dev,
	"setting transfer rate to %d.%03dMHz (%s timing)\n",
	max_dtr / 1000000, (max_dtr / 1000) % 1000,
	mmc_timing_to_string(max_timing));
	}

	/*
	* HS400 must be tuned in HS200 mode, so in case of HS400 we begin
	* with HS200 following the sequence as described in "6.6.2.2 HS200
	* timing mode selection" of the eMMC specification v5.1, too, and
	* switch to max_timing later. HS400ES requires no tuning and, thus,
	* can be switch to directly, but requires the same detour via high
	* speed mode as does HS400 (see mmc_switch_to_hs400()).
	*/
	hs400 = max_timing == bus_timing_mmc_hs400;
	timing = hs400 == true ? bus_timing_mmc_hs200 : max_timing;
	for (i = 0; i < sc->child_count; i++) {
	ivar = device_get_ivars(sc->child_list[i]);
	if ((ivar->timings & ~(1 << bus_timing_normal)) == 0)
	continue;

	rca = ivar->rca;
	if (mmc_select_card(sc, rca) != MMC_ERR_NONE) {
	device_printf(dev, "Card at relative address %d "
	"failed to select\n", rca);
	continue;
	}

	if (timing == bus_timing_mmc_hs200 \|\| /* includes HS400 */
	timing == bus_timing_mmc_hs400es) {
	if (mmc_set_vccq(sc, ivar, timing) != MMC_ERR_NONE) {
	device_printf(dev, "Failed to set VCCQ for "
	"card at relative address %d\n", rca);
	continue;
	}
	}

	if (timing == bus_timing_mmc_hs200) { /* includes HS400 */
	/* Set bus width (required for initial tuning). */
	if (mmc_set_card_bus_width(sc, ivar, timing) !=
	MMC_ERR_NONE) {
	device_printf(dev, "Card at relative address "
	"%d failed to set bus width\n", rca);
	continue;
	}
	mmcbr_set_bus_width(dev, ivar->bus_width);
	mmcbr_update_ios(dev);
	} else if (timing == bus_timing_mmc_hs400es) {
	if (mmc_switch_to_hs400(sc, ivar, max_dtr, timing) !=
	MMC_ERR_NONE) {
	device_printf(dev, "Card at relative address "
	"%d failed to set %s timing\n", rca,
	mmc_timing_to_string(timing));
	continue;
	}
	goto power_class;
	}

	if (mmc_set_timing(sc, ivar, timing) != MMC_ERR_NONE) {
	device_printf(dev, "Card at relative address %d "
	"failed to set %s timing\n", rca,
	mmc_timing_to_string(timing));
	continue;
	}

	if (timing == bus_timing_mmc_ddr52) {
	/*
	* Set EXT_CSD_BUS_WIDTH_n_DDR in EXT_CSD_BUS_WIDTH
	* (must be done after switching to EXT_CSD_HS_TIMING).
	*/
	if (mmc_set_card_bus_width(sc, ivar, timing) !=
	MMC_ERR_NONE) {
	device_printf(dev, "Card at relative address "
	"%d failed to set bus width\n", rca);
	continue;
	}
	mmcbr_set_bus_width(dev, ivar->bus_width);
	mmcbr_update_ios(dev);
	if (mmc_set_vccq(sc, ivar, timing) != MMC_ERR_NONE) {
	device_printf(dev, "Failed to set VCCQ for "
	"card at relative address %d\n", rca);
	continue;
	}
	}

	/* Set clock (must be done before initial tuning). */
	mmcbr_set_clock(dev, max_dtr);
	mmcbr_update_ios(dev);

	if (mmcbr_tune(dev, hs400) != 0) {
	device_printf(dev, "Card at relative address %d "
	"failed to execute initial tuning\n", rca);
	continue;
	}

	if (hs400 == true && mmc_switch_to_hs400(sc, ivar, max_dtr,
	max_timing) != MMC_ERR_NONE) {
	device_printf(dev, "Card at relative address %d "
	"failed to set %s timing\n", rca,
	mmc_timing_to_string(max_timing));
	continue;
	}

	power_class:
	if (mmc_set_power_class(sc, ivar) != MMC_ERR_NONE) {
	device_printf(dev, "Card at relative address %d "
	"failed to set power class\n", rca);
	}
	}
	(void)mmc_select_card(sc, 0);
	return (max_dtr);
	}

	/*
	* Switch from HS200 to HS400 (either initially or for re-tuning) or directly
	* to HS400ES. This follows the sequences described in "6.6.2.3 HS400 timing
	* mode selection" of the eMMC specification v5.1.
	*/
	static int
	mmc_switch_to_hs400(struct mmc_softc sc, struct mmc_ivars ivar,
	uint32_t clock, enum mmc_bus_timing max_timing)
	{
	device_t dev;
	int err;
	uint16_t rca;

	dev = sc->dev;
	rca = ivar->rca;

	/*
	* Both clock and timing must be set as appropriate for high speed
	* before eventually switching to HS400/HS400ES; mmc_set_timing()
	* will issue mmcbr_update_ios().
	*/
	mmcbr_set_clock(dev, ivar->hs_tran_speed);
	err = mmc_set_timing(sc, ivar, bus_timing_hs);
	if (err != MMC_ERR_NONE)
	return (err);

	/*
	* Set EXT_CSD_BUS_WIDTH_8_DDR in EXT_CSD_BUS_WIDTH (and additionally
	* EXT_CSD_BUS_WIDTH_ES for HS400ES).
	*/
	err = mmc_set_card_bus_width(sc, ivar, max_timing);
	if (err != MMC_ERR_NONE)
	return (err);
	mmcbr_set_bus_width(dev, ivar->bus_width);
	mmcbr_update_ios(dev);

	/* Finally, switch to HS400/HS400ES mode. */
	err = mmc_set_timing(sc, ivar, max_timing);
	if (err != MMC_ERR_NONE)
	return (err);
	mmcbr_set_clock(dev, clock);
	mmcbr_update_ios(dev);
	return (MMC_ERR_NONE);
	}

	/*
	* Switch from HS400 to HS200 (for re-tuning).
	*/
	static int
	mmc_switch_to_hs200(struct mmc_softc sc, struct mmc_ivars ivar,
	uint32_t clock)
	{
	device_t dev;
	int err;
	uint16_t rca;

	dev = sc->dev;
	rca = ivar->rca;

	/*
	* Both clock and timing must initially be set as appropriate for
	* DDR52 before eventually switching to HS200; mmc_set_timing()
	* will issue mmcbr_update_ios().
	*/
	mmcbr_set_clock(dev, ivar->hs_tran_speed);
	err = mmc_set_timing(sc, ivar, bus_timing_mmc_ddr52);
	if (err != MMC_ERR_NONE)
	return (err);

	/*
	* Next, switch to high speed. Thus, clear EXT_CSD_BUS_WIDTH_n_DDR
	* in EXT_CSD_BUS_WIDTH and update bus width and timing in ios.
	*/
	err = mmc_set_card_bus_width(sc, ivar, bus_timing_hs);
	if (err != MMC_ERR_NONE)
	return (err);
	mmcbr_set_bus_width(dev, ivar->bus_width);
	mmcbr_set_timing(sc->dev, bus_timing_hs);
	mmcbr_update_ios(dev);

	/* Finally, switch to HS200 mode. */
	err = mmc_set_timing(sc, ivar, bus_timing_mmc_hs200);
	if (err != MMC_ERR_NONE)
	return (err);
	mmcbr_set_clock(dev, clock);
	mmcbr_update_ios(dev);
	return (MMC_ERR_NONE);
	}

	static int
	mmc_retune(device_t busdev, device_t dev, bool reset)
	{
	struct mmc_softc *sc;
	struct mmc_ivars *ivar;
	int err;
	uint32_t clock;
	enum mmc_bus_timing timing;

	if (device_get_parent(dev) != busdev)
	return (MMC_ERR_INVALID);

	sc = device_get_softc(busdev);
	if (sc->retune_needed != 1 && sc->retune_paused != 0)
	return (MMC_ERR_INVALID);

	timing = mmcbr_get_timing(busdev);
	if (timing == bus_timing_mmc_hs400) {
	/*
	* Controllers use the data strobe line to latch data from
	* the devices in HS400 mode so periodic re-tuning isn't
	* expected to be required, i. e. only if a CRC or tuning
	* error is signaled to the bridge. In these latter cases
	* we are asked to reset the tuning circuit and need to do
	* the switch timing dance.
	*/
	if (reset == false)
	return (0);
	ivar = device_get_ivars(dev);
	clock = mmcbr_get_clock(busdev);
	if (mmc_switch_to_hs200(sc, ivar, clock) != MMC_ERR_NONE)
	return (MMC_ERR_BADCRC);
	}
	err = mmcbr_retune(busdev, reset);
	if (err != 0 && timing == bus_timing_mmc_hs400)
	return (MMC_ERR_BADCRC);
	switch (err) {
	case 0:
	break;
	case EIO:
	return (MMC_ERR_FAILED);
	default:
	return (MMC_ERR_INVALID);
	}
	if (timing == bus_timing_mmc_hs400) {
	if (mmc_switch_to_hs400(sc, ivar, clock, timing) !=
	MMC_ERR_NONE)
	return (MMC_ERR_BADCRC);
	}
	return (MMC_ERR_NONE);
	}

	static void
	mmc_retune_pause(device_t busdev, device_t dev, bool retune)
	{
	struct mmc_softc *sc;

	sc = device_get_softc(busdev);
	KASSERT(device_get_parent(dev) == busdev,
	("%s: %s is not a child of %s", __func__, device_get_nameunit(dev),
	device_get_nameunit(busdev)));
	KASSERT(sc->owner != NULL,
	("%s: Request from %s without bus being acquired.", __func__,
	device_get_nameunit(dev)));

	if (retune == true && sc->retune_paused == 0)
	sc->retune_needed = 1;
	sc->retune_paused++;
	}

	static void
	mmc_retune_unpause(device_t busdev, device_t dev)
	{
	struct mmc_softc *sc;

	sc = device_get_softc(busdev);
	KASSERT(device_get_parent(dev) == busdev,
	("%s: %s is not a child of %s", __func__, device_get_nameunit(dev),
	device_get_nameunit(busdev)));
	KASSERT(sc->owner != NULL,
	("%s: Request from %s without bus being acquired.", __func__,
	device_get_nameunit(dev)));
	KASSERT(sc->retune_paused != 0,
	("%s: Re-tune pause count already at 0", __func__));

	sc->retune_paused--;
	}

	static void
	mmc_scan(struct mmc_softc *sc)
	{
	device_t dev = sc->dev;
	int err;

	err = mmc_acquire_bus(dev, dev);
	if (err != 0) {
	device_printf(dev, "Failed to acquire bus for scanning\n");
	return;
	}
	mmc_go_discovery(sc);
	err = mmc_release_bus(dev, dev);
	if (err != 0) {
	device_printf(dev, "Failed to release bus after scanning\n");
	return;
	}
	(void)bus_generic_attach(dev);
	}

	static int
	mmc_read_ivar(device_t bus, device_t child, int which, uintptr_t *result)
	{
	struct mmc_ivars *ivar = device_get_ivars(child);

	switch (which) {
	default:
	return (EINVAL);
	case MMC_IVAR_SPEC_VERS:
	*result = ivar->csd.spec_vers;
	break;
	case MMC_IVAR_DSR_IMP:
	*result = ivar->csd.dsr_imp;
	break;
	case MMC_IVAR_MEDIA_SIZE:
	*result = ivar->sec_count;
	break;
	case MMC_IVAR_RCA:
	*result = ivar->rca;
	break;
	case MMC_IVAR_SECTOR_SIZE:
	*result = MMC_SECTOR_SIZE;
	break;
	case MMC_IVAR_TRAN_SPEED:
	*result = mmcbr_get_clock(bus);
	break;
	case MMC_IVAR_READ_ONLY:
	*result = ivar->read_only;
	break;
	case MMC_IVAR_HIGH_CAP:
	*result = ivar->high_cap;
	break;
	case MMC_IVAR_CARD_TYPE:
	*result = ivar->mode;
	break;
	case MMC_IVAR_BUS_WIDTH:
	*result = ivar->bus_width;
	break;
	case MMC_IVAR_ERASE_SECTOR:
	*result = ivar->erase_sector;
	break;
	case MMC_IVAR_MAX_DATA:
	*result = mmcbr_get_max_data(bus);
	break;
	case MMC_IVAR_CMD6_TIMEOUT:
	*result = ivar->cmd6_time;
	break;
	case MMC_IVAR_QUIRKS:
	*result = ivar->quirks;
	break;
	case MMC_IVAR_CARD_ID_STRING:
	(char *)result = ivar->card_id_string;
	break;
	case MMC_IVAR_CARD_SN_STRING:
	(char *)result = ivar->card_sn_string;
	break;
	}
	return (0);
	}

	static int
	mmc_write_ivar(device_t bus, device_t child, int which, uintptr_t value)
	{

	/*
	* None are writable ATM
	*/
	return (EINVAL);
	}

	static void
	mmc_delayed_attach(void *xsc)
	{
	struct mmc_softc *sc = xsc;

	mmc_scan(sc);
	config_intrhook_disestablish(&sc->config_intrhook);
	}

	static int
	mmc_child_location_str(device_t dev, device_t child, char *buf,
	size_t buflen)
	{

	snprintf(buf, buflen, "rca=0x%04x", mmc_get_rca(child));
	return (0);
	}

	static device_method_t mmc_methods[] = {
	/* device_if */
	DEVMETHOD(device_probe, mmc_probe),
	DEVMETHOD(device_attach, mmc_attach),
	DEVMETHOD(device_detach, mmc_detach),
	DEVMETHOD(device_suspend, mmc_suspend),
	DEVMETHOD(device_resume, mmc_resume),

	/* Bus interface */
	DEVMETHOD(bus_read_ivar, mmc_read_ivar),
	DEVMETHOD(bus_write_ivar, mmc_write_ivar),
	DEVMETHOD(bus_child_location_str, mmc_child_location_str),

	/* MMC Bus interface */
	DEVMETHOD(mmcbus_retune_pause, mmc_retune_pause),
	DEVMETHOD(mmcbus_retune_unpause, mmc_retune_unpause),
	DEVMETHOD(mmcbus_wait_for_request, mmc_wait_for_request),
	DEVMETHOD(mmcbus_acquire_bus, mmc_acquire_bus),
	DEVMETHOD(mmcbus_release_bus, mmc_release_bus),

	DEVMETHOD_END
	};

	driver_t mmc_driver = {
	"mmc",
	mmc_methods,
	sizeof(struct mmc_softc),
	};
	devclass_t mmc_devclass;

	MODULE_VERSION(mmc, MMC_VERSION);
	Index: head/sys/dev/mmc/mmcsd.c
	===================================================================
	--- head/sys/dev/mmc/mmcsd.c (revision 327172)
	+++ head/sys/dev/mmc/mmcsd.c (revision 327173)
	@@ -1,1472 +1,1470 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006 Bernd Walter. All rights reserved.
	* Copyright (c) 2006 M. Warner Losh. All rights reserved.
	* Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* Portions of this software may have been developed with reference to
	* the SD Simplified Specification. The following disclaimer may apply:
	*
	* The following conditions apply to the release of the simplified
	* specification ("Simplified Specification") by the SD Card Association and
	* the SD Group. The Simplified Specification is a subset of the complete SD
	* Specification which is owned by the SD Card Association and the SD
	* Group. This Simplified Specification is provided on a non-confidential
	* basis subject to the disclaimers below. Any implementation of the
	* Simplified Specification may require a license from the SD Card
	* Association, SD Group, SD-3C LLC or other third parties.
	*
	* Disclaimers:
	*
	* The information contained in the Simplified Specification is presented only
	* as a standard specification for SD Cards and SD Host/Ancillary products and
	* is provided "AS-IS" without any representations or warranties of any
	* kind. No responsibility is assumed by the SD Group, SD-3C LLC or the SD
	* Card Association for any damages, any infringements of patents or other
	* right of the SD Group, SD-3C LLC, the SD Card Association or any third
	* parties, which may result from its use. No license is granted by
	* implication, estoppel or otherwise under any patent or other rights of the
	* SD Group, SD-3C LLC, the SD Card Association or any third party. Nothing
	* herein shall be construed as an obligation by the SD Group, the SD-3C LLC
	* or the SD Card Association to disclose or distribute any technical
	* information, know-how or other confidential information to any third party.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/ioccom.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/slicer.h>
	#include <sys/time.h>

	#include <geom/geom.h>
	#include <geom/geom_disk.h>

	#include <dev/mmc/bridge.h>
	#include <dev/mmc/mmc_ioctl.h>
	#include <dev/mmc/mmc_subr.h>
	#include <dev/mmc/mmcbrvar.h>
	#include <dev/mmc/mmcreg.h>
	#include <dev/mmc/mmcvar.h>

	#include "mmcbus_if.h"

	#if __FreeBSD_version < 800002
	#define kproc_create kthread_create
	#define kproc_exit kthread_exit
	#endif

	#define MMCSD_CMD_RETRIES 5

	#define MMCSD_FMT_BOOT "mmcsd%dboot"
	#define MMCSD_FMT_GP "mmcsd%dgp"
	#define MMCSD_FMT_RPMB "mmcsd%drpmb"
	#define MMCSD_LABEL_ENH "enh"

	#define MMCSD_PART_NAMELEN (16 + 1)

	struct mmcsd_softc;

	struct mmcsd_part {
	struct mtx disk_mtx;
	struct mtx ioctl_mtx;
	struct mmcsd_softc *sc;
	struct disk *disk;
	struct proc *p;
	struct bio_queue_head bio_queue;
	daddr_t eblock, eend; /* Range remaining after the last erase. */
	u_int cnt;
	u_int type;
	int running;
	int suspend;
	int ioctl;
	bool ro;
	char name[MMCSD_PART_NAMELEN];
	};

	struct mmcsd_softc {
	device_t dev;
	device_t mmcbus;
	struct mmcsd_part *part[MMC_PART_MAX];
	enum mmc_card_mode mode;
	u_int max_data; /* Maximum data size [blocks] */
	u_int erase_sector; /* Device native erase sector size [blocks] */
	uint8_t high_cap; /* High Capacity device (block addressed) */
	uint8_t part_curr; /* Partition currently switched to */
	uint8_t ext_csd[MMC_EXTCSD_SIZE];
	uint16_t rca;
	uint32_t flags;
	#define MMCSD_INAND_CMD38 0x0001
	#define MMCSD_USE_TRIM 0x0002
	uint32_t cmd6_time; /* Generic switch timeout [us] */
	uint32_t part_time; /* Partition switch timeout [us] */
	off_t enh_base; /* Enhanced user data area slice base ... */
	off_t enh_size; /* ... and size [bytes] */
	int log_count;
	struct timeval log_time;
	struct cdev *rpmb_dev;
	};

	static const char *errmsg[] =
	{
	"None",
	"Timeout",
	"Bad CRC",
	"Fifo",
	"Failed",
	"Invalid",
	"NO MEMORY"
	};

	#define LOG_PPS 5 /* Log no more than 5 errors per second. */

	/* bus entry points */
	static int mmcsd_attach(device_t dev);
	static int mmcsd_detach(device_t dev);
	static int mmcsd_probe(device_t dev);

	/* disk routines */
	static int mmcsd_close(struct disk *dp);
	static int mmcsd_dump(void arg, void virtual, vm_offset_t physical,
	off_t offset, size_t length);
	static int mmcsd_getattr(struct bio *);
	static int mmcsd_ioctl_disk(struct disk disk, u_long cmd, void data,
	int fflag, struct thread *td);
	static int mmcsd_open(struct disk *dp);
	static void mmcsd_strategy(struct bio *bp);
	static void mmcsd_task(void *arg);

	/* RMPB cdev interface */
	static int mmcsd_ioctl_rpmb(struct cdev *dev, u_long cmd, caddr_t data,
	int fflag, struct thread *td);

	static void mmcsd_add_part(struct mmcsd_softc *sc, u_int type,
	const char *name, u_int cnt, off_t media_size, bool ro);
	static int mmcsd_bus_bit_width(device_t dev);
	static daddr_t mmcsd_delete(struct mmcsd_part part, struct bio bp);
	static const char *mmcsd_errmsg(int e);
	static int mmcsd_ioctl(struct mmcsd_part part, u_long cmd, void data,
	int fflag);
	static int mmcsd_ioctl_cmd(struct mmcsd_part part, struct mmc_ioc_cmd mic,
	int fflag);
	static uintmax_t mmcsd_pretty_size(off_t size, char *unit);
	static daddr_t mmcsd_rw(struct mmcsd_part part, struct bio bp);
	static int mmcsd_set_blockcount(struct mmcsd_softc *sc, u_int count, bool rel);
	static int mmcsd_slicer(device_t dev, const char *provider,
	struct flash_slice slices, int nslices);
	static int mmcsd_switch_part(device_t bus, device_t dev, uint16_t rca,
	u_int part);

	#define MMCSD_DISK_LOCK(_part) mtx_lock(&(_part)->disk_mtx)
	#define MMCSD_DISK_UNLOCK(_part) mtx_unlock(&(_part)->disk_mtx)
	#define MMCSD_DISK_LOCK_INIT(_part) \
	mtx_init(&(_part)->disk_mtx, (_part)->name, "mmcsd disk", MTX_DEF)
	#define MMCSD_DISK_LOCK_DESTROY(_part) mtx_destroy(&(_part)->disk_mtx);
	#define MMCSD_DISK_ASSERT_LOCKED(_part) \
	mtx_assert(&(_part)->disk_mtx, MA_OWNED);
	#define MMCSD_DISK_ASSERT_UNLOCKED(_part) \
	mtx_assert(&(_part)->disk_mtx, MA_NOTOWNED);

	#define MMCSD_IOCTL_LOCK(_part) mtx_lock(&(_part)->ioctl_mtx)
	#define MMCSD_IOCTL_UNLOCK(_part) mtx_unlock(&(_part)->ioctl_mtx)
	#define MMCSD_IOCTL_LOCK_INIT(_part) \
	mtx_init(&(_part)->ioctl_mtx, (_part)->name, "mmcsd IOCTL", MTX_DEF)
	#define MMCSD_IOCTL_LOCK_DESTROY(_part) mtx_destroy(&(_part)->ioctl_mtx);
	#define MMCSD_IOCTL_ASSERT_LOCKED(_part) \
	mtx_assert(&(_part)->ioctl_mtx, MA_OWNED);
	#define MMCSD_IOCLT_ASSERT_UNLOCKED(_part) \
	mtx_assert(&(_part)->ioctl_mtx, MA_NOTOWNED);

	static int
	mmcsd_probe(device_t dev)
	{

	device_quiet(dev);
	device_set_desc(dev, "MMC/SD Memory Card");
	return (0);
	}

	static int
	mmcsd_attach(device_t dev)
	{
	device_t mmcbus;
	struct mmcsd_softc *sc;
	const uint8_t *ext_csd;
	off_t erase_size, sector_size, size, wp_size;
	uintmax_t bytes;
	int err, i;
	uint32_t quirks;
	uint8_t rev;
	bool comp, ro;
	char unit[2];

	sc = device_get_softc(dev);
	sc->dev = dev;
	sc->mmcbus = mmcbus = device_get_parent(dev);
	sc->mode = mmcbr_get_mode(mmcbus);
	/*
	* Note that in principle with an SDHCI-like re-tuning implementation,
	* the maximum data size can change at runtime due to a device removal/
	* insertion that results in switches to/from a transfer mode involving
	* re-tuning, iff there are multiple devices on a given bus. Until now
	* mmc(4) lacks support for rescanning already attached buses, however,
	* and sdhci(4) to date has no support for shared buses in the first
	* place either.
	*/
	sc->max_data = mmc_get_max_data(dev);
	sc->high_cap = mmc_get_high_cap(dev);
	sc->rca = mmc_get_rca(dev);
	sc->cmd6_time = mmc_get_cmd6_timeout(dev);
	quirks = mmc_get_quirks(dev);

	/* Only MMC >= 4.x devices support EXT_CSD. */
	if (mmc_get_spec_vers(dev) >= 4) {
	MMCBUS_ACQUIRE_BUS(mmcbus, dev);
	err = mmc_send_ext_csd(mmcbus, dev, sc->ext_csd);
	MMCBUS_RELEASE_BUS(mmcbus, dev);
	if (err != MMC_ERR_NONE) {
	device_printf(dev, "Error reading EXT_CSD %s\n",
	mmcsd_errmsg(err));
	return (ENXIO);
	}
	}
	ext_csd = sc->ext_csd;

	if ((quirks & MMC_QUIRK_INAND_CMD38) != 0) {
	if (mmc_get_spec_vers(dev) < 4) {
	device_printf(dev,
	"MMC_QUIRK_INAND_CMD38 set but no EXT_CSD\n");
	return (EINVAL);
	}
	sc->flags \|= MMCSD_INAND_CMD38;
	}

	/*
	* EXT_CSD_SEC_FEATURE_SUPPORT_GB_CL_EN denotes support for both
	* insecure and secure TRIM.
	*/
	if ((ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT] &
	EXT_CSD_SEC_FEATURE_SUPPORT_GB_CL_EN) != 0 &&
	(quirks & MMC_QUIRK_BROKEN_TRIM) == 0) {
	if (bootverbose)
	device_printf(dev, "taking advantage of TRIM\n");
	sc->flags \|= MMCSD_USE_TRIM;
	sc->erase_sector = 1;
	} else
	sc->erase_sector = mmc_get_erase_sector(dev);

	/*
	* Enhanced user data area and general purpose partitions are only
	* supported in revision 1.4 (EXT_CSD_REV == 4) and later, the RPMB
	* partition in revision 1.5 (MMC v4.41, EXT_CSD_REV == 5) and later.
	*/
	rev = ext_csd[EXT_CSD_REV];

	/*
	* Ignore user-creatable enhanced user data area and general purpose
	* partitions partitions as long as partitioning hasn't been finished.
	*/
	comp = (ext_csd[EXT_CSD_PART_SET] & EXT_CSD_PART_SET_COMPLETED) != 0;

	/*
	* Add enhanced user data area slice, unless it spans the entirety of
	* the user data area. The enhanced area is of a multiple of high
	* capacity write protect groups ((ERASE_GRP_SIZE + HC_WP_GRP_SIZE) *
	* 512 KB) and its offset given in either sectors or bytes, depending
	* on whether it's a high capacity device or not.
	* NB: The slicer and its slices need to be registered before adding
	* the disk for the corresponding user data area as re-tasting is
	* racy.
	*/
	sector_size = mmc_get_sector_size(dev);
	size = ext_csd[EXT_CSD_ENH_SIZE_MULT] +
	(ext_csd[EXT_CSD_ENH_SIZE_MULT + 1] << 8) +
	(ext_csd[EXT_CSD_ENH_SIZE_MULT + 2] << 16);
	if (rev >= 4 && comp == TRUE && size > 0 &&
	(ext_csd[EXT_CSD_PART_SUPPORT] &
	EXT_CSD_PART_SUPPORT_ENH_ATTR_EN) != 0 &&
	(ext_csd[EXT_CSD_PART_ATTR] & (EXT_CSD_PART_ATTR_ENH_USR)) != 0) {
	erase_size = ext_csd[EXT_CSD_ERASE_GRP_SIZE] * 1024 *
	MMC_SECTOR_SIZE;
	wp_size = ext_csd[EXT_CSD_HC_WP_GRP_SIZE];
	size = erase_size wp_size;
	if (size != mmc_get_media_size(dev) * sector_size) {
	sc->enh_size = size;
	sc->enh_base = (ext_csd[EXT_CSD_ENH_START_ADDR] +
	(ext_csd[EXT_CSD_ENH_START_ADDR + 1] << 8) +
	(ext_csd[EXT_CSD_ENH_START_ADDR + 2] << 16) +
	(ext_csd[EXT_CSD_ENH_START_ADDR + 3] << 24)) *
	(sc->high_cap == 0 ? MMC_SECTOR_SIZE : 1);
	} else if (bootverbose)
	device_printf(dev,
	"enhanced user data area spans entire device\n");
	}

	/*
	* Add default partition. This may be the only one or the user
	* data area in case partitions are supported.
	*/
	ro = mmc_get_read_only(dev);
	mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_DEFAULT, "mmcsd",
	device_get_unit(dev), mmc_get_media_size(dev) * sector_size, ro);

	if (mmc_get_spec_vers(dev) < 3)
	return (0);

	/* Belatedly announce enhanced user data slice. */
	if (sc->enh_size != 0) {
	bytes = mmcsd_pretty_size(size, unit);
	printf(FLASH_SLICES_FMT ": %ju%sB enhanced user data area "
	"slice offset 0x%jx at %s\n", device_get_nameunit(dev),
	MMCSD_LABEL_ENH, bytes, unit, (uintmax_t)sc->enh_base,
	device_get_nameunit(dev));
	}

	/*
	* Determine partition switch timeout (provided in units of 10 ms)
	* and ensure it's at least 300 ms as some eMMC chips lie.
	*/
	sc->part_time = max(ext_csd[EXT_CSD_PART_SWITCH_TO] * 10 * 1000,
	300 * 1000);

	/* Add boot partitions, which are of a fixed multiple of 128 KB. */
	size = ext_csd[EXT_CSD_BOOT_SIZE_MULT] * MMC_BOOT_RPMB_BLOCK_SIZE;
	if (size > 0 && (mmcbr_get_caps(mmcbus) & MMC_CAP_BOOT_NOACC) == 0) {
	mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_BOOT0,
	MMCSD_FMT_BOOT, 0, size,
	ro \| ((ext_csd[EXT_CSD_BOOT_WP_STATUS] &
	EXT_CSD_BOOT_WP_STATUS_BOOT0_MASK) != 0));
	mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_BOOT1,
	MMCSD_FMT_BOOT, 1, size,
	ro \| ((ext_csd[EXT_CSD_BOOT_WP_STATUS] &
	EXT_CSD_BOOT_WP_STATUS_BOOT1_MASK) != 0));
	}

	/* Add RPMB partition, which also is of a fixed multiple of 128 KB. */
	size = ext_csd[EXT_CSD_RPMB_MULT] * MMC_BOOT_RPMB_BLOCK_SIZE;
	if (rev >= 5 && size > 0)
	mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_RPMB,
	MMCSD_FMT_RPMB, 0, size, ro);

	if (rev <= 3 \|\| comp == FALSE)
	return (0);

	/*
	* Add general purpose partitions, which are of a multiple of high
	* capacity write protect groups, too.
	*/
	if ((ext_csd[EXT_CSD_PART_SUPPORT] & EXT_CSD_PART_SUPPORT_EN) != 0) {
	erase_size = ext_csd[EXT_CSD_ERASE_GRP_SIZE] * 1024 *
	MMC_SECTOR_SIZE;
	wp_size = ext_csd[EXT_CSD_HC_WP_GRP_SIZE];
	for (i = 0; i < MMC_PART_GP_MAX; i++) {
	size = ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3] +
	(ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3 + 1] << 8) +
	(ext_csd[EXT_CSD_GP_SIZE_MULT + i * 3 + 2] << 16);
	if (size == 0)
	continue;
	mmcsd_add_part(sc, EXT_CSD_PART_CONFIG_ACC_GP0 + i,
	MMCSD_FMT_GP, i, size * erase_size * wp_size, ro);
	}
	}
	return (0);
	}

	static uintmax_t
	mmcsd_pretty_size(off_t size, char *unit)
	{
	uintmax_t bytes;
	int i;

	/*
	* Display in most natural units. There's no card < 1MB. However,
	* RPMB partitions occasionally are smaller than that, though. The
	* SD standard goes to 2 GiB due to its reliance on FAT, but the data
	* format supports up to 4 GiB and some card makers push it up to this
	* limit. The SDHC standard only goes to 32 GiB due to FAT32, but the
	* data format supports up to 2 TiB however. 2048 GB isn't too ugly,
	* so we note it in passing here and don't add the code to print TB).
	* Since these cards are sold in terms of MB and GB not MiB and GiB,
	* report them like that. We also round to the nearest unit, since
	* many cards are a few percent short, even of the power of 10 size.
	*/
	bytes = size;
	unit[0] = unit[1] = '\0';
	for (i = 0; i <= 2 && bytes >= 1000; i++) {
	bytes = (bytes + 1000 / 2 - 1) / 1000;
	switch (i) {
	case 0:
	unit[0] = 'k';
	break;
	case 1:
	unit[0] = 'M';
	break;
	case 2:
	unit[0] = 'G';
	break;
	default:
	break;
	}
	}
	return (bytes);
	}

	static struct cdevsw mmcsd_rpmb_cdevsw = {
	.d_version = D_VERSION,
	.d_name = "mmcsdrpmb",
	.d_ioctl = mmcsd_ioctl_rpmb
	};

	static void
	mmcsd_add_part(struct mmcsd_softc sc, u_int type, const char name, u_int cnt,
	off_t media_size, bool ro)
	{
	struct make_dev_args args;
	device_t dev, mmcbus;
	const char *ext;
	const uint8_t *ext_csd;
	struct mmcsd_part *part;
	struct disk *d;
	uintmax_t bytes;
	u_int gp;
	uint32_t speed;
	uint8_t extattr;
	bool enh;
	char unit[2];

	dev = sc->dev;
	mmcbus = sc->mmcbus;
	part = sc->part[type] = malloc(sizeof(*part), M_DEVBUF,
	M_WAITOK \| M_ZERO);
	part->sc = sc;
	part->cnt = cnt;
	part->type = type;
	part->ro = ro;
	snprintf(part->name, sizeof(part->name), name, device_get_unit(dev));

	MMCSD_IOCTL_LOCK_INIT(part);

	/*
	* For the RPMB partition, allow IOCTL access only.
	* NB: If ever attaching RPMB partitions to disk(9), the re-tuning
	* implementation and especially its pausing need to be revisited,
	* because then re-tuning requests may be issued by the IOCTL half
	* of this driver while re-tuning is already paused by the disk(9)
	* one and vice versa.
	*/
	if (type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
	make_dev_args_init(&args);
	args.mda_flags = MAKEDEV_CHECKNAME \| MAKEDEV_WAITOK;
	args.mda_devsw = &mmcsd_rpmb_cdevsw;
	args.mda_uid = UID_ROOT;
	args.mda_gid = GID_OPERATOR;
	args.mda_mode = 0640;
	args.mda_si_drv1 = part;
	if (make_dev_s(&args, &sc->rpmb_dev, "%s", part->name) != 0) {
	device_printf(dev, "Failed to make RPMB device\n");
	free(part, M_DEVBUF);
	return;
	}
	} else {
	MMCSD_DISK_LOCK_INIT(part);

	d = part->disk = disk_alloc();
	d->d_open = mmcsd_open;
	d->d_close = mmcsd_close;
	d->d_strategy = mmcsd_strategy;
	d->d_ioctl = mmcsd_ioctl_disk;
	d->d_dump = mmcsd_dump;
	d->d_getattr = mmcsd_getattr;
	d->d_name = part->name;
	d->d_drv1 = part;
	d->d_sectorsize = mmc_get_sector_size(dev);
	d->d_maxsize = sc->max_data * d->d_sectorsize;
	d->d_mediasize = media_size;
	d->d_stripesize = sc->erase_sector * d->d_sectorsize;
	d->d_unit = cnt;
	d->d_flags = DISKFLAG_CANDELETE;
	d->d_delmaxsize = mmc_get_erase_sector(dev) * d->d_sectorsize;
	strlcpy(d->d_ident, mmc_get_card_sn_string(dev),
	sizeof(d->d_ident));
	strlcpy(d->d_descr, mmc_get_card_id_string(dev),
	sizeof(d->d_descr));
	d->d_rotation_rate = DISK_RR_NON_ROTATING;

	disk_create(d, DISK_VERSION);
	bioq_init(&part->bio_queue);

	part->running = 1;
	kproc_create(&mmcsd_task, part, &part->p, 0, 0,
	"%s%d: mmc/sd card", part->name, cnt);
	}

	bytes = mmcsd_pretty_size(media_size, unit);
	if (type == EXT_CSD_PART_CONFIG_ACC_DEFAULT) {
	speed = mmcbr_get_clock(mmcbus);
	printf("%s%d: %ju%sB <%s>%s at %s %d.%01dMHz/%dbit/%d-block\n",
	part->name, cnt, bytes, unit, mmc_get_card_id_string(dev),
	ro ? " (read-only)" : "", device_get_nameunit(mmcbus),
	speed / 1000000, (speed / 100000) % 10,
	mmcsd_bus_bit_width(dev), sc->max_data);
	} else if (type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
	printf("%s: %ju%sB partion %d%s at %s\n", part->name, bytes,
	unit, type, ro ? " (read-only)" : "",
	device_get_nameunit(dev));
	} else {
	enh = false;
	ext = NULL;
	extattr = 0;
	if (type >= EXT_CSD_PART_CONFIG_ACC_GP0 &&
	type <= EXT_CSD_PART_CONFIG_ACC_GP3) {
	ext_csd = sc->ext_csd;
	gp = type - EXT_CSD_PART_CONFIG_ACC_GP0;
	if ((ext_csd[EXT_CSD_PART_SUPPORT] &
	EXT_CSD_PART_SUPPORT_ENH_ATTR_EN) != 0 &&
	(ext_csd[EXT_CSD_PART_ATTR] &
	(EXT_CSD_PART_ATTR_ENH_GP0 << gp)) != 0)
	enh = true;
	else if ((ext_csd[EXT_CSD_PART_SUPPORT] &
	EXT_CSD_PART_SUPPORT_EXT_ATTR_EN) != 0) {
	extattr = (ext_csd[EXT_CSD_EXT_PART_ATTR +
	(gp / 2)] >> (4 * (gp % 2))) & 0xF;
	switch (extattr) {
	case EXT_CSD_EXT_PART_ATTR_DEFAULT:
	break;
	case EXT_CSD_EXT_PART_ATTR_SYSTEMCODE:
	ext = "system code";
	break;
	case EXT_CSD_EXT_PART_ATTR_NPERSISTENT:
	ext = "non-persistent";
	break;
	default:
	ext = "reserved";
	break;
	}
	}
	}
	if (ext == NULL)
	printf("%s%d: %ju%sB partion %d%s%s at %s\n",
	part->name, cnt, bytes, unit, type, enh ?
	" enhanced" : "", ro ? " (read-only)" : "",
	device_get_nameunit(dev));
	else
	printf("%s%d: %ju%sB partion %d extended 0x%x "
	"(%s)%s at %s\n", part->name, cnt, bytes, unit,
	type, extattr, ext, ro ? " (read-only)" : "",
	device_get_nameunit(dev));
	}
	}

	static int
	mmcsd_slicer(device_t dev, const char *provider,
	struct flash_slice slices, int nslices)
	{
	char name[MMCSD_PART_NAMELEN];
	struct mmcsd_softc *sc;
	struct mmcsd_part *part;

	*nslices = 0;
	if (slices == NULL)
	return (ENOMEM);

	sc = device_get_softc(dev);
	if (sc->enh_size == 0)
	return (ENXIO);

	part = sc->part[EXT_CSD_PART_CONFIG_ACC_DEFAULT];
	snprintf(name, sizeof(name), "%s%d", part->disk->d_name,
	part->disk->d_unit);
	if (strcmp(name, provider) != 0)
	return (ENXIO);

	*nslices = 1;
	slices[0].base = sc->enh_base;
	slices[0].size = sc->enh_size;
	slices[0].label = MMCSD_LABEL_ENH;
	return (0);
	}

	static int
	mmcsd_detach(device_t dev)
	{
	struct mmcsd_softc *sc = device_get_softc(dev);
	struct mmcsd_part *part;
	int i;

	for (i = 0; i < MMC_PART_MAX; i++) {
	part = sc->part[i];
	if (part != NULL) {
	if (part->disk != NULL) {
	MMCSD_DISK_LOCK(part);
	part->suspend = 0;
	if (part->running > 0) {
	/* kill thread */
	part->running = 0;
	wakeup(part);
	/* wait for thread to finish. */
	while (part->running != -1)
	msleep(part, &part->disk_mtx, 0,
	"mmcsd disk detach", 0);
	}
	MMCSD_DISK_UNLOCK(part);
	}
	MMCSD_IOCTL_LOCK(part);
	while (part->ioctl > 0)
	msleep(part, &part->ioctl_mtx, 0,
	"mmcsd IOCTL detach", 0);
	part->ioctl = -1;
	MMCSD_IOCTL_UNLOCK(part);
	}
	}

	if (sc->rpmb_dev != NULL)
	destroy_dev(sc->rpmb_dev);

	for (i = 0; i < MMC_PART_MAX; i++) {
	part = sc->part[i];
	if (part != NULL) {
	if (part->disk != NULL) {
	/* Flush the request queue. */
	bioq_flush(&part->bio_queue, NULL, ENXIO);
	/* kill disk */
	disk_destroy(part->disk);

	MMCSD_DISK_LOCK_DESTROY(part);
	}
	MMCSD_IOCTL_LOCK_DESTROY(part);
	free(part, M_DEVBUF);
	}
	}
	return (0);
	}

	static int
	mmcsd_suspend(device_t dev)
	{
	struct mmcsd_softc *sc = device_get_softc(dev);
	struct mmcsd_part *part;
	int i;

	for (i = 0; i < MMC_PART_MAX; i++) {
	part = sc->part[i];
	if (part != NULL) {
	if (part->disk != NULL) {
	MMCSD_DISK_LOCK(part);
	part->suspend = 1;
	if (part->running > 0) {
	/* kill thread */
	part->running = 0;
	wakeup(part);
	/* wait for thread to finish. */
	while (part->running != -1)
	msleep(part, &part->disk_mtx, 0,
	"mmcsd disk suspension", 0);
	}
	MMCSD_DISK_UNLOCK(part);
	}
	MMCSD_IOCTL_LOCK(part);
	while (part->ioctl > 0)
	msleep(part, &part->ioctl_mtx, 0,
	"mmcsd IOCTL suspension", 0);
	part->ioctl = -1;
	MMCSD_IOCTL_UNLOCK(part);
	}
	}
	return (0);
	}

	static int
	mmcsd_resume(device_t dev)
	{
	struct mmcsd_softc *sc = device_get_softc(dev);
	struct mmcsd_part *part;
	int i;

	for (i = 0; i < MMC_PART_MAX; i++) {
	part = sc->part[i];
	if (part != NULL) {
	if (part->disk != NULL) {
	MMCSD_DISK_LOCK(part);
	part->suspend = 0;
	if (part->running <= 0) {
	part->running = 1;
	MMCSD_DISK_UNLOCK(part);
	kproc_create(&mmcsd_task, part,
	&part->p, 0, 0, "%s%d: mmc/sd card",
	part->name, part->cnt);
	} else
	MMCSD_DISK_UNLOCK(part);
	}
	MMCSD_IOCTL_LOCK(part);
	part->ioctl = 0;
	MMCSD_IOCTL_UNLOCK(part);
	}
	}
	return (0);
	}

	static int
	mmcsd_open(struct disk *dp __unused)
	{

	return (0);
	}

	static int
	mmcsd_close(struct disk *dp __unused)
	{

	return (0);
	}

	static void
	mmcsd_strategy(struct bio *bp)
	{
	- struct mmcsd_softc *sc;
	struct mmcsd_part *part;

	part = bp->bio_disk->d_drv1;
	- sc = part->sc;
	MMCSD_DISK_LOCK(part);
	if (part->running > 0 \|\| part->suspend > 0) {
	bioq_disksort(&part->bio_queue, bp);
	MMCSD_DISK_UNLOCK(part);
	wakeup(part);
	} else {
	MMCSD_DISK_UNLOCK(part);
	biofinish(bp, NULL, ENXIO);
	}
	}

	static int
	mmcsd_ioctl_rpmb(struct cdev *dev, u_long cmd, caddr_t data,
	int fflag, struct thread *td __unused)
	{

	return (mmcsd_ioctl(dev->si_drv1, cmd, data, fflag));
	}

	static int
	mmcsd_ioctl_disk(struct disk disk, u_long cmd, void data, int fflag,
	struct thread *td __unused)
	{

	return (mmcsd_ioctl(disk->d_drv1, cmd, data, fflag));
	}

	static int
	mmcsd_ioctl(struct mmcsd_part part, u_long cmd, void data, int fflag)
	{
	struct mmc_ioc_cmd *mic;
	struct mmc_ioc_multi_cmd *mimc;
	int i, err;
	u_long cnt, size;

	if ((fflag & FREAD) == 0)
	return (EBADF);

	err = 0;
	switch (cmd) {
	case MMC_IOC_CMD:
	mic = data;
	err = mmcsd_ioctl_cmd(part, mic, fflag);
	break;
	case MMC_IOC_MULTI_CMD:
	mimc = data;
	if (mimc->num_of_cmds == 0)
	break;
	if (mimc->num_of_cmds > MMC_IOC_MAX_CMDS)
	return (EINVAL);
	cnt = mimc->num_of_cmds;
	size = sizeof(mic) cnt;
	mic = malloc(size, M_TEMP, M_WAITOK);
	err = copyin((const void *)mimc->cmds, mic, size);
	if (err == 0) {
	for (i = 0; i < cnt; i++) {
	err = mmcsd_ioctl_cmd(part, &mic[i], fflag);
	if (err != 0)
	break;
	}
	}
	free(mic, M_TEMP);
	break;
	default:
	return (ENOIOCTL);
	}
	return (err);
	}

	static int
	mmcsd_ioctl_cmd(struct mmcsd_part part, struct mmc_ioc_cmd mic, int fflag)
	{
	struct mmc_command cmd;
	struct mmc_data data;
	struct mmcsd_softc *sc;
	device_t dev, mmcbus;
	void *dp;
	u_long len;
	int err, retries;
	uint32_t status;
	uint16_t rca;

	if ((fflag & FWRITE) == 0 && mic->write_flag != 0)
	return (EBADF);

	if (part->ro == TRUE && mic->write_flag != 0)
	return (EROFS);

	/*
	* We don't need to explicitly lock against the disk(9) half of this
	* driver as MMCBUS_ACQUIRE_BUS() will serialize us. However, it's
	* necessary to protect against races with detachment and suspension,
	* especially since it's required to switch away from RPMB partitions
	* again after an access (see mmcsd_switch_part()).
	*/
	MMCSD_IOCTL_LOCK(part);
	while (part->ioctl != 0) {
	if (part->ioctl < 0) {
	MMCSD_IOCTL_UNLOCK(part);
	return (ENXIO);
	}
	msleep(part, &part->ioctl_mtx, 0, "mmcsd IOCTL", 0);
	}
	part->ioctl = 1;
	MMCSD_IOCTL_UNLOCK(part);

	err = 0;
	dp = NULL;
	len = mic->blksz * mic->blocks;
	if (len > MMC_IOC_MAX_BYTES) {
	err = EOVERFLOW;
	goto out;
	}
	if (len != 0) {
	dp = malloc(len, M_TEMP, M_WAITOK);
	err = copyin((void *)(uintptr_t)mic->data_ptr, dp, len);
	if (err != 0)
	goto out;
	}
	memset(&cmd, 0, sizeof(cmd));
	memset(&data, 0, sizeof(data));
	cmd.opcode = mic->opcode;
	cmd.arg = mic->arg;
	cmd.flags = mic->flags;
	if (len != 0) {
	data.len = len;
	data.data = dp;
	data.flags = mic->write_flag != 0 ? MMC_DATA_WRITE :
	MMC_DATA_READ;
	cmd.data = &data;
	}
	sc = part->sc;
	rca = sc->rca;
	if (mic->is_acmd == 0) {
	/* Enforce/patch/restrict RCA-based commands */
	switch (cmd.opcode) {
	case MMC_SET_RELATIVE_ADDR:
	case MMC_SELECT_CARD:
	err = EPERM;
	goto out;
	case MMC_STOP_TRANSMISSION:
	if ((cmd.arg & 0x1) == 0)
	break;
	/* FALLTHROUGH */
	case MMC_SLEEP_AWAKE:
	case MMC_SEND_CSD:
	case MMC_SEND_CID:
	case MMC_SEND_STATUS:
	case MMC_GO_INACTIVE_STATE:
	case MMC_FAST_IO:
	case MMC_APP_CMD:
	cmd.arg = (cmd.arg & 0x0000FFFF) \| (rca << 16);
	break;
	default:
	break;
	}
	}
	dev = sc->dev;
	mmcbus = sc->mmcbus;
	MMCBUS_ACQUIRE_BUS(mmcbus, dev);
	err = mmcsd_switch_part(mmcbus, dev, rca, part->type);
	if (err != MMC_ERR_NONE)
	goto release;
	if (part->type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
	err = mmcsd_set_blockcount(sc, mic->blocks,
	mic->write_flag & (1 << 31));
	if (err != MMC_ERR_NONE)
	goto switch_back;
	}
	if (mic->is_acmd != 0)
	(void)mmc_wait_for_app_cmd(mmcbus, dev, rca, &cmd, 0);
	else
	(void)mmc_wait_for_cmd(mmcbus, dev, &cmd, 0);
	if (part->type == EXT_CSD_PART_CONFIG_ACC_RPMB) {
	/*
	* If the request went to the RPMB partition, try to ensure
	* that the command actually has completed ...
	*/
	retries = MMCSD_CMD_RETRIES;
	do {
	err = mmc_send_status(mmcbus, dev, rca, &status);
	if (err != MMC_ERR_NONE)
	break;
	if (R1_STATUS(status) == 0 &&
	R1_CURRENT_STATE(status) != R1_STATE_PRG)
	break;
	DELAY(1000);
	} while (retries-- > 0);

	switch_back:
	/* ... and always switch back to the default partition. */
	err = mmcsd_switch_part(mmcbus, dev, rca,
	EXT_CSD_PART_CONFIG_ACC_DEFAULT);
	if (err != MMC_ERR_NONE)
	goto release;
	}
	/*
	* If EXT_CSD was changed, our copy is outdated now. Specifically,
	* the upper bits of EXT_CSD_PART_CONFIG used in mmcsd_switch_part(),
	* so retrieve EXT_CSD again.
	*/
	if (cmd.opcode == MMC_SWITCH_FUNC) {
	err = mmc_send_ext_csd(mmcbus, dev, sc->ext_csd);
	if (err != MMC_ERR_NONE)
	goto release;
	}
	MMCBUS_RELEASE_BUS(mmcbus, dev);
	if (cmd.error != MMC_ERR_NONE) {
	switch (cmd.error) {
	case MMC_ERR_TIMEOUT:
	err = ETIMEDOUT;
	break;
	case MMC_ERR_BADCRC:
	err = EILSEQ;
	break;
	case MMC_ERR_INVALID:
	err = EINVAL;
	break;
	case MMC_ERR_NO_MEMORY:
	err = ENOMEM;
	break;
	default:
	err = EIO;
	break;
	}
	goto out;
	}
	memcpy(mic->response, cmd.resp, 4 * sizeof(uint32_t));
	if (mic->write_flag == 0 && len != 0) {
	err = copyout(dp, (void *)(uintptr_t)mic->data_ptr, len);
	if (err != 0)
	goto out;
	}
	goto out;

	release:
	MMCBUS_RELEASE_BUS(mmcbus, dev);
	err = EIO;

	out:
	MMCSD_IOCTL_LOCK(part);
	part->ioctl = 0;
	MMCSD_IOCTL_UNLOCK(part);
	wakeup(part);
	if (dp != NULL)
	free(dp, M_TEMP);
	return (err);
	}

	static int
	mmcsd_getattr(struct bio *bp)
	{
	struct mmcsd_part *part;
	device_t dev;

	if (strcmp(bp->bio_attribute, "MMC::device") == 0) {
	if (bp->bio_length != sizeof(dev))
	return (EFAULT);
	part = bp->bio_disk->d_drv1;
	dev = part->sc->dev;
	bcopy(&dev, bp->bio_data, sizeof(dev));
	bp->bio_completed = bp->bio_length;
	return (0);
	}
	return (-1);
	}

	static int
	mmcsd_set_blockcount(struct mmcsd_softc *sc, u_int count, bool reliable)
	{
	struct mmc_command cmd;
	struct mmc_request req;

	memset(&req, 0, sizeof(req));
	memset(&cmd, 0, sizeof(cmd));
	cmd.mrq = &req;
	req.cmd = &cmd;
	cmd.opcode = MMC_SET_BLOCK_COUNT;
	cmd.arg = count & 0x0000FFFF;
	if (reliable)
	cmd.arg \|= 1 << 31;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_AC;
	MMCBUS_WAIT_FOR_REQUEST(sc->mmcbus, sc->dev, &req);
	return (cmd.error);
	}

	static int
	mmcsd_switch_part(device_t bus, device_t dev, uint16_t rca, u_int part)
	{
	struct mmcsd_softc *sc;
	int err;
	uint8_t value;

	sc = device_get_softc(dev);

	if (sc->mode == mode_sd)
	return (MMC_ERR_NONE);

	/*
	* According to section "6.2.2 Command restrictions" of the eMMC
	* specification v5.1, CMD19/CMD21 aren't allowed to be used with
	* RPMB partitions. So we pause re-tuning along with triggering
	* it up-front to decrease the likelihood of re-tuning becoming
	* necessary while accessing an RPMB partition. Consequently, an
	* RPMB partition should immediately be switched away from again
	* after an access in order to allow for re-tuning to take place
	* anew.
	*/
	if (part == EXT_CSD_PART_CONFIG_ACC_RPMB)
	MMCBUS_RETUNE_PAUSE(sc->mmcbus, sc->dev, true);

	if (sc->part_curr == part)
	return (MMC_ERR_NONE);

	value = (sc->ext_csd[EXT_CSD_PART_CONFIG] &
	~EXT_CSD_PART_CONFIG_ACC_MASK) \| part;
	/* Jump! */
	err = mmc_switch(bus, dev, rca, EXT_CSD_CMD_SET_NORMAL,
	EXT_CSD_PART_CONFIG, value, sc->part_time, true);
	if (err != MMC_ERR_NONE) {
	if (part == EXT_CSD_PART_CONFIG_ACC_RPMB)
	MMCBUS_RETUNE_UNPAUSE(sc->mmcbus, sc->dev);
	return (err);
	}

	sc->ext_csd[EXT_CSD_PART_CONFIG] = value;
	if (sc->part_curr == EXT_CSD_PART_CONFIG_ACC_RPMB)
	MMCBUS_RETUNE_UNPAUSE(sc->mmcbus, sc->dev);
	sc->part_curr = part;
	return (MMC_ERR_NONE);
	}

	static const char *
	mmcsd_errmsg(int e)
	{

	if (e < 0 \|\| e > MMC_ERR_MAX)
	return "Bad error code";
	return (errmsg[e]);
	}

	static daddr_t
	mmcsd_rw(struct mmcsd_part part, struct bio bp)
	{
	daddr_t block, end;
	struct mmc_command cmd;
	struct mmc_command stop;
	struct mmc_request req;
	struct mmc_data data;
	struct mmcsd_softc *sc;
	device_t dev, mmcbus;
	u_int numblocks, sz;
	char *vaddr;

	sc = part->sc;
	dev = sc->dev;
	mmcbus = sc->mmcbus;

	block = bp->bio_pblkno;
	sz = part->disk->d_sectorsize;
	end = bp->bio_pblkno + (bp->bio_bcount / sz);
	while (block < end) {
	vaddr = bp->bio_data + (block - bp->bio_pblkno) * sz;
	numblocks = min(end - block, sc->max_data);
	memset(&req, 0, sizeof(req));
	memset(&cmd, 0, sizeof(cmd));
	memset(&stop, 0, sizeof(stop));
	memset(&data, 0, sizeof(data));
	cmd.mrq = &req;
	req.cmd = &cmd;
	cmd.data = &data;
	if (bp->bio_cmd == BIO_READ) {
	if (numblocks > 1)
	cmd.opcode = MMC_READ_MULTIPLE_BLOCK;
	else
	cmd.opcode = MMC_READ_SINGLE_BLOCK;
	} else {
	if (numblocks > 1)
	cmd.opcode = MMC_WRITE_MULTIPLE_BLOCK;
	else
	cmd.opcode = MMC_WRITE_BLOCK;
	}
	cmd.arg = block;
	if (sc->high_cap == 0)
	cmd.arg <<= 9;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_ADTC;
	data.data = vaddr;
	data.mrq = &req;
	if (bp->bio_cmd == BIO_READ)
	data.flags = MMC_DATA_READ;
	else
	data.flags = MMC_DATA_WRITE;
	data.len = numblocks * sz;
	if (numblocks > 1) {
	data.flags \|= MMC_DATA_MULTI;
	stop.opcode = MMC_STOP_TRANSMISSION;
	stop.arg = 0;
	stop.flags = MMC_RSP_R1B \| MMC_CMD_AC;
	stop.mrq = &req;
	req.stop = &stop;
	}
	MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
	if (req.cmd->error != MMC_ERR_NONE) {
	if (ppsratecheck(&sc->log_time, &sc->log_count,
	LOG_PPS))
	device_printf(dev, "Error indicated: %d %s\n",
	req.cmd->error,
	mmcsd_errmsg(req.cmd->error));
	break;
	}
	block += numblocks;
	}
	return (block);
	}

	static daddr_t
	mmcsd_delete(struct mmcsd_part part, struct bio bp)
	{
	daddr_t block, end, start, stop;
	struct mmc_command cmd;
	struct mmc_request req;
	struct mmcsd_softc *sc;
	device_t dev, mmcbus;
	u_int erase_sector, sz;
	int err;
	bool use_trim;

	sc = part->sc;
	dev = sc->dev;
	mmcbus = sc->mmcbus;

	block = bp->bio_pblkno;
	sz = part->disk->d_sectorsize;
	end = bp->bio_pblkno + (bp->bio_bcount / sz);
	use_trim = sc->flags & MMCSD_USE_TRIM;
	if (use_trim == true) {
	start = block;
	stop = end;
	} else {
	/* Coalesce with the remainder of the previous request. */
	if (block > part->eblock && block <= part->eend)
	block = part->eblock;
	if (end >= part->eblock && end < part->eend)
	end = part->eend;
	/* Safely round to the erase sector boundaries. */
	erase_sector = sc->erase_sector;
	start = block + erase_sector - 1; /* Round up. */
	start -= start % erase_sector;
	stop = end; /* Round down. */
	stop -= end % erase_sector;
	/*
	* We can't erase an area smaller than an erase sector, so
	* store it for later.
	*/
	if (start >= stop) {
	part->eblock = block;
	part->eend = end;
	return (end);
	}
	}

	if ((sc->flags & MMCSD_INAND_CMD38) != 0) {
	err = mmc_switch(mmcbus, dev, sc->rca, EXT_CSD_CMD_SET_NORMAL,
	EXT_CSD_INAND_CMD38, use_trim == true ?
	EXT_CSD_INAND_CMD38_TRIM : EXT_CSD_INAND_CMD38_ERASE,
	sc->cmd6_time, true);
	if (err != MMC_ERR_NONE) {
	device_printf(dev,
	"Setting iNAND erase command failed %s\n",
	mmcsd_errmsg(err));
	return (block);
	}
	}

	/*
	* Pause re-tuning so it won't interfere with the order of erase
	* commands. Note that these latter don't use the data lines, so
	* re-tuning shouldn't actually become necessary during erase.
	*/
	MMCBUS_RETUNE_PAUSE(mmcbus, dev, false);
	/* Set erase start position. */
	memset(&req, 0, sizeof(req));
	memset(&cmd, 0, sizeof(cmd));
	cmd.mrq = &req;
	req.cmd = &cmd;
	if (mmc_get_card_type(dev) == mode_sd)
	cmd.opcode = SD_ERASE_WR_BLK_START;
	else
	cmd.opcode = MMC_ERASE_GROUP_START;
	cmd.arg = start;
	if (sc->high_cap == 0)
	cmd.arg <<= 9;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_AC;
	MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
	if (req.cmd->error != MMC_ERR_NONE) {
	device_printf(dev, "Setting erase start position failed %s\n",
	mmcsd_errmsg(req.cmd->error));
	block = bp->bio_pblkno;
	goto unpause;
	}
	/* Set erase stop position. */
	memset(&req, 0, sizeof(req));
	memset(&cmd, 0, sizeof(cmd));
	req.cmd = &cmd;
	if (mmc_get_card_type(dev) == mode_sd)
	cmd.opcode = SD_ERASE_WR_BLK_END;
	else
	cmd.opcode = MMC_ERASE_GROUP_END;
	cmd.arg = stop;
	if (sc->high_cap == 0)
	cmd.arg <<= 9;
	cmd.arg--;
	cmd.flags = MMC_RSP_R1 \| MMC_CMD_AC;
	MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
	if (req.cmd->error != MMC_ERR_NONE) {
	device_printf(dev, "Setting erase stop position failed %s\n",
	mmcsd_errmsg(req.cmd->error));
	block = bp->bio_pblkno;
	goto unpause;
	}
	/* Erase range. */
	memset(&req, 0, sizeof(req));
	memset(&cmd, 0, sizeof(cmd));
	req.cmd = &cmd;
	cmd.opcode = MMC_ERASE;
	cmd.arg = use_trim == true ? MMC_ERASE_TRIM : MMC_ERASE_ERASE;
	cmd.flags = MMC_RSP_R1B \| MMC_CMD_AC;
	MMCBUS_WAIT_FOR_REQUEST(mmcbus, dev, &req);
	if (req.cmd->error != MMC_ERR_NONE) {
	device_printf(dev, "Issuing erase command failed %s\n",
	mmcsd_errmsg(req.cmd->error));
	block = bp->bio_pblkno;
	goto unpause;
	}
	if (use_trim == false) {
	/* Store one of the remaining parts for the next call. */
	if (bp->bio_pblkno >= part->eblock \|\| block == start) {
	part->eblock = stop; /* Predict next forward. */
	part->eend = end;
	} else {
	part->eblock = block; /* Predict next backward. */
	part->eend = start;
	}
	}
	block = end;
	unpause:
	MMCBUS_RETUNE_UNPAUSE(mmcbus, dev);
	return (block);
	}

	static int
	mmcsd_dump(void arg, void virtual, vm_offset_t physical, off_t offset,
	size_t length)
	{
	struct bio bp;
	daddr_t block, end;
	struct disk *disk;
	struct mmcsd_softc *sc;
	struct mmcsd_part *part;
	device_t dev, mmcbus;
	int err;

	/* length zero is special and really means flush buffers to media */
	if (!length)
	return (0);

	disk = arg;
	part = disk->d_drv1;
	sc = part->sc;
	dev = sc->dev;
	mmcbus = sc->mmcbus;

	g_reset_bio(&bp);
	bp.bio_disk = disk;
	bp.bio_pblkno = offset / disk->d_sectorsize;
	bp.bio_bcount = length;
	bp.bio_data = virtual;
	bp.bio_cmd = BIO_WRITE;
	end = bp.bio_pblkno + bp.bio_bcount / disk->d_sectorsize;
	MMCBUS_ACQUIRE_BUS(mmcbus, dev);
	err = mmcsd_switch_part(mmcbus, dev, sc->rca, part->type);
	if (err != MMC_ERR_NONE) {
	if (ppsratecheck(&sc->log_time, &sc->log_count, LOG_PPS))
	device_printf(dev, "Partition switch error\n");
	MMCBUS_RELEASE_BUS(mmcbus, dev);
	return (EIO);
	}
	block = mmcsd_rw(part, &bp);
	MMCBUS_RELEASE_BUS(mmcbus, dev);
	return ((end < block) ? EIO : 0);
	}

	static void
	mmcsd_task(void *arg)
	{
	daddr_t block, end;
	struct mmcsd_part *part;
	struct mmcsd_softc *sc;
	struct bio *bp;
	device_t dev, mmcbus;
	int err, sz;

	part = arg;
	sc = part->sc;
	dev = sc->dev;
	mmcbus = sc->mmcbus;

	while (1) {
	MMCSD_DISK_LOCK(part);
	do {
	if (part->running == 0)
	goto out;
	bp = bioq_takefirst(&part->bio_queue);
	if (bp == NULL)
	msleep(part, &part->disk_mtx, PRIBIO,
	"mmcsd disk jobqueue", 0);
	} while (bp == NULL);
	MMCSD_DISK_UNLOCK(part);
	if (bp->bio_cmd != BIO_READ && part->ro) {
	bp->bio_error = EROFS;
	bp->bio_resid = bp->bio_bcount;
	bp->bio_flags \|= BIO_ERROR;
	biodone(bp);
	continue;
	}
	MMCBUS_ACQUIRE_BUS(mmcbus, dev);
	sz = part->disk->d_sectorsize;
	block = bp->bio_pblkno;
	end = bp->bio_pblkno + (bp->bio_bcount / sz);
	err = mmcsd_switch_part(mmcbus, dev, sc->rca, part->type);
	if (err != MMC_ERR_NONE) {
	if (ppsratecheck(&sc->log_time, &sc->log_count,
	LOG_PPS))
	device_printf(dev, "Partition switch error\n");
	goto release;
	}
	if (bp->bio_cmd == BIO_READ \|\| bp->bio_cmd == BIO_WRITE) {
	/* Access to the remaining erase block obsoletes it. */
	if (block < part->eend && end > part->eblock)
	part->eblock = part->eend = 0;
	block = mmcsd_rw(part, bp);
	} else if (bp->bio_cmd == BIO_DELETE) {
	block = mmcsd_delete(part, bp);
	}
	release:
	MMCBUS_RELEASE_BUS(mmcbus, dev);
	if (block < end) {
	bp->bio_error = EIO;
	bp->bio_resid = (end - block) * sz;
	bp->bio_flags \|= BIO_ERROR;
	} else {
	bp->bio_resid = 0;
	}
	biodone(bp);
	}
	out:
	/* tell parent we're done */
	part->running = -1;
	MMCSD_DISK_UNLOCK(part);
	wakeup(part);

	kproc_exit(0);
	}

	static int
	mmcsd_bus_bit_width(device_t dev)
	{

	if (mmc_get_bus_width(dev) == bus_width_1)
	return (1);
	if (mmc_get_bus_width(dev) == bus_width_4)
	return (4);
	return (8);
	}

	static device_method_t mmcsd_methods[] = {
	DEVMETHOD(device_probe, mmcsd_probe),
	DEVMETHOD(device_attach, mmcsd_attach),
	DEVMETHOD(device_detach, mmcsd_detach),
	DEVMETHOD(device_suspend, mmcsd_suspend),
	DEVMETHOD(device_resume, mmcsd_resume),
	DEVMETHOD_END
	};

	static driver_t mmcsd_driver = {
	"mmcsd",
	mmcsd_methods,
	sizeof(struct mmcsd_softc),
	};
	static devclass_t mmcsd_devclass;

	static int
	mmcsd_handler(module_t mod __unused, int what, void *arg __unused)
	{

	switch (what) {
	case MOD_LOAD:
	flash_register_slicer(mmcsd_slicer, FLASH_SLICES_TYPE_MMC,
	TRUE);
	return (0);
	case MOD_UNLOAD:
	flash_register_slicer(NULL, FLASH_SLICES_TYPE_MMC, TRUE);
	return (0);
	}
	return (0);
	}

	DRIVER_MODULE(mmcsd, mmc, mmcsd_driver, mmcsd_devclass, mmcsd_handler, NULL);
	MODULE_DEPEND(mmcsd, g_flashmap, 0, 0, 0);
	MMC_DEPEND(mmcsd);
	Index: head/sys/dev/ofw/ofw_bus_subr.c
	===================================================================
	--- head/sys/dev/ofw/ofw_bus_subr.c (revision 327172)
	+++ head/sys/dev/ofw/ofw_bus_subr.c (revision 327173)
	@@ -1,964 +1,962 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2001 - 2003 by Thomas Moestl <tmm@FreeBSD.org>.
	* Copyright (c) 2005 Marius Strobl <marius@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_platform.h"
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/errno.h>
	#include <sys/libkern.h>

	#include <machine/resource.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#include <dev/ofw/openfirm.h>

	#include "ofw_bus_if.h"

	#define OFW_COMPAT_LEN 255
	#define OFW_STATUS_LEN 16

	int
	ofw_bus_gen_setup_devinfo(struct ofw_bus_devinfo *obd, phandle_t node)
	{

	if (obd == NULL)
	return (ENOMEM);
	/* The 'name' property is considered mandatory. */
	if ((OF_getprop_alloc(node, "name", 1, (void **)&obd->obd_name)) == -1)
	return (EINVAL);
	OF_getprop_alloc(node, "compatible", 1, (void **)&obd->obd_compat);
	OF_getprop_alloc(node, "device_type", 1, (void **)&obd->obd_type);
	OF_getprop_alloc(node, "model", 1, (void **)&obd->obd_model);
	OF_getprop_alloc(node, "status", 1, (void **)&obd->obd_status);
	obd->obd_node = node;
	return (0);
	}

	void
	ofw_bus_gen_destroy_devinfo(struct ofw_bus_devinfo *obd)
	{

	if (obd == NULL)
	return;
	if (obd->obd_compat != NULL)
	free(obd->obd_compat, M_OFWPROP);
	if (obd->obd_model != NULL)
	free(obd->obd_model, M_OFWPROP);
	if (obd->obd_name != NULL)
	free(obd->obd_name, M_OFWPROP);
	if (obd->obd_type != NULL)
	free(obd->obd_type, M_OFWPROP);
	if (obd->obd_status != NULL)
	free(obd->obd_status, M_OFWPROP);
	}

	int
	ofw_bus_gen_child_pnpinfo_str(device_t cbdev, device_t child, char *buf,
	size_t buflen)
	{

	if (ofw_bus_get_name(child) != NULL) {
	strlcat(buf, "name=", buflen);
	strlcat(buf, ofw_bus_get_name(child), buflen);
	}

	if (ofw_bus_get_compat(child) != NULL) {
	strlcat(buf, " compat=", buflen);
	strlcat(buf, ofw_bus_get_compat(child), buflen);
	}
	return (0);
	};

	const char *
	ofw_bus_gen_get_compat(device_t bus, device_t dev)
	{
	const struct ofw_bus_devinfo *obd;

	obd = OFW_BUS_GET_DEVINFO(bus, dev);
	if (obd == NULL)
	return (NULL);
	return (obd->obd_compat);
	}

	const char *
	ofw_bus_gen_get_model(device_t bus, device_t dev)
	{
	const struct ofw_bus_devinfo *obd;

	obd = OFW_BUS_GET_DEVINFO(bus, dev);
	if (obd == NULL)
	return (NULL);
	return (obd->obd_model);
	}

	const char *
	ofw_bus_gen_get_name(device_t bus, device_t dev)
	{
	const struct ofw_bus_devinfo *obd;

	obd = OFW_BUS_GET_DEVINFO(bus, dev);
	if (obd == NULL)
	return (NULL);
	return (obd->obd_name);
	}

	phandle_t
	ofw_bus_gen_get_node(device_t bus, device_t dev)
	{
	const struct ofw_bus_devinfo *obd;

	obd = OFW_BUS_GET_DEVINFO(bus, dev);
	if (obd == NULL)
	return (0);
	return (obd->obd_node);
	}

	const char *
	ofw_bus_gen_get_type(device_t bus, device_t dev)
	{
	const struct ofw_bus_devinfo *obd;

	obd = OFW_BUS_GET_DEVINFO(bus, dev);
	if (obd == NULL)
	return (NULL);
	return (obd->obd_type);
	}

	const char *
	ofw_bus_get_status(device_t dev)
	{
	const struct ofw_bus_devinfo *obd;

	obd = OFW_BUS_GET_DEVINFO(device_get_parent(dev), dev);
	if (obd == NULL)
	return (NULL);

	return (obd->obd_status);
	}

	int
	ofw_bus_status_okay(device_t dev)
	{
	const char *status;

	status = ofw_bus_get_status(dev);
	if (status == NULL \|\| strcmp(status, "okay") == 0 \|\|
	strcmp(status, "ok") == 0)
	return (1);

	return (0);
	}

	int
	ofw_bus_node_status_okay(phandle_t node)
	{
	char status[OFW_STATUS_LEN];
	int len;

	len = OF_getproplen(node, "status");
	if (len <= 0)
	return (1);

	OF_getprop(node, "status", status, OFW_STATUS_LEN);
	if ((len == 5 && (bcmp(status, "okay", len) == 0)) \|\|
	(len == 3 && (bcmp(status, "ok", len))))
	return (1);

	return (0);
	}

	static int
	ofw_bus_node_is_compatible_int(const char *compat, int len,
	const char *onecompat)
	{
	int onelen, l, ret;

	onelen = strlen(onecompat);

	ret = 0;
	while (len > 0) {
	if (strlen(compat) == onelen &&
	strncasecmp(compat, onecompat, onelen) == 0) {
	/* Found it. */
	ret = 1;
	break;
	}

	/* Slide to the next sub-string. */
	l = strlen(compat) + 1;
	compat += l;
	len -= l;
	}

	return (ret);
	}

	int
	ofw_bus_node_is_compatible(phandle_t node, const char *compatstr)
	{
	char compat[OFW_COMPAT_LEN];
	int len, rv;

	if ((len = OF_getproplen(node, "compatible")) <= 0)
	return (0);

	bzero(compat, OFW_COMPAT_LEN);

	if (OF_getprop(node, "compatible", compat, OFW_COMPAT_LEN) < 0)
	return (0);

	rv = ofw_bus_node_is_compatible_int(compat, len, compatstr);

	return (rv);
	}

	int
	ofw_bus_is_compatible(device_t dev, const char *onecompat)
	{
	phandle_t node;
	const char *compat;
	int len;

	if ((compat = ofw_bus_get_compat(dev)) == NULL)
	return (0);

	if ((node = ofw_bus_get_node(dev)) == -1)
	return (0);

	/* Get total 'compatible' prop len */
	if ((len = OF_getproplen(node, "compatible")) <= 0)
	return (0);

	return (ofw_bus_node_is_compatible_int(compat, len, onecompat));
	}

	int
	ofw_bus_is_compatible_strict(device_t dev, const char *compatible)
	{
	const char *compat;
	size_t len;

	if ((compat = ofw_bus_get_compat(dev)) == NULL)
	return (0);

	len = strlen(compatible);
	if (strlen(compat) == len &&
	strncasecmp(compat, compatible, len) == 0)
	return (1);

	return (0);
	}

	const struct ofw_compat_data *
	ofw_bus_search_compatible(device_t dev, const struct ofw_compat_data *compat)
	{

	if (compat == NULL)
	return NULL;

	for (; compat->ocd_str != NULL; ++compat) {
	if (ofw_bus_is_compatible(dev, compat->ocd_str))
	break;
	}

	return (compat);
	}

	int
	ofw_bus_has_prop(device_t dev, const char *propname)
	{
	phandle_t node;

	if ((node = ofw_bus_get_node(dev)) == -1)
	return (0);

	return (OF_hasprop(node, propname));
	}

	void
	ofw_bus_setup_iinfo(phandle_t node, struct ofw_bus_iinfo *ii, int intrsz)
	{
	pcell_t addrc;
	int msksz;

	if (OF_getencprop(node, "#address-cells", &addrc, sizeof(addrc)) == -1)
	addrc = 2;
	ii->opi_addrc = addrc * sizeof(pcell_t);

	ii->opi_imapsz = OF_getencprop_alloc(node, "interrupt-map", 1,
	(void **)&ii->opi_imap);
	if (ii->opi_imapsz > 0) {
	msksz = OF_getencprop_alloc(node, "interrupt-map-mask", 1,
	(void **)&ii->opi_imapmsk);
	/*
	* Failure to get the mask is ignored; a full mask is used
	* then. We barf on bad mask sizes, however.
	*/
	if (msksz != -1 && msksz != ii->opi_addrc + intrsz)
	panic("ofw_bus_setup_iinfo: bad interrupt-map-mask "
	"property!");
	}
	}

	int
	ofw_bus_lookup_imap(phandle_t node, struct ofw_bus_iinfo ii, void reg,
	int regsz, void pintr, int pintrsz, void mintr, int mintrsz,
	phandle_t *iparent)
	{
	uint8_t maskbuf[regsz + pintrsz];
	int rv;

	if (ii->opi_imapsz <= 0)
	return (0);
	KASSERT(regsz >= ii->opi_addrc,
	("ofw_bus_lookup_imap: register size too small: %d < %d",
	regsz, ii->opi_addrc));
	if (node != -1) {
	rv = OF_getencprop(node, "reg", reg, regsz);
	if (rv < regsz)
	panic("ofw_bus_lookup_imap: cannot get reg property");
	}
	return (ofw_bus_search_intrmap(pintr, pintrsz, reg, ii->opi_addrc,
	ii->opi_imap, ii->opi_imapsz, ii->opi_imapmsk, maskbuf, mintr,
	mintrsz, iparent));
	}

	/*
	* Map an interrupt using the firmware reg, interrupt-map and
	* interrupt-map-mask properties.
	* The interrupt property to be mapped must be of size intrsz, and pointed to
	* by intr. The regs property of the node for which the mapping is done must
	* be passed as regs. This property is an array of register specifications;
	* the size of the address part of such a specification must be passed as
	* physsz. Only the first element of the property is used.
	* imap and imapsz hold the interrupt mask and it's size.
	* imapmsk is a pointer to the interrupt-map-mask property, which must have
	* a size of physsz + intrsz; it may be NULL, in which case a full mask is
	* assumed.
	* maskbuf must point to a buffer of length physsz + intrsz.
	* The interrupt is returned in result, which must point to a buffer of length
	* rintrsz (which gives the expected size of the mapped interrupt).
	* Returns number of cells in the interrupt if a mapping was found, 0 otherwise.
	*/
	int
	ofw_bus_search_intrmap(void intr, int intrsz, void regs, int physsz,
	void imap, int imapsz, void imapmsk, void maskbuf, void result,
	int rintrsz, phandle_t *iparent)
	{
	phandle_t parent;
	uint8_t *ref = maskbuf;
	uint8_t *uiintr = intr;
	uint8_t *uiregs = regs;
	uint8_t *uiimapmsk = imapmsk;
	uint8_t *mptr;
	pcell_t paddrsz;
	pcell_t pintrsz;
	- int i, rsz, tsz;
	+ int i, tsz;

	- rsz = -1;
	if (imapmsk != NULL) {
	for (i = 0; i < physsz; i++)
	ref[i] = uiregs[i] & uiimapmsk[i];
	for (i = 0; i < intrsz; i++)
	ref[physsz + i] = uiintr[i] & uiimapmsk[physsz + i];
	} else {
	bcopy(regs, ref, physsz);
	bcopy(intr, ref + physsz, intrsz);
	}

	mptr = imap;
	i = imapsz;
	paddrsz = 0;
	while (i > 0) {
	bcopy(mptr + physsz + intrsz, &parent, sizeof(parent));
	#ifndef OFW_IMAP_NO_IPARENT_ADDR_CELLS
	/*
	* Find if we need to read the parent address data.
	* CHRP-derived OF bindings, including ePAPR-compliant FDTs,
	* use this as an optional part of the specifier.
	*/
	if (OF_getencprop(OF_node_from_xref(parent),
	"#address-cells", &paddrsz, sizeof(paddrsz)) == -1)
	paddrsz = 0; /* default */
	paddrsz *= sizeof(pcell_t);
	#endif

	if (OF_searchencprop(OF_node_from_xref(parent),
	"#interrupt-cells", &pintrsz, sizeof(pintrsz)) == -1)
	pintrsz = 1; /* default */
	pintrsz *= sizeof(pcell_t);

	/* Compute the map stride size. */
	tsz = physsz + intrsz + sizeof(phandle_t) + paddrsz + pintrsz;
	KASSERT(i >= tsz, ("ofw_bus_search_intrmap: truncated map"));

	if (bcmp(ref, mptr, physsz + intrsz) == 0) {
	bcopy(mptr + physsz + intrsz + sizeof(parent) + paddrsz,
	result, MIN(rintrsz, pintrsz));

	if (iparent != NULL)
	*iparent = parent;
	return (pintrsz/sizeof(pcell_t));
	}
	mptr += tsz;
	i -= tsz;
	}
	return (0);
	}

	int
	ofw_bus_msimap(phandle_t node, uint16_t pci_rid, phandle_t *msi_parent,
	uint32_t *msi_rid)
	{
	pcell_t *map, mask, msi_base, rid_base, rid_length;
	ssize_t len;
	- uint32_t masked_rid, rid;
	+ uint32_t masked_rid;
	int err, i;

	/* TODO: This should be OF_searchprop_alloc if we had it */
	len = OF_getencprop_alloc(node, "msi-map", sizeof(map), (void *)&map);
	if (len < 0) {
	if (msi_parent != NULL) {
	*msi_parent = 0;
	OF_getencprop(node, "msi-parent", msi_parent,
	sizeof(*msi_parent));
	}
	if (msi_rid != NULL)
	*msi_rid = pci_rid;
	return (0);
	}

	err = ENOENT;
	- rid = 0;
	mask = 0xffffffff;
	OF_getencprop(node, "msi-map-mask", &mask, sizeof(mask));

	masked_rid = pci_rid & mask;
	for (i = 0; i < len; i += 4) {
	rid_base = map[i + 0];
	rid_length = map[i + 3];

	if (masked_rid < rid_base \|\|
	masked_rid >= (rid_base + rid_length))
	continue;

	msi_base = map[i + 2];

	if (msi_parent != NULL)
	*msi_parent = map[i + 1];
	if (msi_rid != NULL)
	*msi_rid = masked_rid - rid_base + msi_base;
	err = 0;
	break;
	}

	free(map, M_OFWPROP);

	return (err);
	}

	int
	ofw_bus_reg_to_rl(device_t dev, phandle_t node, pcell_t acells, pcell_t scells,
	struct resource_list *rl)
	{
	uint64_t phys, size;
	ssize_t i, j, rid, nreg, ret;
	uint32_t *reg;
	char *name;

	/*
	* This may be just redundant when having ofw_bus_devinfo
	* but makes this routine independent of it.
	*/
	ret = OF_getprop_alloc(node, "name", sizeof(name), (void *)&name);
	if (ret == -1)
	name = NULL;

	ret = OF_getencprop_alloc(node, "reg", sizeof(reg), (void *)&reg);
	nreg = (ret == -1) ? 0 : ret;

	if (nreg % (acells + scells) != 0) {
	if (bootverbose)
	device_printf(dev, "Malformed reg property on <%s>\n",
	(name == NULL) ? "unknown" : name);
	nreg = 0;
	}

	for (i = 0, rid = 0; i < nreg; i += acells + scells, rid++) {
	phys = size = 0;
	for (j = 0; j < acells; j++) {
	phys <<= 32;
	phys \|= reg[i + j];
	}
	for (j = 0; j < scells; j++) {
	size <<= 32;
	size \|= reg[i + acells + j];
	}
	/* Skip the dummy reg property of glue devices like ssm(4). */
	if (size != 0)
	resource_list_add(rl, SYS_RES_MEMORY, rid,
	phys, phys + size - 1, size);
	}
	free(name, M_OFWPROP);
	free(reg, M_OFWPROP);

	return (0);
	}

	/*
	* Get interrupt parent for given node.
	* Returns 0 if interrupt parent doesn't exist.
	*/
	phandle_t
	ofw_bus_find_iparent(phandle_t node)
	{
	phandle_t iparent;

	if (OF_searchencprop(node, "interrupt-parent", &iparent,
	sizeof(iparent)) == -1) {
	for (iparent = node; iparent != 0;
	iparent = OF_parent(iparent)) {
	if (OF_hasprop(iparent, "interrupt-controller"))
	break;
	}
	iparent = OF_xref_from_node(iparent);
	}
	return (iparent);
	}

	int
	ofw_bus_intr_to_rl(device_t dev, phandle_t node,
	struct resource_list rl, int rlen)
	{
	phandle_t iparent;
	uint32_t icells, *intr;
	int err, i, irqnum, nintr, rid;
	boolean_t extended;

	nintr = OF_getencprop_alloc(node, "interrupts", sizeof(*intr),
	(void **)&intr);
	if (nintr > 0) {
	iparent = ofw_bus_find_iparent(node);
	if (iparent == 0) {
	device_printf(dev, "No interrupt-parent found, "
	"assuming direct parent\n");
	iparent = OF_parent(node);
	iparent = OF_xref_from_node(iparent);
	}
	if (OF_searchencprop(OF_node_from_xref(iparent),
	"#interrupt-cells", &icells, sizeof(icells)) == -1) {
	device_printf(dev, "Missing #interrupt-cells "
	"property, assuming <1>\n");
	icells = 1;
	}
	if (icells < 1 \|\| icells > nintr) {
	device_printf(dev, "Invalid #interrupt-cells property "
	"value <%d>, assuming <1>\n", icells);
	icells = 1;
	}
	extended = false;
	} else {
	nintr = OF_getencprop_alloc(node, "interrupts-extended",
	sizeof(intr), (void *)&intr);
	if (nintr <= 0)
	return (0);
	extended = true;
	}
	err = 0;
	rid = 0;
	for (i = 0; i < nintr; i += icells) {
	if (extended) {
	iparent = intr[i++];
	if (OF_searchencprop(OF_node_from_xref(iparent),
	"#interrupt-cells", &icells, sizeof(icells)) == -1) {
	device_printf(dev, "Missing #interrupt-cells "
	"property\n");
	err = ENOENT;
	break;
	}
	if (icells < 1 \|\| (i + icells) > nintr) {
	device_printf(dev, "Invalid #interrupt-cells "
	"property value <%d>\n", icells);
	err = ERANGE;
	break;
	}
	}
	irqnum = ofw_bus_map_intr(dev, iparent, icells, &intr[i]);
	resource_list_add(rl, SYS_RES_IRQ, rid++, irqnum, irqnum, 1);
	}
	if (rlen != NULL)
	*rlen = rid;
	free(intr, M_OFWPROP);
	return (err);
	}

	int
	ofw_bus_intr_by_rid(device_t dev, phandle_t node, int wanted_rid,
	phandle_t producer, int ncells, pcell_t **cells)
	{
	phandle_t iparent;
	uint32_t icells, *intr;
	int err, i, nintr, rid;
	boolean_t extended;

	nintr = OF_getencprop_alloc(node, "interrupts", sizeof(*intr),
	(void **)&intr);
	if (nintr > 0) {
	iparent = ofw_bus_find_iparent(node);
	if (iparent == 0) {
	device_printf(dev, "No interrupt-parent found, "
	"assuming direct parent\n");
	iparent = OF_parent(node);
	iparent = OF_xref_from_node(iparent);
	}
	if (OF_searchencprop(OF_node_from_xref(iparent),
	"#interrupt-cells", &icells, sizeof(icells)) == -1) {
	device_printf(dev, "Missing #interrupt-cells "
	"property, assuming <1>\n");
	icells = 1;
	}
	if (icells < 1 \|\| icells > nintr) {
	device_printf(dev, "Invalid #interrupt-cells property "
	"value <%d>, assuming <1>\n", icells);
	icells = 1;
	}
	extended = false;
	} else {
	nintr = OF_getencprop_alloc(node, "interrupts-extended",
	sizeof(intr), (void *)&intr);
	if (nintr <= 0)
	return (ESRCH);
	extended = true;
	}
	err = ESRCH;
	rid = 0;
	for (i = 0; i < nintr; i += icells, rid++) {
	if (extended) {
	iparent = intr[i++];
	if (OF_searchencprop(OF_node_from_xref(iparent),
	"#interrupt-cells", &icells, sizeof(icells)) == -1) {
	device_printf(dev, "Missing #interrupt-cells "
	"property\n");
	err = ENOENT;
	break;
	}
	if (icells < 1 \|\| (i + icells) > nintr) {
	device_printf(dev, "Invalid #interrupt-cells "
	"property value <%d>\n", icells);
	err = ERANGE;
	break;
	}
	}
	if (rid == wanted_rid) {
	cells = malloc(icells sizeof(**cells), M_OFWPROP,
	M_WAITOK);
	*producer = iparent;
	*ncells= icells;
	memcpy(cells, intr + i, icells sizeof(**cells));
	err = 0;
	break;
	}
	}
	free(intr, M_OFWPROP);
	return (err);
	}

	phandle_t
	ofw_bus_find_child(phandle_t start, const char *child_name)
	{
	char *name;
	int ret;
	phandle_t child;

	for (child = OF_child(start); child != 0; child = OF_peer(child)) {
	ret = OF_getprop_alloc(child, "name", sizeof(name), (void *)&name);
	if (ret == -1)
	continue;
	if (strcmp(name, child_name) == 0) {
	free(name, M_OFWPROP);
	return (child);
	}

	free(name, M_OFWPROP);
	}

	return (0);
	}

	phandle_t
	ofw_bus_find_compatible(phandle_t node, const char *onecompat)
	{
	phandle_t child, ret;

	/*
	* Traverse all children of 'start' node, and find first with
	* matching 'compatible' property.
	*/
	for (child = OF_child(node); child != 0; child = OF_peer(child)) {
	if (ofw_bus_node_is_compatible(child, onecompat) != 0)
	return (child);

	ret = ofw_bus_find_compatible(child, onecompat);
	if (ret != 0)
	return (ret);
	}
	return (0);
	}

	/**
	* @brief Return child of bus whose phandle is node
	*
	* A direct child of @p will be returned if it its phandle in the
	* OFW tree is @p node. Otherwise, NULL is returned.
	*
	* @param bus The bus to examine
	* @param node The phandle_t to look for.
	*/
	device_t
	ofw_bus_find_child_device_by_phandle(device_t bus, phandle_t node)
	{
	device_t *children, retval, child;
	int nkid, i;

	/*
	* Nothing can match the flag value for no node.
	*/
	if (node == -1)
	return (NULL);

	/*
	* Search the children for a match. We microoptimize
	* a bit by not using ofw_bus_get since we already know
	* the parent. We do not recurse.
	*/
	if (device_get_children(bus, &children, &nkid) != 0)
	return (NULL);
	retval = NULL;
	for (i = 0; i < nkid; i++) {
	child = children[i];
	if (OFW_BUS_GET_NODE(bus, child) == node) {
	retval = child;
	break;
	}
	}
	free(children, M_TEMP);

	return (retval);
	}

	/*
	* Parse property that contain list of xrefs and values
	* (like standard "clocks" and "resets" properties)
	* Input arguments:
	* node - consumers device node
	* list_name - name of parsed list - "clocks"
	* cells_name - name of size property - "#clock-cells"
	* idx - the index of the requested list entry, or, if -1, an indication
	* to return the number of entries in the parsed list.
	* Output arguments:
	* producer - handle of producer
	* ncells - number of cells in result or the number of items in the list when
	* idx == -1.
	* cells - array of decoded cells
	*/
	static int
	ofw_bus_parse_xref_list_internal(phandle_t node, const char *list_name,
	const char cells_name, int idx, phandle_t producer, int *ncells,
	pcell_t **cells)
	{
	phandle_t pnode;
	phandle_t *elems;
	uint32_t pcells;
	int rv, i, j, nelems, cnt;

	elems = NULL;
	nelems = OF_getencprop_alloc(node, list_name, sizeof(*elems),
	(void **)&elems);
	if (nelems <= 0)
	return (ENOENT);
	rv = (idx == -1) ? 0 : ENOENT;
	for (i = 0, cnt = 0; i < nelems; i += pcells, cnt++) {
	pnode = elems[i++];
	if (OF_getencprop(OF_node_from_xref(pnode),
	cells_name, &pcells, sizeof(pcells)) == -1) {
	printf("Missing %s property\n", cells_name);
	rv = ENOENT;
	break;
	}

	if ((i + pcells) > nelems) {
	printf("Invalid %s property value <%d>\n", cells_name,
	pcells);
	rv = ERANGE;
	break;
	}
	if (cnt == idx) {
	cells= malloc(pcells sizeof(**cells), M_OFWPROP,
	M_WAITOK);
	*producer = pnode;
	*ncells = pcells;
	for (j = 0; j < pcells; j++)
	(*cells)[j] = elems[i + j];
	rv = 0;
	break;
	}
	}
	if (elems != NULL)
	free(elems, M_OFWPROP);
	if (idx == -1 && rv == 0)
	*ncells = cnt;
	return (rv);
	}

	/*
	* Parse property that contain list of xrefs and values
	* (like standard "clocks" and "resets" properties)
	* Input arguments:
	* node - consumers device node
	* list_name - name of parsed list - "clocks"
	* cells_name - name of size property - "#clock-cells"
	* idx - the index of the requested list entry (>= 0)
	* Output arguments:
	* producer - handle of producer
	* ncells - number of cells in result
	* cells - array of decoded cells
	*/
	int
	ofw_bus_parse_xref_list_alloc(phandle_t node, const char *list_name,
	const char cells_name, int idx, phandle_t producer, int *ncells,
	pcell_t **cells)
	{

	KASSERT(idx >= 0,
	("ofw_bus_parse_xref_list_alloc: negative index supplied"));

	return (ofw_bus_parse_xref_list_internal(node, list_name, cells_name,
	idx, producer, ncells, cells));
	}

	/*
	* Parse property that contain list of xrefs and values
	* (like standard "clocks" and "resets" properties)
	* and determine the number of items in the list
	* Input arguments:
	* node - consumers device node
	* list_name - name of parsed list - "clocks"
	* cells_name - name of size property - "#clock-cells"
	* Output arguments:
	* count - number of items in list
	*/
	int
	ofw_bus_parse_xref_list_get_length(phandle_t node, const char *list_name,
	const char cells_name, int count)
	{

	return (ofw_bus_parse_xref_list_internal(node, list_name, cells_name,
	-1, NULL, count, NULL));
	}

	/*
	* Find index of string in string list property (case sensitive).
	*/
	int
	ofw_bus_find_string_index(phandle_t node, const char *list_name,
	const char name, int idx)
	{
	char *elems;
	int rv, i, cnt, nelems;

	elems = NULL;
	nelems = OF_getprop_alloc(node, list_name, 1, (void **)&elems);
	if (nelems <= 0)
	return (ENOENT);

	rv = ENOENT;
	for (i = 0, cnt = 0; i < nelems; cnt++) {
	if (strcmp(elems + i, name) == 0) {
	*idx = cnt;
	rv = 0;
	break;
	}
	i += strlen(elems + i) + 1;
	}

	if (elems != NULL)
	free(elems, M_OFWPROP);
	return (rv);
	}

	/*
	* Create zero terminated array of strings from string list property.
	*/
	int
	ofw_bus_string_list_to_array(phandle_t node, const char *list_name,
	const char ***out_array)
	{
	char elems, tptr;
	const char **array;
	int i, cnt, nelems, len;

	elems = NULL;
	nelems = OF_getprop_alloc(node, list_name, 1, (void **)&elems);
	if (nelems <= 0)
	return (nelems);

	/* Count number of strings. */
	for (i = 0, cnt = 0; i < nelems; cnt++)
	i += strlen(elems + i) + 1;

	/* Allocate space for arrays and all strings. */
	array = malloc((cnt + 1) * sizeof(char *) + nelems, M_OFWPROP,
	M_WAITOK);

	/* Get address of first string. */
	tptr = (char *)(array + cnt + 1);

	/* Copy strings. */
	memcpy(tptr, elems, nelems);
	free(elems, M_OFWPROP);

	/* Fill string pointers. */
	for (i = 0, cnt = 0; i < nelems; cnt++) {
	len = strlen(tptr) + 1;
	array[cnt] = tptr;
	i += len;
	tptr += len;
	}
	array[cnt] = NULL;
	*out_array = array;

	return (cnt);
	}
	Index: head/sys/dev/ofw/ofwpci.c
	===================================================================
	--- head/sys/dev/ofw/ofwpci.c (revision 327172)
	+++ head/sys/dev/ofw/ofwpci.c (revision 327173)
	@@ -1,678 +1,675 @@
	/*-
	* Copyright (c) 2011 Nathan Whitehorn
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/module.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/rman.h>

	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_pci.h>
	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>
	#include <dev/ofw/ofwpci.h>

	#include <dev/pci/pcivar.h>
	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcib_private.h>

	#include <machine/bus.h>
	#include <machine/md_var.h>
	#include <machine/resource.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include "pcib_if.h"

	/*
	* If it is necessary to set another value of this for
	* some platforms it should be set at fdt.h file
	*/
	#ifndef PCI_MAP_INTR
	#define PCI_MAP_INTR 4
	#endif

	#define PCI_INTR_PINS 4

	/*
	* bus interface.
	*/
	static struct resource * ofw_pci_alloc_resource(device_t, device_t,
	int, int *, rman_res_t, rman_res_t, rman_res_t, u_int);
	static int ofw_pci_release_resource(device_t, device_t, int, int,
	struct resource *);
	static int ofw_pci_activate_resource(device_t, device_t, int, int,
	struct resource *);
	static int ofw_pci_deactivate_resource(device_t, device_t, int, int,
	struct resource *);
	static int ofw_pci_adjust_resource(device_t, device_t, int,
	struct resource *, rman_res_t, rman_res_t);

	#ifdef __powerpc__
	static bus_space_tag_t ofw_pci_bus_get_bus_tag(device_t, device_t);
	#endif

	/*
	* pcib interface
	*/
	static int ofw_pci_maxslots(device_t);

	/*
	* ofw_bus interface
	*/
	static phandle_t ofw_pci_get_node(device_t, device_t);

	/*
	* local methods
	*/
	static int ofw_pci_fill_ranges(phandle_t, struct ofw_pci_range *);
	static struct rman ofw_pci_get_rman(struct ofw_pci_softc , int, u_int);

	/*
	* Driver methods.
	*/
	static device_method_t ofw_pci_methods[] = {

	/* Device interface */
	DEVMETHOD(device_attach, ofw_pci_attach),

	/* Bus interface */
	DEVMETHOD(bus_print_child, bus_generic_print_child),
	DEVMETHOD(bus_read_ivar, ofw_pci_read_ivar),
	DEVMETHOD(bus_write_ivar, ofw_pci_write_ivar),
	DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
	DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
	DEVMETHOD(bus_alloc_resource, ofw_pci_alloc_resource),
	DEVMETHOD(bus_release_resource, ofw_pci_release_resource),
	DEVMETHOD(bus_activate_resource, ofw_pci_activate_resource),
	DEVMETHOD(bus_deactivate_resource, ofw_pci_deactivate_resource),
	DEVMETHOD(bus_adjust_resource, ofw_pci_adjust_resource),
	#ifdef __powerpc__
	DEVMETHOD(bus_get_bus_tag, ofw_pci_bus_get_bus_tag),
	#endif

	/* pcib interface */
	DEVMETHOD(pcib_maxslots, ofw_pci_maxslots),
	DEVMETHOD(pcib_route_interrupt, ofw_pci_route_interrupt),
	DEVMETHOD(pcib_request_feature, pcib_request_feature_allow),

	/* ofw_bus interface */
	DEVMETHOD(ofw_bus_get_node, ofw_pci_get_node),

	DEVMETHOD_END
	};

	DEFINE_CLASS_0(ofw_pci, ofw_pci_driver, ofw_pci_methods, 0);

	int
	ofw_pci_init(device_t dev)
	{
	struct ofw_pci_softc *sc;
	phandle_t node;
	u_int32_t busrange[2];
	struct ofw_pci_range *rp;
	int i, error;
	struct ofw_pci_cell_info *cell_info;

	node = ofw_bus_get_node(dev);
	sc = device_get_softc(dev);
	sc->sc_initialized = 1;
	sc->sc_range = NULL;
	sc->sc_pci_domain = device_get_unit(dev);

	cell_info = (struct ofw_pci_cell_info )malloc(sizeof(cell_info),
	M_DEVBUF, M_WAITOK \| M_ZERO);

	sc->sc_cell_info = cell_info;

	if (OF_getencprop(node, "bus-range", busrange, sizeof(busrange)) != 8)
	busrange[0] = 0;

	sc->sc_dev = dev;
	sc->sc_node = node;
	sc->sc_bus = busrange[0];

	if (sc->sc_quirks & OFW_PCI_QUIRK_RANGES_ON_CHILDREN) {
	phandle_t c;
	int n, i;

	sc->sc_nrange = 0;
	for (c = OF_child(node); c != 0; c = OF_peer(c)) {
	n = ofw_pci_nranges(c, cell_info);
	if (n > 0)
	sc->sc_nrange += n;
	}
	if (sc->sc_nrange == 0) {
	error = ENXIO;
	goto out;
	}
	sc->sc_range = malloc(sc->sc_nrange * sizeof(sc->sc_range[0]),
	M_DEVBUF, M_WAITOK);
	i = 0;
	for (c = OF_child(node); c != 0; c = OF_peer(c)) {
	n = ofw_pci_fill_ranges(c, &sc->sc_range[i]);
	if (n > 0)
	i += n;
	}
	KASSERT(i == sc->sc_nrange, ("range count mismatch"));
	} else {
	sc->sc_nrange = ofw_pci_nranges(node, cell_info);
	if (sc->sc_nrange <= 0) {
	device_printf(dev, "could not getranges\n");
	error = ENXIO;
	goto out;
	}
	sc->sc_range = malloc(sc->sc_nrange * sizeof(sc->sc_range[0]),
	M_DEVBUF, M_WAITOK);
	ofw_pci_fill_ranges(node, sc->sc_range);
	}

	sc->sc_io_rman.rm_type = RMAN_ARRAY;
	sc->sc_io_rman.rm_descr = "PCI I/O Ports";
	error = rman_init(&sc->sc_io_rman);
	if (error != 0) {
	device_printf(dev, "rman_init() failed. error = %d\n", error);
	goto out;
	}

	sc->sc_mem_rman.rm_type = RMAN_ARRAY;
	sc->sc_mem_rman.rm_descr = "PCI Non Prefetchable Memory";
	error = rman_init(&sc->sc_mem_rman);
	if (error != 0) {
	device_printf(dev, "rman_init() failed. error = %d\n", error);
	goto out;
	}

	sc->sc_pmem_rman.rm_type = RMAN_ARRAY;
	sc->sc_pmem_rman.rm_descr = "PCI Prefetchable Memory";
	error = rman_init(&sc->sc_pmem_rman);
	if (error != 0) {
	device_printf(dev, "rman_init() failed. error = %d\n", error);
	goto out;
	}

	for (i = 0; i < sc->sc_nrange; i++) {
	error = 0;
	rp = sc->sc_range + i;

	if (sc->sc_range_mask & ((uint64_t)1 << i))
	continue;
	switch (rp->pci_hi & OFW_PCI_PHYS_HI_SPACEMASK) {
	case OFW_PCI_PHYS_HI_SPACE_CONFIG:
	break;
	case OFW_PCI_PHYS_HI_SPACE_IO:
	error = rman_manage_region(&sc->sc_io_rman, rp->pci,
	rp->pci + rp->size - 1);
	break;
	case OFW_PCI_PHYS_HI_SPACE_MEM32:
	case OFW_PCI_PHYS_HI_SPACE_MEM64:
	if (rp->pci_hi & OFW_PCI_PHYS_HI_PREFETCHABLE) {
	sc->sc_have_pmem = 1;
	error = rman_manage_region(&sc->sc_pmem_rman,
	rp->pci, rp->pci + rp->size - 1);
	} else {
	error = rman_manage_region(&sc->sc_mem_rman,
	rp->pci, rp->pci + rp->size - 1);
	}
	break;
	}

	if (error != 0) {
	device_printf(dev,
	"rman_manage_region(%x, %#jx, %#jx) failed. "
	"error = %d\n", rp->pci_hi &
	OFW_PCI_PHYS_HI_SPACEMASK, rp->pci,
	rp->pci + rp->size - 1, error);
	goto out;
	}
	}

	ofw_bus_setup_iinfo(node, &sc->sc_pci_iinfo, sizeof(cell_t));
	return (0);

	out:
	free(cell_info, M_DEVBUF);
	free(sc->sc_range, M_DEVBUF);
	rman_fini(&sc->sc_io_rman);
	rman_fini(&sc->sc_mem_rman);
	rman_fini(&sc->sc_pmem_rman);

	return (error);
	}

	int
	ofw_pci_attach(device_t dev)
	{
	struct ofw_pci_softc *sc;
	int error;

	sc = device_get_softc(dev);
	if (!sc->sc_initialized) {
	error = ofw_pci_init(dev);
	if (error != 0)
	return (error);
	}

	device_add_child(dev, "pci", -1);
	return (bus_generic_attach(dev));
	}

	static int
	ofw_pci_maxslots(device_t dev)
	{

	return (PCI_SLOTMAX);
	}

	int
	ofw_pci_route_interrupt(device_t bus, device_t dev, int pin)
	{
	struct ofw_pci_softc *sc;
	struct ofw_pci_register reg;
	uint32_t pintr, mintr[PCI_MAP_INTR];
	int intrcells;
	phandle_t iparent;

	sc = device_get_softc(bus);
	pintr = pin;

	/* Fabricate imap information in case this isn't an OFW device */
	bzero(&reg, sizeof(reg));
	reg.phys_hi = (pci_get_bus(dev) << OFW_PCI_PHYS_HI_BUSSHIFT) \|
	(pci_get_slot(dev) << OFW_PCI_PHYS_HI_DEVICESHIFT) \|
	(pci_get_function(dev) << OFW_PCI_PHYS_HI_FUNCTIONSHIFT);

	intrcells = ofw_bus_lookup_imap(ofw_bus_get_node(dev),
	&sc->sc_pci_iinfo, &reg, sizeof(reg), &pintr, sizeof(pintr),
	mintr, sizeof(mintr), &iparent);
	if (intrcells != 0) {
	pintr = ofw_bus_map_intr(dev, iparent, intrcells, mintr);
	return (pintr);
	}

	/*
	* Maybe it's a real interrupt, not an intpin
	*/
	if (pin > PCI_INTR_PINS)
	return (pin);

	device_printf(bus, "could not route pin %d for device %d.%d\n",
	pin, pci_get_slot(dev), pci_get_function(dev));
	return (PCI_INVALID_IRQ);
	}

	int
	ofw_pci_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
	{
	struct ofw_pci_softc *sc;

	sc = device_get_softc(dev);

	switch (which) {
	case PCIB_IVAR_DOMAIN:
	*result = sc->sc_pci_domain;
	return (0);
	case PCIB_IVAR_BUS:
	*result = sc->sc_bus;
	return (0);
	default:
	break;
	}

	return (ENOENT);
	}

	int
	ofw_pci_write_ivar(device_t dev, device_t child, int which, uintptr_t value)
	{
	struct ofw_pci_softc *sc;

	sc = device_get_softc(dev);

	switch (which) {
	case PCIB_IVAR_BUS:
	sc->sc_bus = value;
	return (0);
	default:
	break;
	}

	return (ENOENT);
	}

	int
	ofw_pci_nranges(phandle_t node, struct ofw_pci_cell_info *info)
	{
	ssize_t nbase_ranges;

	if (info == NULL)
	return (-1);

	info->host_address_cells = 1;
	info->size_cells = 2;
	info->pci_address_cell = 3;

	OF_getencprop(OF_parent(node), "#address-cells",
	&(info->host_address_cells), sizeof(info->host_address_cells));
	OF_getencprop(node, "#address-cells",
	&(info->pci_address_cell), sizeof(info->pci_address_cell));
	OF_getencprop(node, "#size-cells", &(info->size_cells),
	sizeof(info->size_cells));

	nbase_ranges = OF_getproplen(node, "ranges");
	if (nbase_ranges <= 0)
	return (-1);

	return (nbase_ranges / sizeof(cell_t) /
	(info->pci_address_cell + info->host_address_cells +
	info->size_cells));
	}

	static struct resource *
	ofw_pci_alloc_resource(device_t bus, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct ofw_pci_softc *sc;
	struct resource *rv;
	struct rman *rm;
	int needactivate;


	needactivate = flags & RF_ACTIVE;
	flags &= ~RF_ACTIVE;

	sc = device_get_softc(bus);

	#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
	if (type == PCI_RES_BUS) {
	return (pci_domain_alloc_bus(sc->sc_pci_domain, child, rid,
	start, end, count, flags \| needactivate));
	}
	#endif

	rm = ofw_pci_get_rman(sc, type, flags);
	if (rm == NULL) {
	return (bus_generic_alloc_resource(bus, child, type, rid,
	start, end, count, flags \| needactivate));
	}

	rv = rman_reserve_resource(rm, start, end, count, flags, child);
	if (rv == NULL) {
	device_printf(bus, "failed to reserve resource for %s\n",
	device_get_nameunit(child));
	return (NULL);
	}

	rman_set_rid(rv, *rid);

	if (needactivate) {
	if (bus_activate_resource(child, type, *rid, rv) != 0) {
	device_printf(bus,
	"failed to activate resource for %s\n",
	device_get_nameunit(child));
	rman_release_resource(rv);
	return (NULL);
	}
	}

	return (rv);
	}

	static int
	ofw_pci_release_resource(device_t bus, device_t child, int type, int rid,
	struct resource *res)
	{
	struct ofw_pci_softc *sc;
	struct rman *rm;
	int error;

	sc = device_get_softc(bus);

	#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
	if (type == PCI_RES_BUS)
	return (pci_domain_release_bus(sc->sc_pci_domain, child, rid,
	res));
	#endif

	rm = ofw_pci_get_rman(sc, type, rman_get_flags(res));
	if (rm == NULL) {
	return (bus_generic_release_resource(bus, child, type, rid,
	res));
	}
	KASSERT(rman_is_region_manager(res, rm), ("rman mismatch"));

	if (rman_get_flags(res) & RF_ACTIVE) {
	error = bus_deactivate_resource(child, type, rid, res);
	if (error != 0)
	return (error);
	}
	return (rman_release_resource(res));
	}

	static int
	ofw_pci_activate_resource(device_t bus, device_t child, int type, int rid,
	struct resource *res)
	{
	struct ofw_pci_softc *sc;
	bus_space_handle_t handle;
	bus_space_tag_t tag;
	struct ofw_pci_range *rp;
	vm_paddr_t start;
	int space;
	int rv;

	sc = device_get_softc(bus);

	if (type != SYS_RES_IOPORT && type != SYS_RES_MEMORY) {
	return (bus_generic_activate_resource(bus, child, type, rid,
	res));
	}

	start = (vm_paddr_t)rman_get_start(res);

	/*
	* Map this through the ranges list
	*/
	for (rp = sc->sc_range; rp < sc->sc_range + sc->sc_nrange &&
	rp->pci_hi != 0; rp++) {
	if (start < rp->pci \|\| start >= rp->pci + rp->size)
	continue;

	switch (rp->pci_hi & OFW_PCI_PHYS_HI_SPACEMASK) {
	case OFW_PCI_PHYS_HI_SPACE_IO:
	space = SYS_RES_IOPORT;
	break;
	case OFW_PCI_PHYS_HI_SPACE_MEM32:
	case OFW_PCI_PHYS_HI_SPACE_MEM64:
	space = SYS_RES_MEMORY;
	break;
	default:
	space = -1;
	}

	if (type == space) {
	start += (rp->host - rp->pci);
	break;
	}
	}

	if (bootverbose)
	printf("ofw_pci mapdev: start %jx, len %jd\n",
	(rman_res_t)start, rman_get_size(res));

	tag = BUS_GET_BUS_TAG(child, child);
	if (tag == NULL)
	return (ENOMEM);

	rman_set_bustag(res, tag);
	rv = bus_space_map(tag, start,
	rman_get_size(res), 0, &handle);
	if (rv != 0)
	return (ENOMEM);

	rman_set_bushandle(res, handle);
	rman_set_virtual(res, (void )handle); / XXX for powerpc only ? */

	return (rman_activate_resource(res));
	}

	#ifdef __powerpc__
	static bus_space_tag_t
	ofw_pci_bus_get_bus_tag(device_t bus, device_t child)
	{

	return (&bs_le_tag);
	}
	#endif

	static int
	ofw_pci_deactivate_resource(device_t bus, device_t child, int type, int rid,
	struct resource *res)
	{
	- struct ofw_pci_softc *sc;
	vm_size_t psize;
	-
	- sc = device_get_softc(bus);

	if (type != SYS_RES_IOPORT && type != SYS_RES_MEMORY) {
	return (bus_generic_deactivate_resource(bus, child, type, rid,
	res));
	}

	psize = rman_get_size(res);
	pmap_unmapdev((vm_offset_t)rman_get_virtual(res), psize);

	return (rman_deactivate_resource(res));
	}

	static int
	ofw_pci_adjust_resource(device_t bus, device_t child, int type,
	struct resource *res, rman_res_t start, rman_res_t end)
	{
	struct rman *rm;
	struct ofw_pci_softc *sc;

	sc = device_get_softc(bus);
	#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
	if (type == PCI_RES_BUS)
	return (pci_domain_adjust_bus(sc->sc_pci_domain, child, res,
	start, end));
	#endif

	rm = ofw_pci_get_rman(sc, type, rman_get_flags(res));
	if (rm == NULL) {
	return (bus_generic_adjust_resource(bus, child, type, res,
	start, end));
	}
	KASSERT(rman_is_region_manager(res, rm), ("rman mismatch"));
	KASSERT(!(rman_get_flags(res) & RF_ACTIVE),
	("active resources cannot be adjusted"));

	return (rman_adjust_resource(res, start, end));
	}

	static phandle_t
	ofw_pci_get_node(device_t bus, device_t dev)
	{
	struct ofw_pci_softc *sc;

	sc = device_get_softc(bus);
	/* We only have one child, the PCI bus, which needs our own node. */

	return (sc->sc_node);
	}

	static int
	ofw_pci_fill_ranges(phandle_t node, struct ofw_pci_range *ranges)
	{
	int host_address_cells = 1, pci_address_cells = 3, size_cells = 2;
	cell_t *base_ranges;
	ssize_t nbase_ranges;
	int nranges;
	int i, j, k;

	OF_getencprop(OF_parent(node), "#address-cells", &host_address_cells,
	sizeof(host_address_cells));
	OF_getencprop(node, "#address-cells", &pci_address_cells,
	sizeof(pci_address_cells));
	OF_getencprop(node, "#size-cells", &size_cells, sizeof(size_cells));

	nbase_ranges = OF_getproplen(node, "ranges");
	if (nbase_ranges <= 0)
	return (-1);
	nranges = nbase_ranges / sizeof(cell_t) /
	(pci_address_cells + host_address_cells + size_cells);

	base_ranges = malloc(nbase_ranges, M_DEVBUF, M_WAITOK);
	OF_getencprop(node, "ranges", base_ranges, nbase_ranges);

	for (i = 0, j = 0; i < nranges; i++) {
	ranges[i].pci_hi = base_ranges[j++];
	ranges[i].pci = 0;
	for (k = 0; k < pci_address_cells - 1; k++) {
	ranges[i].pci <<= 32;
	ranges[i].pci \|= base_ranges[j++];
	}
	ranges[i].host = 0;
	for (k = 0; k < host_address_cells; k++) {
	ranges[i].host <<= 32;
	ranges[i].host \|= base_ranges[j++];
	}
	ranges[i].size = 0;
	for (k = 0; k < size_cells; k++) {
	ranges[i].size <<= 32;
	ranges[i].size \|= base_ranges[j++];
	}
	}

	free(base_ranges, M_DEVBUF);
	return (nranges);
	}

	static struct rman *
	ofw_pci_get_rman(struct ofw_pci_softc *sc, int type, u_int flags)
	{

	switch (type) {
	case SYS_RES_IOPORT:
	return (&sc->sc_io_rman);
	case SYS_RES_MEMORY:
	if (sc->sc_have_pmem && (flags & RF_PREFETCHABLE))
	return (&sc->sc_pmem_rman);
	else
	return (&sc->sc_mem_rman);
	default:
	break;
	}

	return (NULL);
	}
	Index: head/sys/dev/pci/pci.c
	===================================================================
	--- head/sys/dev/pci/pci.c (revision 327172)
	+++ head/sys/dev/pci/pci.c (revision 327173)
	@@ -1,6147 +1,6143 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997, Stefan Esser <se@freebsd.org>
	* Copyright (c) 2000, Michael Smith <msmith@freebsd.org>
	* Copyright (c) 2000, BSDi
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_bus.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/limits.h>
	#include <sys/linker.h>
	#include <sys/fcntl.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/queue.h>
	#include <sys/sysctl.h>
	#include <sys/endian.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_extern.h>

	#include <sys/bus.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <machine/resource.h>
	#include <machine/stdarg.h>

	#if defined(__i386__) \|\| defined(__amd64__) \|\| defined(__powerpc__)
	#include <machine/intr_machdep.h>
	#endif

	#include <sys/pciio.h>
	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcivar.h>
	#include <dev/pci/pci_private.h>

	#ifdef PCI_IOV
	#include <sys/nv.h>
	#include <dev/pci/pci_iov_private.h>
	#endif

	#include <dev/usb/controller/xhcireg.h>
	#include <dev/usb/controller/ehcireg.h>
	#include <dev/usb/controller/ohcireg.h>
	#include <dev/usb/controller/uhcireg.h>

	#include "pcib_if.h"
	#include "pci_if.h"

	#define PCIR_IS_BIOS(cfg, reg) \
	(((cfg)->hdrtype == PCIM_HDRTYPE_NORMAL && reg == PCIR_BIOS) \|\| \
	((cfg)->hdrtype == PCIM_HDRTYPE_BRIDGE && reg == PCIR_BIOS_1))

	static int pci_has_quirk(uint32_t devid, int quirk);
	static pci_addr_t pci_mapbase(uint64_t mapreg);
	static const char *pci_maptype(uint64_t mapreg);
	static int pci_maprange(uint64_t mapreg);
	static pci_addr_t pci_rombase(uint64_t mapreg);
	static int pci_romsize(uint64_t testval);
	static void pci_fixancient(pcicfgregs *cfg);
	static int pci_printf(pcicfgregs cfg, const char fmt, ...);

	static int pci_porten(device_t dev);
	static int pci_memen(device_t dev);
	static void pci_assign_interrupt(device_t bus, device_t dev,
	int force_route);
	static int pci_add_map(device_t bus, device_t dev, int reg,
	struct resource_list *rl, int force, int prefetch);
	static int pci_probe(device_t dev);
	static int pci_attach(device_t dev);
	static int pci_detach(device_t dev);
	static void pci_load_vendor_data(void);
	static int pci_describe_parse_line(char *ptr, int vendor,
	int device, char *desc);
	static char *pci_describe_device(device_t dev);
	static int pci_modevent(module_t mod, int what, void *arg);
	static void pci_hdrtypedata(device_t pcib, int b, int s, int f,
	pcicfgregs *cfg);
	static void pci_read_cap(device_t pcib, pcicfgregs *cfg);
	static int pci_read_vpd_reg(device_t pcib, pcicfgregs *cfg,
	int reg, uint32_t *data);
	#if 0
	static int pci_write_vpd_reg(device_t pcib, pcicfgregs *cfg,
	int reg, uint32_t data);
	#endif
	static void pci_read_vpd(device_t pcib, pcicfgregs *cfg);
	static void pci_mask_msix(device_t dev, u_int index);
	static void pci_unmask_msix(device_t dev, u_int index);
	static int pci_msi_blacklisted(void);
	static int pci_msix_blacklisted(void);
	static void pci_resume_msi(device_t dev);
	static void pci_resume_msix(device_t dev);
	static int pci_remap_intr_method(device_t bus, device_t dev,
	u_int irq);
	static void pci_hint_device_unit(device_t acdev, device_t child,
	const char name, int unitp);

	static int pci_get_id_method(device_t dev, device_t child,
	enum pci_id_type type, uintptr_t *rid);

	static struct pci_devinfo * pci_fill_devinfo(device_t pcib, device_t bus, int d,
	int b, int s, int f, uint16_t vid, uint16_t did);

	static device_method_t pci_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, pci_probe),
	DEVMETHOD(device_attach, pci_attach),
	DEVMETHOD(device_detach, pci_detach),
	DEVMETHOD(device_shutdown, bus_generic_shutdown),
	DEVMETHOD(device_suspend, bus_generic_suspend),
	DEVMETHOD(device_resume, pci_resume),

	/* Bus interface */
	DEVMETHOD(bus_print_child, pci_print_child),
	DEVMETHOD(bus_probe_nomatch, pci_probe_nomatch),
	DEVMETHOD(bus_read_ivar, pci_read_ivar),
	DEVMETHOD(bus_write_ivar, pci_write_ivar),
	DEVMETHOD(bus_driver_added, pci_driver_added),
	DEVMETHOD(bus_setup_intr, pci_setup_intr),
	DEVMETHOD(bus_teardown_intr, pci_teardown_intr),

	DEVMETHOD(bus_get_dma_tag, pci_get_dma_tag),
	DEVMETHOD(bus_get_resource_list,pci_get_resource_list),
	DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource),
	DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource),
	DEVMETHOD(bus_delete_resource, pci_delete_resource),
	DEVMETHOD(bus_alloc_resource, pci_alloc_resource),
	DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource),
	DEVMETHOD(bus_release_resource, pci_release_resource),
	DEVMETHOD(bus_activate_resource, pci_activate_resource),
	DEVMETHOD(bus_deactivate_resource, pci_deactivate_resource),
	DEVMETHOD(bus_child_deleted, pci_child_deleted),
	DEVMETHOD(bus_child_detached, pci_child_detached),
	DEVMETHOD(bus_child_pnpinfo_str, pci_child_pnpinfo_str_method),
	DEVMETHOD(bus_child_location_str, pci_child_location_str_method),
	DEVMETHOD(bus_hint_device_unit, pci_hint_device_unit),
	DEVMETHOD(bus_remap_intr, pci_remap_intr_method),
	DEVMETHOD(bus_suspend_child, pci_suspend_child),
	DEVMETHOD(bus_resume_child, pci_resume_child),
	DEVMETHOD(bus_rescan, pci_rescan_method),

	/* PCI interface */
	DEVMETHOD(pci_read_config, pci_read_config_method),
	DEVMETHOD(pci_write_config, pci_write_config_method),
	DEVMETHOD(pci_enable_busmaster, pci_enable_busmaster_method),
	DEVMETHOD(pci_disable_busmaster, pci_disable_busmaster_method),
	DEVMETHOD(pci_enable_io, pci_enable_io_method),
	DEVMETHOD(pci_disable_io, pci_disable_io_method),
	DEVMETHOD(pci_get_vpd_ident, pci_get_vpd_ident_method),
	DEVMETHOD(pci_get_vpd_readonly, pci_get_vpd_readonly_method),
	DEVMETHOD(pci_get_powerstate, pci_get_powerstate_method),
	DEVMETHOD(pci_set_powerstate, pci_set_powerstate_method),
	DEVMETHOD(pci_assign_interrupt, pci_assign_interrupt_method),
	DEVMETHOD(pci_find_cap, pci_find_cap_method),
	DEVMETHOD(pci_find_extcap, pci_find_extcap_method),
	DEVMETHOD(pci_find_htcap, pci_find_htcap_method),
	DEVMETHOD(pci_alloc_msi, pci_alloc_msi_method),
	DEVMETHOD(pci_alloc_msix, pci_alloc_msix_method),
	DEVMETHOD(pci_enable_msi, pci_enable_msi_method),
	DEVMETHOD(pci_enable_msix, pci_enable_msix_method),
	DEVMETHOD(pci_disable_msi, pci_disable_msi_method),
	DEVMETHOD(pci_remap_msix, pci_remap_msix_method),
	DEVMETHOD(pci_release_msi, pci_release_msi_method),
	DEVMETHOD(pci_msi_count, pci_msi_count_method),
	DEVMETHOD(pci_msix_count, pci_msix_count_method),
	DEVMETHOD(pci_msix_pba_bar, pci_msix_pba_bar_method),
	DEVMETHOD(pci_msix_table_bar, pci_msix_table_bar_method),
	DEVMETHOD(pci_get_id, pci_get_id_method),
	DEVMETHOD(pci_alloc_devinfo, pci_alloc_devinfo_method),
	DEVMETHOD(pci_child_added, pci_child_added_method),
	#ifdef PCI_IOV
	DEVMETHOD(pci_iov_attach, pci_iov_attach_method),
	DEVMETHOD(pci_iov_detach, pci_iov_detach_method),
	DEVMETHOD(pci_create_iov_child, pci_create_iov_child_method),
	#endif

	DEVMETHOD_END
	};

	DEFINE_CLASS_0(pci, pci_driver, pci_methods, sizeof(struct pci_softc));

	static devclass_t pci_devclass;
	DRIVER_MODULE(pci, pcib, pci_driver, pci_devclass, pci_modevent, NULL);
	MODULE_VERSION(pci, 1);

	static char *pci_vendordata;
	static size_t pci_vendordata_size;

	struct pci_quirk {
	uint32_t devid; /* Vendor/device of the card */
	int type;
	#define PCI_QUIRK_MAP_REG 1 /* PCI map register in weird place */
	#define PCI_QUIRK_DISABLE_MSI 2 /* Neither MSI nor MSI-X work */
	#define PCI_QUIRK_ENABLE_MSI_VM 3 /* Older chipset in VM where MSI works */
	#define PCI_QUIRK_UNMAP_REG 4 /* Ignore PCI map register */
	#define PCI_QUIRK_DISABLE_MSIX 5 /* MSI-X doesn't work */
	#define PCI_QUIRK_MSI_INTX_BUG 6 /* PCIM_CMD_INTxDIS disables MSI */
	int arg1;
	int arg2;
	};

	static const struct pci_quirk pci_quirks[] = {
	/* The Intel 82371AB and 82443MX have a map register at offset 0x90. */
	{ 0x71138086, PCI_QUIRK_MAP_REG, 0x90, 0 },
	{ 0x719b8086, PCI_QUIRK_MAP_REG, 0x90, 0 },
	/* As does the Serverworks OSB4 (the SMBus mapping register) */
	{ 0x02001166, PCI_QUIRK_MAP_REG, 0x90, 0 },

	/*
	* MSI doesn't work with the ServerWorks CNB20-HE Host Bridge
	* or the CMIC-SL (AKA ServerWorks GC_LE).
	*/
	{ 0x00141166, PCI_QUIRK_DISABLE_MSI, 0, 0 },
	{ 0x00171166, PCI_QUIRK_DISABLE_MSI, 0, 0 },

	/*
	* MSI doesn't work on earlier Intel chipsets including
	* E7500, E7501, E7505, 845, 865, 875/E7210, and 855.
	*/
	{ 0x25408086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
	{ 0x254c8086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
	{ 0x25508086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
	{ 0x25608086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
	{ 0x25708086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
	{ 0x25788086, PCI_QUIRK_DISABLE_MSI, 0, 0 },
	{ 0x35808086, PCI_QUIRK_DISABLE_MSI, 0, 0 },

	/*
	* MSI doesn't work with devices behind the AMD 8131 HT-PCIX
	* bridge.
	*/
	{ 0x74501022, PCI_QUIRK_DISABLE_MSI, 0, 0 },

	/*
	* MSI-X allocation doesn't work properly for devices passed through
	* by VMware up to at least ESXi 5.1.
	*/
	{ 0x079015ad, PCI_QUIRK_DISABLE_MSIX, 0, 0 }, /* PCI/PCI-X */
	{ 0x07a015ad, PCI_QUIRK_DISABLE_MSIX, 0, 0 }, /* PCIe */

	/*
	* Some virtualization environments emulate an older chipset
	* but support MSI just fine. QEMU uses the Intel 82440.
	*/
	{ 0x12378086, PCI_QUIRK_ENABLE_MSI_VM, 0, 0 },

	/*
	* HPET MMIO base address may appear in Bar1 for AMD SB600 SMBus
	* controller depending on SoftPciRst register (PM_IO 0x55 [7]).
	* It prevents us from attaching hpet(4) when the bit is unset.
	* Note this quirk only affects SB600 revision A13 and earlier.
	* For SB600 A21 and later, firmware must set the bit to hide it.
	* For SB700 and later, it is unused and hardcoded to zero.
	*/
	{ 0x43851002, PCI_QUIRK_UNMAP_REG, 0x14, 0 },

	/*
	* Atheros AR8161/AR8162/E2200/E2400/E2500 Ethernet controllers have
	* a bug that MSI interrupt does not assert if PCIM_CMD_INTxDIS bit
	* of the command register is set.
	*/
	{ 0x10911969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
	{ 0xE0911969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
	{ 0xE0A11969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
	{ 0xE0B11969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },
	{ 0x10901969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 },

	/*
	* Broadcom BCM5714(S)/BCM5715(S)/BCM5780(S) Ethernet MACs don't
	* issue MSI interrupts with PCIM_CMD_INTxDIS set either.
	*/
	{ 0x166814e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5714 */
	{ 0x166914e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5714S */
	{ 0x166a14e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5780 */
	{ 0x166b14e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5780S */
	{ 0x167814e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715 */
	{ 0x167914e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715S */

	{ 0 }
	};

	/* map register information */
	#define PCI_MAPMEM 0x01 /* memory map */
	#define PCI_MAPMEMP 0x02 /* prefetchable memory map */
	#define PCI_MAPPORT 0x04 /* port map */

	struct devlist pci_devq;
	uint32_t pci_generation;
	uint32_t pci_numdevs = 0;
	static int pcie_chipset, pcix_chipset;

	/* sysctl vars */
	SYSCTL_NODE(_hw, OID_AUTO, pci, CTLFLAG_RD, 0, "PCI bus tuning parameters");

	static int pci_enable_io_modes = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, enable_io_modes, CTLFLAG_RWTUN,
	&pci_enable_io_modes, 1,
	"Enable I/O and memory bits in the config register. Some BIOSes do not"
	" enable these bits correctly. We'd like to do this all the time, but"
	" there are some peripherals that this causes problems with.");

	static int pci_do_realloc_bars = 0;
	SYSCTL_INT(_hw_pci, OID_AUTO, realloc_bars, CTLFLAG_RWTUN,
	&pci_do_realloc_bars, 0,
	"Attempt to allocate a new range for any BARs whose original "
	"firmware-assigned ranges fail to allocate during the initial device scan.");

	static int pci_do_power_nodriver = 0;
	SYSCTL_INT(_hw_pci, OID_AUTO, do_power_nodriver, CTLFLAG_RWTUN,
	&pci_do_power_nodriver, 0,
	"Place a function into D3 state when no driver attaches to it. 0 means"
	" disable. 1 means conservatively place devices into D3 state. 2 means"
	" aggressively place devices into D3 state. 3 means put absolutely"
	" everything in D3 state.");

	int pci_do_power_resume = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, do_power_resume, CTLFLAG_RWTUN,
	&pci_do_power_resume, 1,
	"Transition from D3 -> D0 on resume.");

	int pci_do_power_suspend = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, do_power_suspend, CTLFLAG_RWTUN,
	&pci_do_power_suspend, 1,
	"Transition from D0 -> D3 on suspend.");

	static int pci_do_msi = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, enable_msi, CTLFLAG_RWTUN, &pci_do_msi, 1,
	"Enable support for MSI interrupts");

	static int pci_do_msix = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, enable_msix, CTLFLAG_RWTUN, &pci_do_msix, 1,
	"Enable support for MSI-X interrupts");

	static int pci_msix_rewrite_table = 0;
	SYSCTL_INT(_hw_pci, OID_AUTO, msix_rewrite_table, CTLFLAG_RWTUN,
	&pci_msix_rewrite_table, 0,
	"Rewrite entire MSI-X table when updating MSI-X entries");

	static int pci_honor_msi_blacklist = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, honor_msi_blacklist, CTLFLAG_RDTUN,
	&pci_honor_msi_blacklist, 1, "Honor chipset blacklist for MSI/MSI-X");

	#if defined(__i386__) \|\| defined(__amd64__)
	static int pci_usb_takeover = 1;
	#else
	static int pci_usb_takeover = 0;
	#endif
	SYSCTL_INT(_hw_pci, OID_AUTO, usb_early_takeover, CTLFLAG_RDTUN,
	&pci_usb_takeover, 1,
	"Enable early takeover of USB controllers. Disable this if you depend on"
	" BIOS emulation of USB devices, that is you use USB devices (like"
	" keyboard or mouse) but do not load USB drivers");

	static int pci_clear_bars;
	SYSCTL_INT(_hw_pci, OID_AUTO, clear_bars, CTLFLAG_RDTUN, &pci_clear_bars, 0,
	"Ignore firmware-assigned resources for BARs.");

	#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
	static int pci_clear_buses;
	SYSCTL_INT(_hw_pci, OID_AUTO, clear_buses, CTLFLAG_RDTUN, &pci_clear_buses, 0,
	"Ignore firmware-assigned bus numbers.");
	#endif

	static int pci_enable_ari = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, enable_ari, CTLFLAG_RDTUN, &pci_enable_ari,
	0, "Enable support for PCIe Alternative RID Interpretation");

	static int
	pci_has_quirk(uint32_t devid, int quirk)
	{
	const struct pci_quirk *q;

	for (q = &pci_quirks[0]; q->devid; q++) {
	if (q->devid == devid && q->type == quirk)
	return (1);
	}
	return (0);
	}

	/* Find a device_t by bus/slot/function in domain 0 */

	device_t
	pci_find_bsf(uint8_t bus, uint8_t slot, uint8_t func)
	{

	return (pci_find_dbsf(0, bus, slot, func));
	}

	/* Find a device_t by domain/bus/slot/function */

	device_t
	pci_find_dbsf(uint32_t domain, uint8_t bus, uint8_t slot, uint8_t func)
	{
	struct pci_devinfo *dinfo;

	STAILQ_FOREACH(dinfo, &pci_devq, pci_links) {
	if ((dinfo->cfg.domain == domain) &&
	(dinfo->cfg.bus == bus) &&
	(dinfo->cfg.slot == slot) &&
	(dinfo->cfg.func == func)) {
	return (dinfo->cfg.dev);
	}
	}

	return (NULL);
	}

	/* Find a device_t by vendor/device ID */

	device_t
	pci_find_device(uint16_t vendor, uint16_t device)
	{
	struct pci_devinfo *dinfo;

	STAILQ_FOREACH(dinfo, &pci_devq, pci_links) {
	if ((dinfo->cfg.vendor == vendor) &&
	(dinfo->cfg.device == device)) {
	return (dinfo->cfg.dev);
	}
	}

	return (NULL);
	}

	device_t
	pci_find_class(uint8_t class, uint8_t subclass)
	{
	struct pci_devinfo *dinfo;

	STAILQ_FOREACH(dinfo, &pci_devq, pci_links) {
	if (dinfo->cfg.baseclass == class &&
	dinfo->cfg.subclass == subclass) {
	return (dinfo->cfg.dev);
	}
	}

	return (NULL);
	}

	static int
	pci_printf(pcicfgregs cfg, const char fmt, ...)
	{
	va_list ap;
	int retval;

	retval = printf("pci%d:%d:%d:%d: ", cfg->domain, cfg->bus, cfg->slot,
	cfg->func);
	va_start(ap, fmt);
	retval += vprintf(fmt, ap);
	va_end(ap);
	return (retval);
	}

	/* return base address of memory or port map */

	static pci_addr_t
	pci_mapbase(uint64_t mapreg)
	{

	if (PCI_BAR_MEM(mapreg))
	return (mapreg & PCIM_BAR_MEM_BASE);
	else
	return (mapreg & PCIM_BAR_IO_BASE);
	}

	/* return map type of memory or port map */

	static const char *
	pci_maptype(uint64_t mapreg)
	{

	if (PCI_BAR_IO(mapreg))
	return ("I/O Port");
	if (mapreg & PCIM_BAR_MEM_PREFETCH)
	return ("Prefetchable Memory");
	return ("Memory");
	}

	/* return log2 of map size decoded for memory or port map */

	int
	pci_mapsize(uint64_t testval)
	{
	int ln2size;

	testval = pci_mapbase(testval);
	ln2size = 0;
	if (testval != 0) {
	while ((testval & 1) == 0)
	{
	ln2size++;
	testval >>= 1;
	}
	}
	return (ln2size);
	}

	/* return base address of device ROM */

	static pci_addr_t
	pci_rombase(uint64_t mapreg)
	{

	return (mapreg & PCIM_BIOS_ADDR_MASK);
	}

	/* return log2 of map size decided for device ROM */

	static int
	pci_romsize(uint64_t testval)
	{
	int ln2size;

	testval = pci_rombase(testval);
	ln2size = 0;
	if (testval != 0) {
	while ((testval & 1) == 0)
	{
	ln2size++;
	testval >>= 1;
	}
	}
	return (ln2size);
	}

	/* return log2 of address range supported by map register */

	static int
	pci_maprange(uint64_t mapreg)
	{
	int ln2range = 0;

	if (PCI_BAR_IO(mapreg))
	ln2range = 32;
	else
	switch (mapreg & PCIM_BAR_MEM_TYPE) {
	case PCIM_BAR_MEM_32:
	ln2range = 32;
	break;
	case PCIM_BAR_MEM_1MB:
	ln2range = 20;
	break;
	case PCIM_BAR_MEM_64:
	ln2range = 64;
	break;
	}
	return (ln2range);
	}

	/* adjust some values from PCI 1.0 devices to match 2.0 standards ... */

	static void
	pci_fixancient(pcicfgregs *cfg)
	{
	if ((cfg->hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
	return;

	/* PCI to PCI bridges use header type 1 */
	if (cfg->baseclass == PCIC_BRIDGE && cfg->subclass == PCIS_BRIDGE_PCI)
	cfg->hdrtype = PCIM_HDRTYPE_BRIDGE;
	}

	/* extract header type specific config data */

	static void
	pci_hdrtypedata(device_t pcib, int b, int s, int f, pcicfgregs *cfg)
	{
	#define REG(n, w) PCIB_READ_CONFIG(pcib, b, s, f, n, w)
	switch (cfg->hdrtype & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_NORMAL:
	cfg->subvendor = REG(PCIR_SUBVEND_0, 2);
	cfg->subdevice = REG(PCIR_SUBDEV_0, 2);
	cfg->mingnt = REG(PCIR_MINGNT, 1);
	cfg->maxlat = REG(PCIR_MAXLAT, 1);
	cfg->nummaps = PCI_MAXMAPS_0;
	break;
	case PCIM_HDRTYPE_BRIDGE:
	cfg->bridge.br_seclat = REG(PCIR_SECLAT_1, 1);
	cfg->bridge.br_subbus = REG(PCIR_SUBBUS_1, 1);
	cfg->bridge.br_secbus = REG(PCIR_SECBUS_1, 1);
	cfg->bridge.br_pribus = REG(PCIR_PRIBUS_1, 1);
	cfg->bridge.br_control = REG(PCIR_BRIDGECTL_1, 2);
	cfg->nummaps = PCI_MAXMAPS_1;
	break;
	case PCIM_HDRTYPE_CARDBUS:
	cfg->bridge.br_seclat = REG(PCIR_SECLAT_2, 1);
	cfg->bridge.br_subbus = REG(PCIR_SUBBUS_2, 1);
	cfg->bridge.br_secbus = REG(PCIR_SECBUS_2, 1);
	cfg->bridge.br_pribus = REG(PCIR_PRIBUS_2, 1);
	cfg->bridge.br_control = REG(PCIR_BRIDGECTL_2, 2);
	cfg->subvendor = REG(PCIR_SUBVEND_2, 2);
	cfg->subdevice = REG(PCIR_SUBDEV_2, 2);
	cfg->nummaps = PCI_MAXMAPS_2;
	break;
	}
	#undef REG
	}

	/* read configuration header into pcicfgregs structure */
	struct pci_devinfo *
	pci_read_device(device_t pcib, device_t bus, int d, int b, int s, int f)
	{
	#define REG(n, w) PCIB_READ_CONFIG(pcib, b, s, f, n, w)
	uint16_t vid, did;

	vid = REG(PCIR_VENDOR, 2);
	did = REG(PCIR_DEVICE, 2);
	if (vid != 0xffff)
	return (pci_fill_devinfo(pcib, bus, d, b, s, f, vid, did));

	return (NULL);
	}

	struct pci_devinfo *
	pci_alloc_devinfo_method(device_t dev)
	{

	return (malloc(sizeof(struct pci_devinfo), M_DEVBUF,
	M_WAITOK \| M_ZERO));
	}

	static struct pci_devinfo *
	pci_fill_devinfo(device_t pcib, device_t bus, int d, int b, int s, int f,
	uint16_t vid, uint16_t did)
	{
	struct pci_devinfo *devlist_entry;
	pcicfgregs *cfg;

	devlist_entry = PCI_ALLOC_DEVINFO(bus);

	cfg = &devlist_entry->cfg;

	cfg->domain = d;
	cfg->bus = b;
	cfg->slot = s;
	cfg->func = f;
	cfg->vendor = vid;
	cfg->device = did;
	cfg->cmdreg = REG(PCIR_COMMAND, 2);
	cfg->statreg = REG(PCIR_STATUS, 2);
	cfg->baseclass = REG(PCIR_CLASS, 1);
	cfg->subclass = REG(PCIR_SUBCLASS, 1);
	cfg->progif = REG(PCIR_PROGIF, 1);
	cfg->revid = REG(PCIR_REVID, 1);
	cfg->hdrtype = REG(PCIR_HDRTYPE, 1);
	cfg->cachelnsz = REG(PCIR_CACHELNSZ, 1);
	cfg->lattimer = REG(PCIR_LATTIMER, 1);
	cfg->intpin = REG(PCIR_INTPIN, 1);
	cfg->intline = REG(PCIR_INTLINE, 1);

	cfg->mfdev = (cfg->hdrtype & PCIM_MFDEV) != 0;
	cfg->hdrtype &= ~PCIM_MFDEV;
	STAILQ_INIT(&cfg->maps);

	cfg->iov = NULL;

	pci_fixancient(cfg);
	pci_hdrtypedata(pcib, b, s, f, cfg);

	if (REG(PCIR_STATUS, 2) & PCIM_STATUS_CAPPRESENT)
	pci_read_cap(pcib, cfg);

	STAILQ_INSERT_TAIL(&pci_devq, devlist_entry, pci_links);

	devlist_entry->conf.pc_sel.pc_domain = cfg->domain;
	devlist_entry->conf.pc_sel.pc_bus = cfg->bus;
	devlist_entry->conf.pc_sel.pc_dev = cfg->slot;
	devlist_entry->conf.pc_sel.pc_func = cfg->func;
	devlist_entry->conf.pc_hdr = cfg->hdrtype;

	devlist_entry->conf.pc_subvendor = cfg->subvendor;
	devlist_entry->conf.pc_subdevice = cfg->subdevice;
	devlist_entry->conf.pc_vendor = cfg->vendor;
	devlist_entry->conf.pc_device = cfg->device;

	devlist_entry->conf.pc_class = cfg->baseclass;
	devlist_entry->conf.pc_subclass = cfg->subclass;
	devlist_entry->conf.pc_progif = cfg->progif;
	devlist_entry->conf.pc_revid = cfg->revid;

	pci_numdevs++;
	pci_generation++;

	return (devlist_entry);
	}
	#undef REG

	static void
	pci_ea_fill_info(device_t pcib, pcicfgregs *cfg)
	{
	#define REG(n, w) PCIB_READ_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, \
	cfg->ea.ea_location + (n), w)
	int num_ent;
	int ptr;
	int a, b;
	uint32_t val;
	int ent_size;
	uint32_t dw[4];
	uint64_t base, max_offset;
	struct pci_ea_entry *eae;

	if (cfg->ea.ea_location == 0)
	return;

	STAILQ_INIT(&cfg->ea.ea_entries);

	/* Determine the number of entries */
	num_ent = REG(PCIR_EA_NUM_ENT, 2);
	num_ent &= PCIM_EA_NUM_ENT_MASK;

	/* Find the first entry to care of */
	ptr = PCIR_EA_FIRST_ENT;

	/* Skip DWORD 2 for type 1 functions */
	if ((cfg->hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_BRIDGE)
	ptr += 4;

	for (a = 0; a < num_ent; a++) {

	eae = malloc(sizeof(*eae), M_DEVBUF, M_WAITOK \| M_ZERO);
	eae->eae_cfg_offset = cfg->ea.ea_location + ptr;

	/* Read a number of dwords in the entry */
	val = REG(ptr, 4);
	ptr += 4;
	ent_size = (val & PCIM_EA_ES);

	for (b = 0; b < ent_size; b++) {
	dw[b] = REG(ptr, 4);
	ptr += 4;
	}

	eae->eae_flags = val;
	eae->eae_bei = (PCIM_EA_BEI & val) >> PCIM_EA_BEI_OFFSET;

	base = dw[0] & PCIM_EA_FIELD_MASK;
	max_offset = dw[1] \| ~PCIM_EA_FIELD_MASK;
	b = 2;
	if (((dw[0] & PCIM_EA_IS_64) != 0) && (b < ent_size)) {
	base \|= (uint64_t)dw[b] << 32UL;
	b++;
	}
	if (((dw[1] & PCIM_EA_IS_64) != 0)
	&& (b < ent_size)) {
	max_offset \|= (uint64_t)dw[b] << 32UL;
	b++;
	}

	eae->eae_base = base;
	eae->eae_max_offset = max_offset;

	STAILQ_INSERT_TAIL(&cfg->ea.ea_entries, eae, eae_link);

	if (bootverbose) {
	printf("PCI(EA) dev %04x:%04x, bei %d, flags #%x, base #%jx, max_offset #%jx\n",
	cfg->vendor, cfg->device, eae->eae_bei, eae->eae_flags,
	(uintmax_t)eae->eae_base, (uintmax_t)eae->eae_max_offset);
	}
	}
	}
	#undef REG

	static void
	pci_read_cap(device_t pcib, pcicfgregs *cfg)
	{
	#define REG(n, w) PCIB_READ_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, n, w)
	#define WREG(n, v, w) PCIB_WRITE_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, n, v, w)
	#if defined(__i386__) \|\| defined(__amd64__) \|\| defined(__powerpc__)
	uint64_t addr;
	#endif
	uint32_t val;
	int ptr, nextptr, ptrptr;

	switch (cfg->hdrtype & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_NORMAL:
	case PCIM_HDRTYPE_BRIDGE:
	ptrptr = PCIR_CAP_PTR;
	break;
	case PCIM_HDRTYPE_CARDBUS:
	ptrptr = PCIR_CAP_PTR_2; /* cardbus capabilities ptr */
	break;
	default:
	return; /* no extended capabilities support */
	}
	nextptr = REG(ptrptr, 1); /* sanity check? */

	/*
	* Read capability entries.
	*/
	while (nextptr != 0) {
	/* Sanity check */
	if (nextptr > 255) {
	printf("illegal PCI extended capability offset %d\n",
	nextptr);
	return;
	}
	/* Find the next entry */
	ptr = nextptr;
	nextptr = REG(ptr + PCICAP_NEXTPTR, 1);

	/* Process this entry */
	switch (REG(ptr + PCICAP_ID, 1)) {
	case PCIY_PMG: /* PCI power management */
	if (cfg->pp.pp_cap == 0) {
	cfg->pp.pp_cap = REG(ptr + PCIR_POWER_CAP, 2);
	cfg->pp.pp_status = ptr + PCIR_POWER_STATUS;
	cfg->pp.pp_bse = ptr + PCIR_POWER_BSE;
	if ((nextptr - ptr) > PCIR_POWER_DATA)
	cfg->pp.pp_data = ptr + PCIR_POWER_DATA;
	}
	break;
	case PCIY_HT: /* HyperTransport */
	/* Determine HT-specific capability type. */
	val = REG(ptr + PCIR_HT_COMMAND, 2);

	if ((val & 0xe000) == PCIM_HTCAP_SLAVE)
	cfg->ht.ht_slave = ptr;

	#if defined(__i386__) \|\| defined(__amd64__) \|\| defined(__powerpc__)
	switch (val & PCIM_HTCMD_CAP_MASK) {
	case PCIM_HTCAP_MSI_MAPPING:
	if (!(val & PCIM_HTCMD_MSI_FIXED)) {
	/* Sanity check the mapping window. */
	addr = REG(ptr + PCIR_HTMSI_ADDRESS_HI,
	4);
	addr <<= 32;
	addr \|= REG(ptr + PCIR_HTMSI_ADDRESS_LO,
	4);
	if (addr != MSI_INTEL_ADDR_BASE)
	device_printf(pcib,
	"HT device at pci%d:%d:%d:%d has non-default MSI window 0x%llx\n",
	cfg->domain, cfg->bus,
	cfg->slot, cfg->func,
	(long long)addr);
	} else
	addr = MSI_INTEL_ADDR_BASE;

	cfg->ht.ht_msimap = ptr;
	cfg->ht.ht_msictrl = val;
	cfg->ht.ht_msiaddr = addr;
	break;
	}
	#endif
	break;
	case PCIY_MSI: /* PCI MSI */
	cfg->msi.msi_location = ptr;
	cfg->msi.msi_ctrl = REG(ptr + PCIR_MSI_CTRL, 2);
	cfg->msi.msi_msgnum = 1 << ((cfg->msi.msi_ctrl &
	PCIM_MSICTRL_MMC_MASK)>>1);
	break;
	case PCIY_MSIX: /* PCI MSI-X */
	cfg->msix.msix_location = ptr;
	cfg->msix.msix_ctrl = REG(ptr + PCIR_MSIX_CTRL, 2);
	cfg->msix.msix_msgnum = (cfg->msix.msix_ctrl &
	PCIM_MSIXCTRL_TABLE_SIZE) + 1;
	val = REG(ptr + PCIR_MSIX_TABLE, 4);
	cfg->msix.msix_table_bar = PCIR_BAR(val &
	PCIM_MSIX_BIR_MASK);
	cfg->msix.msix_table_offset = val & ~PCIM_MSIX_BIR_MASK;
	val = REG(ptr + PCIR_MSIX_PBA, 4);
	cfg->msix.msix_pba_bar = PCIR_BAR(val &
	PCIM_MSIX_BIR_MASK);
	cfg->msix.msix_pba_offset = val & ~PCIM_MSIX_BIR_MASK;
	break;
	case PCIY_VPD: /* PCI Vital Product Data */
	cfg->vpd.vpd_reg = ptr;
	break;
	case PCIY_SUBVENDOR:
	/* Should always be true. */
	if ((cfg->hdrtype & PCIM_HDRTYPE) ==
	PCIM_HDRTYPE_BRIDGE) {
	val = REG(ptr + PCIR_SUBVENDCAP_ID, 4);
	cfg->subvendor = val & 0xffff;
	cfg->subdevice = val >> 16;
	}
	break;
	case PCIY_PCIX: /* PCI-X */
	/*
	* Assume we have a PCI-X chipset if we have
	* at least one PCI-PCI bridge with a PCI-X
	* capability. Note that some systems with
	* PCI-express or HT chipsets might match on
	* this check as well.
	*/
	if ((cfg->hdrtype & PCIM_HDRTYPE) ==
	PCIM_HDRTYPE_BRIDGE)
	pcix_chipset = 1;
	cfg->pcix.pcix_location = ptr;
	break;
	case PCIY_EXPRESS: /* PCI-express */
	/*
	* Assume we have a PCI-express chipset if we have
	* at least one PCI-express device.
	*/
	pcie_chipset = 1;
	cfg->pcie.pcie_location = ptr;
	val = REG(ptr + PCIER_FLAGS, 2);
	cfg->pcie.pcie_type = val & PCIEM_FLAGS_TYPE;
	break;
	case PCIY_EA: /* Enhanced Allocation */
	cfg->ea.ea_location = ptr;
	pci_ea_fill_info(pcib, cfg);
	break;
	default:
	break;
	}
	}

	#if defined(__powerpc__)
	/*
	* Enable the MSI mapping window for all HyperTransport
	* slaves. PCI-PCI bridges have their windows enabled via
	* PCIB_MAP_MSI().
	*/
	if (cfg->ht.ht_slave != 0 && cfg->ht.ht_msimap != 0 &&
	!(cfg->ht.ht_msictrl & PCIM_HTCMD_MSI_ENABLE)) {
	device_printf(pcib,
	"Enabling MSI window for HyperTransport slave at pci%d:%d:%d:%d\n",
	cfg->domain, cfg->bus, cfg->slot, cfg->func);
	cfg->ht.ht_msictrl \|= PCIM_HTCMD_MSI_ENABLE;
	WREG(cfg->ht.ht_msimap + PCIR_HT_COMMAND, cfg->ht.ht_msictrl,
	2);
	}
	#endif
	/* REG and WREG use carry through to next functions */
	}

	/*
	* PCI Vital Product Data
	*/

	#define PCI_VPD_TIMEOUT 1000000

	static int
	pci_read_vpd_reg(device_t pcib, pcicfgregs cfg, int reg, uint32_t data)
	{
	int count = PCI_VPD_TIMEOUT;

	KASSERT((reg & 3) == 0, ("VPD register must by 4 byte aligned"));

	WREG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, reg, 2);

	while ((REG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, 2) & 0x8000) != 0x8000) {
	if (--count < 0)
	return (ENXIO);
	DELAY(1); /* limit looping */
	}
	*data = (REG(cfg->vpd.vpd_reg + PCIR_VPD_DATA, 4));

	return (0);
	}

	#if 0
	static int
	pci_write_vpd_reg(device_t pcib, pcicfgregs *cfg, int reg, uint32_t data)
	{
	int count = PCI_VPD_TIMEOUT;

	KASSERT((reg & 3) == 0, ("VPD register must by 4 byte aligned"));

	WREG(cfg->vpd.vpd_reg + PCIR_VPD_DATA, data, 4);
	WREG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, reg \| 0x8000, 2);
	while ((REG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, 2) & 0x8000) == 0x8000) {
	if (--count < 0)
	return (ENXIO);
	DELAY(1); /* limit looping */
	}

	return (0);
	}
	#endif

	#undef PCI_VPD_TIMEOUT

	struct vpd_readstate {
	device_t pcib;
	pcicfgregs *cfg;
	uint32_t val;
	int bytesinval;
	int off;
	uint8_t cksum;
	};

	static int
	vpd_nextbyte(struct vpd_readstate vrs, uint8_t data)
	{
	uint32_t reg;
	uint8_t byte;

	if (vrs->bytesinval == 0) {
	if (pci_read_vpd_reg(vrs->pcib, vrs->cfg, vrs->off, &reg))
	return (ENXIO);
	vrs->val = le32toh(reg);
	vrs->off += 4;
	byte = vrs->val & 0xff;
	vrs->bytesinval = 3;
	} else {
	vrs->val = vrs->val >> 8;
	byte = vrs->val & 0xff;
	vrs->bytesinval--;
	}

	vrs->cksum += byte;
	*data = byte;
	return (0);
	}

	static void
	pci_read_vpd(device_t pcib, pcicfgregs *cfg)
	{
	struct vpd_readstate vrs;
	int state;
	int name;
	int remain;
	int i;
	int alloc, off; /* alloc/off for RO/W arrays */
	int cksumvalid;
	int dflen;
	uint8_t byte;
	uint8_t byte2;

	/* init vpd reader */
	vrs.bytesinval = 0;
	vrs.off = 0;
	vrs.pcib = pcib;
	vrs.cfg = cfg;
	vrs.cksum = 0;

	state = 0;
	name = remain = i = 0; /* shut up stupid gcc */
	alloc = off = 0; /* shut up stupid gcc */
	dflen = 0; /* shut up stupid gcc */
	cksumvalid = -1;
	while (state >= 0) {
	if (vpd_nextbyte(&vrs, &byte)) {
	state = -2;
	break;
	}
	#if 0
	printf("vpd: val: %#x, off: %d, bytesinval: %d, byte: %#hhx, " \
	"state: %d, remain: %d, name: %#x, i: %d\n", vrs.val,
	vrs.off, vrs.bytesinval, byte, state, remain, name, i);
	#endif
	switch (state) {
	case 0: /* item name */
	if (byte & 0x80) {
	if (vpd_nextbyte(&vrs, &byte2)) {
	state = -2;
	break;
	}
	remain = byte2;
	if (vpd_nextbyte(&vrs, &byte2)) {
	state = -2;
	break;
	}
	remain \|= byte2 << 8;
	if (remain > (0x7f*4 - vrs.off)) {
	state = -1;
	pci_printf(cfg,
	"invalid VPD data, remain %#x\n",
	remain);
	}
	name = byte & 0x7f;
	} else {
	remain = byte & 0x7;
	name = (byte >> 3) & 0xf;
	}
	switch (name) {
	case 0x2: /* String */
	cfg->vpd.vpd_ident = malloc(remain + 1,
	M_DEVBUF, M_WAITOK);
	i = 0;
	state = 1;
	break;
	case 0xf: /* End */
	state = -1;
	break;
	case 0x10: /* VPD-R */
	alloc = 8;
	off = 0;
	cfg->vpd.vpd_ros = malloc(alloc *
	sizeof(*cfg->vpd.vpd_ros), M_DEVBUF,
	M_WAITOK \| M_ZERO);
	state = 2;
	break;
	case 0x11: /* VPD-W */
	alloc = 8;
	off = 0;
	cfg->vpd.vpd_w = malloc(alloc *
	sizeof(*cfg->vpd.vpd_w), M_DEVBUF,
	M_WAITOK \| M_ZERO);
	state = 5;
	break;
	default: /* Invalid data, abort */
	state = -1;
	break;
	}
	break;

	case 1: /* Identifier String */
	cfg->vpd.vpd_ident[i++] = byte;
	remain--;
	if (remain == 0) {
	cfg->vpd.vpd_ident[i] = '\0';
	state = 0;
	}
	break;

	case 2: /* VPD-R Keyword Header */
	if (off == alloc) {
	cfg->vpd.vpd_ros = reallocf(cfg->vpd.vpd_ros,
	(alloc = 2) sizeof(*cfg->vpd.vpd_ros),
	M_DEVBUF, M_WAITOK \| M_ZERO);
	}
	cfg->vpd.vpd_ros[off].keyword[0] = byte;
	if (vpd_nextbyte(&vrs, &byte2)) {
	state = -2;
	break;
	}
	cfg->vpd.vpd_ros[off].keyword[1] = byte2;
	if (vpd_nextbyte(&vrs, &byte2)) {
	state = -2;
	break;
	}
	cfg->vpd.vpd_ros[off].len = dflen = byte2;
	if (dflen == 0 &&
	strncmp(cfg->vpd.vpd_ros[off].keyword, "RV",
	2) == 0) {
	/*
	* if this happens, we can't trust the rest
	* of the VPD.
	*/
	pci_printf(cfg, "bad keyword length: %d\n",
	dflen);
	cksumvalid = 0;
	state = -1;
	break;
	} else if (dflen == 0) {
	cfg->vpd.vpd_ros[off].value = malloc(1 *
	sizeof(*cfg->vpd.vpd_ros[off].value),
	M_DEVBUF, M_WAITOK);
	cfg->vpd.vpd_ros[off].value[0] = '\x00';
	} else
	cfg->vpd.vpd_ros[off].value = malloc(
	(dflen + 1) *
	sizeof(*cfg->vpd.vpd_ros[off].value),
	M_DEVBUF, M_WAITOK);
	remain -= 3;
	i = 0;
	/* keep in sync w/ state 3's transistions */
	if (dflen == 0 && remain == 0)
	state = 0;
	else if (dflen == 0)
	state = 2;
	else
	state = 3;
	break;

	case 3: /* VPD-R Keyword Value */
	cfg->vpd.vpd_ros[off].value[i++] = byte;
	if (strncmp(cfg->vpd.vpd_ros[off].keyword,
	"RV", 2) == 0 && cksumvalid == -1) {
	if (vrs.cksum == 0)
	cksumvalid = 1;
	else {
	if (bootverbose)
	pci_printf(cfg,
	"bad VPD cksum, remain %hhu\n",
	vrs.cksum);
	cksumvalid = 0;
	state = -1;
	break;
	}
	}
	dflen--;
	remain--;
	/* keep in sync w/ state 2's transistions */
	if (dflen == 0)
	cfg->vpd.vpd_ros[off++].value[i++] = '\0';
	if (dflen == 0 && remain == 0) {
	cfg->vpd.vpd_rocnt = off;
	cfg->vpd.vpd_ros = reallocf(cfg->vpd.vpd_ros,
	off * sizeof(*cfg->vpd.vpd_ros),
	M_DEVBUF, M_WAITOK \| M_ZERO);
	state = 0;
	} else if (dflen == 0)
	state = 2;
	break;

	case 4:
	remain--;
	if (remain == 0)
	state = 0;
	break;

	case 5: /* VPD-W Keyword Header */
	if (off == alloc) {
	cfg->vpd.vpd_w = reallocf(cfg->vpd.vpd_w,
	(alloc = 2) sizeof(*cfg->vpd.vpd_w),
	M_DEVBUF, M_WAITOK \| M_ZERO);
	}
	cfg->vpd.vpd_w[off].keyword[0] = byte;
	if (vpd_nextbyte(&vrs, &byte2)) {
	state = -2;
	break;
	}
	cfg->vpd.vpd_w[off].keyword[1] = byte2;
	if (vpd_nextbyte(&vrs, &byte2)) {
	state = -2;
	break;
	}
	cfg->vpd.vpd_w[off].len = dflen = byte2;
	cfg->vpd.vpd_w[off].start = vrs.off - vrs.bytesinval;
	cfg->vpd.vpd_w[off].value = malloc((dflen + 1) *
	sizeof(*cfg->vpd.vpd_w[off].value),
	M_DEVBUF, M_WAITOK);
	remain -= 3;
	i = 0;
	/* keep in sync w/ state 6's transistions */
	if (dflen == 0 && remain == 0)
	state = 0;
	else if (dflen == 0)
	state = 5;
	else
	state = 6;
	break;

	case 6: /* VPD-W Keyword Value */
	cfg->vpd.vpd_w[off].value[i++] = byte;
	dflen--;
	remain--;
	/* keep in sync w/ state 5's transistions */
	if (dflen == 0)
	cfg->vpd.vpd_w[off++].value[i++] = '\0';
	if (dflen == 0 && remain == 0) {
	cfg->vpd.vpd_wcnt = off;
	cfg->vpd.vpd_w = reallocf(cfg->vpd.vpd_w,
	off * sizeof(*cfg->vpd.vpd_w),
	M_DEVBUF, M_WAITOK \| M_ZERO);
	state = 0;
	} else if (dflen == 0)
	state = 5;
	break;

	default:
	pci_printf(cfg, "invalid state: %d\n", state);
	state = -1;
	break;
	}
	}

	if (cksumvalid == 0 \|\| state < -1) {
	/* read-only data bad, clean up */
	if (cfg->vpd.vpd_ros != NULL) {
	for (off = 0; cfg->vpd.vpd_ros[off].value; off++)
	free(cfg->vpd.vpd_ros[off].value, M_DEVBUF);
	free(cfg->vpd.vpd_ros, M_DEVBUF);
	cfg->vpd.vpd_ros = NULL;
	}
	}
	if (state < -1) {
	/* I/O error, clean up */
	pci_printf(cfg, "failed to read VPD data.\n");
	if (cfg->vpd.vpd_ident != NULL) {
	free(cfg->vpd.vpd_ident, M_DEVBUF);
	cfg->vpd.vpd_ident = NULL;
	}
	if (cfg->vpd.vpd_w != NULL) {
	for (off = 0; cfg->vpd.vpd_w[off].value; off++)
	free(cfg->vpd.vpd_w[off].value, M_DEVBUF);
	free(cfg->vpd.vpd_w, M_DEVBUF);
	cfg->vpd.vpd_w = NULL;
	}
	}
	cfg->vpd.vpd_cached = 1;
	#undef REG
	#undef WREG
	}

	int
	pci_get_vpd_ident_method(device_t dev, device_t child, const char **identptr)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;

	if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0)
	pci_read_vpd(device_get_parent(dev), cfg);

	*identptr = cfg->vpd.vpd_ident;

	if (*identptr == NULL)
	return (ENXIO);

	return (0);
	}

	int
	pci_get_vpd_readonly_method(device_t dev, device_t child, const char *kw,
	const char **vptr)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;
	int i;

	if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0)
	pci_read_vpd(device_get_parent(dev), cfg);

	for (i = 0; i < cfg->vpd.vpd_rocnt; i++)
	if (memcmp(kw, cfg->vpd.vpd_ros[i].keyword,
	sizeof(cfg->vpd.vpd_ros[i].keyword)) == 0) {
	*vptr = cfg->vpd.vpd_ros[i].value;
	return (0);
	}

	*vptr = NULL;
	return (ENXIO);
	}

	struct pcicfg_vpd *
	pci_fetch_vpd_list(device_t dev)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	pcicfgregs *cfg = &dinfo->cfg;

	if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0)
	pci_read_vpd(device_get_parent(device_get_parent(dev)), cfg);
	return (&cfg->vpd);
	}

	/*
	* Find the requested HyperTransport capability and return the offset
	* in configuration space via the pointer provided. The function
	* returns 0 on success and an error code otherwise.
	*/
	int
	pci_find_htcap_method(device_t dev, device_t child, int capability, int *capreg)
	{
	int ptr, error;
	uint16_t val;

	error = pci_find_cap(child, PCIY_HT, &ptr);
	if (error)
	return (error);

	/*
	* Traverse the capabilities list checking each HT capability
	* to see if it matches the requested HT capability.
	*/
	while (ptr != 0) {
	val = pci_read_config(child, ptr + PCIR_HT_COMMAND, 2);
	if (capability == PCIM_HTCAP_SLAVE \|\|
	capability == PCIM_HTCAP_HOST)
	val &= 0xe000;
	else
	val &= PCIM_HTCMD_CAP_MASK;
	if (val == capability) {
	if (capreg != NULL)
	*capreg = ptr;
	return (0);
	}

	/* Skip to the next HT capability. */
	while (ptr != 0) {
	ptr = pci_read_config(child, ptr + PCICAP_NEXTPTR, 1);
	if (pci_read_config(child, ptr + PCICAP_ID, 1) ==
	PCIY_HT)
	break;
	}
	}
	return (ENOENT);
	}

	/*
	* Find the requested capability and return the offset in
	* configuration space via the pointer provided. The function returns
	* 0 on success and an error code otherwise.
	*/
	int
	pci_find_cap_method(device_t dev, device_t child, int capability,
	int *capreg)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;
	u_int32_t status;
	u_int8_t ptr;

	/*
	* Check the CAP_LIST bit of the PCI status register first.
	*/
	status = pci_read_config(child, PCIR_STATUS, 2);
	if (!(status & PCIM_STATUS_CAPPRESENT))
	return (ENXIO);

	/*
	* Determine the start pointer of the capabilities list.
	*/
	switch (cfg->hdrtype & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_NORMAL:
	case PCIM_HDRTYPE_BRIDGE:
	ptr = PCIR_CAP_PTR;
	break;
	case PCIM_HDRTYPE_CARDBUS:
	ptr = PCIR_CAP_PTR_2;
	break;
	default:
	/* XXX: panic? */
	return (ENXIO); /* no extended capabilities support */
	}
	ptr = pci_read_config(child, ptr, 1);

	/*
	* Traverse the capabilities list.
	*/
	while (ptr != 0) {
	if (pci_read_config(child, ptr + PCICAP_ID, 1) == capability) {
	if (capreg != NULL)
	*capreg = ptr;
	return (0);
	}
	ptr = pci_read_config(child, ptr + PCICAP_NEXTPTR, 1);
	}

	return (ENOENT);
	}

	/*
	* Find the requested extended capability and return the offset in
	* configuration space via the pointer provided. The function returns
	* 0 on success and an error code otherwise.
	*/
	int
	pci_find_extcap_method(device_t dev, device_t child, int capability,
	int *capreg)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;
	uint32_t ecap;
	uint16_t ptr;

	/* Only supported for PCI-express devices. */
	if (cfg->pcie.pcie_location == 0)
	return (ENXIO);

	ptr = PCIR_EXTCAP;
	ecap = pci_read_config(child, ptr, 4);
	if (ecap == 0xffffffff \|\| ecap == 0)
	return (ENOENT);
	for (;;) {
	if (PCI_EXTCAP_ID(ecap) == capability) {
	if (capreg != NULL)
	*capreg = ptr;
	return (0);
	}
	ptr = PCI_EXTCAP_NEXTPTR(ecap);
	if (ptr == 0)
	break;
	ecap = pci_read_config(child, ptr, 4);
	}

	return (ENOENT);
	}

	/*
	* Support for MSI-X message interrupts.
	*/
	static void
	pci_write_msix_entry(device_t dev, u_int index, uint64_t address, uint32_t data)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;
	uint32_t offset;

	KASSERT(msix->msix_table_len > index, ("bogus index"));
	offset = msix->msix_table_offset + index * 16;
	bus_write_4(msix->msix_table_res, offset, address & 0xffffffff);
	bus_write_4(msix->msix_table_res, offset + 4, address >> 32);
	bus_write_4(msix->msix_table_res, offset + 8, data);
	}

	void
	pci_enable_msix_method(device_t dev, device_t child, u_int index,
	uint64_t address, uint32_t data)
	{

	if (pci_msix_rewrite_table) {
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;

	/*
	* Some VM hosts require MSIX to be disabled in the
	* control register before updating the MSIX table
	* entries are allowed. It is not enough to only
	* disable MSIX while updating a single entry. MSIX
	* must be disabled while updating all entries in the
	* table.
	*/
	pci_write_config(child,
	msix->msix_location + PCIR_MSIX_CTRL,
	msix->msix_ctrl & ~PCIM_MSIXCTRL_MSIX_ENABLE, 2);
	pci_resume_msix(child);
	} else
	pci_write_msix_entry(child, index, address, data);

	/* Enable MSI -> HT mapping. */
	pci_ht_map_msi(child, address);
	}

	void
	pci_mask_msix(device_t dev, u_int index)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;
	uint32_t offset, val;

	KASSERT(msix->msix_msgnum > index, ("bogus index"));
	offset = msix->msix_table_offset + index * 16 + 12;
	val = bus_read_4(msix->msix_table_res, offset);
	if (!(val & PCIM_MSIX_VCTRL_MASK)) {
	val \|= PCIM_MSIX_VCTRL_MASK;
	bus_write_4(msix->msix_table_res, offset, val);
	}
	}

	void
	pci_unmask_msix(device_t dev, u_int index)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;
	uint32_t offset, val;

	KASSERT(msix->msix_table_len > index, ("bogus index"));
	offset = msix->msix_table_offset + index * 16 + 12;
	val = bus_read_4(msix->msix_table_res, offset);
	if (val & PCIM_MSIX_VCTRL_MASK) {
	val &= ~PCIM_MSIX_VCTRL_MASK;
	bus_write_4(msix->msix_table_res, offset, val);
	}
	}

	int
	pci_pending_msix(device_t dev, u_int index)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;
	uint32_t offset, bit;

	KASSERT(msix->msix_table_len > index, ("bogus index"));
	offset = msix->msix_pba_offset + (index / 32) * 4;
	bit = 1 << index % 32;
	return (bus_read_4(msix->msix_pba_res, offset) & bit);
	}

	/*
	* Restore MSI-X registers and table during resume. If MSI-X is
	* enabled then walk the virtual table to restore the actual MSI-X
	* table.
	*/
	static void
	pci_resume_msix(device_t dev)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;
	struct msix_table_entry *mte;
	struct msix_vector *mv;
	int i;

	if (msix->msix_alloc > 0) {
	/* First, mask all vectors. */
	for (i = 0; i < msix->msix_msgnum; i++)
	pci_mask_msix(dev, i);

	/* Second, program any messages with at least one handler. */
	for (i = 0; i < msix->msix_table_len; i++) {
	mte = &msix->msix_table[i];
	if (mte->mte_vector == 0 \|\| mte->mte_handlers == 0)
	continue;
	mv = &msix->msix_vectors[mte->mte_vector - 1];
	pci_write_msix_entry(dev, i, mv->mv_address,
	mv->mv_data);
	pci_unmask_msix(dev, i);
	}
	}
	pci_write_config(dev, msix->msix_location + PCIR_MSIX_CTRL,
	msix->msix_ctrl, 2);
	}

	/*
	* Attempt to allocate *count MSI-X messages. The actual number allocated is
	* returned in *count. After this function returns, each message will be
	* available to the driver as SYS_RES_IRQ resources starting at rid 1.
	*/
	int
	pci_alloc_msix_method(device_t dev, device_t child, int *count)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;
	struct resource_list_entry *rle;
	int actual, error, i, irq, max;

	/* Don't let count == 0 get us into trouble. */
	if (*count == 0)
	return (EINVAL);

	/* If rid 0 is allocated, then fail. */
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0);
	if (rle != NULL && rle->res != NULL)
	return (ENXIO);

	/* Already have allocated messages? */
	if (cfg->msi.msi_alloc != 0 \|\| cfg->msix.msix_alloc != 0)
	return (ENXIO);

	/* If MSI-X is blacklisted for this system, fail. */
	if (pci_msix_blacklisted())
	return (ENXIO);

	/* MSI-X capability present? */
	if (cfg->msix.msix_location == 0 \|\| !pci_do_msix)
	return (ENODEV);

	/* Make sure the appropriate BARs are mapped. */
	rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY,
	cfg->msix.msix_table_bar);
	if (rle == NULL \|\| rle->res == NULL \|\|
	!(rman_get_flags(rle->res) & RF_ACTIVE))
	return (ENXIO);
	cfg->msix.msix_table_res = rle->res;
	if (cfg->msix.msix_pba_bar != cfg->msix.msix_table_bar) {
	rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY,
	cfg->msix.msix_pba_bar);
	if (rle == NULL \|\| rle->res == NULL \|\|
	!(rman_get_flags(rle->res) & RF_ACTIVE))
	return (ENXIO);
	}
	cfg->msix.msix_pba_res = rle->res;

	if (bootverbose)
	device_printf(child,
	"attempting to allocate %d MSI-X vectors (%d supported)\n",
	*count, cfg->msix.msix_msgnum);
	max = min(*count, cfg->msix.msix_msgnum);
	for (i = 0; i < max; i++) {
	/* Allocate a message. */
	error = PCIB_ALLOC_MSIX(device_get_parent(dev), child, &irq);
	if (error) {
	if (i == 0)
	return (error);
	break;
	}
	resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq,
	irq, 1);
	}
	actual = i;

	if (bootverbose) {
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 1);
	if (actual == 1)
	device_printf(child, "using IRQ %ju for MSI-X\n",
	rle->start);
	else {
	int run;

	/*
	* Be fancy and try to print contiguous runs of
	* IRQ values as ranges. 'irq' is the previous IRQ.
	* 'run' is true if we are in a range.
	*/
	device_printf(child, "using IRQs %ju", rle->start);
	irq = rle->start;
	run = 0;
	for (i = 1; i < actual; i++) {
	rle = resource_list_find(&dinfo->resources,
	SYS_RES_IRQ, i + 1);

	/* Still in a run? */
	if (rle->start == irq + 1) {
	run = 1;
	irq++;
	continue;
	}

	/* Finish previous range. */
	if (run) {
	printf("-%d", irq);
	run = 0;
	}

	/* Start new range. */
	printf(",%ju", rle->start);
	irq = rle->start;
	}

	/* Unfinished range? */
	if (run)
	printf("-%d", irq);
	printf(" for MSI-X\n");
	}
	}

	/* Mask all vectors. */
	for (i = 0; i < cfg->msix.msix_msgnum; i++)
	pci_mask_msix(child, i);

	/* Allocate and initialize vector data and virtual table. */
	cfg->msix.msix_vectors = malloc(sizeof(struct msix_vector) * actual,
	M_DEVBUF, M_WAITOK \| M_ZERO);
	cfg->msix.msix_table = malloc(sizeof(struct msix_table_entry) * actual,
	M_DEVBUF, M_WAITOK \| M_ZERO);
	for (i = 0; i < actual; i++) {
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
	cfg->msix.msix_vectors[i].mv_irq = rle->start;
	cfg->msix.msix_table[i].mte_vector = i + 1;
	}

	/* Update control register to enable MSI-X. */
	cfg->msix.msix_ctrl \|= PCIM_MSIXCTRL_MSIX_ENABLE;
	pci_write_config(child, cfg->msix.msix_location + PCIR_MSIX_CTRL,
	cfg->msix.msix_ctrl, 2);

	/* Update counts of alloc'd messages. */
	cfg->msix.msix_alloc = actual;
	cfg->msix.msix_table_len = actual;
	*count = actual;
	return (0);
	}

	/*
	* By default, pci_alloc_msix() will assign the allocated IRQ
	* resources consecutively to the first N messages in the MSI-X table.
	* However, device drivers may want to use different layouts if they
	* either receive fewer messages than they asked for, or they wish to
	* populate the MSI-X table sparsely. This method allows the driver
	* to specify what layout it wants. It must be called after a
	* successful pci_alloc_msix() but before any of the associated
	* SYS_RES_IRQ resources are allocated via bus_alloc_resource().
	*
	* The 'vectors' array contains 'count' message vectors. The array
	* maps directly to the MSI-X table in that index 0 in the array
	* specifies the vector for the first message in the MSI-X table, etc.
	* The vector value in each array index can either be 0 to indicate
	* that no vector should be assigned to a message slot, or it can be a
	* number from 1 to N (where N is the count returned from a
	* succcessful call to pci_alloc_msix()) to indicate which message
	* vector (IRQ) to be used for the corresponding message.
	*
	* On successful return, each message with a non-zero vector will have
	* an associated SYS_RES_IRQ whose rid is equal to the array index +
	* 1. Additionally, if any of the IRQs allocated via the previous
	* call to pci_alloc_msix() are not used in the mapping, those IRQs
	* will be freed back to the system automatically.
	*
	* For example, suppose a driver has a MSI-X table with 6 messages and
	* asks for 6 messages, but pci_alloc_msix() only returns a count of
	* 3. Call the three vectors allocated by pci_alloc_msix() A, B, and
	* C. After the call to pci_alloc_msix(), the device will be setup to
	* have an MSI-X table of ABC--- (where - means no vector assigned).
	* If the driver then passes a vector array of { 1, 0, 1, 2, 0, 2 },
	* then the MSI-X table will look like A-AB-B, and the 'C' vector will
	* be freed back to the system. This device will also have valid
	* SYS_RES_IRQ rids of 1, 3, 4, and 6.
	*
	* In any case, the SYS_RES_IRQ rid X will always map to the message
	* at MSI-X table index X - 1 and will only be valid if a vector is
	* assigned to that table entry.
	*/
	int
	pci_remap_msix_method(device_t dev, device_t child, int count,
	const u_int *vectors)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;
	struct resource_list_entry *rle;
	int i, irq, j, *used;

	/*
	* Have to have at least one message in the table but the
	* table can't be bigger than the actual MSI-X table in the
	* device.
	*/
	if (count == 0 \|\| count > msix->msix_msgnum)
	return (EINVAL);

	/* Sanity check the vectors. */
	for (i = 0; i < count; i++)
	if (vectors[i] > msix->msix_alloc)
	return (EINVAL);

	/*
	* Make sure there aren't any holes in the vectors to be used.
	* It's a big pain to support it, and it doesn't really make
	* sense anyway. Also, at least one vector must be used.
	*/
	used = malloc(sizeof(int) * msix->msix_alloc, M_DEVBUF, M_WAITOK \|
	M_ZERO);
	for (i = 0; i < count; i++)
	if (vectors[i] != 0)
	used[vectors[i] - 1] = 1;
	for (i = 0; i < msix->msix_alloc - 1; i++)
	if (used[i] == 0 && used[i + 1] == 1) {
	free(used, M_DEVBUF);
	return (EINVAL);
	}
	if (used[0] != 1) {
	free(used, M_DEVBUF);
	return (EINVAL);
	}

	/* Make sure none of the resources are allocated. */
	for (i = 0; i < msix->msix_table_len; i++) {
	if (msix->msix_table[i].mte_vector == 0)
	continue;
	if (msix->msix_table[i].mte_handlers > 0) {
	free(used, M_DEVBUF);
	return (EBUSY);
	}
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
	KASSERT(rle != NULL, ("missing resource"));
	if (rle->res != NULL) {
	free(used, M_DEVBUF);
	return (EBUSY);
	}
	}

	/* Free the existing resource list entries. */
	for (i = 0; i < msix->msix_table_len; i++) {
	if (msix->msix_table[i].mte_vector == 0)
	continue;
	resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1);
	}

	/*
	* Build the new virtual table keeping track of which vectors are
	* used.
	*/
	free(msix->msix_table, M_DEVBUF);
	msix->msix_table = malloc(sizeof(struct msix_table_entry) * count,
	M_DEVBUF, M_WAITOK \| M_ZERO);
	for (i = 0; i < count; i++)
	msix->msix_table[i].mte_vector = vectors[i];
	msix->msix_table_len = count;

	/* Free any unused IRQs and resize the vectors array if necessary. */
	j = msix->msix_alloc - 1;
	if (used[j] == 0) {
	struct msix_vector *vec;

	while (used[j] == 0) {
	PCIB_RELEASE_MSIX(device_get_parent(dev), child,
	msix->msix_vectors[j].mv_irq);
	j--;
	}
	vec = malloc(sizeof(struct msix_vector) * (j + 1), M_DEVBUF,
	M_WAITOK);
	bcopy(msix->msix_vectors, vec, sizeof(struct msix_vector) *
	(j + 1));
	free(msix->msix_vectors, M_DEVBUF);
	msix->msix_vectors = vec;
	msix->msix_alloc = j + 1;
	}
	free(used, M_DEVBUF);

	/* Map the IRQs onto the rids. */
	for (i = 0; i < count; i++) {
	if (vectors[i] == 0)
	continue;
	irq = msix->msix_vectors[vectors[i] - 1].mv_irq;
	resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq,
	irq, 1);
	}

	if (bootverbose) {
	device_printf(child, "Remapped MSI-X IRQs as: ");
	for (i = 0; i < count; i++) {
	if (i != 0)
	printf(", ");
	if (vectors[i] == 0)
	printf("---");
	else
	printf("%d",
	msix->msix_vectors[vectors[i] - 1].mv_irq);
	}
	printf("\n");
	}

	return (0);
	}

	static int
	pci_release_msix(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;
	struct resource_list_entry *rle;
	int i;

	/* Do we have any messages to release? */
	if (msix->msix_alloc == 0)
	return (ENODEV);

	/* Make sure none of the resources are allocated. */
	for (i = 0; i < msix->msix_table_len; i++) {
	if (msix->msix_table[i].mte_vector == 0)
	continue;
	if (msix->msix_table[i].mte_handlers > 0)
	return (EBUSY);
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
	KASSERT(rle != NULL, ("missing resource"));
	if (rle->res != NULL)
	return (EBUSY);
	}

	/* Update control register to disable MSI-X. */
	msix->msix_ctrl &= ~PCIM_MSIXCTRL_MSIX_ENABLE;
	pci_write_config(child, msix->msix_location + PCIR_MSIX_CTRL,
	msix->msix_ctrl, 2);

	/* Free the resource list entries. */
	for (i = 0; i < msix->msix_table_len; i++) {
	if (msix->msix_table[i].mte_vector == 0)
	continue;
	resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1);
	}
	free(msix->msix_table, M_DEVBUF);
	msix->msix_table_len = 0;

	/* Release the IRQs. */
	for (i = 0; i < msix->msix_alloc; i++)
	PCIB_RELEASE_MSIX(device_get_parent(dev), child,
	msix->msix_vectors[i].mv_irq);
	free(msix->msix_vectors, M_DEVBUF);
	msix->msix_alloc = 0;
	return (0);
	}

	/*
	* Return the max supported MSI-X messages this device supports.
	* Basically, assuming the MD code can alloc messages, this function
	* should return the maximum value that pci_alloc_msix() can return.
	* Thus, it is subject to the tunables, etc.
	*/
	int
	pci_msix_count_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;

	if (pci_do_msix && msix->msix_location != 0)
	return (msix->msix_msgnum);
	return (0);
	}

	int
	pci_msix_pba_bar_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;

	if (pci_do_msix && msix->msix_location != 0)
	return (msix->msix_pba_bar);
	return (-1);
	}

	int
	pci_msix_table_bar_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msix *msix = &dinfo->cfg.msix;

	if (pci_do_msix && msix->msix_location != 0)
	return (msix->msix_table_bar);
	return (-1);
	}

	/*
	* HyperTransport MSI mapping control
	*/
	void
	pci_ht_map_msi(device_t dev, uint64_t addr)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	struct pcicfg_ht *ht = &dinfo->cfg.ht;

	if (!ht->ht_msimap)
	return;

	if (addr && !(ht->ht_msictrl & PCIM_HTCMD_MSI_ENABLE) &&
	ht->ht_msiaddr >> 20 == addr >> 20) {
	/* Enable MSI -> HT mapping. */
	ht->ht_msictrl \|= PCIM_HTCMD_MSI_ENABLE;
	pci_write_config(dev, ht->ht_msimap + PCIR_HT_COMMAND,
	ht->ht_msictrl, 2);
	}

	if (!addr && ht->ht_msictrl & PCIM_HTCMD_MSI_ENABLE) {
	/* Disable MSI -> HT mapping. */
	ht->ht_msictrl &= ~PCIM_HTCMD_MSI_ENABLE;
	pci_write_config(dev, ht->ht_msimap + PCIR_HT_COMMAND,
	ht->ht_msictrl, 2);
	}
	}

	int
	pci_get_max_payload(device_t dev)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	int cap;
	uint16_t val;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0)
	return (0);
	val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
	val &= PCIEM_CTL_MAX_PAYLOAD;
	val >>= 5;
	return (1 << (val + 7));
	}

	int
	pci_get_max_read_req(device_t dev)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	int cap;
	uint16_t val;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0)
	return (0);
	val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
	val &= PCIEM_CTL_MAX_READ_REQUEST;
	val >>= 12;
	return (1 << (val + 7));
	}

	int
	pci_set_max_read_req(device_t dev, int size)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	int cap;
	uint16_t val;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0)
	return (0);
	if (size < 128)
	size = 128;
	if (size > 4096)
	size = 4096;
	size = (1 << (fls(size) - 1));
	val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
	val &= ~PCIEM_CTL_MAX_READ_REQUEST;
	val \|= (fls(size) - 8) << 12;
	pci_write_config(dev, cap + PCIER_DEVICE_CTL, val, 2);
	return (size);
	}

	uint32_t
	pcie_read_config(device_t dev, int reg, int width)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	int cap;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0) {
	if (width == 2)
	return (0xffff);
	return (0xffffffff);
	}

	return (pci_read_config(dev, cap + reg, width));
	}

	void
	pcie_write_config(device_t dev, int reg, uint32_t value, int width)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	int cap;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0)
	return;
	pci_write_config(dev, cap + reg, value, width);
	}

	/*
	* Adjusts a PCI-e capability register by clearing the bits in mask
	* and setting the bits in (value & mask). Bits not set in mask are
	* not adjusted.
	*
	* Returns the old value on success or all ones on failure.
	*/
	uint32_t
	pcie_adjust_config(device_t dev, int reg, uint32_t mask, uint32_t value,
	int width)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	uint32_t old, new;
	int cap;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0) {
	if (width == 2)
	return (0xffff);
	return (0xffffffff);
	}

	old = pci_read_config(dev, cap + reg, width);
	new = old & ~mask;
	new \|= (value & mask);
	pci_write_config(dev, cap + reg, new, width);
	return (old);
	}

	/*
	* Support for MSI message signalled interrupts.
	*/
	void
	pci_enable_msi_method(device_t dev, device_t child, uint64_t address,
	uint16_t data)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msi *msi = &dinfo->cfg.msi;

	/* Write data and address values. */
	pci_write_config(child, msi->msi_location + PCIR_MSI_ADDR,
	address & 0xffffffff, 4);
	if (msi->msi_ctrl & PCIM_MSICTRL_64BIT) {
	pci_write_config(child, msi->msi_location + PCIR_MSI_ADDR_HIGH,
	address >> 32, 4);
	pci_write_config(child, msi->msi_location + PCIR_MSI_DATA_64BIT,
	data, 2);
	} else
	pci_write_config(child, msi->msi_location + PCIR_MSI_DATA, data,
	2);

	/* Enable MSI in the control register. */
	msi->msi_ctrl \|= PCIM_MSICTRL_MSI_ENABLE;
	pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
	msi->msi_ctrl, 2);

	/* Enable MSI -> HT mapping. */
	pci_ht_map_msi(child, address);
	}

	void
	pci_disable_msi_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msi *msi = &dinfo->cfg.msi;

	/* Disable MSI -> HT mapping. */
	pci_ht_map_msi(child, 0);

	/* Disable MSI in the control register. */
	msi->msi_ctrl &= ~PCIM_MSICTRL_MSI_ENABLE;
	pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
	msi->msi_ctrl, 2);
	}

	/*
	* Restore MSI registers during resume. If MSI is enabled then
	* restore the data and address registers in addition to the control
	* register.
	*/
	static void
	pci_resume_msi(device_t dev)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	struct pcicfg_msi *msi = &dinfo->cfg.msi;
	uint64_t address;
	uint16_t data;

	if (msi->msi_ctrl & PCIM_MSICTRL_MSI_ENABLE) {
	address = msi->msi_addr;
	data = msi->msi_data;
	pci_write_config(dev, msi->msi_location + PCIR_MSI_ADDR,
	address & 0xffffffff, 4);
	if (msi->msi_ctrl & PCIM_MSICTRL_64BIT) {
	pci_write_config(dev, msi->msi_location +
	PCIR_MSI_ADDR_HIGH, address >> 32, 4);
	pci_write_config(dev, msi->msi_location +
	PCIR_MSI_DATA_64BIT, data, 2);
	} else
	pci_write_config(dev, msi->msi_location + PCIR_MSI_DATA,
	data, 2);
	}
	pci_write_config(dev, msi->msi_location + PCIR_MSI_CTRL, msi->msi_ctrl,
	2);
	}

	static int
	pci_remap_intr_method(device_t bus, device_t dev, u_int irq)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	pcicfgregs *cfg = &dinfo->cfg;
	struct resource_list_entry *rle;
	struct msix_table_entry *mte;
	struct msix_vector *mv;
	uint64_t addr;
	uint32_t data;
	int error, i, j;

	/*
	* Handle MSI first. We try to find this IRQ among our list
	* of MSI IRQs. If we find it, we request updated address and
	* data registers and apply the results.
	*/
	if (cfg->msi.msi_alloc > 0) {

	/* If we don't have any active handlers, nothing to do. */
	if (cfg->msi.msi_handlers == 0)
	return (0);
	for (i = 0; i < cfg->msi.msi_alloc; i++) {
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ,
	i + 1);
	if (rle->start == irq) {
	error = PCIB_MAP_MSI(device_get_parent(bus),
	dev, irq, &addr, &data);
	if (error)
	return (error);
	pci_disable_msi(dev);
	dinfo->cfg.msi.msi_addr = addr;
	dinfo->cfg.msi.msi_data = data;
	pci_enable_msi(dev, addr, data);
	return (0);
	}
	}
	return (ENOENT);
	}

	/*
	* For MSI-X, we check to see if we have this IRQ. If we do,
	* we request the updated mapping info. If that works, we go
	* through all the slots that use this IRQ and update them.
	*/
	if (cfg->msix.msix_alloc > 0) {
	for (i = 0; i < cfg->msix.msix_alloc; i++) {
	mv = &cfg->msix.msix_vectors[i];
	if (mv->mv_irq == irq) {
	error = PCIB_MAP_MSI(device_get_parent(bus),
	dev, irq, &addr, &data);
	if (error)
	return (error);
	mv->mv_address = addr;
	mv->mv_data = data;
	for (j = 0; j < cfg->msix.msix_table_len; j++) {
	mte = &cfg->msix.msix_table[j];
	if (mte->mte_vector != i + 1)
	continue;
	if (mte->mte_handlers == 0)
	continue;
	pci_mask_msix(dev, j);
	pci_enable_msix(dev, j, addr, data);
	pci_unmask_msix(dev, j);
	}
	}
	}
	return (ENOENT);
	}

	return (ENOENT);
	}

	/*
	* Returns true if the specified device is blacklisted because MSI
	* doesn't work.
	*/
	int
	pci_msi_device_blacklisted(device_t dev)
	{

	if (!pci_honor_msi_blacklist)
	return (0);

	return (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_MSI));
	}

	/*
	* Determine if MSI is blacklisted globally on this system. Currently,
	* we just check for blacklisted chipsets as represented by the
	* host-PCI bridge at device 0:0:0. In the future, it may become
	* necessary to check other system attributes, such as the kenv values
	* that give the motherboard manufacturer and model number.
	*/
	static int
	pci_msi_blacklisted(void)
	{
	device_t dev;

	if (!pci_honor_msi_blacklist)
	return (0);

	/* Blacklist all non-PCI-express and non-PCI-X chipsets. */
	if (!(pcie_chipset \|\| pcix_chipset)) {
	if (vm_guest != VM_GUEST_NO) {
	/*
	* Whitelist older chipsets in virtual
	* machines known to support MSI.
	*/
	dev = pci_find_bsf(0, 0, 0);
	if (dev != NULL)
	return (!pci_has_quirk(pci_get_devid(dev),
	PCI_QUIRK_ENABLE_MSI_VM));
	}
	return (1);
	}

	dev = pci_find_bsf(0, 0, 0);
	if (dev != NULL)
	return (pci_msi_device_blacklisted(dev));
	return (0);
	}

	/*
	* Returns true if the specified device is blacklisted because MSI-X
	* doesn't work. Note that this assumes that if MSI doesn't work,
	* MSI-X doesn't either.
	*/
	int
	pci_msix_device_blacklisted(device_t dev)
	{

	if (!pci_honor_msi_blacklist)
	return (0);

	if (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_MSIX))
	return (1);

	return (pci_msi_device_blacklisted(dev));
	}

	/*
	* Determine if MSI-X is blacklisted globally on this system. If MSI
	* is blacklisted, assume that MSI-X is as well. Check for additional
	* chipsets where MSI works but MSI-X does not.
	*/
	static int
	pci_msix_blacklisted(void)
	{
	device_t dev;

	if (!pci_honor_msi_blacklist)
	return (0);

	dev = pci_find_bsf(0, 0, 0);
	if (dev != NULL && pci_has_quirk(pci_get_devid(dev),
	PCI_QUIRK_DISABLE_MSIX))
	return (1);

	return (pci_msi_blacklisted());
	}

	/*
	* Attempt to allocate *count MSI messages. The actual number allocated is
	* returned in *count. After this function returns, each message will be
	* available to the driver as SYS_RES_IRQ resources starting at a rid 1.
	*/
	int
	pci_alloc_msi_method(device_t dev, device_t child, int *count)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;
	struct resource_list_entry *rle;
	int actual, error, i, irqs[32];
	uint16_t ctrl;

	/* Don't let count == 0 get us into trouble. */
	if (*count == 0)
	return (EINVAL);

	/* If rid 0 is allocated, then fail. */
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0);
	if (rle != NULL && rle->res != NULL)
	return (ENXIO);

	/* Already have allocated messages? */
	if (cfg->msi.msi_alloc != 0 \|\| cfg->msix.msix_alloc != 0)
	return (ENXIO);

	/* If MSI is blacklisted for this system, fail. */
	if (pci_msi_blacklisted())
	return (ENXIO);

	/* MSI capability present? */
	if (cfg->msi.msi_location == 0 \|\| !pci_do_msi)
	return (ENODEV);

	if (bootverbose)
	device_printf(child,
	"attempting to allocate %d MSI vectors (%d supported)\n",
	*count, cfg->msi.msi_msgnum);

	/* Don't ask for more than the device supports. */
	actual = min(*count, cfg->msi.msi_msgnum);

	/* Don't ask for more than 32 messages. */
	actual = min(actual, 32);

	/* MSI requires power of 2 number of messages. */
	if (!powerof2(actual))
	return (EINVAL);

	for (;;) {
	/* Try to allocate N messages. */
	error = PCIB_ALLOC_MSI(device_get_parent(dev), child, actual,
	actual, irqs);
	if (error == 0)
	break;
	if (actual == 1)
	return (error);

	/* Try N / 2. */
	actual >>= 1;
	}

	/*
	* We now have N actual messages mapped onto SYS_RES_IRQ
	* resources in the irqs[] array, so add new resources
	* starting at rid 1.
	*/
	for (i = 0; i < actual; i++)
	resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1,
	irqs[i], irqs[i], 1);

	if (bootverbose) {
	if (actual == 1)
	device_printf(child, "using IRQ %d for MSI\n", irqs[0]);
	else {
	int run;

	/*
	* Be fancy and try to print contiguous runs
	* of IRQ values as ranges. 'run' is true if
	* we are in a range.
	*/
	device_printf(child, "using IRQs %d", irqs[0]);
	run = 0;
	for (i = 1; i < actual; i++) {

	/* Still in a run? */
	if (irqs[i] == irqs[i - 1] + 1) {
	run = 1;
	continue;
	}

	/* Finish previous range. */
	if (run) {
	printf("-%d", irqs[i - 1]);
	run = 0;
	}

	/* Start new range. */
	printf(",%d", irqs[i]);
	}

	/* Unfinished range? */
	if (run)
	printf("-%d", irqs[actual - 1]);
	printf(" for MSI\n");
	}
	}

	/* Update control register with actual count. */
	ctrl = cfg->msi.msi_ctrl;
	ctrl &= ~PCIM_MSICTRL_MME_MASK;
	ctrl \|= (ffs(actual) - 1) << 4;
	cfg->msi.msi_ctrl = ctrl;
	pci_write_config(child, cfg->msi.msi_location + PCIR_MSI_CTRL, ctrl, 2);

	/* Update counts of alloc'd messages. */
	cfg->msi.msi_alloc = actual;
	cfg->msi.msi_handlers = 0;
	*count = actual;
	return (0);
	}

	/* Release the MSI messages associated with this device. */
	int
	pci_release_msi_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msi *msi = &dinfo->cfg.msi;
	struct resource_list_entry *rle;
	int error, i, irqs[32];

	/* Try MSI-X first. */
	error = pci_release_msix(dev, child);
	if (error != ENODEV)
	return (error);

	/* Do we have any messages to release? */
	if (msi->msi_alloc == 0)
	return (ENODEV);
	KASSERT(msi->msi_alloc <= 32, ("more than 32 alloc'd messages"));

	/* Make sure none of the resources are allocated. */
	if (msi->msi_handlers > 0)
	return (EBUSY);
	for (i = 0; i < msi->msi_alloc; i++) {
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1);
	KASSERT(rle != NULL, ("missing MSI resource"));
	if (rle->res != NULL)
	return (EBUSY);
	irqs[i] = rle->start;
	}

	/* Update control register with 0 count. */
	KASSERT(!(msi->msi_ctrl & PCIM_MSICTRL_MSI_ENABLE),
	("%s: MSI still enabled", __func__));
	msi->msi_ctrl &= ~PCIM_MSICTRL_MME_MASK;
	pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
	msi->msi_ctrl, 2);

	/* Release the messages. */
	PCIB_RELEASE_MSI(device_get_parent(dev), child, msi->msi_alloc, irqs);
	for (i = 0; i < msi->msi_alloc; i++)
	resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1);

	/* Update alloc count. */
	msi->msi_alloc = 0;
	msi->msi_addr = 0;
	msi->msi_data = 0;
	return (0);
	}

	/*
	* Return the max supported MSI messages this device supports.
	* Basically, assuming the MD code can alloc messages, this function
	* should return the maximum value that pci_alloc_msi() can return.
	* Thus, it is subject to the tunables, etc.
	*/
	int
	pci_msi_count_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct pcicfg_msi *msi = &dinfo->cfg.msi;

	if (pci_do_msi && msi->msi_location != 0)
	return (msi->msi_msgnum);
	return (0);
	}

	/* free pcicfgregs structure and all depending data structures */

	int
	pci_freecfg(struct pci_devinfo *dinfo)
	{
	struct devlist *devlist_head;
	struct pci_map pm, next;
	int i;

	devlist_head = &pci_devq;

	if (dinfo->cfg.vpd.vpd_reg) {
	free(dinfo->cfg.vpd.vpd_ident, M_DEVBUF);
	for (i = 0; i < dinfo->cfg.vpd.vpd_rocnt; i++)
	free(dinfo->cfg.vpd.vpd_ros[i].value, M_DEVBUF);
	free(dinfo->cfg.vpd.vpd_ros, M_DEVBUF);
	for (i = 0; i < dinfo->cfg.vpd.vpd_wcnt; i++)
	free(dinfo->cfg.vpd.vpd_w[i].value, M_DEVBUF);
	free(dinfo->cfg.vpd.vpd_w, M_DEVBUF);
	}
	STAILQ_FOREACH_SAFE(pm, &dinfo->cfg.maps, pm_link, next) {
	free(pm, M_DEVBUF);
	}
	STAILQ_REMOVE(devlist_head, dinfo, pci_devinfo, pci_links);
	free(dinfo, M_DEVBUF);

	/* increment the generation count */
	pci_generation++;

	/* we're losing one device */
	pci_numdevs--;
	return (0);
	}

	/*
	* PCI power manangement
	*/
	int
	pci_set_powerstate_method(device_t dev, device_t child, int state)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;
	uint16_t status;
	int oldstate, highest, delay;

	if (cfg->pp.pp_cap == 0)
	return (EOPNOTSUPP);

	/*
	* Optimize a no state change request away. While it would be OK to
	* write to the hardware in theory, some devices have shown odd
	* behavior when going from D3 -> D3.
	*/
	oldstate = pci_get_powerstate(child);
	if (oldstate == state)
	return (0);

	/*
	* The PCI power management specification states that after a state
	* transition between PCI power states, system software must
	* guarantee a minimal delay before the function accesses the device.
	* Compute the worst case delay that we need to guarantee before we
	* access the device. Many devices will be responsive much more
	* quickly than this delay, but there are some that don't respond
	* instantly to state changes. Transitions to/from D3 state require
	* 10ms, while D2 requires 200us, and D0/1 require none. The delay
	* is done below with DELAY rather than a sleeper function because
	* this function can be called from contexts where we cannot sleep.
	*/
	highest = (oldstate > state) ? oldstate : state;
	if (highest == PCI_POWERSTATE_D3)
	delay = 10000;
	else if (highest == PCI_POWERSTATE_D2)
	delay = 200;
	else
	delay = 0;
	status = PCI_READ_CONFIG(dev, child, cfg->pp.pp_status, 2)
	& ~PCIM_PSTAT_DMASK;
	switch (state) {
	case PCI_POWERSTATE_D0:
	status \|= PCIM_PSTAT_D0;
	break;
	case PCI_POWERSTATE_D1:
	if ((cfg->pp.pp_cap & PCIM_PCAP_D1SUPP) == 0)
	return (EOPNOTSUPP);
	status \|= PCIM_PSTAT_D1;
	break;
	case PCI_POWERSTATE_D2:
	if ((cfg->pp.pp_cap & PCIM_PCAP_D2SUPP) == 0)
	return (EOPNOTSUPP);
	status \|= PCIM_PSTAT_D2;
	break;
	case PCI_POWERSTATE_D3:
	status \|= PCIM_PSTAT_D3;
	break;
	default:
	return (EINVAL);
	}

	if (bootverbose)
	pci_printf(cfg, "Transition from D%d to D%d\n", oldstate,
	state);

	PCI_WRITE_CONFIG(dev, child, cfg->pp.pp_status, status, 2);
	if (delay)
	DELAY(delay);
	return (0);
	}

	int
	pci_get_powerstate_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;
	uint16_t status;
	int result;

	if (cfg->pp.pp_cap != 0) {
	status = PCI_READ_CONFIG(dev, child, cfg->pp.pp_status, 2);
	switch (status & PCIM_PSTAT_DMASK) {
	case PCIM_PSTAT_D0:
	result = PCI_POWERSTATE_D0;
	break;
	case PCIM_PSTAT_D1:
	result = PCI_POWERSTATE_D1;
	break;
	case PCIM_PSTAT_D2:
	result = PCI_POWERSTATE_D2;
	break;
	case PCIM_PSTAT_D3:
	result = PCI_POWERSTATE_D3;
	break;
	default:
	result = PCI_POWERSTATE_UNKNOWN;
	break;
	}
	} else {
	/* No support, device is always at D0 */
	result = PCI_POWERSTATE_D0;
	}
	return (result);
	}

	/*
	* Some convenience functions for PCI device drivers.
	*/

	static __inline void
	pci_set_command_bit(device_t dev, device_t child, uint16_t bit)
	{
	uint16_t command;

	command = PCI_READ_CONFIG(dev, child, PCIR_COMMAND, 2);
	command \|= bit;
	PCI_WRITE_CONFIG(dev, child, PCIR_COMMAND, command, 2);
	}

	static __inline void
	pci_clear_command_bit(device_t dev, device_t child, uint16_t bit)
	{
	uint16_t command;

	command = PCI_READ_CONFIG(dev, child, PCIR_COMMAND, 2);
	command &= ~bit;
	PCI_WRITE_CONFIG(dev, child, PCIR_COMMAND, command, 2);
	}

	int
	pci_enable_busmaster_method(device_t dev, device_t child)
	{
	pci_set_command_bit(dev, child, PCIM_CMD_BUSMASTEREN);
	return (0);
	}

	int
	pci_disable_busmaster_method(device_t dev, device_t child)
	{
	pci_clear_command_bit(dev, child, PCIM_CMD_BUSMASTEREN);
	return (0);
	}

	int
	pci_enable_io_method(device_t dev, device_t child, int space)
	{
	uint16_t bit;

	switch(space) {
	case SYS_RES_IOPORT:
	bit = PCIM_CMD_PORTEN;
	break;
	case SYS_RES_MEMORY:
	bit = PCIM_CMD_MEMEN;
	break;
	default:
	return (EINVAL);
	}
	pci_set_command_bit(dev, child, bit);
	return (0);
	}

	int
	pci_disable_io_method(device_t dev, device_t child, int space)
	{
	uint16_t bit;

	switch(space) {
	case SYS_RES_IOPORT:
	bit = PCIM_CMD_PORTEN;
	break;
	case SYS_RES_MEMORY:
	bit = PCIM_CMD_MEMEN;
	break;
	default:
	return (EINVAL);
	}
	pci_clear_command_bit(dev, child, bit);
	return (0);
	}

	/*
	* New style pci driver. Parent device is either a pci-host-bridge or a
	* pci-pci-bridge. Both kinds are represented by instances of pcib.
	*/

	void
	pci_print_verbose(struct pci_devinfo *dinfo)
	{

	if (bootverbose) {
	pcicfgregs *cfg = &dinfo->cfg;

	printf("found->\tvendor=0x%04x, dev=0x%04x, revid=0x%02x\n",
	cfg->vendor, cfg->device, cfg->revid);
	printf("\tdomain=%d, bus=%d, slot=%d, func=%d\n",
	cfg->domain, cfg->bus, cfg->slot, cfg->func);
	printf("\tclass=%02x-%02x-%02x, hdrtype=0x%02x, mfdev=%d\n",
	cfg->baseclass, cfg->subclass, cfg->progif, cfg->hdrtype,
	cfg->mfdev);
	printf("\tcmdreg=0x%04x, statreg=0x%04x, cachelnsz=%d (dwords)\n",
	cfg->cmdreg, cfg->statreg, cfg->cachelnsz);
	printf("\tlattimer=0x%02x (%d ns), mingnt=0x%02x (%d ns), maxlat=0x%02x (%d ns)\n",
	cfg->lattimer, cfg->lattimer * 30, cfg->mingnt,
	cfg->mingnt * 250, cfg->maxlat, cfg->maxlat * 250);
	if (cfg->intpin > 0)
	printf("\tintpin=%c, irq=%d\n",
	cfg->intpin +'a' -1, cfg->intline);
	if (cfg->pp.pp_cap) {
	uint16_t status;

	status = pci_read_config(cfg->dev, cfg->pp.pp_status, 2);
	printf("\tpowerspec %d supports D0%s%s D3 current D%d\n",
	cfg->pp.pp_cap & PCIM_PCAP_SPEC,
	cfg->pp.pp_cap & PCIM_PCAP_D1SUPP ? " D1" : "",
	cfg->pp.pp_cap & PCIM_PCAP_D2SUPP ? " D2" : "",
	status & PCIM_PSTAT_DMASK);
	}
	if (cfg->msi.msi_location) {
	int ctrl;

	ctrl = cfg->msi.msi_ctrl;
	printf("\tMSI supports %d message%s%s%s\n",
	cfg->msi.msi_msgnum,
	(cfg->msi.msi_msgnum == 1) ? "" : "s",
	(ctrl & PCIM_MSICTRL_64BIT) ? ", 64 bit" : "",
	(ctrl & PCIM_MSICTRL_VECTOR) ? ", vector masks":"");
	}
	if (cfg->msix.msix_location) {
	printf("\tMSI-X supports %d message%s ",
	cfg->msix.msix_msgnum,
	(cfg->msix.msix_msgnum == 1) ? "" : "s");
	if (cfg->msix.msix_table_bar == cfg->msix.msix_pba_bar)
	printf("in map 0x%x\n",
	cfg->msix.msix_table_bar);
	else
	printf("in maps 0x%x and 0x%x\n",
	cfg->msix.msix_table_bar,
	cfg->msix.msix_pba_bar);
	}
	}
	}

	static int
	pci_porten(device_t dev)
	{
	return (pci_read_config(dev, PCIR_COMMAND, 2) & PCIM_CMD_PORTEN) != 0;
	}

	static int
	pci_memen(device_t dev)
	{
	return (pci_read_config(dev, PCIR_COMMAND, 2) & PCIM_CMD_MEMEN) != 0;
	}

	void
	pci_read_bar(device_t dev, int reg, pci_addr_t mapp, pci_addr_t testvalp,
	int *bar64)
	{
	struct pci_devinfo *dinfo;
	pci_addr_t map, testval;
	int ln2range;
	uint16_t cmd;

	/*
	* The device ROM BAR is special. It is always a 32-bit
	* memory BAR. Bit 0 is special and should not be set when
	* sizing the BAR.
	*/
	dinfo = device_get_ivars(dev);
	if (PCIR_IS_BIOS(&dinfo->cfg, reg)) {
	map = pci_read_config(dev, reg, 4);
	pci_write_config(dev, reg, 0xfffffffe, 4);
	testval = pci_read_config(dev, reg, 4);
	pci_write_config(dev, reg, map, 4);
	*mapp = map;
	*testvalp = testval;
	if (bar64 != NULL)
	*bar64 = 0;
	return;
	}

	map = pci_read_config(dev, reg, 4);
	ln2range = pci_maprange(map);
	if (ln2range == 64)
	map \|= (pci_addr_t)pci_read_config(dev, reg + 4, 4) << 32;

	/*
	* Disable decoding via the command register before
	* determining the BAR's length since we will be placing it in
	* a weird state.
	*/
	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
	pci_write_config(dev, PCIR_COMMAND,
	cmd & ~(PCI_BAR_MEM(map) ? PCIM_CMD_MEMEN : PCIM_CMD_PORTEN), 2);

	/*
	* Determine the BAR's length by writing all 1's. The bottom
	* log_2(size) bits of the BAR will stick as 0 when we read
	* the value back.
	*
	* NB: according to the PCI Local Bus Specification, rev. 3.0:
	* "Software writes 0FFFFFFFFh to both registers, reads them back,
	* and combines the result into a 64-bit value." (section 6.2.5.1)
	*
	* Writes to both registers must be performed before attempting to
	* read back the size value.
	*/
	testval = 0;
	pci_write_config(dev, reg, 0xffffffff, 4);
	if (ln2range == 64) {
	pci_write_config(dev, reg + 4, 0xffffffff, 4);
	testval \|= (pci_addr_t)pci_read_config(dev, reg + 4, 4) << 32;
	}
	testval \|= pci_read_config(dev, reg, 4);

	/*
	* Restore the original value of the BAR. We may have reprogrammed
	* the BAR of the low-level console device and when booting verbose,
	* we need the console device addressable.
	*/
	pci_write_config(dev, reg, map, 4);
	if (ln2range == 64)
	pci_write_config(dev, reg + 4, map >> 32, 4);
	pci_write_config(dev, PCIR_COMMAND, cmd, 2);

	*mapp = map;
	*testvalp = testval;
	if (bar64 != NULL)
	*bar64 = (ln2range == 64);
	}

	static void
	pci_write_bar(device_t dev, struct pci_map *pm, pci_addr_t base)
	{
	struct pci_devinfo *dinfo;
	int ln2range;

	/* The device ROM BAR is always a 32-bit memory BAR. */
	dinfo = device_get_ivars(dev);
	if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg))
	ln2range = 32;
	else
	ln2range = pci_maprange(pm->pm_value);
	pci_write_config(dev, pm->pm_reg, base, 4);
	if (ln2range == 64)
	pci_write_config(dev, pm->pm_reg + 4, base >> 32, 4);
	pm->pm_value = pci_read_config(dev, pm->pm_reg, 4);
	if (ln2range == 64)
	pm->pm_value \|= (pci_addr_t)pci_read_config(dev,
	pm->pm_reg + 4, 4) << 32;
	}

	struct pci_map *
	pci_find_bar(device_t dev, int reg)
	{
	struct pci_devinfo *dinfo;
	struct pci_map *pm;

	dinfo = device_get_ivars(dev);
	STAILQ_FOREACH(pm, &dinfo->cfg.maps, pm_link) {
	if (pm->pm_reg == reg)
	return (pm);
	}
	return (NULL);
	}

	int
	pci_bar_enabled(device_t dev, struct pci_map *pm)
	{
	struct pci_devinfo *dinfo;
	uint16_t cmd;

	dinfo = device_get_ivars(dev);
	if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg) &&
	!(pm->pm_value & PCIM_BIOS_ENABLE))
	return (0);
	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
	if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg) \|\| PCI_BAR_MEM(pm->pm_value))
	return ((cmd & PCIM_CMD_MEMEN) != 0);
	else
	return ((cmd & PCIM_CMD_PORTEN) != 0);
	}

	struct pci_map *
	pci_add_bar(device_t dev, int reg, pci_addr_t value, pci_addr_t size)
	{
	struct pci_devinfo *dinfo;
	struct pci_map pm, prev;

	dinfo = device_get_ivars(dev);
	pm = malloc(sizeof(*pm), M_DEVBUF, M_WAITOK \| M_ZERO);
	pm->pm_reg = reg;
	pm->pm_value = value;
	pm->pm_size = size;
	STAILQ_FOREACH(prev, &dinfo->cfg.maps, pm_link) {
	KASSERT(prev->pm_reg != pm->pm_reg, ("duplicate map %02x",
	reg));
	if (STAILQ_NEXT(prev, pm_link) == NULL \|\|
	STAILQ_NEXT(prev, pm_link)->pm_reg > pm->pm_reg)
	break;
	}
	if (prev != NULL)
	STAILQ_INSERT_AFTER(&dinfo->cfg.maps, prev, pm, pm_link);
	else
	STAILQ_INSERT_TAIL(&dinfo->cfg.maps, pm, pm_link);
	return (pm);
	}

	static void
	pci_restore_bars(device_t dev)
	{
	struct pci_devinfo *dinfo;
	struct pci_map *pm;
	int ln2range;

	dinfo = device_get_ivars(dev);
	STAILQ_FOREACH(pm, &dinfo->cfg.maps, pm_link) {
	if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg))
	ln2range = 32;
	else
	ln2range = pci_maprange(pm->pm_value);
	pci_write_config(dev, pm->pm_reg, pm->pm_value, 4);
	if (ln2range == 64)
	pci_write_config(dev, pm->pm_reg + 4,
	pm->pm_value >> 32, 4);
	}
	}

	/*
	* Add a resource based on a pci map register. Return 1 if the map
	* register is a 32bit map register or 2 if it is a 64bit register.
	*/
	static int
	pci_add_map(device_t bus, device_t dev, int reg, struct resource_list *rl,
	int force, int prefetch)
	{
	struct pci_map *pm;
	pci_addr_t base, map, testval;
	pci_addr_t start, end, count;
	int barlen, basezero, flags, maprange, mapsize, type;
	uint16_t cmd;
	struct resource *res;

	/*
	* The BAR may already exist if the device is a CardBus card
	* whose CIS is stored in this BAR.
	*/
	pm = pci_find_bar(dev, reg);
	if (pm != NULL) {
	maprange = pci_maprange(pm->pm_value);
	barlen = maprange == 64 ? 2 : 1;
	return (barlen);
	}

	pci_read_bar(dev, reg, &map, &testval, NULL);
	if (PCI_BAR_MEM(map)) {
	type = SYS_RES_MEMORY;
	if (map & PCIM_BAR_MEM_PREFETCH)
	prefetch = 1;
	} else
	type = SYS_RES_IOPORT;
	mapsize = pci_mapsize(testval);
	base = pci_mapbase(map);
	#ifdef __PCI_BAR_ZERO_VALID
	basezero = 0;
	#else
	basezero = base == 0;
	#endif
	maprange = pci_maprange(map);
	barlen = maprange == 64 ? 2 : 1;

	/*
	* For I/O registers, if bottom bit is set, and the next bit up
	* isn't clear, we know we have a BAR that doesn't conform to the
	* spec, so ignore it. Also, sanity check the size of the data
	* areas to the type of memory involved. Memory must be at least
	* 16 bytes in size, while I/O ranges must be at least 4.
	*/
	if (PCI_BAR_IO(testval) && (testval & PCIM_BAR_IO_RESERVED) != 0)
	return (barlen);
	if ((type == SYS_RES_MEMORY && mapsize < 4) \|\|
	(type == SYS_RES_IOPORT && mapsize < 2))
	return (barlen);

	/* Save a record of this BAR. */
	pm = pci_add_bar(dev, reg, map, mapsize);
	if (bootverbose) {
	printf("\tmap[%02x]: type %s, range %2d, base %#jx, size %2d",
	reg, pci_maptype(map), maprange, (uintmax_t)base, mapsize);
	if (type == SYS_RES_IOPORT && !pci_porten(dev))
	printf(", port disabled\n");
	else if (type == SYS_RES_MEMORY && !pci_memen(dev))
	printf(", memory disabled\n");
	else
	printf(", enabled\n");
	}

	/*
	* If base is 0, then we have problems if this architecture does
	* not allow that. It is best to ignore such entries for the
	* moment. These will be allocated later if the driver specifically
	* requests them. However, some removable buses look better when
	* all resources are allocated, so allow '0' to be overriden.
	*
	* Similarly treat maps whose values is the same as the test value
	* read back. These maps have had all f's written to them by the
	* BIOS in an attempt to disable the resources.
	*/
	if (!force && (basezero \|\| map == testval))
	return (barlen);
	if ((u_long)base != base) {
	device_printf(bus,
	"pci%d:%d:%d:%d bar %#x too many address bits",
	pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev),
	pci_get_function(dev), reg);
	return (barlen);
	}

	/*
	* This code theoretically does the right thing, but has
	* undesirable side effects in some cases where peripherals
	* respond oddly to having these bits enabled. Let the user
	* be able to turn them off (since pci_enable_io_modes is 1 by
	* default).
	*/
	if (pci_enable_io_modes) {
	/* Turn on resources that have been left off by a lazy BIOS */
	if (type == SYS_RES_IOPORT && !pci_porten(dev)) {
	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
	cmd \|= PCIM_CMD_PORTEN;
	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
	}
	if (type == SYS_RES_MEMORY && !pci_memen(dev)) {
	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
	cmd \|= PCIM_CMD_MEMEN;
	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
	}
	} else {
	if (type == SYS_RES_IOPORT && !pci_porten(dev))
	return (barlen);
	if (type == SYS_RES_MEMORY && !pci_memen(dev))
	return (barlen);
	}

	count = (pci_addr_t)1 << mapsize;
	flags = RF_ALIGNMENT_LOG2(mapsize);
	if (prefetch)
	flags \|= RF_PREFETCHABLE;
	if (basezero \|\| base == pci_mapbase(testval) \|\| pci_clear_bars) {
	start = 0; /* Let the parent decide. */
	end = ~0;
	} else {
	start = base;
	end = base + count - 1;
	}
	resource_list_add(rl, type, reg, start, end, count);

	/*
	* Try to allocate the resource for this BAR from our parent
	* so that this resource range is already reserved. The
	* driver for this device will later inherit this resource in
	* pci_alloc_resource().
	*/
	res = resource_list_reserve(rl, bus, dev, type, &reg, start, end, count,
	flags);
	if (pci_do_realloc_bars && res == NULL && (start != 0 \|\| end != ~0)) {
	/*
	* If the allocation fails, try to allocate a resource for
	* this BAR using any available range. The firmware felt
	* it was important enough to assign a resource, so don't
	* disable decoding if we can help it.
	*/
	resource_list_delete(rl, type, reg);
	resource_list_add(rl, type, reg, 0, ~0, count);
	res = resource_list_reserve(rl, bus, dev, type, &reg, 0, ~0,
	count, flags);
	}
	if (res == NULL) {
	/*
	* If the allocation fails, delete the resource list entry
	* and disable decoding for this device.
	*
	* If the driver requests this resource in the future,
	* pci_reserve_map() will try to allocate a fresh
	* resource range.
	*/
	resource_list_delete(rl, type, reg);
	pci_disable_io(dev, type);
	if (bootverbose)
	device_printf(bus,
	"pci%d:%d:%d:%d bar %#x failed to allocate\n",
	pci_get_domain(dev), pci_get_bus(dev),
	pci_get_slot(dev), pci_get_function(dev), reg);
	} else {
	start = rman_get_start(res);
	pci_write_bar(dev, pm, start);
	}
	return (barlen);
	}

	/*
	* For ATA devices we need to decide early what addressing mode to use.
	* Legacy demands that the primary and secondary ATA ports sits on the
	* same addresses that old ISA hardware did. This dictates that we use
	* those addresses and ignore the BAR's if we cannot set PCI native
	* addressing mode.
	*/
	static void
	pci_ata_maps(device_t bus, device_t dev, struct resource_list *rl, int force,
	uint32_t prefetchmask)
	{
	int rid, type, progif;
	#if 0
	/* if this device supports PCI native addressing use it */
	progif = pci_read_config(dev, PCIR_PROGIF, 1);
	if ((progif & 0x8a) == 0x8a) {
	if (pci_mapbase(pci_read_config(dev, PCIR_BAR(0), 4)) &&
	pci_mapbase(pci_read_config(dev, PCIR_BAR(2), 4))) {
	printf("Trying ATA native PCI addressing mode\n");
	pci_write_config(dev, PCIR_PROGIF, progif \| 0x05, 1);
	}
	}
	#endif
	progif = pci_read_config(dev, PCIR_PROGIF, 1);
	type = SYS_RES_IOPORT;
	if (progif & PCIP_STORAGE_IDE_MODEPRIM) {
	pci_add_map(bus, dev, PCIR_BAR(0), rl, force,
	prefetchmask & (1 << 0));
	pci_add_map(bus, dev, PCIR_BAR(1), rl, force,
	prefetchmask & (1 << 1));
	} else {
	rid = PCIR_BAR(0);
	resource_list_add(rl, type, rid, 0x1f0, 0x1f7, 8);
	(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x1f0,
	0x1f7, 8, 0);
	rid = PCIR_BAR(1);
	resource_list_add(rl, type, rid, 0x3f6, 0x3f6, 1);
	(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x3f6,
	0x3f6, 1, 0);
	}
	if (progif & PCIP_STORAGE_IDE_MODESEC) {
	pci_add_map(bus, dev, PCIR_BAR(2), rl, force,
	prefetchmask & (1 << 2));
	pci_add_map(bus, dev, PCIR_BAR(3), rl, force,
	prefetchmask & (1 << 3));
	} else {
	rid = PCIR_BAR(2);
	resource_list_add(rl, type, rid, 0x170, 0x177, 8);
	(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x170,
	0x177, 8, 0);
	rid = PCIR_BAR(3);
	resource_list_add(rl, type, rid, 0x376, 0x376, 1);
	(void)resource_list_reserve(rl, bus, dev, type, &rid, 0x376,
	0x376, 1, 0);
	}
	pci_add_map(bus, dev, PCIR_BAR(4), rl, force,
	prefetchmask & (1 << 4));
	pci_add_map(bus, dev, PCIR_BAR(5), rl, force,
	prefetchmask & (1 << 5));
	}

	static void
	pci_assign_interrupt(device_t bus, device_t dev, int force_route)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	pcicfgregs *cfg = &dinfo->cfg;
	char tunable_name[64];
	int irq;

	/* Has to have an intpin to have an interrupt. */
	if (cfg->intpin == 0)
	return;

	/* Let the user override the IRQ with a tunable. */
	irq = PCI_INVALID_IRQ;
	snprintf(tunable_name, sizeof(tunable_name),
	"hw.pci%d.%d.%d.INT%c.irq",
	cfg->domain, cfg->bus, cfg->slot, cfg->intpin + 'A' - 1);
	if (TUNABLE_INT_FETCH(tunable_name, &irq) && (irq >= 255 \|\| irq <= 0))
	irq = PCI_INVALID_IRQ;

	/*
	* If we didn't get an IRQ via the tunable, then we either use the
	* IRQ value in the intline register or we ask the bus to route an
	* interrupt for us. If force_route is true, then we only use the
	* value in the intline register if the bus was unable to assign an
	* IRQ.
	*/
	if (!PCI_INTERRUPT_VALID(irq)) {
	if (!PCI_INTERRUPT_VALID(cfg->intline) \|\| force_route)
	irq = PCI_ASSIGN_INTERRUPT(bus, dev);
	if (!PCI_INTERRUPT_VALID(irq))
	irq = cfg->intline;
	}

	/* If after all that we don't have an IRQ, just bail. */
	if (!PCI_INTERRUPT_VALID(irq))
	return;

	/* Update the config register if it changed. */
	if (irq != cfg->intline) {
	cfg->intline = irq;
	pci_write_config(dev, PCIR_INTLINE, irq, 1);
	}

	/* Add this IRQ as rid 0 interrupt resource. */
	resource_list_add(&dinfo->resources, SYS_RES_IRQ, 0, irq, irq, 1);
	}

	/* Perform early OHCI takeover from SMM. */
	static void
	ohci_early_takeover(device_t self)
	{
	struct resource *res;
	uint32_t ctl;
	int rid;
	int i;

	rid = PCIR_BAR(0);
	res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE);
	if (res == NULL)
	return;

	ctl = bus_read_4(res, OHCI_CONTROL);
	if (ctl & OHCI_IR) {
	if (bootverbose)
	printf("ohci early: "
	"SMM active, request owner change\n");
	bus_write_4(res, OHCI_COMMAND_STATUS, OHCI_OCR);
	for (i = 0; (i < 100) && (ctl & OHCI_IR); i++) {
	DELAY(1000);
	ctl = bus_read_4(res, OHCI_CONTROL);
	}
	if (ctl & OHCI_IR) {
	if (bootverbose)
	printf("ohci early: "
	"SMM does not respond, resetting\n");
	bus_write_4(res, OHCI_CONTROL, OHCI_HCFS_RESET);
	}
	/* Disable interrupts */
	bus_write_4(res, OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS);
	}

	bus_release_resource(self, SYS_RES_MEMORY, rid, res);
	}

	/* Perform early UHCI takeover from SMM. */
	static void
	uhci_early_takeover(device_t self)
	{
	struct resource *res;
	int rid;

	/*
	* Set the PIRQD enable bit and switch off all the others. We don't
	* want legacy support to interfere with us XXX Does this also mean
	* that the BIOS won't touch the keyboard anymore if it is connected
	* to the ports of the root hub?
	*/
	pci_write_config(self, PCI_LEGSUP, PCI_LEGSUP_USBPIRQDEN, 2);

	/* Disable interrupts */
	rid = PCI_UHCI_BASE_REG;
	res = bus_alloc_resource_any(self, SYS_RES_IOPORT, &rid, RF_ACTIVE);
	if (res != NULL) {
	bus_write_2(res, UHCI_INTR, 0);
	bus_release_resource(self, SYS_RES_IOPORT, rid, res);
	}
	}

	/* Perform early EHCI takeover from SMM. */
	static void
	ehci_early_takeover(device_t self)
	{
	struct resource *res;
	uint32_t cparams;
	uint32_t eec;
	uint8_t eecp;
	uint8_t bios_sem;
	uint8_t offs;
	int rid;
	int i;

	rid = PCIR_BAR(0);
	res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE);
	if (res == NULL)
	return;

	cparams = bus_read_4(res, EHCI_HCCPARAMS);

	/* Synchronise with the BIOS if it owns the controller. */
	for (eecp = EHCI_HCC_EECP(cparams); eecp != 0;
	eecp = EHCI_EECP_NEXT(eec)) {
	eec = pci_read_config(self, eecp, 4);
	if (EHCI_EECP_ID(eec) != EHCI_EC_LEGSUP) {
	continue;
	}
	bios_sem = pci_read_config(self, eecp +
	EHCI_LEGSUP_BIOS_SEM, 1);
	if (bios_sem == 0) {
	continue;
	}
	if (bootverbose)
	printf("ehci early: "
	"SMM active, request owner change\n");

	pci_write_config(self, eecp + EHCI_LEGSUP_OS_SEM, 1, 1);

	for (i = 0; (i < 100) && (bios_sem != 0); i++) {
	DELAY(1000);
	bios_sem = pci_read_config(self, eecp +
	EHCI_LEGSUP_BIOS_SEM, 1);
	}

	if (bios_sem != 0) {
	if (bootverbose)
	printf("ehci early: "
	"SMM does not respond\n");
	}
	/* Disable interrupts */
	offs = EHCI_CAPLENGTH(bus_read_4(res, EHCI_CAPLEN_HCIVERSION));
	bus_write_4(res, offs + EHCI_USBINTR, 0);
	}
	bus_release_resource(self, SYS_RES_MEMORY, rid, res);
	}

	/* Perform early XHCI takeover from SMM. */
	static void
	xhci_early_takeover(device_t self)
	{
	struct resource *res;
	uint32_t cparams;
	uint32_t eec;
	uint8_t eecp;
	uint8_t bios_sem;
	uint8_t offs;
	int rid;
	int i;

	rid = PCIR_BAR(0);
	res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE);
	if (res == NULL)
	return;

	cparams = bus_read_4(res, XHCI_HCSPARAMS0);

	eec = -1;

	/* Synchronise with the BIOS if it owns the controller. */
	for (eecp = XHCI_HCS0_XECP(cparams) << 2; eecp != 0 && XHCI_XECP_NEXT(eec);
	eecp += XHCI_XECP_NEXT(eec) << 2) {
	eec = bus_read_4(res, eecp);

	if (XHCI_XECP_ID(eec) != XHCI_ID_USB_LEGACY)
	continue;

	bios_sem = bus_read_1(res, eecp + XHCI_XECP_BIOS_SEM);
	if (bios_sem == 0)
	continue;

	if (bootverbose)
	printf("xhci early: "
	"SMM active, request owner change\n");

	bus_write_1(res, eecp + XHCI_XECP_OS_SEM, 1);

	/* wait a maximum of 5 second */

	for (i = 0; (i < 5000) && (bios_sem != 0); i++) {
	DELAY(1000);
	bios_sem = bus_read_1(res, eecp +
	XHCI_XECP_BIOS_SEM);
	}

	if (bios_sem != 0) {
	if (bootverbose)
	printf("xhci early: "
	"SMM does not respond\n");
	}

	/* Disable interrupts */
	offs = bus_read_1(res, XHCI_CAPLENGTH);
	bus_write_4(res, offs + XHCI_USBCMD, 0);
	bus_read_4(res, offs + XHCI_USBSTS);
	}
	bus_release_resource(self, SYS_RES_MEMORY, rid, res);
	}

	#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
	static void
	pci_reserve_secbus(device_t bus, device_t dev, pcicfgregs *cfg,
	struct resource_list *rl)
	{
	struct resource *res;
	char *cp;
	rman_res_t start, end, count;
	int rid, sec_bus, sec_reg, sub_bus, sub_reg, sup_bus;

	switch (cfg->hdrtype & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_BRIDGE:
	sec_reg = PCIR_SECBUS_1;
	sub_reg = PCIR_SUBBUS_1;
	break;
	case PCIM_HDRTYPE_CARDBUS:
	sec_reg = PCIR_SECBUS_2;
	sub_reg = PCIR_SUBBUS_2;
	break;
	default:
	return;
	}

	/*
	* If the existing bus range is valid, attempt to reserve it
	* from our parent. If this fails for any reason, clear the
	* secbus and subbus registers.
	*
	* XXX: Should we reset sub_bus to sec_bus if it is < sec_bus?
	* This would at least preserve the existing sec_bus if it is
	* valid.
	*/
	sec_bus = PCI_READ_CONFIG(bus, dev, sec_reg, 1);
	sub_bus = PCI_READ_CONFIG(bus, dev, sub_reg, 1);

	/* Quirk handling. */
	switch (pci_get_devid(dev)) {
	case 0x12258086: /* Intel 82454KX/GX (Orion) */
	sup_bus = pci_read_config(dev, 0x41, 1);
	if (sup_bus != 0xff) {
	sec_bus = sup_bus + 1;
	sub_bus = sup_bus + 1;
	PCI_WRITE_CONFIG(bus, dev, sec_reg, sec_bus, 1);
	PCI_WRITE_CONFIG(bus, dev, sub_reg, sub_bus, 1);
	}
	break;

	case 0x00dd10de:
	/* Compaq R3000 BIOS sets wrong subordinate bus number. */
	if ((cp = kern_getenv("smbios.planar.maker")) == NULL)
	break;
	if (strncmp(cp, "Compal", 6) != 0) {
	freeenv(cp);
	break;
	}
	freeenv(cp);
	if ((cp = kern_getenv("smbios.planar.product")) == NULL)
	break;
	if (strncmp(cp, "08A0", 4) != 0) {
	freeenv(cp);
	break;
	}
	freeenv(cp);
	if (sub_bus < 0xa) {
	sub_bus = 0xa;
	PCI_WRITE_CONFIG(bus, dev, sub_reg, sub_bus, 1);
	}
	break;
	}

	if (bootverbose)
	printf("\tsecbus=%d, subbus=%d\n", sec_bus, sub_bus);
	if (sec_bus > 0 && sub_bus >= sec_bus) {
	start = sec_bus;
	end = sub_bus;
	count = end - start + 1;

	resource_list_add(rl, PCI_RES_BUS, 0, 0, ~0, count);

	/*
	* If requested, clear secondary bus registers in
	* bridge devices to force a complete renumbering
	* rather than reserving the existing range. However,
	* preserve the existing size.
	*/
	if (pci_clear_buses)
	goto clear;

	rid = 0;
	res = resource_list_reserve(rl, bus, dev, PCI_RES_BUS, &rid,
	start, end, count, 0);
	if (res != NULL)
	return;

	if (bootverbose)
	device_printf(bus,
	"pci%d:%d:%d:%d secbus failed to allocate\n",
	pci_get_domain(dev), pci_get_bus(dev),
	pci_get_slot(dev), pci_get_function(dev));
	}

	clear:
	PCI_WRITE_CONFIG(bus, dev, sec_reg, 0, 1);
	PCI_WRITE_CONFIG(bus, dev, sub_reg, 0, 1);
	}

	static struct resource *
	pci_alloc_secbus(device_t dev, device_t child, int *rid, rman_res_t start,
	rman_res_t end, rman_res_t count, u_int flags)
	{
	struct pci_devinfo *dinfo;
	pcicfgregs *cfg;
	struct resource_list *rl;
	struct resource *res;
	int sec_reg, sub_reg;

	dinfo = device_get_ivars(child);
	cfg = &dinfo->cfg;
	rl = &dinfo->resources;
	switch (cfg->hdrtype & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_BRIDGE:
	sec_reg = PCIR_SECBUS_1;
	sub_reg = PCIR_SUBBUS_1;
	break;
	case PCIM_HDRTYPE_CARDBUS:
	sec_reg = PCIR_SECBUS_2;
	sub_reg = PCIR_SUBBUS_2;
	break;
	default:
	return (NULL);
	}

	if (*rid != 0)
	return (NULL);

	if (resource_list_find(rl, PCI_RES_BUS, *rid) == NULL)
	resource_list_add(rl, PCI_RES_BUS, *rid, start, end, count);
	if (!resource_list_reserved(rl, PCI_RES_BUS, *rid)) {
	res = resource_list_reserve(rl, dev, child, PCI_RES_BUS, rid,
	start, end, count, flags & ~RF_ACTIVE);
	if (res == NULL) {
	resource_list_delete(rl, PCI_RES_BUS, *rid);
	device_printf(child, "allocating %ju bus%s failed\n",
	count, count == 1 ? "" : "es");
	return (NULL);
	}
	if (bootverbose)
	device_printf(child,
	"Lazy allocation of %ju bus%s at %ju\n", count,
	count == 1 ? "" : "es", rman_get_start(res));
	PCI_WRITE_CONFIG(dev, child, sec_reg, rman_get_start(res), 1);
	PCI_WRITE_CONFIG(dev, child, sub_reg, rman_get_end(res), 1);
	}
	return (resource_list_alloc(rl, dev, child, PCI_RES_BUS, rid, start,
	end, count, flags));
	}
	#endif

	static int
	pci_ea_bei_to_rid(device_t dev, int bei)
	{
	#ifdef PCI_IOV
	struct pci_devinfo *dinfo;
	int iov_pos;
	struct pcicfg_iov *iov;

	dinfo = device_get_ivars(dev);
	iov = dinfo->cfg.iov;
	if (iov != NULL)
	iov_pos = iov->iov_pos;
	else
	iov_pos = 0;
	#endif

	/* Check if matches BAR */
	if ((bei >= PCIM_EA_BEI_BAR_0) &&
	(bei <= PCIM_EA_BEI_BAR_5))
	return (PCIR_BAR(bei));

	/* Check ROM */
	if (bei == PCIM_EA_BEI_ROM)
	return (PCIR_BIOS);

	#ifdef PCI_IOV
	/* Check if matches VF_BAR */
	if ((iov != NULL) && (bei >= PCIM_EA_BEI_VF_BAR_0) &&
	(bei <= PCIM_EA_BEI_VF_BAR_5))
	return (PCIR_SRIOV_BAR(bei - PCIM_EA_BEI_VF_BAR_0) +
	iov_pos);
	#endif

	return (-1);
	}

	int
	pci_ea_is_enabled(device_t dev, int rid)
	{
	struct pci_ea_entry *ea;
	struct pci_devinfo *dinfo;

	dinfo = device_get_ivars(dev);

	STAILQ_FOREACH(ea, &dinfo->cfg.ea.ea_entries, eae_link) {
	if (pci_ea_bei_to_rid(dev, ea->eae_bei) == rid)
	return ((ea->eae_flags & PCIM_EA_ENABLE) > 0);
	}

	return (0);
	}

	void
	pci_add_resources_ea(device_t bus, device_t dev, int alloc_iov)
	{
	struct pci_ea_entry *ea;
	struct pci_devinfo *dinfo;
	pci_addr_t start, end, count;
	struct resource_list *rl;
	int type, flags, rid;
	struct resource *res;
	uint32_t tmp;
	#ifdef PCI_IOV
	struct pcicfg_iov *iov;
	#endif

	dinfo = device_get_ivars(dev);
	rl = &dinfo->resources;
	flags = 0;

	#ifdef PCI_IOV
	iov = dinfo->cfg.iov;
	#endif

	if (dinfo->cfg.ea.ea_location == 0)
	return;

	STAILQ_FOREACH(ea, &dinfo->cfg.ea.ea_entries, eae_link) {

	/*
	* TODO: Ignore EA-BAR if is not enabled.
	* Currently the EA implementation supports
	* only situation, where EA structure contains
	* predefined entries. In case they are not enabled
	* leave them unallocated and proceed with
	* a legacy-BAR mechanism.
	*/
	if ((ea->eae_flags & PCIM_EA_ENABLE) == 0)
	continue;

	switch ((ea->eae_flags & PCIM_EA_PP) >> PCIM_EA_PP_OFFSET) {
	case PCIM_EA_P_MEM_PREFETCH:
	case PCIM_EA_P_VF_MEM_PREFETCH:
	flags = RF_PREFETCHABLE;
	/* FALLTHROUGH */
	case PCIM_EA_P_VF_MEM:
	case PCIM_EA_P_MEM:
	type = SYS_RES_MEMORY;
	break;
	case PCIM_EA_P_IO:
	type = SYS_RES_IOPORT;
	break;
	default:
	continue;
	}

	if (alloc_iov != 0) {
	#ifdef PCI_IOV
	/* Allocating IOV, confirm BEI matches */
	if ((ea->eae_bei < PCIM_EA_BEI_VF_BAR_0) \|\|
	(ea->eae_bei > PCIM_EA_BEI_VF_BAR_5))
	continue;
	#else
	continue;
	#endif
	} else {
	/* Allocating BAR, confirm BEI matches */
	if (((ea->eae_bei < PCIM_EA_BEI_BAR_0) \|\|
	(ea->eae_bei > PCIM_EA_BEI_BAR_5)) &&
	(ea->eae_bei != PCIM_EA_BEI_ROM))
	continue;
	}

	rid = pci_ea_bei_to_rid(dev, ea->eae_bei);
	if (rid < 0)
	continue;

	/* Skip resources already allocated by EA */
	if ((resource_list_find(rl, SYS_RES_MEMORY, rid) != NULL) \|\|
	(resource_list_find(rl, SYS_RES_IOPORT, rid) != NULL))
	continue;

	start = ea->eae_base;
	count = ea->eae_max_offset + 1;
	#ifdef PCI_IOV
	if (iov != NULL)
	count = count * iov->iov_num_vfs;
	#endif
	end = start + count - 1;
	if (count == 0)
	continue;

	resource_list_add(rl, type, rid, start, end, count);
	res = resource_list_reserve(rl, bus, dev, type, &rid, start, end, count,
	flags);
	if (res == NULL) {
	resource_list_delete(rl, type, rid);

	/*
	* Failed to allocate using EA, disable entry.
	* Another attempt to allocation will be performed
	* further, but this time using legacy BAR registers
	*/
	tmp = pci_read_config(dev, ea->eae_cfg_offset, 4);
	tmp &= ~PCIM_EA_ENABLE;
	pci_write_config(dev, ea->eae_cfg_offset, tmp, 4);

	/*
	* Disabling entry might fail in case it is hardwired.
	* Read flags again to match current status.
	*/
	ea->eae_flags = pci_read_config(dev, ea->eae_cfg_offset, 4);

	continue;
	}

	/* As per specification, fill BAR with zeros */
	pci_write_config(dev, rid, 0, 4);
	}
	}

	void
	pci_add_resources(device_t bus, device_t dev, int force, uint32_t prefetchmask)
	{
	struct pci_devinfo *dinfo;
	pcicfgregs *cfg;
	struct resource_list *rl;
	const struct pci_quirk *q;
	uint32_t devid;
	int i;

	dinfo = device_get_ivars(dev);
	cfg = &dinfo->cfg;
	rl = &dinfo->resources;
	devid = (cfg->device << 16) \| cfg->vendor;

	/* Allocate resources using Enhanced Allocation */
	pci_add_resources_ea(bus, dev, 0);

	/* ATA devices needs special map treatment */
	if ((pci_get_class(dev) == PCIC_STORAGE) &&
	(pci_get_subclass(dev) == PCIS_STORAGE_IDE) &&
	((pci_get_progif(dev) & PCIP_STORAGE_IDE_MASTERDEV) \|\|
	(!pci_read_config(dev, PCIR_BAR(0), 4) &&
	!pci_read_config(dev, PCIR_BAR(2), 4))) )
	pci_ata_maps(bus, dev, rl, force, prefetchmask);
	else
	for (i = 0; i < cfg->nummaps;) {
	/* Skip resources already managed by EA */
	if ((resource_list_find(rl, SYS_RES_MEMORY, PCIR_BAR(i)) != NULL) \|\|
	(resource_list_find(rl, SYS_RES_IOPORT, PCIR_BAR(i)) != NULL) \|\|
	pci_ea_is_enabled(dev, PCIR_BAR(i))) {
	i++;
	continue;
	}

	/*
	* Skip quirked resources.
	*/
	for (q = &pci_quirks[0]; q->devid != 0; q++)
	if (q->devid == devid &&
	q->type == PCI_QUIRK_UNMAP_REG &&
	q->arg1 == PCIR_BAR(i))
	break;
	if (q->devid != 0) {
	i++;
	continue;
	}
	i += pci_add_map(bus, dev, PCIR_BAR(i), rl, force,
	prefetchmask & (1 << i));
	}

	/*
	* Add additional, quirked resources.
	*/
	for (q = &pci_quirks[0]; q->devid != 0; q++)
	if (q->devid == devid && q->type == PCI_QUIRK_MAP_REG)
	pci_add_map(bus, dev, q->arg1, rl, force, 0);

	if (cfg->intpin > 0 && PCI_INTERRUPT_VALID(cfg->intline)) {
	#ifdef __PCI_REROUTE_INTERRUPT
	/*
	* Try to re-route interrupts. Sometimes the BIOS or
	* firmware may leave bogus values in these registers.
	* If the re-route fails, then just stick with what we
	* have.
	*/
	pci_assign_interrupt(bus, dev, 1);
	#else
	pci_assign_interrupt(bus, dev, 0);
	#endif
	}

	if (pci_usb_takeover && pci_get_class(dev) == PCIC_SERIALBUS &&
	pci_get_subclass(dev) == PCIS_SERIALBUS_USB) {
	if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_XHCI)
	xhci_early_takeover(dev);
	else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_EHCI)
	ehci_early_takeover(dev);
	else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_OHCI)
	ohci_early_takeover(dev);
	else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_UHCI)
	uhci_early_takeover(dev);
	}

	#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
	/*
	* Reserve resources for secondary bus ranges behind bridge
	* devices.
	*/
	pci_reserve_secbus(bus, dev, cfg, rl);
	#endif
	}

	static struct pci_devinfo *
	pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
	int slot, int func)
	{
	struct pci_devinfo *dinfo;

	dinfo = pci_read_device(pcib, dev, domain, busno, slot, func);
	if (dinfo != NULL)
	pci_add_child(dev, dinfo);

	return (dinfo);
	}

	void
	pci_add_children(device_t dev, int domain, int busno)
	{
	#define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
	device_t pcib = device_get_parent(dev);
	struct pci_devinfo *dinfo;
	int maxslots;
	int s, f, pcifunchigh;
	uint8_t hdrtype;
	int first_func;

	/*
	* Try to detect a device at slot 0, function 0. If it exists, try to
	* enable ARI. We must enable ARI before detecting the rest of the
	* functions on this bus as ARI changes the set of slots and functions
	* that are legal on this bus.
	*/
	dinfo = pci_identify_function(pcib, dev, domain, busno, 0, 0);
	if (dinfo != NULL && pci_enable_ari)
	PCIB_TRY_ENABLE_ARI(pcib, dinfo->cfg.dev);

	/*
	* Start looking for new devices on slot 0 at function 1 because we
	* just identified the device at slot 0, function 0.
	*/
	first_func = 1;

	maxslots = PCIB_MAXSLOTS(pcib);
	for (s = 0; s <= maxslots; s++, first_func = 0) {
	pcifunchigh = 0;
	f = 0;
	DELAY(1);
	hdrtype = REG(PCIR_HDRTYPE, 1);
	if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
	continue;
	if (hdrtype & PCIM_MFDEV)
	pcifunchigh = PCIB_MAXFUNCS(pcib);
	for (f = first_func; f <= pcifunchigh; f++)
	pci_identify_function(pcib, dev, domain, busno, s, f);
	}
	#undef REG
	}

	int
	pci_rescan_method(device_t dev)
	{
	#define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
	device_t pcib = device_get_parent(dev);
	- struct pci_softc *sc;
	device_t child, devlist, unchanged;
	int devcount, error, i, j, maxslots, oldcount;
	int busno, domain, s, f, pcifunchigh;
	uint8_t hdrtype;

	/* No need to check for ARI on a rescan. */
	error = device_get_children(dev, &devlist, &devcount);
	if (error)
	return (error);
	if (devcount != 0) {
	unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
	M_NOWAIT \| M_ZERO);
	if (unchanged == NULL) {
	free(devlist, M_TEMP);
	return (ENOMEM);
	}
	} else
	unchanged = NULL;

	- sc = device_get_softc(dev);
	domain = pcib_get_domain(dev);
	busno = pcib_get_bus(dev);
	maxslots = PCIB_MAXSLOTS(pcib);
	for (s = 0; s <= maxslots; s++) {
	/* If function 0 is not present, skip to the next slot. */
	f = 0;
	if (REG(PCIR_VENDOR, 2) == 0xffff)
	continue;
	pcifunchigh = 0;
	hdrtype = REG(PCIR_HDRTYPE, 1);
	if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
	continue;
	if (hdrtype & PCIM_MFDEV)
	pcifunchigh = PCIB_MAXFUNCS(pcib);
	for (f = 0; f <= pcifunchigh; f++) {
	if (REG(PCIR_VENDOR, 2) == 0xffff)
	continue;

	/*
	* Found a valid function. Check if a
	* device_t for this device already exists.
	*/
	for (i = 0; i < devcount; i++) {
	child = devlist[i];
	if (child == NULL)
	continue;
	if (pci_get_slot(child) == s &&
	pci_get_function(child) == f) {
	unchanged[i] = child;
	goto next_func;
	}
	}

	pci_identify_function(pcib, dev, domain, busno, s, f);
	next_func:;
	}
	}

	/* Remove devices that are no longer present. */
	for (i = 0; i < devcount; i++) {
	if (unchanged[i] != NULL)
	continue;
	device_delete_child(dev, devlist[i]);
	}

	free(devlist, M_TEMP);
	oldcount = devcount;

	/* Try to attach the devices just added. */
	error = device_get_children(dev, &devlist, &devcount);
	if (error) {
	free(unchanged, M_TEMP);
	return (error);
	}

	for (i = 0; i < devcount; i++) {
	for (j = 0; j < oldcount; j++) {
	if (devlist[i] == unchanged[j])
	goto next_device;
	}

	device_probe_and_attach(devlist[i]);
	next_device:;
	}

	free(unchanged, M_TEMP);
	free(devlist, M_TEMP);
	return (0);
	#undef REG
	}

	#ifdef PCI_IOV
	device_t
	pci_add_iov_child(device_t bus, device_t pf, uint16_t rid, uint16_t vid,
	uint16_t did)
	{
	- struct pci_devinfo pf_dinfo, vf_dinfo;
	+ struct pci_devinfo *vf_dinfo;
	device_t pcib;
	int busno, slot, func;
	-
	- pf_dinfo = device_get_ivars(pf);

	pcib = device_get_parent(bus);

	PCIB_DECODE_RID(pcib, rid, &busno, &slot, &func);

	vf_dinfo = pci_fill_devinfo(pcib, bus, pci_get_domain(pcib), busno,
	slot, func, vid, did);

	vf_dinfo->cfg.flags \|= PCICFG_VF;
	pci_add_child(bus, vf_dinfo);

	return (vf_dinfo->cfg.dev);
	}

	device_t
	pci_create_iov_child_method(device_t bus, device_t pf, uint16_t rid,
	uint16_t vid, uint16_t did)
	{

	return (pci_add_iov_child(bus, pf, rid, vid, did));
	}
	#endif

	void
	pci_add_child(device_t bus, struct pci_devinfo *dinfo)
	{
	dinfo->cfg.dev = device_add_child(bus, NULL, -1);
	device_set_ivars(dinfo->cfg.dev, dinfo);
	resource_list_init(&dinfo->resources);
	pci_cfg_save(dinfo->cfg.dev, dinfo, 0);
	pci_cfg_restore(dinfo->cfg.dev, dinfo);
	pci_print_verbose(dinfo);
	pci_add_resources(bus, dinfo->cfg.dev, 0, 0);
	pci_child_added(dinfo->cfg.dev);
	EVENTHANDLER_INVOKE(pci_add_device, dinfo->cfg.dev);
	}

	void
	pci_child_added_method(device_t dev, device_t child)
	{

	}

	static int
	pci_probe(device_t dev)
	{

	device_set_desc(dev, "PCI bus");

	/* Allow other subclasses to override this driver. */
	return (BUS_PROBE_GENERIC);
	}

	int
	pci_attach_common(device_t dev)
	{
	struct pci_softc *sc;
	int busno, domain;
	#ifdef PCI_DMA_BOUNDARY
	int error, tag_valid;
	#endif
	#ifdef PCI_RES_BUS
	int rid;
	#endif

	sc = device_get_softc(dev);
	domain = pcib_get_domain(dev);
	busno = pcib_get_bus(dev);
	#ifdef PCI_RES_BUS
	rid = 0;
	sc->sc_bus = bus_alloc_resource(dev, PCI_RES_BUS, &rid, busno, busno,
	1, 0);
	if (sc->sc_bus == NULL) {
	device_printf(dev, "failed to allocate bus number\n");
	return (ENXIO);
	}
	#endif
	if (bootverbose)
	device_printf(dev, "domain=%d, physical bus=%d\n",
	domain, busno);
	#ifdef PCI_DMA_BOUNDARY
	tag_valid = 0;
	if (device_get_devclass(device_get_parent(device_get_parent(dev))) !=
	devclass_find("pci")) {
	error = bus_dma_tag_create(bus_get_dma_tag(dev), 1,
	PCI_DMA_BOUNDARY, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
	NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED,
	BUS_SPACE_MAXSIZE, 0, NULL, NULL, &sc->sc_dma_tag);
	if (error)
	device_printf(dev, "Failed to create DMA tag: %d\n",
	error);
	else
	tag_valid = 1;
	}
	if (!tag_valid)
	#endif
	sc->sc_dma_tag = bus_get_dma_tag(dev);
	return (0);
	}

	static int
	pci_attach(device_t dev)
	{
	int busno, domain, error;

	error = pci_attach_common(dev);
	if (error)
	return (error);

	/*
	* Since there can be multiple independently numbered PCI
	* buses on systems with multiple PCI domains, we can't use
	* the unit number to decide which bus we are probing. We ask
	* the parent pcib what our domain and bus numbers are.
	*/
	domain = pcib_get_domain(dev);
	busno = pcib_get_bus(dev);
	pci_add_children(dev, domain, busno);
	return (bus_generic_attach(dev));
	}

	static int
	pci_detach(device_t dev)
	{
	#ifdef PCI_RES_BUS
	struct pci_softc *sc;
	#endif
	int error;

	error = bus_generic_detach(dev);
	if (error)
	return (error);
	#ifdef PCI_RES_BUS
	sc = device_get_softc(dev);
	error = bus_release_resource(dev, PCI_RES_BUS, 0, sc->sc_bus);
	if (error)
	return (error);
	#endif
	return (device_delete_children(dev));
	}

	static void
	pci_hint_device_unit(device_t dev, device_t child, const char name, int unitp)
	{
	int line, unit;
	const char *at;
	char me1[24], me2[32];
	uint8_t b, s, f;
	uint32_t d;

	d = pci_get_domain(child);
	b = pci_get_bus(child);
	s = pci_get_slot(child);
	f = pci_get_function(child);
	snprintf(me1, sizeof(me1), "pci%u:%u:%u", b, s, f);
	snprintf(me2, sizeof(me2), "pci%u:%u:%u:%u", d, b, s, f);
	line = 0;
	while (resource_find_dev(&line, name, &unit, "at", NULL) == 0) {
	resource_string_value(name, unit, "at", &at);
	if (strcmp(at, me1) != 0 && strcmp(at, me2) != 0)
	continue; /* No match, try next candidate */
	*unitp = unit;
	return;
	}
	}

	static void
	pci_set_power_child(device_t dev, device_t child, int state)
	{
	device_t pcib;
	int dstate;

	/*
	* Set the device to the given state. If the firmware suggests
	* a different power state, use it instead. If power management
	* is not present, the firmware is responsible for managing
	* device power. Skip children who aren't attached since they
	* are handled separately.
	*/
	pcib = device_get_parent(dev);
	dstate = state;
	if (device_is_attached(child) &&
	PCIB_POWER_FOR_SLEEP(pcib, child, &dstate) == 0)
	pci_set_powerstate(child, dstate);
	}

	int
	pci_suspend_child(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo;
	int error;

	dinfo = device_get_ivars(child);

	/*
	* Save the PCI configuration space for the child and set the
	* device in the appropriate power state for this sleep state.
	*/
	pci_cfg_save(child, dinfo, 0);

	/* Suspend devices before potentially powering them down. */
	error = bus_generic_suspend_child(dev, child);

	if (error)
	return (error);

	if (pci_do_power_suspend)
	pci_set_power_child(dev, child, PCI_POWERSTATE_D3);

	return (0);
	}

	int
	pci_resume_child(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo;

	if (pci_do_power_resume)
	pci_set_power_child(dev, child, PCI_POWERSTATE_D0);

	dinfo = device_get_ivars(child);
	pci_cfg_restore(child, dinfo);
	if (!device_is_attached(child))
	pci_cfg_save(child, dinfo, 1);

	bus_generic_resume_child(dev, child);

	return (0);
	}

	int
	pci_resume(device_t dev)
	{
	device_t child, *devlist;
	int error, i, numdevs;

	if ((error = device_get_children(dev, &devlist, &numdevs)) != 0)
	return (error);

	/*
	* Resume critical devices first, then everything else later.
	*/
	for (i = 0; i < numdevs; i++) {
	child = devlist[i];
	switch (pci_get_class(child)) {
	case PCIC_DISPLAY:
	case PCIC_MEMORY:
	case PCIC_BRIDGE:
	case PCIC_BASEPERIPH:
	BUS_RESUME_CHILD(dev, child);
	break;
	}
	}
	for (i = 0; i < numdevs; i++) {
	child = devlist[i];
	switch (pci_get_class(child)) {
	case PCIC_DISPLAY:
	case PCIC_MEMORY:
	case PCIC_BRIDGE:
	case PCIC_BASEPERIPH:
	break;
	default:
	BUS_RESUME_CHILD(dev, child);
	}
	}
	free(devlist, M_TEMP);
	return (0);
	}

	static void
	pci_load_vendor_data(void)
	{
	caddr_t data;
	void *ptr;
	size_t sz;

	data = preload_search_by_type("pci_vendor_data");
	if (data != NULL) {
	ptr = preload_fetch_addr(data);
	sz = preload_fetch_size(data);
	if (ptr != NULL && sz != 0) {
	pci_vendordata = ptr;
	pci_vendordata_size = sz;
	/* terminate the database */
	pci_vendordata[pci_vendordata_size] = '\n';
	}
	}
	}

	void
	pci_driver_added(device_t dev, driver_t *driver)
	{
	int numdevs;
	device_t *devlist;
	device_t child;
	struct pci_devinfo *dinfo;
	int i;

	if (bootverbose)
	device_printf(dev, "driver added\n");
	DEVICE_IDENTIFY(driver, dev);
	if (device_get_children(dev, &devlist, &numdevs) != 0)
	return;
	for (i = 0; i < numdevs; i++) {
	child = devlist[i];
	if (device_get_state(child) != DS_NOTPRESENT)
	continue;
	dinfo = device_get_ivars(child);
	pci_print_verbose(dinfo);
	if (bootverbose)
	pci_printf(&dinfo->cfg, "reprobing on driver added\n");
	pci_cfg_restore(child, dinfo);
	if (device_probe_and_attach(child) != 0)
	pci_child_detached(dev, child);
	}
	free(devlist, M_TEMP);
	}

	int
	pci_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
	driver_filter_t filter, driver_intr_t intr, void arg, void *cookiep)
	{
	struct pci_devinfo *dinfo;
	struct msix_table_entry *mte;
	struct msix_vector *mv;
	uint64_t addr;
	uint32_t data;
	void *cookie;
	int error, rid;

	error = bus_generic_setup_intr(dev, child, irq, flags, filter, intr,
	arg, &cookie);
	if (error)
	return (error);

	/* If this is not a direct child, just bail out. */
	if (device_get_parent(child) != dev) {
	*cookiep = cookie;
	return(0);
	}

	rid = rman_get_rid(irq);
	if (rid == 0) {
	/* Make sure that INTx is enabled */
	pci_clear_command_bit(dev, child, PCIM_CMD_INTxDIS);
	} else {
	/*
	* Check to see if the interrupt is MSI or MSI-X.
	* Ask our parent to map the MSI and give
	* us the address and data register values.
	* If we fail for some reason, teardown the
	* interrupt handler.
	*/
	dinfo = device_get_ivars(child);
	if (dinfo->cfg.msi.msi_alloc > 0) {
	if (dinfo->cfg.msi.msi_addr == 0) {
	KASSERT(dinfo->cfg.msi.msi_handlers == 0,
	("MSI has handlers, but vectors not mapped"));
	error = PCIB_MAP_MSI(device_get_parent(dev),
	child, rman_get_start(irq), &addr, &data);
	if (error)
	goto bad;
	dinfo->cfg.msi.msi_addr = addr;
	dinfo->cfg.msi.msi_data = data;
	}
	if (dinfo->cfg.msi.msi_handlers == 0)
	pci_enable_msi(child, dinfo->cfg.msi.msi_addr,
	dinfo->cfg.msi.msi_data);
	dinfo->cfg.msi.msi_handlers++;
	} else {
	KASSERT(dinfo->cfg.msix.msix_alloc > 0,
	("No MSI or MSI-X interrupts allocated"));
	KASSERT(rid <= dinfo->cfg.msix.msix_table_len,
	("MSI-X index too high"));
	mte = &dinfo->cfg.msix.msix_table[rid - 1];
	KASSERT(mte->mte_vector != 0, ("no message vector"));
	mv = &dinfo->cfg.msix.msix_vectors[mte->mte_vector - 1];
	KASSERT(mv->mv_irq == rman_get_start(irq),
	("IRQ mismatch"));
	if (mv->mv_address == 0) {
	KASSERT(mte->mte_handlers == 0,
	("MSI-X table entry has handlers, but vector not mapped"));
	error = PCIB_MAP_MSI(device_get_parent(dev),
	child, rman_get_start(irq), &addr, &data);
	if (error)
	goto bad;
	mv->mv_address = addr;
	mv->mv_data = data;
	}

	/*
	* The MSIX table entry must be made valid by
	* incrementing the mte_handlers before
	* calling pci_enable_msix() and
	* pci_resume_msix(). Else the MSIX rewrite
	* table quirk will not work as expected.
	*/
	mte->mte_handlers++;
	if (mte->mte_handlers == 1) {
	pci_enable_msix(child, rid - 1, mv->mv_address,
	mv->mv_data);
	pci_unmask_msix(child, rid - 1);
	}
	}

	/*
	* Make sure that INTx is disabled if we are using MSI/MSI-X,
	* unless the device is affected by PCI_QUIRK_MSI_INTX_BUG,
	* in which case we "enable" INTx so MSI/MSI-X actually works.
	*/
	if (!pci_has_quirk(pci_get_devid(child),
	PCI_QUIRK_MSI_INTX_BUG))
	pci_set_command_bit(dev, child, PCIM_CMD_INTxDIS);
	else
	pci_clear_command_bit(dev, child, PCIM_CMD_INTxDIS);
	bad:
	if (error) {
	(void)bus_generic_teardown_intr(dev, child, irq,
	cookie);
	return (error);
	}
	}
	*cookiep = cookie;
	return (0);
	}

	int
	pci_teardown_intr(device_t dev, device_t child, struct resource *irq,
	void *cookie)
	{
	struct msix_table_entry *mte;
	struct resource_list_entry *rle;
	struct pci_devinfo *dinfo;
	int error, rid;

	if (irq == NULL \|\| !(rman_get_flags(irq) & RF_ACTIVE))
	return (EINVAL);

	/* If this isn't a direct child, just bail out */
	if (device_get_parent(child) != dev)
	return(bus_generic_teardown_intr(dev, child, irq, cookie));

	rid = rman_get_rid(irq);
	if (rid == 0) {
	/* Mask INTx */
	pci_set_command_bit(dev, child, PCIM_CMD_INTxDIS);
	} else {
	/*
	* Check to see if the interrupt is MSI or MSI-X. If so,
	* decrement the appropriate handlers count and mask the
	* MSI-X message, or disable MSI messages if the count
	* drops to 0.
	*/
	dinfo = device_get_ivars(child);
	rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, rid);
	if (rle->res != irq)
	return (EINVAL);
	if (dinfo->cfg.msi.msi_alloc > 0) {
	KASSERT(rid <= dinfo->cfg.msi.msi_alloc,
	("MSI-X index too high"));
	if (dinfo->cfg.msi.msi_handlers == 0)
	return (EINVAL);
	dinfo->cfg.msi.msi_handlers--;
	if (dinfo->cfg.msi.msi_handlers == 0)
	pci_disable_msi(child);
	} else {
	KASSERT(dinfo->cfg.msix.msix_alloc > 0,
	("No MSI or MSI-X interrupts allocated"));
	KASSERT(rid <= dinfo->cfg.msix.msix_table_len,
	("MSI-X index too high"));
	mte = &dinfo->cfg.msix.msix_table[rid - 1];
	if (mte->mte_handlers == 0)
	return (EINVAL);
	mte->mte_handlers--;
	if (mte->mte_handlers == 0)
	pci_mask_msix(child, rid - 1);
	}
	}
	error = bus_generic_teardown_intr(dev, child, irq, cookie);
	if (rid > 0)
	KASSERT(error == 0,
	("%s: generic teardown failed for MSI/MSI-X", __func__));
	return (error);
	}

	int
	pci_print_child(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo;
	struct resource_list *rl;
	int retval = 0;

	dinfo = device_get_ivars(child);
	rl = &dinfo->resources;

	retval += bus_print_child_header(dev, child);

	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
	retval += resource_list_print_type(rl, "mem", SYS_RES_MEMORY, "%#jx");
	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
	if (device_get_flags(dev))
	retval += printf(" flags %#x", device_get_flags(dev));

	retval += printf(" at device %d.%d", pci_get_slot(child),
	pci_get_function(child));

	retval += bus_print_child_domain(dev, child);
	retval += bus_print_child_footer(dev, child);

	return (retval);
	}

	static const struct
	{
	int class;
	int subclass;
	int report; /* 0 = bootverbose, 1 = always */
	const char *desc;
	} pci_nomatch_tab[] = {
	{PCIC_OLD, -1, 1, "old"},
	{PCIC_OLD, PCIS_OLD_NONVGA, 1, "non-VGA display device"},
	{PCIC_OLD, PCIS_OLD_VGA, 1, "VGA-compatible display device"},
	{PCIC_STORAGE, -1, 1, "mass storage"},
	{PCIC_STORAGE, PCIS_STORAGE_SCSI, 1, "SCSI"},
	{PCIC_STORAGE, PCIS_STORAGE_IDE, 1, "ATA"},
	{PCIC_STORAGE, PCIS_STORAGE_FLOPPY, 1, "floppy disk"},
	{PCIC_STORAGE, PCIS_STORAGE_IPI, 1, "IPI"},
	{PCIC_STORAGE, PCIS_STORAGE_RAID, 1, "RAID"},
	{PCIC_STORAGE, PCIS_STORAGE_ATA_ADMA, 1, "ATA (ADMA)"},
	{PCIC_STORAGE, PCIS_STORAGE_SATA, 1, "SATA"},
	{PCIC_STORAGE, PCIS_STORAGE_SAS, 1, "SAS"},
	{PCIC_STORAGE, PCIS_STORAGE_NVM, 1, "NVM"},
	{PCIC_NETWORK, -1, 1, "network"},
	{PCIC_NETWORK, PCIS_NETWORK_ETHERNET, 1, "ethernet"},
	{PCIC_NETWORK, PCIS_NETWORK_TOKENRING, 1, "token ring"},
	{PCIC_NETWORK, PCIS_NETWORK_FDDI, 1, "fddi"},
	{PCIC_NETWORK, PCIS_NETWORK_ATM, 1, "ATM"},
	{PCIC_NETWORK, PCIS_NETWORK_ISDN, 1, "ISDN"},
	{PCIC_DISPLAY, -1, 1, "display"},
	{PCIC_DISPLAY, PCIS_DISPLAY_VGA, 1, "VGA"},
	{PCIC_DISPLAY, PCIS_DISPLAY_XGA, 1, "XGA"},
	{PCIC_DISPLAY, PCIS_DISPLAY_3D, 1, "3D"},
	{PCIC_MULTIMEDIA, -1, 1, "multimedia"},
	{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_VIDEO, 1, "video"},
	{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_AUDIO, 1, "audio"},
	{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_TELE, 1, "telephony"},
	{PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_HDA, 1, "HDA"},
	{PCIC_MEMORY, -1, 1, "memory"},
	{PCIC_MEMORY, PCIS_MEMORY_RAM, 1, "RAM"},
	{PCIC_MEMORY, PCIS_MEMORY_FLASH, 1, "flash"},
	{PCIC_BRIDGE, -1, 1, "bridge"},
	{PCIC_BRIDGE, PCIS_BRIDGE_HOST, 1, "HOST-PCI"},
	{PCIC_BRIDGE, PCIS_BRIDGE_ISA, 1, "PCI-ISA"},
	{PCIC_BRIDGE, PCIS_BRIDGE_EISA, 1, "PCI-EISA"},
	{PCIC_BRIDGE, PCIS_BRIDGE_MCA, 1, "PCI-MCA"},
	{PCIC_BRIDGE, PCIS_BRIDGE_PCI, 1, "PCI-PCI"},
	{PCIC_BRIDGE, PCIS_BRIDGE_PCMCIA, 1, "PCI-PCMCIA"},
	{PCIC_BRIDGE, PCIS_BRIDGE_NUBUS, 1, "PCI-NuBus"},
	{PCIC_BRIDGE, PCIS_BRIDGE_CARDBUS, 1, "PCI-CardBus"},
	{PCIC_BRIDGE, PCIS_BRIDGE_RACEWAY, 1, "PCI-RACEway"},
	{PCIC_SIMPLECOMM, -1, 1, "simple comms"},
	{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_UART, 1, "UART"}, /* could detect 16550 */
	{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_PAR, 1, "parallel port"},
	{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_MULSER, 1, "multiport serial"},
	{PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_MODEM, 1, "generic modem"},
	{PCIC_BASEPERIPH, -1, 0, "base peripheral"},
	{PCIC_BASEPERIPH, PCIS_BASEPERIPH_PIC, 1, "interrupt controller"},
	{PCIC_BASEPERIPH, PCIS_BASEPERIPH_DMA, 1, "DMA controller"},
	{PCIC_BASEPERIPH, PCIS_BASEPERIPH_TIMER, 1, "timer"},
	{PCIC_BASEPERIPH, PCIS_BASEPERIPH_RTC, 1, "realtime clock"},
	{PCIC_BASEPERIPH, PCIS_BASEPERIPH_PCIHOT, 1, "PCI hot-plug controller"},
	{PCIC_BASEPERIPH, PCIS_BASEPERIPH_SDHC, 1, "SD host controller"},
	{PCIC_BASEPERIPH, PCIS_BASEPERIPH_IOMMU, 1, "IOMMU"},
	{PCIC_INPUTDEV, -1, 1, "input device"},
	{PCIC_INPUTDEV, PCIS_INPUTDEV_KEYBOARD, 1, "keyboard"},
	{PCIC_INPUTDEV, PCIS_INPUTDEV_DIGITIZER,1, "digitizer"},
	{PCIC_INPUTDEV, PCIS_INPUTDEV_MOUSE, 1, "mouse"},
	{PCIC_INPUTDEV, PCIS_INPUTDEV_SCANNER, 1, "scanner"},
	{PCIC_INPUTDEV, PCIS_INPUTDEV_GAMEPORT, 1, "gameport"},
	{PCIC_DOCKING, -1, 1, "docking station"},
	{PCIC_PROCESSOR, -1, 1, "processor"},
	{PCIC_SERIALBUS, -1, 1, "serial bus"},
	{PCIC_SERIALBUS, PCIS_SERIALBUS_FW, 1, "FireWire"},
	{PCIC_SERIALBUS, PCIS_SERIALBUS_ACCESS, 1, "AccessBus"},
	{PCIC_SERIALBUS, PCIS_SERIALBUS_SSA, 1, "SSA"},
	{PCIC_SERIALBUS, PCIS_SERIALBUS_USB, 1, "USB"},
	{PCIC_SERIALBUS, PCIS_SERIALBUS_FC, 1, "Fibre Channel"},
	{PCIC_SERIALBUS, PCIS_SERIALBUS_SMBUS, 0, "SMBus"},
	{PCIC_WIRELESS, -1, 1, "wireless controller"},
	{PCIC_WIRELESS, PCIS_WIRELESS_IRDA, 1, "iRDA"},
	{PCIC_WIRELESS, PCIS_WIRELESS_IR, 1, "IR"},
	{PCIC_WIRELESS, PCIS_WIRELESS_RF, 1, "RF"},
	{PCIC_INTELLIIO, -1, 1, "intelligent I/O controller"},
	{PCIC_INTELLIIO, PCIS_INTELLIIO_I2O, 1, "I2O"},
	{PCIC_SATCOM, -1, 1, "satellite communication"},
	{PCIC_SATCOM, PCIS_SATCOM_TV, 1, "sat TV"},
	{PCIC_SATCOM, PCIS_SATCOM_AUDIO, 1, "sat audio"},
	{PCIC_SATCOM, PCIS_SATCOM_VOICE, 1, "sat voice"},
	{PCIC_SATCOM, PCIS_SATCOM_DATA, 1, "sat data"},
	{PCIC_CRYPTO, -1, 1, "encrypt/decrypt"},
	{PCIC_CRYPTO, PCIS_CRYPTO_NETCOMP, 1, "network/computer crypto"},
	{PCIC_CRYPTO, PCIS_CRYPTO_ENTERTAIN, 1, "entertainment crypto"},
	{PCIC_DASP, -1, 0, "dasp"},
	{PCIC_DASP, PCIS_DASP_DPIO, 1, "DPIO module"},
	{PCIC_DASP, PCIS_DASP_PERFCNTRS, 1, "performance counters"},
	{PCIC_DASP, PCIS_DASP_COMM_SYNC, 1, "communication synchronizer"},
	{PCIC_DASP, PCIS_DASP_MGMT_CARD, 1, "signal processing management"},
	{0, 0, 0, NULL}
	};

	void
	pci_probe_nomatch(device_t dev, device_t child)
	{
	int i, report;
	const char cp, scp;
	char *device;

	/*
	* Look for a listing for this device in a loaded device database.
	*/
	report = 1;
	if ((device = pci_describe_device(child)) != NULL) {
	device_printf(dev, "<%s>", device);
	free(device, M_DEVBUF);
	} else {
	/*
	* Scan the class/subclass descriptions for a general
	* description.
	*/
	cp = "unknown";
	scp = NULL;
	for (i = 0; pci_nomatch_tab[i].desc != NULL; i++) {
	if (pci_nomatch_tab[i].class == pci_get_class(child)) {
	if (pci_nomatch_tab[i].subclass == -1) {
	cp = pci_nomatch_tab[i].desc;
	report = pci_nomatch_tab[i].report;
	} else if (pci_nomatch_tab[i].subclass ==
	pci_get_subclass(child)) {
	scp = pci_nomatch_tab[i].desc;
	report = pci_nomatch_tab[i].report;
	}
	}
	}
	if (report \|\| bootverbose) {
	device_printf(dev, "<%s%s%s>",
	cp ? cp : "",
	((cp != NULL) && (scp != NULL)) ? ", " : "",
	scp ? scp : "");
	}
	}
	if (report \|\| bootverbose) {
	printf(" at device %d.%d (no driver attached)\n",
	pci_get_slot(child), pci_get_function(child));
	}
	pci_cfg_save(child, device_get_ivars(child), 1);
	}

	void
	pci_child_detached(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo;
	struct resource_list *rl;

	dinfo = device_get_ivars(child);
	rl = &dinfo->resources;

	/*
	* Have to deallocate IRQs before releasing any MSI messages and
	* have to release MSI messages before deallocating any memory
	* BARs.
	*/
	if (resource_list_release_active(rl, dev, child, SYS_RES_IRQ) != 0)
	pci_printf(&dinfo->cfg, "Device leaked IRQ resources\n");
	if (dinfo->cfg.msi.msi_alloc != 0 \|\| dinfo->cfg.msix.msix_alloc != 0) {
	pci_printf(&dinfo->cfg, "Device leaked MSI vectors\n");
	(void)pci_release_msi(child);
	}
	if (resource_list_release_active(rl, dev, child, SYS_RES_MEMORY) != 0)
	pci_printf(&dinfo->cfg, "Device leaked memory resources\n");
	if (resource_list_release_active(rl, dev, child, SYS_RES_IOPORT) != 0)
	pci_printf(&dinfo->cfg, "Device leaked I/O resources\n");
	#ifdef PCI_RES_BUS
	if (resource_list_release_active(rl, dev, child, PCI_RES_BUS) != 0)
	pci_printf(&dinfo->cfg, "Device leaked PCI bus numbers\n");
	#endif

	pci_cfg_save(child, dinfo, 1);
	}

	/*
	* Parse the PCI device database, if loaded, and return a pointer to a
	* description of the device.
	*
	* The database is flat text formatted as follows:
	*
	* Any line not in a valid format is ignored.
	* Lines are terminated with newline '\n' characters.
	*
	* A VENDOR line consists of the 4 digit (hex) vendor code, a TAB, then
	* the vendor name.
	*
	* A DEVICE line is entered immediately below the corresponding VENDOR ID.
	* - devices cannot be listed without a corresponding VENDOR line.
	* A DEVICE line consists of a TAB, the 4 digit (hex) device code,
	* another TAB, then the device name.
	*/

	/*
	* Assuming (ptr) points to the beginning of a line in the database,
	* return the vendor or device and description of the next entry.
	* The value of (vendor) or (device) inappropriate for the entry type
	* is set to -1. Returns nonzero at the end of the database.
	*
	* Note that this is slightly unrobust in the face of corrupt data;
	* we attempt to safeguard against this by spamming the end of the
	* database with a newline when we initialise.
	*/
	static int
	pci_describe_parse_line(char *ptr, int vendor, int device, char *desc)
	{
	char cp = ptr;
	int left;

	*device = -1;
	*vendor = -1;
	**desc = '\0';
	for (;;) {
	left = pci_vendordata_size - (cp - pci_vendordata);
	if (left <= 0) {
	*ptr = cp;
	return(1);
	}

	/* vendor entry? */
	if (*cp != '\t' &&
	sscanf(cp, "%x\t%80[^\n]", vendor, *desc) == 2)
	break;
	/* device entry? */
	if (*cp == '\t' &&
	sscanf(cp, "%x\t%80[^\n]", device, *desc) == 2)
	break;

	/* skip to next line */
	while (*cp != '\n' && left > 0) {
	cp++;
	left--;
	}
	if (*cp == '\n') {
	cp++;
	left--;
	}
	}
	/* skip to next line */
	while (*cp != '\n' && left > 0) {
	cp++;
	left--;
	}
	if (*cp == '\n' && left > 0)
	cp++;
	*ptr = cp;
	return(0);
	}

	static char *
	pci_describe_device(device_t dev)
	{
	int vendor, device;
	char desc, vp, dp, line;

	desc = vp = dp = NULL;

	/*
	* If we have no vendor data, we can't do anything.
	*/
	if (pci_vendordata == NULL)
	goto out;

	/*
	* Scan the vendor data looking for this device
	*/
	line = pci_vendordata;
	if ((vp = malloc(80, M_DEVBUF, M_NOWAIT)) == NULL)
	goto out;
	for (;;) {
	if (pci_describe_parse_line(&line, &vendor, &device, &vp))
	goto out;
	if (vendor == pci_get_vendor(dev))
	break;
	}
	if ((dp = malloc(80, M_DEVBUF, M_NOWAIT)) == NULL)
	goto out;
	for (;;) {
	if (pci_describe_parse_line(&line, &vendor, &device, &dp)) {
	*dp = 0;
	break;
	}
	if (vendor != -1) {
	*dp = 0;
	break;
	}
	if (device == pci_get_device(dev))
	break;
	}
	if (dp[0] == '\0')
	snprintf(dp, 80, "0x%x", pci_get_device(dev));
	if ((desc = malloc(strlen(vp) + strlen(dp) + 3, M_DEVBUF, M_NOWAIT)) !=
	NULL)
	sprintf(desc, "%s, %s", vp, dp);
	out:
	if (vp != NULL)
	free(vp, M_DEVBUF);
	if (dp != NULL)
	free(dp, M_DEVBUF);
	return(desc);
	}

	int
	pci_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
	{
	struct pci_devinfo *dinfo;
	pcicfgregs *cfg;

	dinfo = device_get_ivars(child);
	cfg = &dinfo->cfg;

	switch (which) {
	case PCI_IVAR_ETHADDR:
	/*
	* The generic accessor doesn't deal with failure, so
	* we set the return value, then return an error.
	*/
	((uint8_t *) result) = NULL;
	return (EINVAL);
	case PCI_IVAR_SUBVENDOR:
	*result = cfg->subvendor;
	break;
	case PCI_IVAR_SUBDEVICE:
	*result = cfg->subdevice;
	break;
	case PCI_IVAR_VENDOR:
	*result = cfg->vendor;
	break;
	case PCI_IVAR_DEVICE:
	*result = cfg->device;
	break;
	case PCI_IVAR_DEVID:
	*result = (cfg->device << 16) \| cfg->vendor;
	break;
	case PCI_IVAR_CLASS:
	*result = cfg->baseclass;
	break;
	case PCI_IVAR_SUBCLASS:
	*result = cfg->subclass;
	break;
	case PCI_IVAR_PROGIF:
	*result = cfg->progif;
	break;
	case PCI_IVAR_REVID:
	*result = cfg->revid;
	break;
	case PCI_IVAR_INTPIN:
	*result = cfg->intpin;
	break;
	case PCI_IVAR_IRQ:
	*result = cfg->intline;
	break;
	case PCI_IVAR_DOMAIN:
	*result = cfg->domain;
	break;
	case PCI_IVAR_BUS:
	*result = cfg->bus;
	break;
	case PCI_IVAR_SLOT:
	*result = cfg->slot;
	break;
	case PCI_IVAR_FUNCTION:
	*result = cfg->func;
	break;
	case PCI_IVAR_CMDREG:
	*result = cfg->cmdreg;
	break;
	case PCI_IVAR_CACHELNSZ:
	*result = cfg->cachelnsz;
	break;
	case PCI_IVAR_MINGNT:
	if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) {
	*result = -1;
	return (EINVAL);
	}
	*result = cfg->mingnt;
	break;
	case PCI_IVAR_MAXLAT:
	if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) {
	*result = -1;
	return (EINVAL);
	}
	*result = cfg->maxlat;
	break;
	case PCI_IVAR_LATTIMER:
	*result = cfg->lattimer;
	break;
	default:
	return (ENOENT);
	}
	return (0);
	}

	int
	pci_write_ivar(device_t dev, device_t child, int which, uintptr_t value)
	{
	struct pci_devinfo *dinfo;

	dinfo = device_get_ivars(child);

	switch (which) {
	case PCI_IVAR_INTPIN:
	dinfo->cfg.intpin = value;
	return (0);
	case PCI_IVAR_ETHADDR:
	case PCI_IVAR_SUBVENDOR:
	case PCI_IVAR_SUBDEVICE:
	case PCI_IVAR_VENDOR:
	case PCI_IVAR_DEVICE:
	case PCI_IVAR_DEVID:
	case PCI_IVAR_CLASS:
	case PCI_IVAR_SUBCLASS:
	case PCI_IVAR_PROGIF:
	case PCI_IVAR_REVID:
	case PCI_IVAR_IRQ:
	case PCI_IVAR_DOMAIN:
	case PCI_IVAR_BUS:
	case PCI_IVAR_SLOT:
	case PCI_IVAR_FUNCTION:
	return (EINVAL); /* disallow for now */

	default:
	return (ENOENT);
	}
	}

	#include "opt_ddb.h"
	#ifdef DDB
	#include <ddb/ddb.h>
	#include <sys/cons.h>

	/*
	* List resources based on pci map registers, used for within ddb
	*/

	DB_SHOW_COMMAND(pciregs, db_pci_dump)
	{
	struct pci_devinfo *dinfo;
	struct devlist *devlist_head;
	struct pci_conf *p;
	const char *name;
	int i, error, none_count;

	none_count = 0;
	/* get the head of the device queue */
	devlist_head = &pci_devq;

	/*
	* Go through the list of devices and print out devices
	*/
	for (error = 0, i = 0,
	dinfo = STAILQ_FIRST(devlist_head);
	(dinfo != NULL) && (error == 0) && (i < pci_numdevs) && !db_pager_quit;
	dinfo = STAILQ_NEXT(dinfo, pci_links), i++) {

	/* Populate pd_name and pd_unit */
	name = NULL;
	if (dinfo->cfg.dev)
	name = device_get_name(dinfo->cfg.dev);

	p = &dinfo->conf;
	db_printf("%s%d@pci%d:%d:%d:%d:\tclass=0x%06x card=0x%08x "
	"chip=0x%08x rev=0x%02x hdr=0x%02x\n",
	(name && *name) ? name : "none",
	(name && *name) ? (int)device_get_unit(dinfo->cfg.dev) :
	none_count++,
	p->pc_sel.pc_domain, p->pc_sel.pc_bus, p->pc_sel.pc_dev,
	p->pc_sel.pc_func, (p->pc_class << 16) \|
	(p->pc_subclass << 8) \| p->pc_progif,
	(p->pc_subdevice << 16) \| p->pc_subvendor,
	(p->pc_device << 16) \| p->pc_vendor,
	p->pc_revid, p->pc_hdr);
	}
	}
	#endif /* DDB */

	static struct resource *
	pci_reserve_map(device_t dev, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int num,
	u_int flags)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	struct resource_list *rl = &dinfo->resources;
	struct resource *res;
	struct pci_map *pm;
	uint16_t cmd;
	pci_addr_t map, testval;
	int mapsize;

	res = NULL;

	/* If rid is managed by EA, ignore it */
	if (pci_ea_is_enabled(child, *rid))
	goto out;

	pm = pci_find_bar(child, *rid);
	if (pm != NULL) {
	/* This is a BAR that we failed to allocate earlier. */
	mapsize = pm->pm_size;
	map = pm->pm_value;
	} else {
	/*
	* Weed out the bogons, and figure out how large the
	* BAR/map is. BARs that read back 0 here are bogus
	* and unimplemented. Note: atapci in legacy mode are
	* special and handled elsewhere in the code. If you
	* have a atapci device in legacy mode and it fails
	* here, that other code is broken.
	*/
	pci_read_bar(child, *rid, &map, &testval, NULL);

	/*
	* Determine the size of the BAR and ignore BARs with a size
	* of 0. Device ROM BARs use a different mask value.
	*/
	if (PCIR_IS_BIOS(&dinfo->cfg, *rid))
	mapsize = pci_romsize(testval);
	else
	mapsize = pci_mapsize(testval);
	if (mapsize == 0)
	goto out;
	pm = pci_add_bar(child, *rid, map, mapsize);
	}

	if (PCI_BAR_MEM(map) \|\| PCIR_IS_BIOS(&dinfo->cfg, *rid)) {
	if (type != SYS_RES_MEMORY) {
	if (bootverbose)
	device_printf(dev,
	"child %s requested type %d for rid %#x,"
	" but the BAR says it is an memio\n",
	device_get_nameunit(child), type, *rid);
	goto out;
	}
	} else {
	if (type != SYS_RES_IOPORT) {
	if (bootverbose)
	device_printf(dev,
	"child %s requested type %d for rid %#x,"
	" but the BAR says it is an ioport\n",
	device_get_nameunit(child), type, *rid);
	goto out;
	}
	}

	/*
	* For real BARs, we need to override the size that
	* the driver requests, because that's what the BAR
	* actually uses and we would otherwise have a
	* situation where we might allocate the excess to
	* another driver, which won't work.
	*/
	count = ((pci_addr_t)1 << mapsize) * num;
	if (RF_ALIGNMENT(flags) < mapsize)
	flags = (flags & ~RF_ALIGNMENT_MASK) \| RF_ALIGNMENT_LOG2(mapsize);
	if (PCI_BAR_MEM(map) && (map & PCIM_BAR_MEM_PREFETCH))
	flags \|= RF_PREFETCHABLE;

	/*
	* Allocate enough resource, and then write back the
	* appropriate BAR for that resource.
	*/
	resource_list_add(rl, type, *rid, start, end, count);
	res = resource_list_reserve(rl, dev, child, type, rid, start, end,
	count, flags & ~RF_ACTIVE);
	if (res == NULL) {
	resource_list_delete(rl, type, *rid);
	device_printf(child,
	"%#jx bytes of rid %#x res %d failed (%#jx, %#jx).\n",
	count, *rid, type, start, end);
	goto out;
	}
	if (bootverbose)
	device_printf(child,
	"Lazy allocation of %#jx bytes rid %#x type %d at %#jx\n",
	count, *rid, type, rman_get_start(res));

	/* Disable decoding via the CMD register before updating the BAR */
	cmd = pci_read_config(child, PCIR_COMMAND, 2);
	pci_write_config(child, PCIR_COMMAND,
	cmd & ~(PCI_BAR_MEM(map) ? PCIM_CMD_MEMEN : PCIM_CMD_PORTEN), 2);

	map = rman_get_start(res);
	pci_write_bar(child, pm, map);

	/* Restore the original value of the CMD register */
	pci_write_config(child, PCIR_COMMAND, cmd, 2);
	out:
	return (res);
	}

	struct resource *
	pci_alloc_multi_resource(device_t dev, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_long num,
	u_int flags)
	{
	struct pci_devinfo *dinfo;
	struct resource_list *rl;
	struct resource_list_entry *rle;
	struct resource *res;
	pcicfgregs *cfg;

	/*
	* Perform lazy resource allocation
	*/
	dinfo = device_get_ivars(child);
	rl = &dinfo->resources;
	cfg = &dinfo->cfg;
	switch (type) {
	#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
	case PCI_RES_BUS:
	return (pci_alloc_secbus(dev, child, rid, start, end, count,
	flags));
	#endif
	case SYS_RES_IRQ:
	/*
	* Can't alloc legacy interrupt once MSI messages have
	* been allocated.
	*/
	if (*rid == 0 && (cfg->msi.msi_alloc > 0 \|\|
	cfg->msix.msix_alloc > 0))
	return (NULL);

	/*
	* If the child device doesn't have an interrupt
	* routed and is deserving of an interrupt, try to
	* assign it one.
	*/
	if (*rid == 0 && !PCI_INTERRUPT_VALID(cfg->intline) &&
	(cfg->intpin != 0))
	pci_assign_interrupt(dev, child, 0);
	break;
	case SYS_RES_IOPORT:
	case SYS_RES_MEMORY:
	#ifdef NEW_PCIB
	/*
	* PCI-PCI bridge I/O window resources are not BARs.
	* For those allocations just pass the request up the
	* tree.
	*/
	if (cfg->hdrtype == PCIM_HDRTYPE_BRIDGE) {
	switch (*rid) {
	case PCIR_IOBASEL_1:
	case PCIR_MEMBASE_1:
	case PCIR_PMBASEL_1:
	/*
	* XXX: Should we bother creating a resource
	* list entry?
	*/
	return (bus_generic_alloc_resource(dev, child,
	type, rid, start, end, count, flags));
	}
	}
	#endif
	/* Reserve resources for this BAR if needed. */
	rle = resource_list_find(rl, type, *rid);
	if (rle == NULL) {
	res = pci_reserve_map(dev, child, type, rid, start, end,
	count, num, flags);
	if (res == NULL)
	return (NULL);
	}
	}
	return (resource_list_alloc(rl, dev, child, type, rid,
	start, end, count, flags));
	}

	struct resource *
	pci_alloc_resource(device_t dev, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	#ifdef PCI_IOV
	struct pci_devinfo *dinfo;
	#endif

	if (device_get_parent(child) != dev)
	return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
	type, rid, start, end, count, flags));

	#ifdef PCI_IOV
	dinfo = device_get_ivars(child);
	if (dinfo->cfg.flags & PCICFG_VF) {
	switch (type) {
	/* VFs can't have I/O BARs. */
	case SYS_RES_IOPORT:
	return (NULL);
	case SYS_RES_MEMORY:
	return (pci_vf_alloc_mem_resource(dev, child, rid,
	start, end, count, flags));
	}

	/* Fall through for other types of resource allocations. */
	}
	#endif

	return (pci_alloc_multi_resource(dev, child, type, rid, start, end,
	count, 1, flags));
	}

	int
	pci_release_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r)
	{
	struct pci_devinfo *dinfo;
	struct resource_list *rl;
	pcicfgregs *cfg;

	if (device_get_parent(child) != dev)
	return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
	type, rid, r));

	dinfo = device_get_ivars(child);
	cfg = &dinfo->cfg;

	#ifdef PCI_IOV
	if (dinfo->cfg.flags & PCICFG_VF) {
	switch (type) {
	/* VFs can't have I/O BARs. */
	case SYS_RES_IOPORT:
	return (EDOOFUS);
	case SYS_RES_MEMORY:
	return (pci_vf_release_mem_resource(dev, child, rid,
	r));
	}

	/* Fall through for other types of resource allocations. */
	}
	#endif

	#ifdef NEW_PCIB
	/*
	* PCI-PCI bridge I/O window resources are not BARs. For
	* those allocations just pass the request up the tree.
	*/
	if (cfg->hdrtype == PCIM_HDRTYPE_BRIDGE &&
	(type == SYS_RES_IOPORT \|\| type == SYS_RES_MEMORY)) {
	switch (rid) {
	case PCIR_IOBASEL_1:
	case PCIR_MEMBASE_1:
	case PCIR_PMBASEL_1:
	return (bus_generic_release_resource(dev, child, type,
	rid, r));
	}
	}
	#endif

	rl = &dinfo->resources;
	return (resource_list_release(rl, dev, child, type, rid, r));
	}

	int
	pci_activate_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r)
	{
	struct pci_devinfo *dinfo;
	int error;

	error = bus_generic_activate_resource(dev, child, type, rid, r);
	if (error)
	return (error);

	/* Enable decoding in the command register when activating BARs. */
	if (device_get_parent(child) == dev) {
	/* Device ROMs need their decoding explicitly enabled. */
	dinfo = device_get_ivars(child);
	if (type == SYS_RES_MEMORY && PCIR_IS_BIOS(&dinfo->cfg, rid))
	pci_write_bar(child, pci_find_bar(child, rid),
	rman_get_start(r) \| PCIM_BIOS_ENABLE);
	switch (type) {
	case SYS_RES_IOPORT:
	case SYS_RES_MEMORY:
	error = PCI_ENABLE_IO(dev, child, type);
	break;
	}
	}
	return (error);
	}

	int
	pci_deactivate_resource(device_t dev, device_t child, int type,
	int rid, struct resource *r)
	{
	struct pci_devinfo *dinfo;
	int error;

	error = bus_generic_deactivate_resource(dev, child, type, rid, r);
	if (error)
	return (error);

	/* Disable decoding for device ROMs. */
	if (device_get_parent(child) == dev) {
	dinfo = device_get_ivars(child);
	if (type == SYS_RES_MEMORY && PCIR_IS_BIOS(&dinfo->cfg, rid))
	pci_write_bar(child, pci_find_bar(child, rid),
	rman_get_start(r));
	}
	return (0);
	}

	void
	pci_child_deleted(device_t dev, device_t child)
	{
	struct resource_list_entry *rle;
	struct resource_list *rl;
	struct pci_devinfo *dinfo;

	dinfo = device_get_ivars(child);
	rl = &dinfo->resources;

	EVENTHANDLER_INVOKE(pci_delete_device, child);

	/* Turn off access to resources we're about to free */
	if (bus_child_present(child) != 0) {
	pci_write_config(child, PCIR_COMMAND, pci_read_config(child,
	PCIR_COMMAND, 2) & ~(PCIM_CMD_MEMEN \| PCIM_CMD_PORTEN), 2);

	pci_disable_busmaster(child);
	}

	/* Free all allocated resources */
	STAILQ_FOREACH(rle, rl, link) {
	if (rle->res) {
	if (rman_get_flags(rle->res) & RF_ACTIVE \|\|
	resource_list_busy(rl, rle->type, rle->rid)) {
	pci_printf(&dinfo->cfg,
	"Resource still owned, oops. "
	"(type=%d, rid=%d, addr=%lx)\n",
	rle->type, rle->rid,
	rman_get_start(rle->res));
	bus_release_resource(child, rle->type, rle->rid,
	rle->res);
	}
	resource_list_unreserve(rl, dev, child, rle->type,
	rle->rid);
	}
	}
	resource_list_free(rl);

	pci_freecfg(dinfo);
	}

	void
	pci_delete_resource(device_t dev, device_t child, int type, int rid)
	{
	struct pci_devinfo *dinfo;
	struct resource_list *rl;
	struct resource_list_entry *rle;

	if (device_get_parent(child) != dev)
	return;

	dinfo = device_get_ivars(child);
	rl = &dinfo->resources;
	rle = resource_list_find(rl, type, rid);
	if (rle == NULL)
	return;

	if (rle->res) {
	if (rman_get_flags(rle->res) & RF_ACTIVE \|\|
	resource_list_busy(rl, type, rid)) {
	device_printf(dev, "delete_resource: "
	"Resource still owned by child, oops. "
	"(type=%d, rid=%d, addr=%jx)\n",
	type, rid, rman_get_start(rle->res));
	return;
	}
	resource_list_unreserve(rl, dev, child, type, rid);
	}
	resource_list_delete(rl, type, rid);
	}

	struct resource_list *
	pci_get_resource_list (device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);

	return (&dinfo->resources);
	}

	bus_dma_tag_t
	pci_get_dma_tag(device_t bus, device_t dev)
	{
	struct pci_softc *sc = device_get_softc(bus);

	return (sc->sc_dma_tag);
	}

	uint32_t
	pci_read_config_method(device_t dev, device_t child, int reg, int width)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;

	#ifdef PCI_IOV
	/*
	* SR-IOV VFs don't implement the VID or DID registers, so we have to
	* emulate them here.
	*/
	if (cfg->flags & PCICFG_VF) {
	if (reg == PCIR_VENDOR) {
	switch (width) {
	case 4:
	return (cfg->device << 16 \| cfg->vendor);
	case 2:
	return (cfg->vendor);
	case 1:
	return (cfg->vendor & 0xff);
	default:
	return (0xffffffff);
	}
	} else if (reg == PCIR_DEVICE) {
	switch (width) {
	/* Note that an unaligned 4-byte read is an error. */
	case 2:
	return (cfg->device);
	case 1:
	return (cfg->device & 0xff);
	default:
	return (0xffffffff);
	}
	}
	}
	#endif

	return (PCIB_READ_CONFIG(device_get_parent(dev),
	cfg->bus, cfg->slot, cfg->func, reg, width));
	}

	void
	pci_write_config_method(device_t dev, device_t child, int reg,
	uint32_t val, int width)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;

	PCIB_WRITE_CONFIG(device_get_parent(dev),
	cfg->bus, cfg->slot, cfg->func, reg, val, width);
	}

	int
	pci_child_location_str_method(device_t dev, device_t child, char *buf,
	size_t buflen)
	{

	snprintf(buf, buflen, "slot=%d function=%d dbsf=pci%d:%d:%d:%d",
	pci_get_slot(child), pci_get_function(child), pci_get_domain(child),
	pci_get_bus(child), pci_get_slot(child), pci_get_function(child));
	return (0);
	}

	int
	pci_child_pnpinfo_str_method(device_t dev, device_t child, char *buf,
	size_t buflen)
	{
	struct pci_devinfo *dinfo;
	pcicfgregs *cfg;

	dinfo = device_get_ivars(child);
	cfg = &dinfo->cfg;
	snprintf(buf, buflen, "vendor=0x%04x device=0x%04x subvendor=0x%04x "
	"subdevice=0x%04x class=0x%02x%02x%02x", cfg->vendor, cfg->device,
	cfg->subvendor, cfg->subdevice, cfg->baseclass, cfg->subclass,
	cfg->progif);
	return (0);
	}

	int
	pci_assign_interrupt_method(device_t dev, device_t child)
	{
	struct pci_devinfo *dinfo = device_get_ivars(child);
	pcicfgregs *cfg = &dinfo->cfg;

	return (PCIB_ROUTE_INTERRUPT(device_get_parent(dev), child,
	cfg->intpin));
	}

	static void
	pci_lookup(void arg, const char name, device_t *dev)
	{
	long val;
	char *end;
	int domain, bus, slot, func;

	if (*dev != NULL)
	return;

	/*
	* Accept pciconf-style selectors of either pciD:B:S:F or
	* pciB:S:F. In the latter case, the domain is assumed to
	* be zero.
	*/
	if (strncmp(name, "pci", 3) != 0)
	return;
	val = strtol(name + 3, &end, 10);
	if (val < 0 \|\| val > INT_MAX \|\| *end != ':')
	return;
	domain = val;
	val = strtol(end + 1, &end, 10);
	if (val < 0 \|\| val > INT_MAX \|\| *end != ':')
	return;
	bus = val;
	val = strtol(end + 1, &end, 10);
	if (val < 0 \|\| val > INT_MAX)
	return;
	slot = val;
	if (*end == ':') {
	val = strtol(end + 1, &end, 10);
	if (val < 0 \|\| val > INT_MAX \|\| *end != '\0')
	return;
	func = val;
	} else if (*end == '\0') {
	func = slot;
	slot = bus;
	bus = domain;
	domain = 0;
	} else
	return;

	if (domain > PCI_DOMAINMAX \|\| bus > PCI_BUSMAX \|\| slot > PCI_SLOTMAX \|\|
	func > PCIE_ARI_FUNCMAX \|\| (slot != 0 && func > PCI_FUNCMAX))
	return;

	*dev = pci_find_dbsf(domain, bus, slot, func);
	}

	static int
	pci_modevent(module_t mod, int what, void *arg)
	{
	static struct cdev *pci_cdev;
	static eventhandler_tag tag;

	switch (what) {
	case MOD_LOAD:
	STAILQ_INIT(&pci_devq);
	pci_generation = 0;
	pci_cdev = make_dev(&pcicdev, 0, UID_ROOT, GID_WHEEL, 0644,
	"pci");
	pci_load_vendor_data();
	tag = EVENTHANDLER_REGISTER(dev_lookup, pci_lookup, NULL,
	1000);
	break;

	case MOD_UNLOAD:
	if (tag != NULL)
	EVENTHANDLER_DEREGISTER(dev_lookup, tag);
	destroy_dev(pci_cdev);
	break;
	}

	return (0);
	}

	static void
	pci_cfg_restore_pcie(device_t dev, struct pci_devinfo *dinfo)
	{
	#define WREG(n, v) pci_write_config(dev, pos + (n), (v), 2)
	struct pcicfg_pcie *cfg;
	int version, pos;

	cfg = &dinfo->cfg.pcie;
	pos = cfg->pcie_location;

	version = cfg->pcie_flags & PCIEM_FLAGS_VERSION;

	WREG(PCIER_DEVICE_CTL, cfg->pcie_device_ctl);

	if (version > 1 \|\| cfg->pcie_type == PCIEM_TYPE_ROOT_PORT \|\|
	cfg->pcie_type == PCIEM_TYPE_ENDPOINT \|\|
	cfg->pcie_type == PCIEM_TYPE_LEGACY_ENDPOINT)
	WREG(PCIER_LINK_CTL, cfg->pcie_link_ctl);

	if (version > 1 \|\| (cfg->pcie_type == PCIEM_TYPE_ROOT_PORT \|\|
	(cfg->pcie_type == PCIEM_TYPE_DOWNSTREAM_PORT &&
	(cfg->pcie_flags & PCIEM_FLAGS_SLOT))))
	WREG(PCIER_SLOT_CTL, cfg->pcie_slot_ctl);

	if (version > 1 \|\| cfg->pcie_type == PCIEM_TYPE_ROOT_PORT \|\|
	cfg->pcie_type == PCIEM_TYPE_ROOT_EC)
	WREG(PCIER_ROOT_CTL, cfg->pcie_root_ctl);

	if (version > 1) {
	WREG(PCIER_DEVICE_CTL2, cfg->pcie_device_ctl2);
	WREG(PCIER_LINK_CTL2, cfg->pcie_link_ctl2);
	WREG(PCIER_SLOT_CTL2, cfg->pcie_slot_ctl2);
	}
	#undef WREG
	}

	static void
	pci_cfg_restore_pcix(device_t dev, struct pci_devinfo *dinfo)
	{
	pci_write_config(dev, dinfo->cfg.pcix.pcix_location + PCIXR_COMMAND,
	dinfo->cfg.pcix.pcix_command, 2);
	}

	void
	pci_cfg_restore(device_t dev, struct pci_devinfo *dinfo)
	{

	/*
	* Restore the device to full power mode. We must do this
	* before we restore the registers because moving from D3 to
	* D0 will cause the chip's BARs and some other registers to
	* be reset to some unknown power on reset values. Cut down
	* the noise on boot by doing nothing if we are already in
	* state D0.
	*/
	if (pci_get_powerstate(dev) != PCI_POWERSTATE_D0)
	pci_set_powerstate(dev, PCI_POWERSTATE_D0);
	pci_write_config(dev, PCIR_COMMAND, dinfo->cfg.cmdreg, 2);
	pci_write_config(dev, PCIR_INTLINE, dinfo->cfg.intline, 1);
	pci_write_config(dev, PCIR_INTPIN, dinfo->cfg.intpin, 1);
	pci_write_config(dev, PCIR_CACHELNSZ, dinfo->cfg.cachelnsz, 1);
	pci_write_config(dev, PCIR_LATTIMER, dinfo->cfg.lattimer, 1);
	pci_write_config(dev, PCIR_PROGIF, dinfo->cfg.progif, 1);
	pci_write_config(dev, PCIR_REVID, dinfo->cfg.revid, 1);
	switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_NORMAL:
	pci_write_config(dev, PCIR_MINGNT, dinfo->cfg.mingnt, 1);
	pci_write_config(dev, PCIR_MAXLAT, dinfo->cfg.maxlat, 1);
	break;
	case PCIM_HDRTYPE_BRIDGE:
	pci_write_config(dev, PCIR_SECLAT_1,
	dinfo->cfg.bridge.br_seclat, 1);
	pci_write_config(dev, PCIR_SUBBUS_1,
	dinfo->cfg.bridge.br_subbus, 1);
	pci_write_config(dev, PCIR_SECBUS_1,
	dinfo->cfg.bridge.br_secbus, 1);
	pci_write_config(dev, PCIR_PRIBUS_1,
	dinfo->cfg.bridge.br_pribus, 1);
	pci_write_config(dev, PCIR_BRIDGECTL_1,
	dinfo->cfg.bridge.br_control, 2);
	break;
	case PCIM_HDRTYPE_CARDBUS:
	pci_write_config(dev, PCIR_SECLAT_2,
	dinfo->cfg.bridge.br_seclat, 1);
	pci_write_config(dev, PCIR_SUBBUS_2,
	dinfo->cfg.bridge.br_subbus, 1);
	pci_write_config(dev, PCIR_SECBUS_2,
	dinfo->cfg.bridge.br_secbus, 1);
	pci_write_config(dev, PCIR_PRIBUS_2,
	dinfo->cfg.bridge.br_pribus, 1);
	pci_write_config(dev, PCIR_BRIDGECTL_2,
	dinfo->cfg.bridge.br_control, 2);
	break;
	}
	pci_restore_bars(dev);

	/*
	* Restore extended capabilities for PCI-Express and PCI-X
	*/
	if (dinfo->cfg.pcie.pcie_location != 0)
	pci_cfg_restore_pcie(dev, dinfo);
	if (dinfo->cfg.pcix.pcix_location != 0)
	pci_cfg_restore_pcix(dev, dinfo);

	/* Restore MSI and MSI-X configurations if they are present. */
	if (dinfo->cfg.msi.msi_location != 0)
	pci_resume_msi(dev);
	if (dinfo->cfg.msix.msix_location != 0)
	pci_resume_msix(dev);

	#ifdef PCI_IOV
	if (dinfo->cfg.iov != NULL)
	pci_iov_cfg_restore(dev, dinfo);
	#endif
	}

	static void
	pci_cfg_save_pcie(device_t dev, struct pci_devinfo *dinfo)
	{
	#define RREG(n) pci_read_config(dev, pos + (n), 2)
	struct pcicfg_pcie *cfg;
	int version, pos;

	cfg = &dinfo->cfg.pcie;
	pos = cfg->pcie_location;

	cfg->pcie_flags = RREG(PCIER_FLAGS);

	version = cfg->pcie_flags & PCIEM_FLAGS_VERSION;

	cfg->pcie_device_ctl = RREG(PCIER_DEVICE_CTL);

	if (version > 1 \|\| cfg->pcie_type == PCIEM_TYPE_ROOT_PORT \|\|
	cfg->pcie_type == PCIEM_TYPE_ENDPOINT \|\|
	cfg->pcie_type == PCIEM_TYPE_LEGACY_ENDPOINT)
	cfg->pcie_link_ctl = RREG(PCIER_LINK_CTL);

	if (version > 1 \|\| (cfg->pcie_type == PCIEM_TYPE_ROOT_PORT \|\|
	(cfg->pcie_type == PCIEM_TYPE_DOWNSTREAM_PORT &&
	(cfg->pcie_flags & PCIEM_FLAGS_SLOT))))
	cfg->pcie_slot_ctl = RREG(PCIER_SLOT_CTL);

	if (version > 1 \|\| cfg->pcie_type == PCIEM_TYPE_ROOT_PORT \|\|
	cfg->pcie_type == PCIEM_TYPE_ROOT_EC)
	cfg->pcie_root_ctl = RREG(PCIER_ROOT_CTL);

	if (version > 1) {
	cfg->pcie_device_ctl2 = RREG(PCIER_DEVICE_CTL2);
	cfg->pcie_link_ctl2 = RREG(PCIER_LINK_CTL2);
	cfg->pcie_slot_ctl2 = RREG(PCIER_SLOT_CTL2);
	}
	#undef RREG
	}

	static void
	pci_cfg_save_pcix(device_t dev, struct pci_devinfo *dinfo)
	{
	dinfo->cfg.pcix.pcix_command = pci_read_config(dev,
	dinfo->cfg.pcix.pcix_location + PCIXR_COMMAND, 2);
	}

	void
	pci_cfg_save(device_t dev, struct pci_devinfo *dinfo, int setstate)
	{
	uint32_t cls;
	int ps;

	/*
	* Some drivers apparently write to these registers w/o updating our
	* cached copy. No harm happens if we update the copy, so do so here
	* so we can restore them. The COMMAND register is modified by the
	* bus w/o updating the cache. This should represent the normally
	* writable portion of the 'defined' part of type 0/1/2 headers.
	*/
	dinfo->cfg.vendor = pci_read_config(dev, PCIR_VENDOR, 2);
	dinfo->cfg.device = pci_read_config(dev, PCIR_DEVICE, 2);
	dinfo->cfg.cmdreg = pci_read_config(dev, PCIR_COMMAND, 2);
	dinfo->cfg.intline = pci_read_config(dev, PCIR_INTLINE, 1);
	dinfo->cfg.intpin = pci_read_config(dev, PCIR_INTPIN, 1);
	dinfo->cfg.cachelnsz = pci_read_config(dev, PCIR_CACHELNSZ, 1);
	dinfo->cfg.lattimer = pci_read_config(dev, PCIR_LATTIMER, 1);
	dinfo->cfg.baseclass = pci_read_config(dev, PCIR_CLASS, 1);
	dinfo->cfg.subclass = pci_read_config(dev, PCIR_SUBCLASS, 1);
	dinfo->cfg.progif = pci_read_config(dev, PCIR_PROGIF, 1);
	dinfo->cfg.revid = pci_read_config(dev, PCIR_REVID, 1);
	switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_NORMAL:
	dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_0, 2);
	dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_0, 2);
	dinfo->cfg.mingnt = pci_read_config(dev, PCIR_MINGNT, 1);
	dinfo->cfg.maxlat = pci_read_config(dev, PCIR_MAXLAT, 1);
	break;
	case PCIM_HDRTYPE_BRIDGE:
	dinfo->cfg.bridge.br_seclat = pci_read_config(dev,
	PCIR_SECLAT_1, 1);
	dinfo->cfg.bridge.br_subbus = pci_read_config(dev,
	PCIR_SUBBUS_1, 1);
	dinfo->cfg.bridge.br_secbus = pci_read_config(dev,
	PCIR_SECBUS_1, 1);
	dinfo->cfg.bridge.br_pribus = pci_read_config(dev,
	PCIR_PRIBUS_1, 1);
	dinfo->cfg.bridge.br_control = pci_read_config(dev,
	PCIR_BRIDGECTL_1, 2);
	break;
	case PCIM_HDRTYPE_CARDBUS:
	dinfo->cfg.bridge.br_seclat = pci_read_config(dev,
	PCIR_SECLAT_2, 1);
	dinfo->cfg.bridge.br_subbus = pci_read_config(dev,
	PCIR_SUBBUS_2, 1);
	dinfo->cfg.bridge.br_secbus = pci_read_config(dev,
	PCIR_SECBUS_2, 1);
	dinfo->cfg.bridge.br_pribus = pci_read_config(dev,
	PCIR_PRIBUS_2, 1);
	dinfo->cfg.bridge.br_control = pci_read_config(dev,
	PCIR_BRIDGECTL_2, 2);
	dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_2, 2);
	dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_2, 2);
	break;
	}

	if (dinfo->cfg.pcie.pcie_location != 0)
	pci_cfg_save_pcie(dev, dinfo);

	if (dinfo->cfg.pcix.pcix_location != 0)
	pci_cfg_save_pcix(dev, dinfo);

	#ifdef PCI_IOV
	if (dinfo->cfg.iov != NULL)
	pci_iov_cfg_save(dev, dinfo);
	#endif

	/*
	* don't set the state for display devices, base peripherals and
	* memory devices since bad things happen when they are powered down.
	* We should (a) have drivers that can easily detach and (b) use
	* generic drivers for these devices so that some device actually
	* attaches. We need to make sure that when we implement (a) we don't
	* power the device down on a reattach.
	*/
	cls = pci_get_class(dev);
	if (!setstate)
	return;
	switch (pci_do_power_nodriver)
	{
	case 0: /* NO powerdown at all */
	return;
	case 1: /* Conservative about what to power down */
	if (cls == PCIC_STORAGE)
	return;
	/FALLTHROUGH/
	case 2: /* Aggressive about what to power down */
	if (cls == PCIC_DISPLAY \|\| cls == PCIC_MEMORY \|\|
	cls == PCIC_BASEPERIPH)
	return;
	/FALLTHROUGH/
	case 3: /* Power down everything */
	break;
	}
	/*
	* PCI spec says we can only go into D3 state from D0 state.
	* Transition from D[12] into D0 before going to D3 state.
	*/
	ps = pci_get_powerstate(dev);
	if (ps != PCI_POWERSTATE_D0 && ps != PCI_POWERSTATE_D3)
	pci_set_powerstate(dev, PCI_POWERSTATE_D0);
	if (pci_get_powerstate(dev) != PCI_POWERSTATE_D3)
	pci_set_powerstate(dev, PCI_POWERSTATE_D3);
	}

	/* Wrapper APIs suitable for device driver use. */
	void
	pci_save_state(device_t dev)
	{
	struct pci_devinfo *dinfo;

	dinfo = device_get_ivars(dev);
	pci_cfg_save(dev, dinfo, 0);
	}

	void
	pci_restore_state(device_t dev)
	{
	struct pci_devinfo *dinfo;

	dinfo = device_get_ivars(dev);
	pci_cfg_restore(dev, dinfo);
	}

	static int
	pci_get_id_method(device_t dev, device_t child, enum pci_id_type type,
	uintptr_t *id)
	{

	return (PCIB_GET_ID(device_get_parent(dev), child, type, id));
	}

	/* Find the upstream port of a given PCI device in a root complex. */
	device_t
	pci_find_pcie_root_port(device_t dev)
	{
	struct pci_devinfo *dinfo;
	devclass_t pci_class;
	device_t pcib, bus;

	pci_class = devclass_find("pci");
	KASSERT(device_get_devclass(device_get_parent(dev)) == pci_class,
	("%s: non-pci device %s", __func__, device_get_nameunit(dev)));

	/*
	* Walk the bridge hierarchy until we find a PCI-e root
	* port or a non-PCI device.
	*/
	for (;;) {
	bus = device_get_parent(dev);
	KASSERT(bus != NULL, ("%s: null parent of %s", __func__,
	device_get_nameunit(dev)));

	pcib = device_get_parent(bus);
	KASSERT(pcib != NULL, ("%s: null bridge of %s", __func__,
	device_get_nameunit(bus)));

	/*
	* pcib's parent must be a PCI bus for this to be a
	* PCI-PCI bridge.
	*/
	if (device_get_devclass(device_get_parent(pcib)) != pci_class)
	return (NULL);

	dinfo = device_get_ivars(pcib);
	if (dinfo->cfg.pcie.pcie_location != 0 &&
	dinfo->cfg.pcie.pcie_type == PCIEM_TYPE_ROOT_PORT)
	return (pcib);

	dev = pcib;
	}
	}

	/*
	* Wait for pending transactions to complete on a PCI-express function.
	*
	* The maximum delay is specified in milliseconds in max_delay. Note
	* that this function may sleep.
	*
	* Returns true if the function is idle and false if the timeout is
	* exceeded. If dev is not a PCI-express function, this returns true.
	*/
	bool
	pcie_wait_for_pending_transactions(device_t dev, u_int max_delay)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	uint16_t sta;
	int cap;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0)
	return (true);

	sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2);
	while (sta & PCIEM_STA_TRANSACTION_PND) {
	if (max_delay == 0)
	return (false);

	/* Poll once every 100 milliseconds up to the timeout. */
	if (max_delay > 100) {
	pause_sbt("pcietp", 100 * SBT_1MS, 0, C_HARDCLOCK);
	max_delay -= 100;
	} else {
	pause_sbt("pcietp", max_delay * SBT_1MS, 0,
	C_HARDCLOCK);
	max_delay = 0;
	}
	sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2);
	}

	return (true);
	}

	/*
	* Determine the maximum Completion Timeout in microseconds.
	*
	* For non-PCI-express functions this returns 0.
	*/
	int
	pcie_get_max_completion_timeout(device_t dev)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	int cap;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0)
	return (0);

	/*
	* Functions using the 1.x spec use the default timeout range of
	* 50 microseconds to 50 milliseconds. Functions that do not
	* support programmable timeouts also use this range.
	*/
	if ((dinfo->cfg.pcie.pcie_flags & PCIEM_FLAGS_VERSION) < 2 \|\|
	(pci_read_config(dev, cap + PCIER_DEVICE_CAP2, 4) &
	PCIEM_CAP2_COMP_TIMO_RANGES) == 0)
	return (50 * 1000);

	switch (pci_read_config(dev, cap + PCIER_DEVICE_CTL2, 2) &
	PCIEM_CTL2_COMP_TIMO_VAL) {
	case PCIEM_CTL2_COMP_TIMO_100US:
	return (100);
	case PCIEM_CTL2_COMP_TIMO_10MS:
	return (10 * 1000);
	case PCIEM_CTL2_COMP_TIMO_55MS:
	return (55 * 1000);
	case PCIEM_CTL2_COMP_TIMO_210MS:
	return (210 * 1000);
	case PCIEM_CTL2_COMP_TIMO_900MS:
	return (900 * 1000);
	case PCIEM_CTL2_COMP_TIMO_3500MS:
	return (3500 * 1000);
	case PCIEM_CTL2_COMP_TIMO_13S:
	return (13 * 1000 * 1000);
	case PCIEM_CTL2_COMP_TIMO_64S:
	return (64 * 1000 * 1000);
	default:
	return (50 * 1000);
	}
	}

	/*
	* Perform a Function Level Reset (FLR) on a device.
	*
	* This function first waits for any pending transactions to complete
	* within the timeout specified by max_delay. If transactions are
	* still pending, the function will return false without attempting a
	* reset.
	*
	* If dev is not a PCI-express function or does not support FLR, this
	* function returns false.
	*
	* Note that no registers are saved or restored. The caller is
	* responsible for saving and restoring any registers including
	* PCI-standard registers via pci_save_state() and
	* pci_restore_state().
	*/
	bool
	pcie_flr(device_t dev, u_int max_delay, bool force)
	{
	struct pci_devinfo *dinfo = device_get_ivars(dev);
	uint16_t cmd, ctl;
	int compl_delay;
	int cap;

	cap = dinfo->cfg.pcie.pcie_location;
	if (cap == 0)
	return (false);

	if (!(pci_read_config(dev, cap + PCIER_DEVICE_CAP, 4) & PCIEM_CAP_FLR))
	return (false);

	/*
	* Disable busmastering to prevent generation of new
	* transactions while waiting for the device to go idle. If
	* the idle timeout fails, the command register is restored
	* which will re-enable busmastering.
	*/
	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
	pci_write_config(dev, PCIR_COMMAND, cmd & ~(PCIM_CMD_BUSMASTEREN), 2);
	if (!pcie_wait_for_pending_transactions(dev, max_delay)) {
	if (!force) {
	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
	return (false);
	}
	pci_printf(&dinfo->cfg,
	"Resetting with transactions pending after %d ms\n",
	max_delay);

	/*
	* Extend the post-FLR delay to cover the maximum
	* Completion Timeout delay of anything in flight
	* during the FLR delay. Enforce a minimum delay of
	* at least 10ms.
	*/
	compl_delay = pcie_get_max_completion_timeout(dev) / 1000;
	if (compl_delay < 10)
	compl_delay = 10;
	} else
	compl_delay = 0;

	/* Initiate the reset. */
	ctl = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
	pci_write_config(dev, cap + PCIER_DEVICE_CTL, ctl \|
	PCIEM_CTL_INITIATE_FLR, 2);

	/* Wait for 100ms. */
	pause_sbt("pcieflr", (100 + compl_delay) * SBT_1MS, 0, C_HARDCLOCK);

	if (pci_read_config(dev, cap + PCIER_DEVICE_STA, 2) &
	PCIEM_STA_TRANSACTION_PND)
	pci_printf(&dinfo->cfg, "Transactions pending after FLR!\n");
	return (true);
	}
	Index: head/sys/dev/pci/pci_pci.c
	===================================================================
	--- head/sys/dev/pci/pci_pci.c (revision 327172)
	+++ head/sys/dev/pci/pci_pci.c (revision 327173)
	@@ -1,2898 +1,2894 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1994,1995 Stefan Esser, Wolfgang StanglMeier
	* Copyright (c) 2000 Michael Smith <msmith@freebsd.org>
	* Copyright (c) 2000 BSDi
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* PCI:PCI bridge support.
	*/

	#include "opt_pci.h"

	#include <sys/param.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/rman.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/taskqueue.h>

	#include <dev/pci/pcivar.h>
	#include <dev/pci/pcireg.h>
	#include <dev/pci/pci_private.h>
	#include <dev/pci/pcib_private.h>

	#include "pcib_if.h"

	static int pcib_probe(device_t dev);
	static int pcib_suspend(device_t dev);
	static int pcib_resume(device_t dev);
	static int pcib_power_for_sleep(device_t pcib, device_t dev,
	int *pstate);
	static int pcib_ari_get_id(device_t pcib, device_t dev,
	enum pci_id_type type, uintptr_t *id);
	static uint32_t pcib_read_config(device_t dev, u_int b, u_int s,
	u_int f, u_int reg, int width);
	static void pcib_write_config(device_t dev, u_int b, u_int s,
	u_int f, u_int reg, uint32_t val, int width);
	static int pcib_ari_maxslots(device_t dev);
	static int pcib_ari_maxfuncs(device_t dev);
	static int pcib_try_enable_ari(device_t pcib, device_t dev);
	static int pcib_ari_enabled(device_t pcib);
	static void pcib_ari_decode_rid(device_t pcib, uint16_t rid,
	int bus, int slot, int *func);
	#ifdef PCI_HP
	static void pcib_pcie_ab_timeout(void *arg);
	static void pcib_pcie_cc_timeout(void *arg);
	static void pcib_pcie_dll_timeout(void *arg);
	#endif
	static int pcib_request_feature_default(device_t pcib, device_t dev,
	enum pci_feature feature);

	static device_method_t pcib_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, pcib_probe),
	DEVMETHOD(device_attach, pcib_attach),
	DEVMETHOD(device_detach, pcib_detach),
	DEVMETHOD(device_shutdown, bus_generic_shutdown),
	DEVMETHOD(device_suspend, pcib_suspend),
	DEVMETHOD(device_resume, pcib_resume),

	/* Bus interface */
	DEVMETHOD(bus_child_present, pcib_child_present),
	DEVMETHOD(bus_read_ivar, pcib_read_ivar),
	DEVMETHOD(bus_write_ivar, pcib_write_ivar),
	DEVMETHOD(bus_alloc_resource, pcib_alloc_resource),
	#ifdef NEW_PCIB
	DEVMETHOD(bus_adjust_resource, pcib_adjust_resource),
	DEVMETHOD(bus_release_resource, pcib_release_resource),
	#else
	DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource),
	DEVMETHOD(bus_release_resource, bus_generic_release_resource),
	#endif
	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
	DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
	DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),

	/* pcib interface */
	DEVMETHOD(pcib_maxslots, pcib_ari_maxslots),
	DEVMETHOD(pcib_maxfuncs, pcib_ari_maxfuncs),
	DEVMETHOD(pcib_read_config, pcib_read_config),
	DEVMETHOD(pcib_write_config, pcib_write_config),
	DEVMETHOD(pcib_route_interrupt, pcib_route_interrupt),
	DEVMETHOD(pcib_alloc_msi, pcib_alloc_msi),
	DEVMETHOD(pcib_release_msi, pcib_release_msi),
	DEVMETHOD(pcib_alloc_msix, pcib_alloc_msix),
	DEVMETHOD(pcib_release_msix, pcib_release_msix),
	DEVMETHOD(pcib_map_msi, pcib_map_msi),
	DEVMETHOD(pcib_power_for_sleep, pcib_power_for_sleep),
	DEVMETHOD(pcib_get_id, pcib_ari_get_id),
	DEVMETHOD(pcib_try_enable_ari, pcib_try_enable_ari),
	DEVMETHOD(pcib_ari_enabled, pcib_ari_enabled),
	DEVMETHOD(pcib_decode_rid, pcib_ari_decode_rid),
	DEVMETHOD(pcib_request_feature, pcib_request_feature_default),

	DEVMETHOD_END
	};

	static devclass_t pcib_devclass;

	DEFINE_CLASS_0(pcib, pcib_driver, pcib_methods, sizeof(struct pcib_softc));
	DRIVER_MODULE(pcib, pci, pcib_driver, pcib_devclass, NULL, NULL);

	#if defined(NEW_PCIB) \|\| defined(PCI_HP)
	SYSCTL_DECL(_hw_pci);
	#endif

	#ifdef NEW_PCIB
	static int pci_clear_pcib;
	SYSCTL_INT(_hw_pci, OID_AUTO, clear_pcib, CTLFLAG_RDTUN, &pci_clear_pcib, 0,
	"Clear firmware-assigned resources for PCI-PCI bridge I/O windows.");

	/*
	* Is a resource from a child device sub-allocated from one of our
	* resource managers?
	*/
	static int
	pcib_is_resource_managed(struct pcib_softc sc, int type, struct resource r)
	{

	switch (type) {
	#ifdef PCI_RES_BUS
	case PCI_RES_BUS:
	return (rman_is_region_manager(r, &sc->bus.rman));
	#endif
	case SYS_RES_IOPORT:
	return (rman_is_region_manager(r, &sc->io.rman));
	case SYS_RES_MEMORY:
	/* Prefetchable resources may live in either memory rman. */
	if (rman_get_flags(r) & RF_PREFETCHABLE &&
	rman_is_region_manager(r, &sc->pmem.rman))
	return (1);
	return (rman_is_region_manager(r, &sc->mem.rman));
	}
	return (0);
	}

	static int
	pcib_is_window_open(struct pcib_window *pw)
	{

	return (pw->valid && pw->base < pw->limit);
	}

	/*
	* XXX: If RF_ACTIVE did not also imply allocating a bus space tag and
	* handle for the resource, we could pass RF_ACTIVE up to the PCI bus
	* when allocating the resource windows and rely on the PCI bus driver
	* to do this for us.
	*/
	static void
	pcib_activate_window(struct pcib_softc *sc, int type)
	{

	PCI_ENABLE_IO(device_get_parent(sc->dev), sc->dev, type);
	}

	static void
	pcib_write_windows(struct pcib_softc *sc, int mask)
	{
	device_t dev;
	uint32_t val;

	dev = sc->dev;
	if (sc->io.valid && mask & WIN_IO) {
	val = pci_read_config(dev, PCIR_IOBASEL_1, 1);
	if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) {
	pci_write_config(dev, PCIR_IOBASEH_1,
	sc->io.base >> 16, 2);
	pci_write_config(dev, PCIR_IOLIMITH_1,
	sc->io.limit >> 16, 2);
	}
	pci_write_config(dev, PCIR_IOBASEL_1, sc->io.base >> 8, 1);
	pci_write_config(dev, PCIR_IOLIMITL_1, sc->io.limit >> 8, 1);
	}

	if (mask & WIN_MEM) {
	pci_write_config(dev, PCIR_MEMBASE_1, sc->mem.base >> 16, 2);
	pci_write_config(dev, PCIR_MEMLIMIT_1, sc->mem.limit >> 16, 2);
	}

	if (sc->pmem.valid && mask & WIN_PMEM) {
	val = pci_read_config(dev, PCIR_PMBASEL_1, 2);
	if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) {
	pci_write_config(dev, PCIR_PMBASEH_1,
	sc->pmem.base >> 32, 4);
	pci_write_config(dev, PCIR_PMLIMITH_1,
	sc->pmem.limit >> 32, 4);
	}
	pci_write_config(dev, PCIR_PMBASEL_1, sc->pmem.base >> 16, 2);
	pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmem.limit >> 16, 2);
	}
	}

	/*
	* This is used to reject I/O port allocations that conflict with an
	* ISA alias range.
	*/
	static int
	pcib_is_isa_range(struct pcib_softc *sc, rman_res_t start, rman_res_t end,
	rman_res_t count)
	{
	rman_res_t next_alias;

	if (!(sc->bridgectl & PCIB_BCR_ISA_ENABLE))
	return (0);

	/* Only check fixed ranges for overlap. */
	if (start + count - 1 != end)
	return (0);

	/* ISA aliases are only in the lower 64KB of I/O space. */
	if (start >= 65536)
	return (0);

	/* Check for overlap with 0x000 - 0x0ff as a special case. */
	if (start < 0x100)
	goto alias;

	/*
	* If the start address is an alias, the range is an alias.
	* Otherwise, compute the start of the next alias range and
	* check if it is before the end of the candidate range.
	*/
	if ((start & 0x300) != 0)
	goto alias;
	next_alias = (start & ~0x3fful) \| 0x100;
	if (next_alias <= end)
	goto alias;
	return (0);

	alias:
	if (bootverbose)
	device_printf(sc->dev,
	"I/O range %#jx-%#jx overlaps with an ISA alias\n", start,
	end);
	return (1);
	}

	static void
	pcib_add_window_resources(struct pcib_window w, struct resource *res,
	int count)
	{
	struct resource **newarray;
	int error, i;

	newarray = malloc(sizeof(struct resource ) (w->count + count),
	M_DEVBUF, M_WAITOK);
	if (w->res != NULL)
	bcopy(w->res, newarray, sizeof(struct resource ) w->count);
	bcopy(res, newarray + w->count, sizeof(struct resource ) count);
	free(w->res, M_DEVBUF);
	w->res = newarray;
	w->count += count;

	for (i = 0; i < count; i++) {
	error = rman_manage_region(&w->rman, rman_get_start(res[i]),
	rman_get_end(res[i]));
	if (error)
	panic("Failed to add resource to rman");
	}
	}

	typedef void (nonisa_callback)(rman_res_t start, rman_res_t end, void *arg);

	static void
	pcib_walk_nonisa_ranges(rman_res_t start, rman_res_t end, nonisa_callback *cb,
	void *arg)
	{
	rman_res_t next_end;

	/*
	* If start is within an ISA alias range, move up to the start
	* of the next non-alias range. As a special case, addresses
	* in the range 0x000 - 0x0ff should also be skipped since
	* those are used for various system I/O devices in ISA
	* systems.
	*/
	if (start <= 65535) {
	if (start < 0x100 \|\| (start & 0x300) != 0) {
	start &= ~0x3ff;
	start += 0x400;
	}
	}

	/* ISA aliases are only in the lower 64KB of I/O space. */
	while (start <= MIN(end, 65535)) {
	next_end = MIN(start \| 0xff, end);
	cb(start, next_end, arg);
	start += 0x400;
	}

	if (start <= end)
	cb(start, end, arg);
	}

	static void
	count_ranges(rman_res_t start, rman_res_t end, void *arg)
	{
	int *countp;

	countp = arg;
	(*countp)++;
	}

	struct alloc_state {
	struct resource **res;
	struct pcib_softc *sc;
	int count, error;
	};

	static void
	alloc_ranges(rman_res_t start, rman_res_t end, void *arg)
	{
	struct alloc_state *as;
	struct pcib_window *w;
	int rid;

	as = arg;
	if (as->error != 0)
	return;

	w = &as->sc->io;
	rid = w->reg;
	if (bootverbose)
	device_printf(as->sc->dev,
	"allocating non-ISA range %#jx-%#jx\n", start, end);
	as->res[as->count] = bus_alloc_resource(as->sc->dev, SYS_RES_IOPORT,
	&rid, start, end, end - start + 1, 0);
	if (as->res[as->count] == NULL)
	as->error = ENXIO;
	else
	as->count++;
	}

	static int
	pcib_alloc_nonisa_ranges(struct pcib_softc *sc, rman_res_t start, rman_res_t end)
	{
	struct alloc_state as;
	int i, new_count;

	/* First, see how many ranges we need. */
	new_count = 0;
	pcib_walk_nonisa_ranges(start, end, count_ranges, &new_count);

	/* Second, allocate the ranges. */
	as.res = malloc(sizeof(struct resource ) new_count, M_DEVBUF,
	M_WAITOK);
	as.sc = sc;
	as.count = 0;
	as.error = 0;
	pcib_walk_nonisa_ranges(start, end, alloc_ranges, &as);
	if (as.error != 0) {
	for (i = 0; i < as.count; i++)
	bus_release_resource(sc->dev, SYS_RES_IOPORT,
	sc->io.reg, as.res[i]);
	free(as.res, M_DEVBUF);
	return (as.error);
	}
	KASSERT(as.count == new_count, ("%s: count mismatch", __func__));

	/* Third, add the ranges to the window. */
	pcib_add_window_resources(&sc->io, as.res, as.count);
	free(as.res, M_DEVBUF);
	return (0);
	}

	static void
	pcib_alloc_window(struct pcib_softc sc, struct pcib_window w, int type,
	int flags, pci_addr_t max_address)
	{
	struct resource *res;
	char buf[64];
	int error, rid;

	if (max_address != (rman_res_t)max_address)
	max_address = ~0;
	w->rman.rm_start = 0;
	w->rman.rm_end = max_address;
	w->rman.rm_type = RMAN_ARRAY;
	snprintf(buf, sizeof(buf), "%s %s window",
	device_get_nameunit(sc->dev), w->name);
	w->rman.rm_descr = strdup(buf, M_DEVBUF);
	error = rman_init(&w->rman);
	if (error)
	panic("Failed to initialize %s %s rman",
	device_get_nameunit(sc->dev), w->name);

	if (!pcib_is_window_open(w))
	return;

	if (w->base > max_address \|\| w->limit > max_address) {
	device_printf(sc->dev,
	"initial %s window has too many bits, ignoring\n", w->name);
	return;
	}
	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE)
	(void)pcib_alloc_nonisa_ranges(sc, w->base, w->limit);
	else {
	rid = w->reg;
	res = bus_alloc_resource(sc->dev, type, &rid, w->base, w->limit,
	w->limit - w->base + 1, flags);
	if (res != NULL)
	pcib_add_window_resources(w, &res, 1);
	}
	if (w->res == NULL) {
	device_printf(sc->dev,
	"failed to allocate initial %s window: %#jx-%#jx\n",
	w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
	w->base = max_address;
	w->limit = 0;
	pcib_write_windows(sc, w->mask);
	return;
	}
	pcib_activate_window(sc, type);
	}

	/*
	* Initialize I/O windows.
	*/
	static void
	pcib_probe_windows(struct pcib_softc *sc)
	{
	pci_addr_t max;
	device_t dev;
	uint32_t val;

	dev = sc->dev;

	if (pci_clear_pcib) {
	pcib_bridge_init(dev);
	}

	/* Determine if the I/O port window is implemented. */
	val = pci_read_config(dev, PCIR_IOBASEL_1, 1);
	if (val == 0) {
	/*
	* If 'val' is zero, then only 16-bits of I/O space
	* are supported.
	*/
	pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1);
	if (pci_read_config(dev, PCIR_IOBASEL_1, 1) != 0) {
	sc->io.valid = 1;
	pci_write_config(dev, PCIR_IOBASEL_1, 0, 1);
	}
	} else
	sc->io.valid = 1;

	/* Read the existing I/O port window. */
	if (sc->io.valid) {
	sc->io.reg = PCIR_IOBASEL_1;
	sc->io.step = 12;
	sc->io.mask = WIN_IO;
	sc->io.name = "I/O port";
	if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) {
	sc->io.base = PCI_PPBIOBASE(
	pci_read_config(dev, PCIR_IOBASEH_1, 2), val);
	sc->io.limit = PCI_PPBIOLIMIT(
	pci_read_config(dev, PCIR_IOLIMITH_1, 2),
	pci_read_config(dev, PCIR_IOLIMITL_1, 1));
	max = 0xffffffff;
	} else {
	sc->io.base = PCI_PPBIOBASE(0, val);
	sc->io.limit = PCI_PPBIOLIMIT(0,
	pci_read_config(dev, PCIR_IOLIMITL_1, 1));
	max = 0xffff;
	}
	pcib_alloc_window(sc, &sc->io, SYS_RES_IOPORT, 0, max);
	}

	/* Read the existing memory window. */
	sc->mem.valid = 1;
	sc->mem.reg = PCIR_MEMBASE_1;
	sc->mem.step = 20;
	sc->mem.mask = WIN_MEM;
	sc->mem.name = "memory";
	sc->mem.base = PCI_PPBMEMBASE(0,
	pci_read_config(dev, PCIR_MEMBASE_1, 2));
	sc->mem.limit = PCI_PPBMEMLIMIT(0,
	pci_read_config(dev, PCIR_MEMLIMIT_1, 2));
	pcib_alloc_window(sc, &sc->mem, SYS_RES_MEMORY, 0, 0xffffffff);

	/* Determine if the prefetchable memory window is implemented. */
	val = pci_read_config(dev, PCIR_PMBASEL_1, 2);
	if (val == 0) {
	/*
	* If 'val' is zero, then only 32-bits of memory space
	* are supported.
	*/
	pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2);
	if (pci_read_config(dev, PCIR_PMBASEL_1, 2) != 0) {
	sc->pmem.valid = 1;
	pci_write_config(dev, PCIR_PMBASEL_1, 0, 2);
	}
	} else
	sc->pmem.valid = 1;

	/* Read the existing prefetchable memory window. */
	if (sc->pmem.valid) {
	sc->pmem.reg = PCIR_PMBASEL_1;
	sc->pmem.step = 20;
	sc->pmem.mask = WIN_PMEM;
	sc->pmem.name = "prefetch";
	if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) {
	sc->pmem.base = PCI_PPBMEMBASE(
	pci_read_config(dev, PCIR_PMBASEH_1, 4), val);
	sc->pmem.limit = PCI_PPBMEMLIMIT(
	pci_read_config(dev, PCIR_PMLIMITH_1, 4),
	pci_read_config(dev, PCIR_PMLIMITL_1, 2));
	max = 0xffffffffffffffff;
	} else {
	sc->pmem.base = PCI_PPBMEMBASE(0, val);
	sc->pmem.limit = PCI_PPBMEMLIMIT(0,
	pci_read_config(dev, PCIR_PMLIMITL_1, 2));
	max = 0xffffffff;
	}
	pcib_alloc_window(sc, &sc->pmem, SYS_RES_MEMORY,
	RF_PREFETCHABLE, max);
	}
	}

	static void
	pcib_release_window(struct pcib_softc sc, struct pcib_window w, int type)
	{
	device_t dev;
	int error, i;

	if (!w->valid)
	return;

	dev = sc->dev;
	error = rman_fini(&w->rman);
	if (error) {
	device_printf(dev, "failed to release %s rman\n", w->name);
	return;
	}
	free(__DECONST(char *, w->rman.rm_descr), M_DEVBUF);

	for (i = 0; i < w->count; i++) {
	error = bus_free_resource(dev, type, w->res[i]);
	if (error)
	device_printf(dev,
	"failed to release %s resource: %d\n", w->name,
	error);
	}
	free(w->res, M_DEVBUF);
	}

	static void
	pcib_free_windows(struct pcib_softc *sc)
	{

	pcib_release_window(sc, &sc->pmem, SYS_RES_MEMORY);
	pcib_release_window(sc, &sc->mem, SYS_RES_MEMORY);
	pcib_release_window(sc, &sc->io, SYS_RES_IOPORT);
	}

	#ifdef PCI_RES_BUS
	/*
	* Allocate a suitable secondary bus for this bridge if needed and
	* initialize the resource manager for the secondary bus range. Note
	* that the minimum count is a desired value and this may allocate a
	* smaller range.
	*/
	void
	pcib_setup_secbus(device_t dev, struct pcib_secbus *bus, int min_count)
	{
	char buf[64];
	int error, rid, sec_reg;

	switch (pci_read_config(dev, PCIR_HDRTYPE, 1) & PCIM_HDRTYPE) {
	case PCIM_HDRTYPE_BRIDGE:
	sec_reg = PCIR_SECBUS_1;
	bus->sub_reg = PCIR_SUBBUS_1;
	break;
	case PCIM_HDRTYPE_CARDBUS:
	sec_reg = PCIR_SECBUS_2;
	bus->sub_reg = PCIR_SUBBUS_2;
	break;
	default:
	panic("not a PCI bridge");
	}
	bus->sec = pci_read_config(dev, sec_reg, 1);
	bus->sub = pci_read_config(dev, bus->sub_reg, 1);
	bus->dev = dev;
	bus->rman.rm_start = 0;
	bus->rman.rm_end = PCI_BUSMAX;
	bus->rman.rm_type = RMAN_ARRAY;
	snprintf(buf, sizeof(buf), "%s bus numbers", device_get_nameunit(dev));
	bus->rman.rm_descr = strdup(buf, M_DEVBUF);
	error = rman_init(&bus->rman);
	if (error)
	panic("Failed to initialize %s bus number rman",
	device_get_nameunit(dev));

	/*
	* Allocate a bus range. This will return an existing bus range
	* if one exists, or a new bus range if one does not.
	*/
	rid = 0;
	bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid,
	min_count, 0);
	if (bus->res == NULL) {
	/*
	* Fall back to just allocating a range of a single bus
	* number.
	*/
	bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid,
	1, 0);
	} else if (rman_get_size(bus->res) < min_count)
	/*
	* Attempt to grow the existing range to satisfy the
	* minimum desired count.
	*/
	(void)bus_adjust_resource(dev, PCI_RES_BUS, bus->res,
	rman_get_start(bus->res), rman_get_start(bus->res) +
	min_count - 1);

	/*
	* Add the initial resource to the rman.
	*/
	if (bus->res != NULL) {
	error = rman_manage_region(&bus->rman, rman_get_start(bus->res),
	rman_get_end(bus->res));
	if (error)
	panic("Failed to add resource to rman");
	bus->sec = rman_get_start(bus->res);
	bus->sub = rman_get_end(bus->res);
	}
	}

	void
	pcib_free_secbus(device_t dev, struct pcib_secbus *bus)
	{
	int error;

	error = rman_fini(&bus->rman);
	if (error) {
	device_printf(dev, "failed to release bus number rman\n");
	return;
	}
	free(__DECONST(char *, bus->rman.rm_descr), M_DEVBUF);

	error = bus_free_resource(dev, PCI_RES_BUS, bus->res);
	if (error)
	device_printf(dev,
	"failed to release bus numbers resource: %d\n", error);
	}

	static struct resource *
	pcib_suballoc_bus(struct pcib_secbus bus, device_t child, int rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct resource *res;

	res = rman_reserve_resource(&bus->rman, start, end, count, flags,
	child);
	if (res == NULL)
	return (NULL);

	if (bootverbose)
	device_printf(bus->dev,
	"allocated bus range (%ju-%ju) for rid %d of %s\n",
	rman_get_start(res), rman_get_end(res), *rid,
	pcib_child_name(child));
	rman_set_rid(res, *rid);
	return (res);
	}

	/*
	* Attempt to grow the secondary bus range. This is much simpler than
	* for I/O windows as the range can only be grown by increasing
	* subbus.
	*/
	static int
	pcib_grow_subbus(struct pcib_secbus *bus, rman_res_t new_end)
	{
	rman_res_t old_end;
	int error;

	old_end = rman_get_end(bus->res);
	KASSERT(new_end > old_end, ("attempt to shrink subbus"));
	error = bus_adjust_resource(bus->dev, PCI_RES_BUS, bus->res,
	rman_get_start(bus->res), new_end);
	if (error)
	return (error);
	if (bootverbose)
	device_printf(bus->dev, "grew bus range to %ju-%ju\n",
	rman_get_start(bus->res), rman_get_end(bus->res));
	error = rman_manage_region(&bus->rman, old_end + 1,
	rman_get_end(bus->res));
	if (error)
	panic("Failed to add resource to rman");
	bus->sub = rman_get_end(bus->res);
	pci_write_config(bus->dev, bus->sub_reg, bus->sub, 1);
	return (0);
	}

	struct resource *
	pcib_alloc_subbus(struct pcib_secbus bus, device_t child, int rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct resource *res;
	rman_res_t start_free, end_free, new_end;

	/*
	* First, see if the request can be satisified by the existing
	* bus range.
	*/
	res = pcib_suballoc_bus(bus, child, rid, start, end, count, flags);
	if (res != NULL)
	return (res);

	/*
	* Figure out a range to grow the bus range. First, find the
	* first bus number after the last allocated bus in the rman and
	* enforce that as a minimum starting point for the range.
	*/
	if (rman_last_free_region(&bus->rman, &start_free, &end_free) != 0 \|\|
	end_free != bus->sub)
	start_free = bus->sub + 1;
	if (start_free < start)
	start_free = start;
	new_end = start_free + count - 1;

	/*
	* See if this new range would satisfy the request if it
	* succeeds.
	*/
	if (new_end > end)
	return (NULL);

	/* Finally, attempt to grow the existing resource. */
	if (bootverbose) {
	device_printf(bus->dev,
	"attempting to grow bus range for %ju buses\n", count);
	printf("\tback candidate range: %ju-%ju\n", start_free,
	new_end);
	}
	if (pcib_grow_subbus(bus, new_end) == 0)
	return (pcib_suballoc_bus(bus, child, rid, start, end, count,
	flags));
	return (NULL);
	}
	#endif

	#else

	/*
	* Is the prefetch window open (eg, can we allocate memory in it?)
	*/
	static int
	pcib_is_prefetch_open(struct pcib_softc *sc)
	{
	return (sc->pmembase > 0 && sc->pmembase < sc->pmemlimit);
	}

	/*
	* Is the nonprefetch window open (eg, can we allocate memory in it?)
	*/
	static int
	pcib_is_nonprefetch_open(struct pcib_softc *sc)
	{
	return (sc->membase > 0 && sc->membase < sc->memlimit);
	}

	/*
	* Is the io window open (eg, can we allocate ports in it?)
	*/
	static int
	pcib_is_io_open(struct pcib_softc *sc)
	{
	return (sc->iobase > 0 && sc->iobase < sc->iolimit);
	}

	/*
	* Get current I/O decode.
	*/
	static void
	pcib_get_io_decode(struct pcib_softc *sc)
	{
	device_t dev;
	uint32_t iolow;

	dev = sc->dev;

	iolow = pci_read_config(dev, PCIR_IOBASEL_1, 1);
	if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32)
	sc->iobase = PCI_PPBIOBASE(
	pci_read_config(dev, PCIR_IOBASEH_1, 2), iolow);
	else
	sc->iobase = PCI_PPBIOBASE(0, iolow);

	iolow = pci_read_config(dev, PCIR_IOLIMITL_1, 1);
	if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32)
	sc->iolimit = PCI_PPBIOLIMIT(
	pci_read_config(dev, PCIR_IOLIMITH_1, 2), iolow);
	else
	sc->iolimit = PCI_PPBIOLIMIT(0, iolow);
	}

	/*
	* Get current memory decode.
	*/
	static void
	pcib_get_mem_decode(struct pcib_softc *sc)
	{
	device_t dev;
	pci_addr_t pmemlow;

	dev = sc->dev;

	sc->membase = PCI_PPBMEMBASE(0,
	pci_read_config(dev, PCIR_MEMBASE_1, 2));
	sc->memlimit = PCI_PPBMEMLIMIT(0,
	pci_read_config(dev, PCIR_MEMLIMIT_1, 2));

	pmemlow = pci_read_config(dev, PCIR_PMBASEL_1, 2);
	if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64)
	sc->pmembase = PCI_PPBMEMBASE(
	pci_read_config(dev, PCIR_PMBASEH_1, 4), pmemlow);
	else
	sc->pmembase = PCI_PPBMEMBASE(0, pmemlow);

	pmemlow = pci_read_config(dev, PCIR_PMLIMITL_1, 2);
	if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64)
	sc->pmemlimit = PCI_PPBMEMLIMIT(
	pci_read_config(dev, PCIR_PMLIMITH_1, 4), pmemlow);
	else
	sc->pmemlimit = PCI_PPBMEMLIMIT(0, pmemlow);
	}

	/*
	* Restore previous I/O decode.
	*/
	static void
	pcib_set_io_decode(struct pcib_softc *sc)
	{
	device_t dev;
	uint32_t iohi;

	dev = sc->dev;

	iohi = sc->iobase >> 16;
	if (iohi > 0)
	pci_write_config(dev, PCIR_IOBASEH_1, iohi, 2);
	pci_write_config(dev, PCIR_IOBASEL_1, sc->iobase >> 8, 1);

	iohi = sc->iolimit >> 16;
	if (iohi > 0)
	pci_write_config(dev, PCIR_IOLIMITH_1, iohi, 2);
	pci_write_config(dev, PCIR_IOLIMITL_1, sc->iolimit >> 8, 1);
	}

	/*
	* Restore previous memory decode.
	*/
	static void
	pcib_set_mem_decode(struct pcib_softc *sc)
	{
	device_t dev;
	pci_addr_t pmemhi;

	dev = sc->dev;

	pci_write_config(dev, PCIR_MEMBASE_1, sc->membase >> 16, 2);
	pci_write_config(dev, PCIR_MEMLIMIT_1, sc->memlimit >> 16, 2);

	pmemhi = sc->pmembase >> 32;
	if (pmemhi > 0)
	pci_write_config(dev, PCIR_PMBASEH_1, pmemhi, 4);
	pci_write_config(dev, PCIR_PMBASEL_1, sc->pmembase >> 16, 2);

	pmemhi = sc->pmemlimit >> 32;
	if (pmemhi > 0)
	pci_write_config(dev, PCIR_PMLIMITH_1, pmemhi, 4);
	pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmemlimit >> 16, 2);
	}
	#endif

	#ifdef PCI_HP
	/*
	* PCI-express HotPlug support.
	*/
	static int pci_enable_pcie_hp = 1;
	SYSCTL_INT(_hw_pci, OID_AUTO, enable_pcie_hp, CTLFLAG_RDTUN,
	&pci_enable_pcie_hp, 0,
	"Enable support for native PCI-express HotPlug.");

	static void
	pcib_probe_hotplug(struct pcib_softc *sc)
	{
	device_t dev;
	uint32_t link_cap;
	uint16_t link_sta, slot_sta;

	if (!pci_enable_pcie_hp)
	return;

	dev = sc->dev;
	if (pci_find_cap(dev, PCIY_EXPRESS, NULL) != 0)
	return;

	if (!(pcie_read_config(dev, PCIER_FLAGS, 2) & PCIEM_FLAGS_SLOT))
	return;

	sc->pcie_slot_cap = pcie_read_config(dev, PCIER_SLOT_CAP, 4);

	if ((sc->pcie_slot_cap & PCIEM_SLOT_CAP_HPC) == 0)
	return;
	link_cap = pcie_read_config(dev, PCIER_LINK_CAP, 4);
	if ((link_cap & PCIEM_LINK_CAP_DL_ACTIVE) == 0)
	return;

	/*
	* Some devices report that they have an MRL when they actually
	* do not. Since they always report that the MRL is open, child
	* devices would be ignored. Try to detect these devices and
	* ignore their claim of HotPlug support.
	*
	* If there is an open MRL but the Data Link Layer is active,
	* the MRL is not real.
	*/
	if ((sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP) != 0) {
	link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
	slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
	if ((slot_sta & PCIEM_SLOT_STA_MRLSS) != 0 &&
	(link_sta & PCIEM_LINK_STA_DL_ACTIVE) != 0) {
	return;
	}
	}

	/*
	* Now that we're sure we want to do hot plug, ask the
	* firmware, if any, if that's OK.
	*/
	if (pcib_request_feature(dev, PCI_FEATURE_HP) != 0) {
	if (bootverbose)
	device_printf(dev, "Unable to activate hot plug feature.\n");
	return;
	}

	sc->flags \|= PCIB_HOTPLUG;
	}

	/*
	* Send a HotPlug command to the slot control register. If this slot
	* uses command completion interrupts and a previous command is still
	* in progress, then the command is dropped. Once the previous
	* command completes or times out, pcib_pcie_hotplug_update() will be
	* invoked to post a new command based on the slot's state at that
	* time.
	*/
	static void
	pcib_pcie_hotplug_command(struct pcib_softc *sc, uint16_t val, uint16_t mask)
	{
	device_t dev;
	uint16_t ctl, new;

	dev = sc->dev;

	if (sc->flags & PCIB_HOTPLUG_CMD_PENDING)
	return;

	ctl = pcie_read_config(dev, PCIER_SLOT_CTL, 2);
	new = (ctl & ~mask) \| val;
	if (new == ctl)
	return;
	if (bootverbose)
	device_printf(dev, "HotPlug command: %04x -> %04x\n", ctl, new);
	pcie_write_config(dev, PCIER_SLOT_CTL, new, 2);
	if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS) &&
	(ctl & new) & PCIEM_SLOT_CTL_CCIE) {
	sc->flags \|= PCIB_HOTPLUG_CMD_PENDING;
	if (!cold)
	callout_reset(&sc->pcie_cc_timer, hz,
	pcib_pcie_cc_timeout, sc);
	}
	}

	static void
	pcib_pcie_hotplug_command_completed(struct pcib_softc *sc)
	{
	device_t dev;

	dev = sc->dev;

	if (bootverbose)
	device_printf(dev, "Command Completed\n");
	if (!(sc->flags & PCIB_HOTPLUG_CMD_PENDING))
	return;
	callout_stop(&sc->pcie_cc_timer);
	sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING;
	wakeup(sc);
	}

	/*
	* Returns true if a card is fully inserted from the user's
	* perspective. It may not yet be ready for access, but the driver
	* can now start enabling access if necessary.
	*/
	static bool
	pcib_hotplug_inserted(struct pcib_softc *sc)
	{

	/* Pretend the card isn't present if a detach is forced. */
	if (sc->flags & PCIB_DETACHING)
	return (false);

	/* Card must be present in the slot. */
	if ((sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS) == 0)
	return (false);

	/* A power fault implicitly turns off power to the slot. */
	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD)
	return (false);

	/* If the MRL is disengaged, the slot is powered off. */
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP &&
	(sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS) != 0)
	return (false);

	return (true);
	}

	/*
	* Returns -1 if the card is fully inserted, powered, and ready for
	* access. Otherwise, returns 0.
	*/
	static int
	pcib_hotplug_present(struct pcib_softc *sc)
	{

	/* Card must be inserted. */
	if (!pcib_hotplug_inserted(sc))
	return (0);

	/*
	* Require the Electromechanical Interlock to be engaged if
	* present.
	*/
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP &&
	(sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS) == 0)
	return (0);

	/* Require the Data Link Layer to be active. */
	if (!(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE))
	return (0);

	return (-1);
	}

	static void
	pcib_pcie_hotplug_update(struct pcib_softc *sc, uint16_t val, uint16_t mask,
	bool schedule_task)
	{
	bool card_inserted, ei_engaged;

	/* Clear DETACHING if Presence Detect has cleared. */
	if ((sc->pcie_slot_sta & (PCIEM_SLOT_STA_PDC \| PCIEM_SLOT_STA_PDS)) ==
	PCIEM_SLOT_STA_PDC)
	sc->flags &= ~PCIB_DETACHING;

	card_inserted = pcib_hotplug_inserted(sc);

	/* Turn the power indicator on if a card is inserted. */
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PIP) {
	mask \|= PCIEM_SLOT_CTL_PIC;
	if (card_inserted)
	val \|= PCIEM_SLOT_CTL_PI_ON;
	else if (sc->flags & PCIB_DETACH_PENDING)
	val \|= PCIEM_SLOT_CTL_PI_BLINK;
	else
	val \|= PCIEM_SLOT_CTL_PI_OFF;
	}

	/* Turn the power on via the Power Controller if a card is inserted. */
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP) {
	mask \|= PCIEM_SLOT_CTL_PCC;
	if (card_inserted)
	val \|= PCIEM_SLOT_CTL_PC_ON;
	else
	val \|= PCIEM_SLOT_CTL_PC_OFF;
	}

	/*
	* If a card is inserted, enable the Electromechanical
	* Interlock. If a card is not inserted (or we are in the
	* process of detaching), disable the Electromechanical
	* Interlock.
	*/
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP) {
	mask \|= PCIEM_SLOT_CTL_EIC;
	ei_engaged = (sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS) != 0;
	if (card_inserted != ei_engaged)
	val \|= PCIEM_SLOT_CTL_EIC;
	}

	/*
	* Start a timer to see if the Data Link Layer times out.
	* Note that we only start the timer if Presence Detect or MRL Sensor
	* changed on this interrupt. Stop any scheduled timer if
	* the Data Link Layer is active.
	*/
	if (card_inserted &&
	!(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE) &&
	sc->pcie_slot_sta &
	(PCIEM_SLOT_STA_MRLSC \| PCIEM_SLOT_STA_PDC)) {
	if (cold)
	device_printf(sc->dev,
	"Data Link Layer inactive\n");
	else
	callout_reset(&sc->pcie_dll_timer, hz,
	pcib_pcie_dll_timeout, sc);
	} else if (sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE)
	callout_stop(&sc->pcie_dll_timer);

	pcib_pcie_hotplug_command(sc, val, mask);

	/*
	* During attach the child "pci" device is added synchronously;
	* otherwise, the task is scheduled to manage the child
	* device.
	*/
	if (schedule_task &&
	(pcib_hotplug_present(sc) != 0) != (sc->child != NULL))
	taskqueue_enqueue(taskqueue_thread, &sc->pcie_hp_task);
	}

	static void
	pcib_pcie_intr_hotplug(void *arg)
	{
	struct pcib_softc *sc;
	device_t dev;

	sc = arg;
	dev = sc->dev;
	sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);

	/* Clear the events just reported. */
	pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2);

	if (bootverbose)
	device_printf(dev, "HotPlug interrupt: %#x\n",
	sc->pcie_slot_sta);

	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_ABP) {
	if (sc->flags & PCIB_DETACH_PENDING) {
	device_printf(dev,
	"Attention Button Pressed: Detach Cancelled\n");
	sc->flags &= ~PCIB_DETACH_PENDING;
	callout_stop(&sc->pcie_ab_timer);
	} else {
	device_printf(dev,
	"Attention Button Pressed: Detaching in 5 seconds\n");
	sc->flags \|= PCIB_DETACH_PENDING;
	callout_reset(&sc->pcie_ab_timer, 5 * hz,
	pcib_pcie_ab_timeout, sc);
	}
	}
	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD)
	device_printf(dev, "Power Fault Detected\n");
	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSC)
	device_printf(dev, "MRL Sensor Changed to %s\n",
	sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS ? "open" :
	"closed");
	if (bootverbose && sc->pcie_slot_sta & PCIEM_SLOT_STA_PDC)
	device_printf(dev, "Presence Detect Changed to %s\n",
	sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS ? "card present" :
	"empty");
	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_CC)
	pcib_pcie_hotplug_command_completed(sc);
	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_DLLSC) {
	sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
	if (bootverbose)
	device_printf(dev,
	"Data Link Layer State Changed to %s\n",
	sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE ?
	"active" : "inactive");
	}

	pcib_pcie_hotplug_update(sc, 0, 0, true);
	}

	static void
	pcib_pcie_hotplug_task(void *context, int pending)
	{
	struct pcib_softc *sc;
	device_t dev;

	sc = context;
	mtx_lock(&Giant);
	dev = sc->dev;
	if (pcib_hotplug_present(sc) != 0) {
	if (sc->child == NULL) {
	sc->child = device_add_child(dev, "pci", -1);
	bus_generic_attach(dev);
	}
	} else {
	if (sc->child != NULL) {
	if (device_delete_child(dev, sc->child) == 0)
	sc->child = NULL;
	}
	}
	mtx_unlock(&Giant);
	}

	static void
	pcib_pcie_ab_timeout(void *arg)
	{
	struct pcib_softc *sc;
	- device_t dev;

	sc = arg;
	- dev = sc->dev;
	mtx_assert(&Giant, MA_OWNED);
	if (sc->flags & PCIB_DETACH_PENDING) {
	sc->flags \|= PCIB_DETACHING;
	sc->flags &= ~PCIB_DETACH_PENDING;
	pcib_pcie_hotplug_update(sc, 0, 0, true);
	}
	}

	static void
	pcib_pcie_cc_timeout(void *arg)
	{
	struct pcib_softc *sc;
	device_t dev;
	uint16_t sta;

	sc = arg;
	dev = sc->dev;
	mtx_assert(&Giant, MA_OWNED);
	sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
	if (!(sta & PCIEM_SLOT_STA_CC)) {
	device_printf(dev,
	"HotPlug Command Timed Out - forcing detach\n");
	sc->flags &= ~(PCIB_HOTPLUG_CMD_PENDING \| PCIB_DETACH_PENDING);
	sc->flags \|= PCIB_DETACHING;
	pcib_pcie_hotplug_update(sc, 0, 0, true);
	} else {
	device_printf(dev,
	"Missed HotPlug interrupt waiting for Command Completion\n");
	pcib_pcie_intr_hotplug(sc);
	}
	}

	static void
	pcib_pcie_dll_timeout(void *arg)
	{
	struct pcib_softc *sc;
	device_t dev;
	uint16_t sta;

	sc = arg;
	dev = sc->dev;
	mtx_assert(&Giant, MA_OWNED);
	sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
	if (!(sta & PCIEM_LINK_STA_DL_ACTIVE)) {
	device_printf(dev,
	"Timed out waiting for Data Link Layer Active\n");
	sc->flags \|= PCIB_DETACHING;
	pcib_pcie_hotplug_update(sc, 0, 0, true);
	} else if (sta != sc->pcie_link_sta) {
	device_printf(dev,
	"Missed HotPlug interrupt waiting for DLL Active\n");
	pcib_pcie_intr_hotplug(sc);
	}
	}

	static int
	pcib_alloc_pcie_irq(struct pcib_softc *sc)
	{
	device_t dev;
	int count, error, rid;

	rid = -1;
	dev = sc->dev;

	/*
	* For simplicity, only use MSI-X if there is a single message.
	* To support a device with multiple messages we would have to
	* use remap intr if the MSI number is not 0.
	*/
	count = pci_msix_count(dev);
	if (count == 1) {
	error = pci_alloc_msix(dev, &count);
	if (error == 0)
	rid = 1;
	}

	if (rid < 0 && pci_msi_count(dev) > 0) {
	count = 1;
	error = pci_alloc_msi(dev, &count);
	if (error == 0)
	rid = 1;
	}

	if (rid < 0)
	rid = 0;

	sc->pcie_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
	RF_ACTIVE);
	if (sc->pcie_irq == NULL) {
	device_printf(dev,
	"Failed to allocate interrupt for PCI-e events\n");
	if (rid > 0)
	pci_release_msi(dev);
	return (ENXIO);
	}

	error = bus_setup_intr(dev, sc->pcie_irq, INTR_TYPE_MISC,
	NULL, pcib_pcie_intr_hotplug, sc, &sc->pcie_ihand);
	if (error) {
	device_printf(dev, "Failed to setup PCI-e interrupt handler\n");
	bus_release_resource(dev, SYS_RES_IRQ, rid, sc->pcie_irq);
	if (rid > 0)
	pci_release_msi(dev);
	return (error);
	}
	return (0);
	}

	static int
	pcib_release_pcie_irq(struct pcib_softc *sc)
	{
	device_t dev;
	int error;

	dev = sc->dev;
	error = bus_teardown_intr(dev, sc->pcie_irq, sc->pcie_ihand);
	if (error)
	return (error);
	error = bus_free_resource(dev, SYS_RES_IRQ, sc->pcie_irq);
	if (error)
	return (error);
	return (pci_release_msi(dev));
	}

	static void
	pcib_setup_hotplug(struct pcib_softc *sc)
	{
	device_t dev;
	uint16_t mask, val;

	dev = sc->dev;
	callout_init(&sc->pcie_ab_timer, 0);
	callout_init(&sc->pcie_cc_timer, 0);
	callout_init(&sc->pcie_dll_timer, 0);
	TASK_INIT(&sc->pcie_hp_task, 0, pcib_pcie_hotplug_task, sc);

	/* Allocate IRQ. */
	if (pcib_alloc_pcie_irq(sc) != 0)
	return;

	sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
	sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);

	/* Clear any events previously pending. */
	pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2);

	/* Enable HotPlug events. */
	mask = PCIEM_SLOT_CTL_DLLSCE \| PCIEM_SLOT_CTL_HPIE \|
	PCIEM_SLOT_CTL_CCIE \| PCIEM_SLOT_CTL_PDCE \| PCIEM_SLOT_CTL_MRLSCE \|
	PCIEM_SLOT_CTL_PFDE \| PCIEM_SLOT_CTL_ABPE;
	val = PCIEM_SLOT_CTL_DLLSCE \| PCIEM_SLOT_CTL_HPIE \| PCIEM_SLOT_CTL_PDCE;
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_APB)
	val \|= PCIEM_SLOT_CTL_ABPE;
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP)
	val \|= PCIEM_SLOT_CTL_PFDE;
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP)
	val \|= PCIEM_SLOT_CTL_MRLSCE;
	if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS))
	val \|= PCIEM_SLOT_CTL_CCIE;

	/* Turn the attention indicator off. */
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) {
	mask \|= PCIEM_SLOT_CTL_AIC;
	val \|= PCIEM_SLOT_CTL_AI_OFF;
	}

	pcib_pcie_hotplug_update(sc, val, mask, false);
	}

	static int
	pcib_detach_hotplug(struct pcib_softc *sc)
	{
	uint16_t mask, val;
	int error;

	/* Disable the card in the slot and force it to detach. */
	if (sc->flags & PCIB_DETACH_PENDING) {
	sc->flags &= ~PCIB_DETACH_PENDING;
	callout_stop(&sc->pcie_ab_timer);
	}
	sc->flags \|= PCIB_DETACHING;

	if (sc->flags & PCIB_HOTPLUG_CMD_PENDING) {
	callout_stop(&sc->pcie_cc_timer);
	tsleep(sc, 0, "hpcmd", hz);
	sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING;
	}

	/* Disable HotPlug events. */
	mask = PCIEM_SLOT_CTL_DLLSCE \| PCIEM_SLOT_CTL_HPIE \|
	PCIEM_SLOT_CTL_CCIE \| PCIEM_SLOT_CTL_PDCE \| PCIEM_SLOT_CTL_MRLSCE \|
	PCIEM_SLOT_CTL_PFDE \| PCIEM_SLOT_CTL_ABPE;
	val = 0;

	/* Turn the attention indicator off. */
	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) {
	mask \|= PCIEM_SLOT_CTL_AIC;
	val \|= PCIEM_SLOT_CTL_AI_OFF;
	}

	pcib_pcie_hotplug_update(sc, val, mask, false);

	error = pcib_release_pcie_irq(sc);
	if (error)
	return (error);
	taskqueue_drain(taskqueue_thread, &sc->pcie_hp_task);
	callout_drain(&sc->pcie_ab_timer);
	callout_drain(&sc->pcie_cc_timer);
	callout_drain(&sc->pcie_dll_timer);
	return (0);
	}
	#endif

	/*
	* Get current bridge configuration.
	*/
	static void
	pcib_cfg_save(struct pcib_softc *sc)
	{
	#ifndef NEW_PCIB
	device_t dev;
	uint16_t command;

	dev = sc->dev;

	command = pci_read_config(dev, PCIR_COMMAND, 2);
	if (command & PCIM_CMD_PORTEN)
	pcib_get_io_decode(sc);
	if (command & PCIM_CMD_MEMEN)
	pcib_get_mem_decode(sc);
	#endif
	}

	/*
	* Restore previous bridge configuration.
	*/
	static void
	pcib_cfg_restore(struct pcib_softc *sc)
	{
	- device_t dev;
	#ifndef NEW_PCIB
	uint16_t command;
	#endif
	- dev = sc->dev;

	#ifdef NEW_PCIB
	pcib_write_windows(sc, WIN_IO \| WIN_MEM \| WIN_PMEM);
	#else
	- command = pci_read_config(dev, PCIR_COMMAND, 2);
	+ command = pci_read_config(sc->dev, PCIR_COMMAND, 2);
	if (command & PCIM_CMD_PORTEN)
	pcib_set_io_decode(sc);
	if (command & PCIM_CMD_MEMEN)
	pcib_set_mem_decode(sc);
	#endif
	}

	/*
	* Generic device interface
	*/
	static int
	pcib_probe(device_t dev)
	{
	if ((pci_get_class(dev) == PCIC_BRIDGE) &&
	(pci_get_subclass(dev) == PCIS_BRIDGE_PCI)) {
	device_set_desc(dev, "PCI-PCI bridge");
	return(-10000);
	}
	return(ENXIO);
	}

	void
	pcib_attach_common(device_t dev)
	{
	struct pcib_softc *sc;
	struct sysctl_ctx_list *sctx;
	struct sysctl_oid *soid;
	int comma;

	sc = device_get_softc(dev);
	sc->dev = dev;

	/*
	* Get current bridge configuration.
	*/
	sc->domain = pci_get_domain(dev);
	#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
	sc->bus.sec = pci_read_config(dev, PCIR_SECBUS_1, 1);
	sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1);
	#endif
	sc->bridgectl = pci_read_config(dev, PCIR_BRIDGECTL_1, 2);
	pcib_cfg_save(sc);

	/*
	* The primary bus register should always be the bus of the
	* parent.
	*/
	sc->pribus = pci_get_bus(dev);
	pci_write_config(dev, PCIR_PRIBUS_1, sc->pribus, 1);

	/*
	* Setup sysctl reporting nodes
	*/
	sctx = device_get_sysctl_ctx(dev);
	soid = device_get_sysctl_tree(dev);
	SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "domain",
	CTLFLAG_RD, &sc->domain, 0, "Domain number");
	SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "pribus",
	CTLFLAG_RD, &sc->pribus, 0, "Primary bus number");
	SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "secbus",
	CTLFLAG_RD, &sc->bus.sec, 0, "Secondary bus number");
	SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "subbus",
	CTLFLAG_RD, &sc->bus.sub, 0, "Subordinate bus number");

	/*
	* Quirk handling.
	*/
	switch (pci_get_devid(dev)) {
	#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
	case 0x12258086: /* Intel 82454KX/GX (Orion) */
	{
	uint8_t supbus;

	supbus = pci_read_config(dev, 0x41, 1);
	if (supbus != 0xff) {
	sc->bus.sec = supbus + 1;
	sc->bus.sub = supbus + 1;
	}
	break;
	}
	#endif

	/*
	* The i82380FB mobile docking controller is a PCI-PCI bridge,
	* and it is a subtractive bridge. However, the ProgIf is wrong
	* so the normal setting of PCIB_SUBTRACTIVE bit doesn't
	* happen. There are also Toshiba and Cavium ThunderX bridges
	* that behave this way.
	*/
	case 0xa002177d: /* Cavium ThunderX */
	case 0x124b8086: /* Intel 82380FB Mobile */
	case 0x060513d7: /* Toshiba ???? */
	sc->flags \|= PCIB_SUBTRACTIVE;
	break;

	#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
	/* Compaq R3000 BIOS sets wrong subordinate bus number. */
	case 0x00dd10de:
	{
	char *cp;

	if ((cp = kern_getenv("smbios.planar.maker")) == NULL)
	break;
	if (strncmp(cp, "Compal", 6) != 0) {
	freeenv(cp);
	break;
	}
	freeenv(cp);
	if ((cp = kern_getenv("smbios.planar.product")) == NULL)
	break;
	if (strncmp(cp, "08A0", 4) != 0) {
	freeenv(cp);
	break;
	}
	freeenv(cp);
	if (sc->bus.sub < 0xa) {
	pci_write_config(dev, PCIR_SUBBUS_1, 0xa, 1);
	sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1);
	}
	break;
	}
	#endif
	}

	if (pci_msi_device_blacklisted(dev))
	sc->flags \|= PCIB_DISABLE_MSI;

	if (pci_msix_device_blacklisted(dev))
	sc->flags \|= PCIB_DISABLE_MSIX;

	/*
	* Intel 815, 845 and other chipsets say they are PCI-PCI bridges,
	* but have a ProgIF of 0x80. The 82801 family (AA, AB, BAM/CAM,
	* BA/CA/DB and E) PCI bridges are HUB-PCI bridges, in Intelese.
	* This means they act as if they were subtractively decoding
	* bridges and pass all transactions. Mark them and real ProgIf 1
	* parts as subtractive.
	*/
	if ((pci_get_devid(dev) & 0xff00ffff) == 0x24008086 \|\|
	pci_read_config(dev, PCIR_PROGIF, 1) == PCIP_BRIDGE_PCI_SUBTRACTIVE)
	sc->flags \|= PCIB_SUBTRACTIVE;

	#ifdef PCI_HP
	pcib_probe_hotplug(sc);
	#endif
	#ifdef NEW_PCIB
	#ifdef PCI_RES_BUS
	pcib_setup_secbus(dev, &sc->bus, 1);
	#endif
	pcib_probe_windows(sc);
	#endif
	#ifdef PCI_HP
	if (sc->flags & PCIB_HOTPLUG)
	pcib_setup_hotplug(sc);
	#endif
	if (bootverbose) {
	device_printf(dev, " domain %d\n", sc->domain);
	device_printf(dev, " secondary bus %d\n", sc->bus.sec);
	device_printf(dev, " subordinate bus %d\n", sc->bus.sub);
	#ifdef NEW_PCIB
	if (pcib_is_window_open(&sc->io))
	device_printf(dev, " I/O decode 0x%jx-0x%jx\n",
	(uintmax_t)sc->io.base, (uintmax_t)sc->io.limit);
	if (pcib_is_window_open(&sc->mem))
	device_printf(dev, " memory decode 0x%jx-0x%jx\n",
	(uintmax_t)sc->mem.base, (uintmax_t)sc->mem.limit);
	if (pcib_is_window_open(&sc->pmem))
	device_printf(dev, " prefetched decode 0x%jx-0x%jx\n",
	(uintmax_t)sc->pmem.base, (uintmax_t)sc->pmem.limit);
	#else
	if (pcib_is_io_open(sc))
	device_printf(dev, " I/O decode 0x%x-0x%x\n",
	sc->iobase, sc->iolimit);
	if (pcib_is_nonprefetch_open(sc))
	device_printf(dev, " memory decode 0x%jx-0x%jx\n",
	(uintmax_t)sc->membase, (uintmax_t)sc->memlimit);
	if (pcib_is_prefetch_open(sc))
	device_printf(dev, " prefetched decode 0x%jx-0x%jx\n",
	(uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit);
	#endif
	if (sc->bridgectl & (PCIB_BCR_ISA_ENABLE \| PCIB_BCR_VGA_ENABLE) \|\|
	sc->flags & PCIB_SUBTRACTIVE) {
	device_printf(dev, " special decode ");
	comma = 0;
	if (sc->bridgectl & PCIB_BCR_ISA_ENABLE) {
	printf("ISA");
	comma = 1;
	}
	if (sc->bridgectl & PCIB_BCR_VGA_ENABLE) {
	printf("%sVGA", comma ? ", " : "");
	comma = 1;
	}
	if (sc->flags & PCIB_SUBTRACTIVE)
	printf("%ssubtractive", comma ? ", " : "");
	printf("\n");
	}
	}

	/*
	* Always enable busmastering on bridges so that transactions
	* initiated on the secondary bus are passed through to the
	* primary bus.
	*/
	pci_enable_busmaster(dev);
	}

	#ifdef PCI_HP
	static int
	pcib_present(struct pcib_softc *sc)
	{

	if (sc->flags & PCIB_HOTPLUG)
	return (pcib_hotplug_present(sc) != 0);
	return (1);
	}
	#endif

	int
	pcib_attach_child(device_t dev)
	{
	struct pcib_softc *sc;

	sc = device_get_softc(dev);
	if (sc->bus.sec == 0) {
	/* no secondary bus; we should have fixed this */
	return(0);
	}

	#ifdef PCI_HP
	if (!pcib_present(sc)) {
	/* An empty HotPlug slot, so don't add a PCI bus yet. */
	return (0);
	}
	#endif

	sc->child = device_add_child(dev, "pci", -1);
	return (bus_generic_attach(dev));
	}

	int
	pcib_attach(device_t dev)
	{

	pcib_attach_common(dev);
	return (pcib_attach_child(dev));
	}

	int
	pcib_detach(device_t dev)
	{
	#if defined(PCI_HP) \|\| defined(NEW_PCIB)
	struct pcib_softc *sc;
	#endif
	int error;

	#if defined(PCI_HP) \|\| defined(NEW_PCIB)
	sc = device_get_softc(dev);
	#endif
	error = bus_generic_detach(dev);
	if (error)
	return (error);
	#ifdef PCI_HP
	if (sc->flags & PCIB_HOTPLUG) {
	error = pcib_detach_hotplug(sc);
	if (error)
	return (error);
	}
	#endif
	error = device_delete_children(dev);
	if (error)
	return (error);
	#ifdef NEW_PCIB
	pcib_free_windows(sc);
	#ifdef PCI_RES_BUS
	pcib_free_secbus(dev, &sc->bus);
	#endif
	#endif
	return (0);
	}

	int
	pcib_suspend(device_t dev)
	{

	pcib_cfg_save(device_get_softc(dev));
	return (bus_generic_suspend(dev));
	}

	int
	pcib_resume(device_t dev)
	{

	pcib_cfg_restore(device_get_softc(dev));
	return (bus_generic_resume(dev));
	}

	void
	pcib_bridge_init(device_t dev)
	{
	pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1);
	pci_write_config(dev, PCIR_IOBASEH_1, 0xffff, 2);
	pci_write_config(dev, PCIR_IOLIMITL_1, 0, 1);
	pci_write_config(dev, PCIR_IOLIMITH_1, 0, 2);
	pci_write_config(dev, PCIR_MEMBASE_1, 0xffff, 2);
	pci_write_config(dev, PCIR_MEMLIMIT_1, 0, 2);
	pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2);
	pci_write_config(dev, PCIR_PMBASEH_1, 0xffffffff, 4);
	pci_write_config(dev, PCIR_PMLIMITL_1, 0, 2);
	pci_write_config(dev, PCIR_PMLIMITH_1, 0, 4);
	}

	int
	pcib_child_present(device_t dev, device_t child)
	{
	#ifdef PCI_HP
	struct pcib_softc *sc = device_get_softc(dev);
	int retval;

	retval = bus_child_present(dev);
	if (retval != 0 && sc->flags & PCIB_HOTPLUG)
	retval = pcib_hotplug_present(sc);
	return (retval);
	#else
	return (bus_child_present(dev));
	#endif
	}

	int
	pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
	{
	struct pcib_softc *sc = device_get_softc(dev);

	switch (which) {
	case PCIB_IVAR_DOMAIN:
	*result = sc->domain;
	return(0);
	case PCIB_IVAR_BUS:
	*result = sc->bus.sec;
	return(0);
	}
	return(ENOENT);
	}

	int
	pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t value)
	{

	switch (which) {
	case PCIB_IVAR_DOMAIN:
	return(EINVAL);
	case PCIB_IVAR_BUS:
	return(EINVAL);
	}
	return(ENOENT);
	}

	#ifdef NEW_PCIB
	/*
	* Attempt to allocate a resource from the existing resources assigned
	* to a window.
	*/
	static struct resource *
	pcib_suballoc_resource(struct pcib_softc sc, struct pcib_window w,
	device_t child, int type, int *rid, rman_res_t start, rman_res_t end,
	rman_res_t count, u_int flags)
	{
	struct resource *res;

	if (!pcib_is_window_open(w))
	return (NULL);

	res = rman_reserve_resource(&w->rman, start, end, count,
	flags & ~RF_ACTIVE, child);
	if (res == NULL)
	return (NULL);

	if (bootverbose)
	device_printf(sc->dev,
	"allocated %s range (%#jx-%#jx) for rid %x of %s\n",
	w->name, rman_get_start(res), rman_get_end(res), *rid,
	pcib_child_name(child));
	rman_set_rid(res, *rid);

	/*
	* If the resource should be active, pass that request up the
	* tree. This assumes the parent drivers can handle
	* activating sub-allocated resources.
	*/
	if (flags & RF_ACTIVE) {
	if (bus_activate_resource(child, type, *rid, res) != 0) {
	rman_release_resource(res);
	return (NULL);
	}
	}

	return (res);
	}

	/* Allocate a fresh resource range for an unconfigured window. */
	static int
	pcib_alloc_new_window(struct pcib_softc sc, struct pcib_window w, int type,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct resource *res;
	rman_res_t base, limit, wmask;
	int rid;

	/*
	* If this is an I/O window on a bridge with ISA enable set
	* and the start address is below 64k, then try to allocate an
	* initial window of 0x1000 bytes long starting at address
	* 0xf000 and walking down. Note that if the original request
	* was larger than the non-aliased range size of 0x100 our
	* caller would have raised the start address up to 64k
	* already.
	*/
	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
	start < 65536) {
	for (base = 0xf000; (long)base >= 0; base -= 0x1000) {
	limit = base + 0xfff;

	/*
	* Skip ranges that wouldn't work for the
	* original request. Note that the actual
	* window that overlaps are the non-alias
	* ranges within [base, limit], so this isn't
	* quite a simple comparison.
	*/
	if (start + count > limit - 0x400)
	continue;
	if (base == 0) {
	/*
	* The first open region for the window at
	* 0 is 0x400-0x4ff.
	*/
	if (end - count + 1 < 0x400)
	continue;
	} else {
	if (end - count + 1 < base)
	continue;
	}

	if (pcib_alloc_nonisa_ranges(sc, base, limit) == 0) {
	w->base = base;
	w->limit = limit;
	return (0);
	}
	}
	return (ENOSPC);
	}

	wmask = ((rman_res_t)1 << w->step) - 1;
	if (RF_ALIGNMENT(flags) < w->step) {
	flags &= ~RF_ALIGNMENT_MASK;
	flags \|= RF_ALIGNMENT_LOG2(w->step);
	}
	start &= ~wmask;
	end \|= wmask;
	count = roundup2(count, (rman_res_t)1 << w->step);
	rid = w->reg;
	res = bus_alloc_resource(sc->dev, type, &rid, start, end, count,
	flags & ~RF_ACTIVE);
	if (res == NULL)
	return (ENOSPC);
	pcib_add_window_resources(w, &res, 1);
	pcib_activate_window(sc, type);
	w->base = rman_get_start(res);
	w->limit = rman_get_end(res);
	return (0);
	}

	/* Try to expand an existing window to the requested base and limit. */
	static int
	pcib_expand_window(struct pcib_softc sc, struct pcib_window w, int type,
	rman_res_t base, rman_res_t limit)
	{
	struct resource *res;
	int error, i, force_64k_base;

	KASSERT(base <= w->base && limit >= w->limit,
	("attempting to shrink window"));

	/*
	* XXX: pcib_grow_window() doesn't try to do this anyway and
	* the error handling for all the edge cases would be tedious.
	*/
	KASSERT(limit == w->limit \|\| base == w->base,
	("attempting to grow both ends of a window"));

	/*
	* Yet more special handling for requests to expand an I/O
	* window behind an ISA-enabled bridge. Since I/O windows
	* have to grow in 0x1000 increments and the end of the 0xffff
	* range is an alias, growing a window below 64k will always
	* result in allocating new resources and never adjusting an
	* existing resource.
	*/
	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
	(limit <= 65535 \|\| (base <= 65535 && base != w->base))) {
	KASSERT(limit == w->limit \|\| limit <= 65535,
	("attempting to grow both ends across 64k ISA alias"));

	if (base != w->base)
	error = pcib_alloc_nonisa_ranges(sc, base, w->base - 1);
	else
	error = pcib_alloc_nonisa_ranges(sc, w->limit + 1,
	limit);
	if (error == 0) {
	w->base = base;
	w->limit = limit;
	}
	return (error);
	}

	/*
	* Find the existing resource to adjust. Usually there is only one,
	* but for an ISA-enabled bridge we might be growing the I/O window
	* above 64k and need to find the existing resource that maps all
	* of the area above 64k.
	*/
	for (i = 0; i < w->count; i++) {
	if (rman_get_end(w->res[i]) == w->limit)
	break;
	}
	KASSERT(i != w->count, ("did not find existing resource"));
	res = w->res[i];

	/*
	* Usually the resource we found should match the window's
	* existing range. The one exception is the ISA-enabled case
	* mentioned above in which case the resource should start at
	* 64k.
	*/
	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
	w->base <= 65535) {
	KASSERT(rman_get_start(res) == 65536,
	("existing resource mismatch"));
	force_64k_base = 1;
	} else {
	KASSERT(w->base == rman_get_start(res),
	("existing resource mismatch"));
	force_64k_base = 0;
	}

	error = bus_adjust_resource(sc->dev, type, res, force_64k_base ?
	rman_get_start(res) : base, limit);
	if (error)
	return (error);

	/* Add the newly allocated region to the resource manager. */
	if (w->base != base) {
	error = rman_manage_region(&w->rman, base, w->base - 1);
	w->base = base;
	} else {
	error = rman_manage_region(&w->rman, w->limit + 1, limit);
	w->limit = limit;
	}
	if (error) {
	if (bootverbose)
	device_printf(sc->dev,
	"failed to expand %s resource manager\n", w->name);
	(void)bus_adjust_resource(sc->dev, type, res, force_64k_base ?
	rman_get_start(res) : w->base, w->limit);
	}
	return (error);
	}

	/*
	* Attempt to grow a window to make room for a given resource request.
	*/
	static int
	pcib_grow_window(struct pcib_softc sc, struct pcib_window w, int type,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	rman_res_t align, start_free, end_free, front, back, wmask;
	int error;

	/*
	* Clamp the desired resource range to the maximum address
	* this window supports. Reject impossible requests.
	*
	* For I/O port requests behind a bridge with the ISA enable
	* bit set, force large allocations to start above 64k.
	*/
	if (!w->valid)
	return (EINVAL);
	if (sc->bridgectl & PCIB_BCR_ISA_ENABLE && count > 0x100 &&
	start < 65536)
	start = 65536;
	if (end > w->rman.rm_end)
	end = w->rman.rm_end;
	if (start + count - 1 > end \|\| start + count < start)
	return (EINVAL);
	wmask = ((rman_res_t)1 << w->step) - 1;

	/*
	* If there is no resource at all, just try to allocate enough
	* aligned space for this resource.
	*/
	if (w->res == NULL) {
	error = pcib_alloc_new_window(sc, w, type, start, end, count,
	flags);
	if (error) {
	if (bootverbose)
	device_printf(sc->dev,
	"failed to allocate initial %s window (%#jx-%#jx,%#jx)\n",
	w->name, start, end, count);
	return (error);
	}
	if (bootverbose)
	device_printf(sc->dev,
	"allocated initial %s window of %#jx-%#jx\n",
	w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
	goto updatewin;
	}

	/*
	* See if growing the window would help. Compute the minimum
	* amount of address space needed on both the front and back
	* ends of the existing window to satisfy the allocation.
	*
	* For each end, build a candidate region adjusting for the
	* required alignment, etc. If there is a free region at the
	* edge of the window, grow from the inner edge of the free
	* region. Otherwise grow from the window boundary.
	*
	* Growing an I/O window below 64k for a bridge with the ISA
	* enable bit doesn't require any special magic as the step
	* size of an I/O window (1k) always includes multiple
	* non-alias ranges when it is grown in either direction.
	*
	* XXX: Special case: if w->res is completely empty and the
	* request size is larger than w->res, we should find the
	* optimal aligned buffer containing w->res and allocate that.
	*/
	if (bootverbose)
	device_printf(sc->dev,
	"attempting to grow %s window for (%#jx-%#jx,%#jx)\n",
	w->name, start, end, count);
	align = (rman_res_t)1 << RF_ALIGNMENT(flags);
	if (start < w->base) {
	if (rman_first_free_region(&w->rman, &start_free, &end_free) !=
	0 \|\| start_free != w->base)
	end_free = w->base;
	if (end_free > end)
	end_free = end + 1;

	/* Move end_free down until it is properly aligned. */
	end_free &= ~(align - 1);
	end_free--;
	front = end_free - (count - 1);

	/*
	* The resource would now be allocated at (front,
	* end_free). Ensure that fits in the (start, end)
	* bounds. end_free is checked above. If 'front' is
	* ok, ensure it is properly aligned for this window.
	* Also check for underflow.
	*/
	if (front >= start && front <= end_free) {
	if (bootverbose)
	printf("\tfront candidate range: %#jx-%#jx\n",
	front, end_free);
	front &= ~wmask;
	front = w->base - front;
	} else
	front = 0;
	} else
	front = 0;
	if (end > w->limit) {
	if (rman_last_free_region(&w->rman, &start_free, &end_free) !=
	0 \|\| end_free != w->limit)
	start_free = w->limit + 1;
	if (start_free < start)
	start_free = start;

	/* Move start_free up until it is properly aligned. */
	start_free = roundup2(start_free, align);
	back = start_free + count - 1;

	/*
	* The resource would now be allocated at (start_free,
	* back). Ensure that fits in the (start, end)
	* bounds. start_free is checked above. If 'back' is
	* ok, ensure it is properly aligned for this window.
	* Also check for overflow.
	*/
	if (back <= end && start_free <= back) {
	if (bootverbose)
	printf("\tback candidate range: %#jx-%#jx\n",
	start_free, back);
	back \|= wmask;
	back -= w->limit;
	} else
	back = 0;
	} else
	back = 0;

	/*
	* Try to allocate the smallest needed region first.
	* If that fails, fall back to the other region.
	*/
	error = ENOSPC;
	while (front != 0 \|\| back != 0) {
	if (front != 0 && (front <= back \|\| back == 0)) {
	error = pcib_expand_window(sc, w, type, w->base - front,
	w->limit);
	if (error == 0)
	break;
	front = 0;
	} else {
	error = pcib_expand_window(sc, w, type, w->base,
	w->limit + back);
	if (error == 0)
	break;
	back = 0;
	}
	}

	if (error)
	return (error);
	if (bootverbose)
	device_printf(sc->dev, "grew %s window to %#jx-%#jx\n",
	w->name, (uintmax_t)w->base, (uintmax_t)w->limit);

	updatewin:
	/* Write the new window. */
	KASSERT((w->base & wmask) == 0, ("start address is not aligned"));
	KASSERT((w->limit & wmask) == wmask, ("end address is not aligned"));
	pcib_write_windows(sc, w->mask);
	return (0);
	}

	/*
	* We have to trap resource allocation requests and ensure that the bridge
	* is set up to, or capable of handling them.
	*/
	struct resource *
	pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct pcib_softc *sc;
	struct resource *r;

	sc = device_get_softc(dev);

	/*
	* VGA resources are decoded iff the VGA enable bit is set in
	* the bridge control register. VGA resources do not fall into
	* the resource windows and are passed up to the parent.
	*/
	if ((type == SYS_RES_IOPORT && pci_is_vga_ioport_range(start, end)) \|\|
	(type == SYS_RES_MEMORY && pci_is_vga_memory_range(start, end))) {
	if (sc->bridgectl & PCIB_BCR_VGA_ENABLE)
	return (bus_generic_alloc_resource(dev, child, type,
	rid, start, end, count, flags));
	else
	return (NULL);
	}

	switch (type) {
	#ifdef PCI_RES_BUS
	case PCI_RES_BUS:
	return (pcib_alloc_subbus(&sc->bus, child, rid, start, end,
	count, flags));
	#endif
	case SYS_RES_IOPORT:
	if (pcib_is_isa_range(sc, start, end, count))
	return (NULL);
	r = pcib_suballoc_resource(sc, &sc->io, child, type, rid, start,
	end, count, flags);
	if (r != NULL \|\| (sc->flags & PCIB_SUBTRACTIVE) != 0)
	break;
	if (pcib_grow_window(sc, &sc->io, type, start, end, count,
	flags) == 0)
	r = pcib_suballoc_resource(sc, &sc->io, child, type,
	rid, start, end, count, flags);
	break;
	case SYS_RES_MEMORY:
	/*
	* For prefetchable resources, prefer the prefetchable
	* memory window, but fall back to the regular memory
	* window if that fails. Try both windows before
	* attempting to grow a window in case the firmware
	* has used a range in the regular memory window to
	* map a prefetchable BAR.
	*/
	if (flags & RF_PREFETCHABLE) {
	r = pcib_suballoc_resource(sc, &sc->pmem, child, type,
	rid, start, end, count, flags);
	if (r != NULL)
	break;
	}
	r = pcib_suballoc_resource(sc, &sc->mem, child, type, rid,
	start, end, count, flags);
	if (r != NULL \|\| (sc->flags & PCIB_SUBTRACTIVE) != 0)
	break;
	if (flags & RF_PREFETCHABLE) {
	if (pcib_grow_window(sc, &sc->pmem, type, start, end,
	count, flags) == 0) {
	r = pcib_suballoc_resource(sc, &sc->pmem, child,
	type, rid, start, end, count, flags);
	if (r != NULL)
	break;
	}
	}
	if (pcib_grow_window(sc, &sc->mem, type, start, end, count,
	flags & ~RF_PREFETCHABLE) == 0)
	r = pcib_suballoc_resource(sc, &sc->mem, child, type,
	rid, start, end, count, flags);
	break;
	default:
	return (bus_generic_alloc_resource(dev, child, type, rid,
	start, end, count, flags));
	}

	/*
	* If attempts to suballocate from the window fail but this is a
	* subtractive bridge, pass the request up the tree.
	*/
	if (sc->flags & PCIB_SUBTRACTIVE && r == NULL)
	return (bus_generic_alloc_resource(dev, child, type, rid,
	start, end, count, flags));
	return (r);
	}

	int
	pcib_adjust_resource(device_t bus, device_t child, int type, struct resource *r,
	rman_res_t start, rman_res_t end)
	{
	struct pcib_softc *sc;

	sc = device_get_softc(bus);
	if (pcib_is_resource_managed(sc, type, r))
	return (rman_adjust_resource(r, start, end));
	return (bus_generic_adjust_resource(bus, child, type, r, start, end));
	}

	int
	pcib_release_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r)
	{
	struct pcib_softc *sc;
	int error;

	sc = device_get_softc(dev);
	if (pcib_is_resource_managed(sc, type, r)) {
	if (rman_get_flags(r) & RF_ACTIVE) {
	error = bus_deactivate_resource(child, type, rid, r);
	if (error)
	return (error);
	}
	return (rman_release_resource(r));
	}
	return (bus_generic_release_resource(dev, child, type, rid, r));
	}
	#else
	/*
	* We have to trap resource allocation requests and ensure that the bridge
	* is set up to, or capable of handling them.
	*/
	struct resource *
	pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
	{
	struct pcib_softc *sc = device_get_softc(dev);
	const char name, suffix;
	int ok;

	/*
	* Fail the allocation for this range if it's not supported.
	*/
	name = device_get_nameunit(child);
	if (name == NULL) {
	name = "";
	suffix = "";
	} else
	suffix = " ";
	switch (type) {
	case SYS_RES_IOPORT:
	ok = 0;
	if (!pcib_is_io_open(sc))
	break;
	ok = (start >= sc->iobase && end <= sc->iolimit);

	/*
	* Make sure we allow access to VGA I/O addresses when the
	* bridge has the "VGA Enable" bit set.
	*/
	if (!ok && pci_is_vga_ioport_range(start, end))
	ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0;

	if ((sc->flags & PCIB_SUBTRACTIVE) == 0) {
	if (!ok) {
	if (start < sc->iobase)
	start = sc->iobase;
	if (end > sc->iolimit)
	end = sc->iolimit;
	if (start < end)
	ok = 1;
	}
	} else {
	ok = 1;
	#if 0
	/*
	* If we overlap with the subtractive range, then
	* pick the upper range to use.
	*/
	if (start < sc->iolimit && end > sc->iobase)
	start = sc->iolimit + 1;
	#endif
	}
	if (end < start) {
	device_printf(dev, "ioport: end (%jx) < start (%jx)\n",
	end, start);
	start = 0;
	end = 0;
	ok = 0;
	}
	if (!ok) {
	device_printf(dev, "%s%srequested unsupported I/O "
	"range 0x%jx-0x%jx (decoding 0x%x-0x%x)\n",
	name, suffix, start, end, sc->iobase, sc->iolimit);
	return (NULL);
	}
	if (bootverbose)
	device_printf(dev,
	"%s%srequested I/O range 0x%jx-0x%jx: in range\n",
	name, suffix, start, end);
	break;

	case SYS_RES_MEMORY:
	ok = 0;
	if (pcib_is_nonprefetch_open(sc))
	ok = ok \|\| (start >= sc->membase && end <= sc->memlimit);
	if (pcib_is_prefetch_open(sc))
	ok = ok \|\| (start >= sc->pmembase && end <= sc->pmemlimit);

	/*
	* Make sure we allow access to VGA memory addresses when the
	* bridge has the "VGA Enable" bit set.
	*/
	if (!ok && pci_is_vga_memory_range(start, end))
	ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0;

	if ((sc->flags & PCIB_SUBTRACTIVE) == 0) {
	if (!ok) {
	ok = 1;
	if (flags & RF_PREFETCHABLE) {
	if (pcib_is_prefetch_open(sc)) {
	if (start < sc->pmembase)
	start = sc->pmembase;
	if (end > sc->pmemlimit)
	end = sc->pmemlimit;
	} else {
	ok = 0;
	}
	} else { /* non-prefetchable */
	if (pcib_is_nonprefetch_open(sc)) {
	if (start < sc->membase)
	start = sc->membase;
	if (end > sc->memlimit)
	end = sc->memlimit;
	} else {
	ok = 0;
	}
	}
	}
	} else if (!ok) {
	ok = 1; /* subtractive bridge: always ok */
	#if 0
	if (pcib_is_nonprefetch_open(sc)) {
	if (start < sc->memlimit && end > sc->membase)
	start = sc->memlimit + 1;
	}
	if (pcib_is_prefetch_open(sc)) {
	if (start < sc->pmemlimit && end > sc->pmembase)
	start = sc->pmemlimit + 1;
	}
	#endif
	}
	if (end < start) {
	device_printf(dev, "memory: end (%jx) < start (%jx)\n",
	end, start);
	start = 0;
	end = 0;
	ok = 0;
	}
	if (!ok && bootverbose)
	device_printf(dev,
	"%s%srequested unsupported memory range %#jx-%#jx "
	"(decoding %#jx-%#jx, %#jx-%#jx)\n",
	name, suffix, start, end,
	(uintmax_t)sc->membase, (uintmax_t)sc->memlimit,
	(uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit);
	if (!ok)
	return (NULL);
	if (bootverbose)
	device_printf(dev,"%s%srequested memory range "
	"0x%jx-0x%jx: good\n",
	name, suffix, start, end);
	break;

	default:
	break;
	}
	/*
	* Bridge is OK decoding this resource, so pass it up.
	*/
	return (bus_generic_alloc_resource(dev, child, type, rid, start, end,
	count, flags));
	}
	#endif

	/*
	* If ARI is enabled on this downstream port, translate the function number
	* to the non-ARI slot/function. The downstream port will convert it back in
	* hardware. If ARI is not enabled slot and func are not modified.
	*/
	static __inline void
	pcib_xlate_ari(device_t pcib, int bus, int slot, int func)
	{
	struct pcib_softc *sc;
	int ari_func;

	sc = device_get_softc(pcib);
	ari_func = *func;

	if (sc->flags & PCIB_ENABLE_ARI) {
	KASSERT(*slot == 0,
	("Non-zero slot number with ARI enabled!"));
	*slot = PCIE_ARI_SLOT(ari_func);
	*func = PCIE_ARI_FUNC(ari_func);
	}
	}


	static void
	pcib_enable_ari(struct pcib_softc *sc, uint32_t pcie_pos)
	{
	uint32_t ctl2;

	ctl2 = pci_read_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, 4);
	ctl2 \|= PCIEM_CTL2_ARI;
	pci_write_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, ctl2, 4);

	sc->flags \|= PCIB_ENABLE_ARI;
	}

	/*
	* PCIB interface.
	*/
	int
	pcib_maxslots(device_t dev)
	{
	return (PCI_SLOTMAX);
	}

	static int
	pcib_ari_maxslots(device_t dev)
	{
	struct pcib_softc *sc;

	sc = device_get_softc(dev);

	if (sc->flags & PCIB_ENABLE_ARI)
	return (PCIE_ARI_SLOTMAX);
	else
	return (PCI_SLOTMAX);
	}

	static int
	pcib_ari_maxfuncs(device_t dev)
	{
	struct pcib_softc *sc;

	sc = device_get_softc(dev);

	if (sc->flags & PCIB_ENABLE_ARI)
	return (PCIE_ARI_FUNCMAX);
	else
	return (PCI_FUNCMAX);
	}

	static void
	pcib_ari_decode_rid(device_t pcib, uint16_t rid, int bus, int slot,
	int *func)
	{
	struct pcib_softc *sc;

	sc = device_get_softc(pcib);

	*bus = PCI_RID2BUS(rid);
	if (sc->flags & PCIB_ENABLE_ARI) {
	*slot = PCIE_ARI_RID2SLOT(rid);
	*func = PCIE_ARI_RID2FUNC(rid);
	} else {
	*slot = PCI_RID2SLOT(rid);
	*func = PCI_RID2FUNC(rid);
	}
	}

	/*
	* Since we are a child of a PCI bus, its parent must support the pcib interface.
	*/
	static uint32_t
	pcib_read_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, int width)
	{
	#ifdef PCI_HP
	struct pcib_softc *sc;

	sc = device_get_softc(dev);
	if (!pcib_present(sc)) {
	switch (width) {
	case 2:
	return (0xffff);
	case 1:
	return (0xff);
	default:
	return (0xffffffff);
	}
	}
	#endif
	pcib_xlate_ari(dev, b, &s, &f);
	return(PCIB_READ_CONFIG(device_get_parent(device_get_parent(dev)), b, s,
	f, reg, width));
	}

	static void
	pcib_write_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, uint32_t val, int width)
	{
	#ifdef PCI_HP
	struct pcib_softc *sc;

	sc = device_get_softc(dev);
	if (!pcib_present(sc))
	return;
	#endif
	pcib_xlate_ari(dev, b, &s, &f);
	PCIB_WRITE_CONFIG(device_get_parent(device_get_parent(dev)), b, s, f,
	reg, val, width);
	}

	/*
	* Route an interrupt across a PCI bridge.
	*/
	int
	pcib_route_interrupt(device_t pcib, device_t dev, int pin)
	{
	device_t bus;
	int parent_intpin;
	int intnum;

	/*
	*
	* The PCI standard defines a swizzle of the child-side device/intpin to
	* the parent-side intpin as follows.
	*
	* device = device on child bus
	* child_intpin = intpin on child bus slot (0-3)
	* parent_intpin = intpin on parent bus slot (0-3)
	*
	* parent_intpin = (device + child_intpin) % 4
	*/
	parent_intpin = (pci_get_slot(dev) + (pin - 1)) % 4;

	/*
	* Our parent is a PCI bus. Its parent must export the pcib interface
	* which includes the ability to route interrupts.
	*/
	bus = device_get_parent(pcib);
	intnum = PCIB_ROUTE_INTERRUPT(device_get_parent(bus), pcib, parent_intpin + 1);
	if (PCI_INTERRUPT_VALID(intnum) && bootverbose) {
	device_printf(pcib, "slot %d INT%c is routed to irq %d\n",
	pci_get_slot(dev), 'A' + pin - 1, intnum);
	}
	return(intnum);
	}

	/* Pass request to alloc MSI/MSI-X messages up to the parent bridge. */
	int
	pcib_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
	{
	struct pcib_softc *sc = device_get_softc(pcib);
	device_t bus;

	if (sc->flags & PCIB_DISABLE_MSI)
	return (ENXIO);
	bus = device_get_parent(pcib);
	return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount,
	irqs));
	}

	/* Pass request to release MSI/MSI-X messages up to the parent bridge. */
	int
	pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
	{
	device_t bus;

	bus = device_get_parent(pcib);
	return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs));
	}

	/* Pass request to alloc an MSI-X message up to the parent bridge. */
	int
	pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
	{
	struct pcib_softc *sc = device_get_softc(pcib);
	device_t bus;

	if (sc->flags & PCIB_DISABLE_MSIX)
	return (ENXIO);
	bus = device_get_parent(pcib);
	return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq));
	}

	/* Pass request to release an MSI-X message up to the parent bridge. */
	int
	pcib_release_msix(device_t pcib, device_t dev, int irq)
	{
	device_t bus;

	bus = device_get_parent(pcib);
	return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq));
	}

	/* Pass request to map MSI/MSI-X message up to parent bridge. */
	int
	pcib_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr,
	uint32_t *data)
	{
	device_t bus;
	int error;

	bus = device_get_parent(pcib);
	error = PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data);
	if (error)
	return (error);

	pci_ht_map_msi(pcib, *addr);
	return (0);
	}

	/* Pass request for device power state up to parent bridge. */
	int
	pcib_power_for_sleep(device_t pcib, device_t dev, int *pstate)
	{
	device_t bus;

	bus = device_get_parent(pcib);
	return (PCIB_POWER_FOR_SLEEP(bus, dev, pstate));
	}

	static int
	pcib_ari_enabled(device_t pcib)
	{
	struct pcib_softc *sc;

	sc = device_get_softc(pcib);

	return ((sc->flags & PCIB_ENABLE_ARI) != 0);
	}

	static int
	pcib_ari_get_id(device_t pcib, device_t dev, enum pci_id_type type,
	uintptr_t *id)
	{
	struct pcib_softc *sc;
	device_t bus_dev;
	uint8_t bus, slot, func;

	if (type != PCI_ID_RID) {
	bus_dev = device_get_parent(pcib);
	return (PCIB_GET_ID(device_get_parent(bus_dev), dev, type, id));
	}

	sc = device_get_softc(pcib);

	if (sc->flags & PCIB_ENABLE_ARI) {
	bus = pci_get_bus(dev);
	func = pci_get_function(dev);

	*id = (PCI_ARI_RID(bus, func));
	} else {
	bus = pci_get_bus(dev);
	slot = pci_get_slot(dev);
	func = pci_get_function(dev);

	*id = (PCI_RID(bus, slot, func));
	}

	return (0);
	}

	/*
	* Check that the downstream port (pcib) and the endpoint device (dev) both
	* support ARI. If so, enable it and return 0, otherwise return an error.
	*/
	static int
	pcib_try_enable_ari(device_t pcib, device_t dev)
	{
	struct pcib_softc *sc;
	int error;
	uint32_t cap2;
	int ari_cap_off;
	uint32_t ari_ver;
	uint32_t pcie_pos;

	sc = device_get_softc(pcib);

	/*
	* ARI is controlled in a register in the PCIe capability structure.
	* If the downstream port does not have the PCIe capability structure
	* then it does not support ARI.
	*/
	error = pci_find_cap(pcib, PCIY_EXPRESS, &pcie_pos);
	if (error != 0)
	return (ENODEV);

	/* Check that the PCIe port advertises ARI support. */
	cap2 = pci_read_config(pcib, pcie_pos + PCIER_DEVICE_CAP2, 4);
	if (!(cap2 & PCIEM_CAP2_ARI))
	return (ENODEV);

	/*
	* Check that the endpoint device advertises ARI support via the ARI
	* extended capability structure.
	*/
	error = pci_find_extcap(dev, PCIZ_ARI, &ari_cap_off);
	if (error != 0)
	return (ENODEV);

	/*
	* Finally, check that the endpoint device supports the same version
	* of ARI that we do.
	*/
	ari_ver = pci_read_config(dev, ari_cap_off, 4);
	if (PCI_EXTCAP_VER(ari_ver) != PCIB_SUPPORTED_ARI_VER) {
	if (bootverbose)
	device_printf(pcib,
	"Unsupported version of ARI (%d) detected\n",
	PCI_EXTCAP_VER(ari_ver));

	return (ENXIO);
	}

	pcib_enable_ari(sc, pcie_pos);

	return (0);
	}

	int
	pcib_request_feature_allow(device_t pcib, device_t dev,
	enum pci_feature feature)
	{
	/*
	* No host firmware we have to negotiate with, so we allow
	* every valid feature requested.
	*/
	switch (feature) {
	case PCI_FEATURE_AER:
	case PCI_FEATURE_HP:
	break;
	default:
	return (EINVAL);
	}

	return (0);
	}

	int
	pcib_request_feature(device_t dev, enum pci_feature feature)
	{

	/*
	* Invoke PCIB_REQUEST_FEATURE of this bridge first in case
	* the firmware overrides the method of PCI-PCI bridges.
	*/
	return (PCIB_REQUEST_FEATURE(dev, dev, feature));
	}

	/*
	* Pass the request to use this PCI feature up the tree. Either there's a
	* firmware like ACPI that's using this feature that will approve (or deny) the
	* request to take it over, or the platform has no such firmware, in which case
	* the request will be approved. If the request is approved, the OS is expected
	* to make use of the feature or render it harmless.
	*/
	static int
	pcib_request_feature_default(device_t pcib, device_t dev,
	enum pci_feature feature)
	{
	device_t bus;

	/*
	* Our parent is necessarily a pci bus. Its parent will either be
	* another pci bridge (which passes it up) or a host bridge that can
	* approve or reject the request.
	*/
	bus = device_get_parent(pcib);
	return (PCIB_REQUEST_FEATURE(device_get_parent(bus), dev, feature));
	}
	Index: head/sys/dev/smc/if_smc_fdt.c
	===================================================================
	--- head/sys/dev/smc/if_smc_fdt.c (revision 327172)
	+++ head/sys/dev/smc/if_smc_fdt.c (revision 327173)
	@@ -1,135 +1,126 @@
	/*-
	* Copyright (c) 2008 Benno Rice
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/socket.h>
	#include <sys/systm.h>
	#include <sys/taskqueue.h>

	#include <machine/bus.h>
	#include <machine/resource.h>

	#include <net/ethernet.h>
	#include <net/if.h>
	#include <net/if_arp.h>
	#include <net/if_media.h>

	#include <dev/smc/if_smcvar.h>

	#include <dev/mii/mii.h>
	#include <dev/mii/miivar.h>

	#include <dev/fdt/fdt_common.h>
	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include "miibus_if.h"

	static int smc_fdt_probe(device_t);
	static int smc_fdt_attach(device_t);
	static int smc_fdt_detach(device_t);

	static int
	smc_fdt_probe(device_t dev)
	{
	struct smc_softc *sc;

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (ofw_bus_is_compatible(dev, "smsc,lan91c111")) {
	sc = device_get_softc(dev);
	sc->smc_usemem = 1;

	if (smc_probe(dev) != 0) {
	return (ENXIO);
	}

	return (0);
	}

	return (ENXIO);
	}

	static int
	smc_fdt_attach(device_t dev)
	{
	- int err;
	- struct smc_softc *sc;

	- sc = device_get_softc(dev);
	-
	- err = smc_attach(dev);
	- if (err) {
	- return (err);
	- }
	-
	- return (0);
	+ return smc_attach(dev);
	}

	static int
	smc_fdt_detach(device_t dev)
	{

	smc_detach(dev);

	return (0);
	}

	static device_method_t smc_fdt_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, smc_fdt_probe),
	DEVMETHOD(device_attach, smc_fdt_attach),
	DEVMETHOD(device_detach, smc_fdt_detach),

	/* MII interface */
	DEVMETHOD(miibus_readreg, smc_miibus_readreg),
	DEVMETHOD(miibus_writereg, smc_miibus_writereg),
	DEVMETHOD(miibus_statchg, smc_miibus_statchg),

	{ 0, 0 }
	};

	static driver_t smc_fdt_driver = {
	"smc",
	smc_fdt_methods,
	sizeof(struct smc_softc),
	};

	extern devclass_t smc_devclass;

	DRIVER_MODULE(smc, simplebus, smc_fdt_driver, smc_devclass, 0, 0);
	DRIVER_MODULE(miibus, smc, miibus_driver, miibus_devclass, 0, 0);
	MODULE_DEPEND(smc, fdt, 1, 1, 1);
	MODULE_DEPEND(smc, ether, 1, 1, 1);
	MODULE_DEPEND(smc, miibus, 1, 1, 1);
	Index: head/sys/dev/uart/uart_bus_acpi.c
	===================================================================
	--- head/sys/dev/uart/uart_bus_acpi.c (revision 327172)
	+++ head/sys/dev/uart/uart_bus_acpi.c (revision 327173)
	@@ -1,127 +1,125 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2001 M. Warner Losh. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <machine/resource.h>

	#include <isa/isavar.h>

	#include <dev/uart/uart.h>
	#include <dev/uart/uart_bus.h>
	#include <dev/uart/uart_cpu_acpi.h>

	#ifdef __aarch64__
	#include <contrib/dev/acpica/include/acpi.h>
	#include <contrib/dev/acpica/include/accommon.h>
	#include <dev/acpica/acpivar.h>
	#endif

	static int uart_acpi_probe(device_t dev);

	static device_method_t uart_acpi_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, uart_acpi_probe),
	DEVMETHOD(device_attach, uart_bus_attach),
	DEVMETHOD(device_detach, uart_bus_detach),
	DEVMETHOD(device_resume, uart_bus_resume),
	{ 0, 0 }
	};

	static driver_t uart_acpi_driver = {
	uart_driver_name,
	uart_acpi_methods,
	sizeof(struct uart_softc),
	};

	#if defined(__i386__) \|\| defined(__amd64__)
	static struct isa_pnp_id acpi_ns8250_ids[] = {
	{0x0005d041, "Standard PC COM port"}, /* PNP0500 */
	{0x0105d041, "16550A-compatible COM port"}, /* PNP0501 */
	{0x0205d041, "Multiport serial device (non-intelligent 16550)"}, /* PNP0502 */
	{0x1005d041, "Generic IRDA-compatible device"}, /* PNP0510 */
	{0x1105d041, "Generic IRDA-compatible device"}, /* PNP0511 */
	{0x04f0235c, "Wacom Tablet PC Screen"}, /* WACF004 */
	{0x0ef0235c, "Wacom Tablet PC Screen 00e"}, /* WACF00e */
	{0xe502aa1a, "Wacom Tablet at FuS Lifebook T"}, /* FUJ02E5 */
	{0}
	};
	#endif

	#ifdef __aarch64__
	static struct uart_class *
	uart_acpi_find_device(device_t dev)
	{
	struct acpi_uart_compat_data **cd;
	ACPI_HANDLE h;

	if ((h = acpi_get_handle(dev)) == NULL)
	return (NULL);

	SET_FOREACH(cd, uart_acpi_class_and_device_set) {
	if (acpi_MatchHid(h, (*cd)->hid)) {
	return ((*cd)->clas);
	}
	}

	return (NULL);
	}
	#endif

	static int
	uart_acpi_probe(device_t dev)
	{
	struct uart_softc *sc;
	- device_t parent;

	- parent = device_get_parent(dev);
	sc = device_get_softc(dev);

	#if defined(__i386__) \|\| defined(__amd64__)
	- if (!ISA_PNP_PROBE(parent, dev, acpi_ns8250_ids)) {
	+ if (!ISA_PNP_PROBE(device_get_parent(dev), dev, acpi_ns8250_ids)) {
	sc->sc_class = &uart_ns8250_class;
	return (uart_bus_probe(dev, 0, 0, 0, 0, 0));
	}

	/* Add checks for non-ns8250 IDs here. */
	#elif defined(__aarch64__)
	if ((sc->sc_class = uart_acpi_find_device(dev)) != NULL)
	return (uart_bus_probe(dev, 2, 0, 0, 0, 0));
	#endif

	return (ENXIO);
	}

	DRIVER_MODULE(uart, acpi, uart_acpi_driver, uart_devclass, 0, 0);
	Index: head/sys/dev/uart/uart_dev_pl011.c
	===================================================================
	--- head/sys/dev/uart/uart_dev_pl011.c (revision 327172)
	+++ head/sys/dev/uart/uart_dev_pl011.c (revision 327173)
	@@ -1,592 +1,590 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Semihalf.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_acpi.h"
	#include "opt_platform.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <machine/bus.h>

	#include <dev/uart/uart.h>
	#include <dev/uart/uart_cpu.h>
	#ifdef FDT
	#include <dev/uart/uart_cpu_fdt.h>
	#include <dev/ofw/ofw_bus.h>
	#endif
	#include <dev/uart/uart_bus.h>
	#include "uart_if.h"

	#ifdef DEV_ACPI
	#include <dev/uart/uart_cpu_acpi.h>
	#include <contrib/dev/acpica/include/acpi.h>
	#include <contrib/dev/acpica/include/accommon.h>
	#include <contrib/dev/acpica/include/actables.h>
	#endif

	#include <sys/kdb.h>

	/* PL011 UART registers and masks*/
	#define UART_DR 0x00 /* Data register */
	#define DR_FE (1 << 8) /* Framing error */
	#define DR_PE (1 << 9) /* Parity error */
	#define DR_BE (1 << 10) /* Break error */
	#define DR_OE (1 << 11) /* Overrun error */

	#define UART_FR 0x06 /* Flag register */
	#define FR_RXFE (1 << 4) /* Receive FIFO/reg empty */
	#define FR_TXFF (1 << 5) /* Transmit FIFO/reg full */
	#define FR_RXFF (1 << 6) /* Receive FIFO/reg full */
	#define FR_TXFE (1 << 7) /* Transmit FIFO/reg empty */

	#define UART_IBRD 0x09 /* Integer baud rate register */
	#define IBRD_BDIVINT 0xffff /* Significant part of int. divisor value */

	#define UART_FBRD 0x0a /* Fractional baud rate register */
	#define FBRD_BDIVFRAC 0x3f /* Significant part of frac. divisor value */

	#define UART_LCR_H 0x0b /* Line control register */
	#define LCR_H_WLEN8 (0x3 << 5)
	#define LCR_H_WLEN7 (0x2 << 5)
	#define LCR_H_WLEN6 (0x1 << 5)
	#define LCR_H_FEN (1 << 4) /* FIFO mode enable */
	#define LCR_H_STP2 (1 << 3) /* 2 stop frames at the end */
	#define LCR_H_EPS (1 << 2) /* Even parity select */
	#define LCR_H_PEN (1 << 1) /* Parity enable */

	#define UART_CR 0x0c /* Control register */
	#define CR_RXE (1 << 9) /* Receive enable */
	#define CR_TXE (1 << 8) /* Transmit enable */
	#define CR_UARTEN (1 << 0) /* UART enable */

	#define UART_IFLS 0x0d /* FIFO level select register */
	#define IFLS_RX_SHIFT 3 /* RX level in bits [5:3] */
	#define IFLS_TX_SHIFT 0 /* TX level in bits [2:0] */
	#define IFLS_MASK 0x07 /* RX/TX level is 3 bits */
	#define IFLS_LVL_1_8th 0 /* Interrupt at 1/8 full */
	#define IFLS_LVL_2_8th 1 /* Interrupt at 1/4 full */
	#define IFLS_LVL_4_8th 2 /* Interrupt at 1/2 full */
	#define IFLS_LVL_6_8th 3 /* Interrupt at 3/4 full */
	#define IFLS_LVL_7_8th 4 /* Interrupt at 7/8 full */

	#define UART_IMSC 0x0e /* Interrupt mask set/clear register */
	#define IMSC_MASK_ALL 0x7ff /* Mask all interrupts */

	#define UART_RIS 0x0f /* Raw interrupt status register */
	#define UART_RXREADY (1 << 4) /* RX buffer full */
	#define UART_TXEMPTY (1 << 5) /* TX buffer empty */
	#define RIS_RTIM (1 << 6) /* Receive timeout */
	#define RIS_FE (1 << 7) /* Framing error interrupt status */
	#define RIS_PE (1 << 8) /* Parity error interrupt status */
	#define RIS_BE (1 << 9) /* Break error interrupt status */
	#define RIS_OE (1 << 10) /* Overrun interrupt status */

	#define UART_MIS 0x10 /* Masked interrupt status register */
	#define UART_ICR 0x11 /* Interrupt clear register */

	#define UART_PIDREG_0 0x3f8 /* Peripheral ID register 0 */
	#define UART_PIDREG_1 0x3f9 /* Peripheral ID register 1 */
	#define UART_PIDREG_2 0x3fa /* Peripheral ID register 2 */
	#define UART_PIDREG_3 0x3fb /* Peripheral ID register 3 */

	/*
	* The hardware FIFOs are 16 bytes each on rev 2 and earlier hardware, 32 bytes
	* on rev 3 and later. We configure them to interrupt when 3/4 full/empty. For
	* RX we set the size to the full hardware capacity so that the uart core
	* allocates enough buffer space to hold a complete fifo full of incoming data.
	* For TX, we need to limit the size to the capacity we know will be available
	* when the interrupt occurs; uart_core will feed exactly that many bytes to
	* uart_pl011_bus_transmit() which must consume them all.
	*/
	#define FIFO_RX_SIZE_R2 16
	#define FIFO_TX_SIZE_R2 12
	#define FIFO_RX_SIZE_R3 32
	#define FIFO_TX_SIZE_R3 24
	#define FIFO_IFLS_BITS ((IFLS_LVL_6_8th << IFLS_RX_SHIFT) \| (IFLS_LVL_2_8th))

	/*
	* FIXME: actual register size is SoC-dependent, we need to handle it
	*/
	#define __uart_getreg(bas, reg) \
	bus_space_read_4((bas)->bst, (bas)->bsh, uart_regofs(bas, reg))
	#define __uart_setreg(bas, reg, value) \
	bus_space_write_4((bas)->bst, (bas)->bsh, uart_regofs(bas, reg), value)

	/*
	* Low-level UART interface.
	*/
	static int uart_pl011_probe(struct uart_bas *bas);
	static void uart_pl011_init(struct uart_bas *bas, int, int, int, int);
	static void uart_pl011_term(struct uart_bas *bas);
	static void uart_pl011_putc(struct uart_bas *bas, int);
	static int uart_pl011_rxready(struct uart_bas *bas);
	static int uart_pl011_getc(struct uart_bas bas, struct mtx );

	static struct uart_ops uart_pl011_ops = {
	.probe = uart_pl011_probe,
	.init = uart_pl011_init,
	.term = uart_pl011_term,
	.putc = uart_pl011_putc,
	.rxready = uart_pl011_rxready,
	.getc = uart_pl011_getc,
	};

	static int
	uart_pl011_probe(struct uart_bas *bas)
	{

	return (0);
	}

	static void
	uart_pl011_param(struct uart_bas *bas, int baudrate, int databits, int stopbits,
	int parity)
	{
	uint32_t ctrl, line;
	uint32_t baud;

	/*
	* Zero all settings to make sure
	* UART is disabled and not configured
	*/
	ctrl = line = 0x0;
	__uart_setreg(bas, UART_CR, ctrl);

	/* As we know UART is disabled we may setup the line */
	switch (databits) {
	case 7:
	line \|= LCR_H_WLEN7;
	break;
	case 6:
	line \|= LCR_H_WLEN6;
	break;
	case 8:
	default:
	line \|= LCR_H_WLEN8;
	break;
	}

	if (stopbits == 2)
	line \|= LCR_H_STP2;
	else
	line &= ~LCR_H_STP2;

	if (parity)
	line \|= LCR_H_PEN;
	else
	line &= ~LCR_H_PEN;
	line \|= LCR_H_FEN;

	/* Configure the rest */
	ctrl \|= (CR_RXE \| CR_TXE \| CR_UARTEN);

	if (bas->rclk != 0 && baudrate != 0) {
	baud = bas->rclk * 4 / baudrate;
	__uart_setreg(bas, UART_IBRD, ((uint32_t)(baud >> 6)) & IBRD_BDIVINT);
	__uart_setreg(bas, UART_FBRD, (uint32_t)(baud & 0x3F) & FBRD_BDIVFRAC);
	}

	/* Add config. to line before reenabling UART */
	__uart_setreg(bas, UART_LCR_H, (__uart_getreg(bas, UART_LCR_H) &
	~0xff) \| line);

	/* Set rx and tx fifo levels. */
	__uart_setreg(bas, UART_IFLS, FIFO_IFLS_BITS);

	__uart_setreg(bas, UART_CR, ctrl);
	}

	static void
	uart_pl011_init(struct uart_bas *bas, int baudrate, int databits, int stopbits,
	int parity)
	{
	/* Mask all interrupts */
	__uart_setreg(bas, UART_IMSC, __uart_getreg(bas, UART_IMSC) &
	~IMSC_MASK_ALL);

	uart_pl011_param(bas, baudrate, databits, stopbits, parity);
	}

	static void
	uart_pl011_term(struct uart_bas *bas)
	{
	}

	static void
	uart_pl011_putc(struct uart_bas *bas, int c)
	{

	/* Wait when TX FIFO full. Push character otherwise. */
	while (__uart_getreg(bas, UART_FR) & FR_TXFF)
	;
	__uart_setreg(bas, UART_DR, c & 0xff);
	}

	static int
	uart_pl011_rxready(struct uart_bas *bas)
	{

	return !(__uart_getreg(bas, UART_FR) & FR_RXFE);
	}

	static int
	uart_pl011_getc(struct uart_bas bas, struct mtx hwmtx)
	{
	int c;

	while (!uart_pl011_rxready(bas))
	;
	c = __uart_getreg(bas, UART_DR) & 0xff;

	return (c);
	}

	/*
	* High-level UART interface.
	*/
	struct uart_pl011_softc {
	struct uart_softc base;
	uint16_t imsc; /* Interrupt mask */
	};

	static int uart_pl011_bus_attach(struct uart_softc *);
	static int uart_pl011_bus_detach(struct uart_softc *);
	static int uart_pl011_bus_flush(struct uart_softc *, int);
	static int uart_pl011_bus_getsig(struct uart_softc *);
	static int uart_pl011_bus_ioctl(struct uart_softc *, int, intptr_t);
	static int uart_pl011_bus_ipend(struct uart_softc *);
	static int uart_pl011_bus_param(struct uart_softc *, int, int, int, int);
	static int uart_pl011_bus_probe(struct uart_softc *);
	static int uart_pl011_bus_receive(struct uart_softc *);
	static int uart_pl011_bus_setsig(struct uart_softc *, int);
	static int uart_pl011_bus_transmit(struct uart_softc *);
	static void uart_pl011_bus_grab(struct uart_softc *);
	static void uart_pl011_bus_ungrab(struct uart_softc *);

	static kobj_method_t uart_pl011_methods[] = {
	KOBJMETHOD(uart_attach, uart_pl011_bus_attach),
	KOBJMETHOD(uart_detach, uart_pl011_bus_detach),
	KOBJMETHOD(uart_flush, uart_pl011_bus_flush),
	KOBJMETHOD(uart_getsig, uart_pl011_bus_getsig),
	KOBJMETHOD(uart_ioctl, uart_pl011_bus_ioctl),
	KOBJMETHOD(uart_ipend, uart_pl011_bus_ipend),
	KOBJMETHOD(uart_param, uart_pl011_bus_param),
	KOBJMETHOD(uart_probe, uart_pl011_bus_probe),
	KOBJMETHOD(uart_receive, uart_pl011_bus_receive),
	KOBJMETHOD(uart_setsig, uart_pl011_bus_setsig),
	KOBJMETHOD(uart_transmit, uart_pl011_bus_transmit),
	KOBJMETHOD(uart_grab, uart_pl011_bus_grab),
	KOBJMETHOD(uart_ungrab, uart_pl011_bus_ungrab),

	{ 0, 0 }
	};

	static struct uart_class uart_pl011_class = {
	"uart_pl011",
	uart_pl011_methods,
	sizeof(struct uart_pl011_softc),
	.uc_ops = &uart_pl011_ops,
	.uc_range = 0x48,
	.uc_rclk = 0,
	.uc_rshift = 2
	};


	#ifdef FDT
	static struct ofw_compat_data compat_data[] = {
	{"arm,pl011", (uintptr_t)&uart_pl011_class},
	{NULL, (uintptr_t)NULL},
	};
	UART_FDT_CLASS_AND_DEVICE(compat_data);
	#endif

	#ifdef DEV_ACPI
	static struct acpi_uart_compat_data acpi_compat_data[] = {
	{"ARMH0011", &uart_pl011_class, ACPI_DBG2_ARM_PL011},
	{NULL, NULL, 0},
	};
	UART_ACPI_CLASS_AND_DEVICE(acpi_compat_data);
	#endif

	static int
	uart_pl011_bus_attach(struct uart_softc *sc)
	{
	struct uart_pl011_softc *psc;
	struct uart_bas *bas;

	psc = (struct uart_pl011_softc *)sc;
	bas = &sc->sc_bas;

	/* Enable interrupts */
	psc->imsc = (UART_RXREADY \| RIS_RTIM \| UART_TXEMPTY);
	__uart_setreg(bas, UART_IMSC, psc->imsc);

	/* Clear interrupts */
	__uart_setreg(bas, UART_ICR, IMSC_MASK_ALL);

	return (0);
	}

	static int
	uart_pl011_bus_detach(struct uart_softc *sc)
	{

	return (0);
	}

	static int
	uart_pl011_bus_flush(struct uart_softc *sc, int what)
	{

	return (0);
	}

	static int
	uart_pl011_bus_getsig(struct uart_softc *sc)
	{

	return (0);
	}

	static int
	uart_pl011_bus_ioctl(struct uart_softc *sc, int request, intptr_t data)
	{
	- struct uart_bas *bas;
	int error;

	- bas = &sc->sc_bas;
	error = 0;
	uart_lock(sc->sc_hwmtx);
	switch (request) {
	case UART_IOCTL_BREAK:
	break;
	case UART_IOCTL_BAUD:
	(int)data = 115200;
	break;
	default:
	error = EINVAL;
	break;
	}
	uart_unlock(sc->sc_hwmtx);

	return (error);
	}

	static int
	uart_pl011_bus_ipend(struct uart_softc *sc)
	{
	struct uart_pl011_softc *psc;
	struct uart_bas *bas;
	uint32_t ints;
	int ipend;

	psc = (struct uart_pl011_softc *)sc;
	bas = &sc->sc_bas;

	uart_lock(sc->sc_hwmtx);
	ints = __uart_getreg(bas, UART_MIS);
	ipend = 0;

	if (ints & (UART_RXREADY \| RIS_RTIM))
	ipend \|= SER_INT_RXREADY;
	if (ints & RIS_BE)
	ipend \|= SER_INT_BREAK;
	if (ints & RIS_OE)
	ipend \|= SER_INT_OVERRUN;
	if (ints & UART_TXEMPTY) {
	if (sc->sc_txbusy)
	ipend \|= SER_INT_TXIDLE;

	/* Disable TX interrupt */
	__uart_setreg(bas, UART_IMSC, psc->imsc & ~UART_TXEMPTY);
	}

	uart_unlock(sc->sc_hwmtx);

	return (ipend);
	}

	static int
	uart_pl011_bus_param(struct uart_softc *sc, int baudrate, int databits,
	int stopbits, int parity)
	{

	uart_lock(sc->sc_hwmtx);
	uart_pl011_param(&sc->sc_bas, baudrate, databits, stopbits, parity);
	uart_unlock(sc->sc_hwmtx);

	return (0);
	}

	static int
	uart_pl011_bus_probe(struct uart_softc *sc)
	{
	uint8_t hwrev;
	#ifdef FDT
	pcell_t node;
	uint32_t periphid;

	/*
	* The FIFO sizes vary depending on hardware; rev 2 and below have 16
	* byte FIFOs, rev 3 and up are 32 byte. The hardware rev is in the
	* primecell periphid register, but we get a bit of drama, as always,
	* with the bcm2835 (rpi), which claims to be rev 3, but has 16 byte
	* FIFOs. We check for both the old freebsd-historic and the proper
	* bindings-defined compatible strings for bcm2835, and also check the
	* workaround the linux drivers use for rpi3, which is to override the
	* primecell periphid register value with a property.
	*/
	if (ofw_bus_is_compatible(sc->sc_dev, "brcm,bcm2835-pl011") \|\|
	ofw_bus_is_compatible(sc->sc_dev, "broadcom,bcm2835-uart")) {
	hwrev = 2;
	} else {
	node = ofw_bus_get_node(sc->sc_dev);
	if (OF_getencprop(node, "arm,primecell-periphid", &periphid,
	sizeof(periphid)) > 0) {
	hwrev = (periphid >> 20) & 0x0f;
	} else {
	hwrev = __uart_getreg(&sc->sc_bas, UART_PIDREG_2) >> 4;
	}
	}
	#else
	hwrev = __uart_getreg(&sc->sc_bas, UART_PIDREG_2) >> 4;
	#endif
	if (hwrev <= 2) {
	sc->sc_rxfifosz = FIFO_RX_SIZE_R2;
	sc->sc_txfifosz = FIFO_TX_SIZE_R2;
	} else {
	sc->sc_rxfifosz = FIFO_RX_SIZE_R3;
	sc->sc_txfifosz = FIFO_TX_SIZE_R3;
	}

	device_set_desc(sc->sc_dev, "PrimeCell UART (PL011)");

	return (0);
	}

	static int
	uart_pl011_bus_receive(struct uart_softc *sc)
	{
	struct uart_bas *bas;
	uint32_t ints, xc;
	int rx;

	bas = &sc->sc_bas;
	uart_lock(sc->sc_hwmtx);

	for (;;) {
	ints = __uart_getreg(bas, UART_FR);
	if (ints & FR_RXFE)
	break;
	if (uart_rx_full(sc)) {
	sc->sc_rxbuf[sc->sc_rxput] = UART_STAT_OVERRUN;
	break;
	}

	xc = __uart_getreg(bas, UART_DR);
	rx = xc & 0xff;

	if (xc & DR_FE)
	rx \|= UART_STAT_FRAMERR;
	if (xc & DR_PE)
	rx \|= UART_STAT_PARERR;

	uart_rx_put(sc, rx);
	}

	uart_unlock(sc->sc_hwmtx);

	return (0);
	}

	static int
	uart_pl011_bus_setsig(struct uart_softc *sc, int sig)
	{

	return (0);
	}

	static int
	uart_pl011_bus_transmit(struct uart_softc *sc)
	{
	struct uart_pl011_softc *psc;
	struct uart_bas *bas;
	int i;

	psc = (struct uart_pl011_softc *)sc;
	bas = &sc->sc_bas;
	uart_lock(sc->sc_hwmtx);

	for (i = 0; i < sc->sc_txdatasz; i++) {
	__uart_setreg(bas, UART_DR, sc->sc_txbuf[i]);
	uart_barrier(bas);
	}

	/* Mark busy and enable TX interrupt */
	sc->sc_txbusy = 1;
	__uart_setreg(bas, UART_IMSC, psc->imsc);

	uart_unlock(sc->sc_hwmtx);

	return (0);
	}

	static void
	uart_pl011_bus_grab(struct uart_softc *sc)
	{
	struct uart_pl011_softc *psc;
	struct uart_bas *bas;

	psc = (struct uart_pl011_softc *)sc;
	bas = &sc->sc_bas;

	/* Disable interrupts on switch to polling */
	uart_lock(sc->sc_hwmtx);
	__uart_setreg(bas, UART_IMSC, psc->imsc & ~IMSC_MASK_ALL);
	uart_unlock(sc->sc_hwmtx);
	}

	static void
	uart_pl011_bus_ungrab(struct uart_softc *sc)
	{
	struct uart_pl011_softc *psc;
	struct uart_bas *bas;

	psc = (struct uart_pl011_softc *)sc;
	bas = &sc->sc_bas;

	/* Switch to using interrupts while not grabbed */
	uart_lock(sc->sc_hwmtx);
	__uart_setreg(bas, UART_IMSC, psc->imsc);
	uart_unlock(sc->sc_hwmtx);
	}
	Index: head/sys/dev/uart/uart_dev_snps.c
	===================================================================
	--- head/sys/dev/uart/uart_dev_snps.c (revision 327172)
	+++ head/sys/dev/uart/uart_dev_snps.c (revision 327173)
	@@ -1,285 +1,283 @@
	/*-
	* Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <machine/bus.h>

	#include <dev/uart/uart.h>
	#include <dev/uart/uart_bus.h>
	#include <dev/uart/uart_cpu_fdt.h>
	#include <dev/uart/uart_dev_ns8250.h>

	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#ifdef EXT_RESOURCES
	#include <dev/extres/clk/clk.h>
	#include <dev/extres/hwreset/hwreset.h>
	#endif

	#include "uart_if.h"

	struct snps_softc {
	struct ns8250_softc ns8250;

	#ifdef EXT_RESOURCES
	clk_t baudclk;
	clk_t apb_pclk;
	hwreset_t reset;
	#endif
	};

	static int
	snps_uart_attach(struct uart_softc *uart_sc)
	{
	struct snps_softc *sc;

	sc = (struct snps_softc *)uart_sc;

	/* UART requires to read USR reg when IIR_BUSY */
	sc->ns8250.busy_detect = 1;

	return (ns8250_bus_attach(uart_sc));
	}

	static kobj_method_t snps_methods[] = {
	KOBJMETHOD(uart_probe, ns8250_bus_probe),
	KOBJMETHOD(uart_attach, snps_uart_attach),
	KOBJMETHOD(uart_detach, ns8250_bus_detach),
	KOBJMETHOD(uart_flush, ns8250_bus_flush),
	KOBJMETHOD(uart_getsig, ns8250_bus_getsig),
	KOBJMETHOD(uart_ioctl, ns8250_bus_ioctl),
	KOBJMETHOD(uart_ipend, ns8250_bus_ipend),
	KOBJMETHOD(uart_param, ns8250_bus_param),
	KOBJMETHOD(uart_receive, ns8250_bus_receive),
	KOBJMETHOD(uart_setsig, ns8250_bus_setsig),
	KOBJMETHOD(uart_transmit, ns8250_bus_transmit),
	KOBJMETHOD(uart_grab, ns8250_bus_grab),
	KOBJMETHOD(uart_ungrab, ns8250_bus_ungrab),
	KOBJMETHOD_END
	};

	struct uart_class uart_snps_class = {
	"snps",
	snps_methods,
	sizeof(struct snps_softc),
	.uc_ops = &uart_ns8250_ops,
	.uc_range = 8,
	.uc_rclk = 0,
	};

	static struct ofw_compat_data compat_data[] = {
	{ "snps,dw-apb-uart", (uintptr_t)&uart_snps_class },
	{ NULL, (uintptr_t)NULL }
	};
	UART_FDT_CLASS(compat_data);

	#ifdef EXT_RESOURCES
	static int
	snps_get_clocks(device_t dev, clk_t baudclk, clk_t apb_pclk)
	{
	- struct snps_softc *sc;

	- sc = device_get_softc(dev);
	*baudclk = NULL;
	*apb_pclk = NULL;

	/* Baud clock is either named "baudclk", or there is a single
	* unnamed clock.
	*/
	if (clk_get_by_ofw_name(dev, 0, "baudclk", baudclk) != 0 &&
	clk_get_by_ofw_index(dev, 0, 0, baudclk) != 0)
	return (ENOENT);

	/* APB peripheral clock is optional */
	(void)clk_get_by_ofw_name(dev, 0, "apb_pclk", apb_pclk);

	return (0);
	}
	#endif

	static int
	snps_probe(device_t dev)
	{
	struct snps_softc *sc;
	struct uart_class *uart_class;
	phandle_t node;
	uint32_t shift, iowidth, clock;
	uint64_t freq;
	int error;
	#ifdef EXT_RESOURCES
	clk_t baudclk, apb_pclk;
	hwreset_t reset;
	#endif

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	uart_class = (struct uart_class *)ofw_bus_search_compatible(dev,
	compat_data)->ocd_data;
	if (uart_class == NULL)
	return (ENXIO);

	freq = 0;
	sc = device_get_softc(dev);
	sc->ns8250.base.sc_class = uart_class;

	node = ofw_bus_get_node(dev);
	if (OF_getencprop(node, "reg-shift", &shift, sizeof(shift)) <= 0)
	shift = 0;
	if (OF_getencprop(node, "reg-io-width", &iowidth, sizeof(iowidth)) <= 0)
	iowidth = 1;
	if (OF_getencprop(node, "clock-frequency", &clock, sizeof(clock)) <= 0)
	clock = 0;

	#ifdef EXT_RESOURCES
	if (hwreset_get_by_ofw_idx(dev, 0, 0, &reset) == 0) {
	error = hwreset_deassert(reset);
	if (error != 0) {
	device_printf(dev, "cannot de-assert reset\n");
	return (error);
	}
	}

	if (snps_get_clocks(dev, &baudclk, &apb_pclk) == 0) {
	error = clk_enable(baudclk);
	if (error != 0) {
	device_printf(dev, "cannot enable baud clock\n");
	return (error);
	}
	if (apb_pclk != NULL) {
	error = clk_enable(apb_pclk);
	if (error != 0) {
	device_printf(dev,
	"cannot enable peripheral clock\n");
	return (error);
	}
	}

	if (clock == 0) {
	error = clk_get_freq(baudclk, &freq);
	if (error != 0) {
	device_printf(dev, "cannot get frequency\n");
	return (error);
	}
	clock = (uint32_t)freq;
	}
	}
	#endif

	if (bootverbose && clock == 0)
	device_printf(dev, "could not determine frequency\n");

	error = uart_bus_probe(dev, (int)shift, (int)iowidth, (int)clock, 0, 0);
	if (error != 0)
	return (error);

	#ifdef EXT_RESOURCES
	/* XXX uart_bus_probe has changed the softc, so refresh it */
	sc = device_get_softc(dev);

	/* Store clock and reset handles for detach */
	sc->baudclk = baudclk;
	sc->apb_pclk = apb_pclk;
	sc->reset = reset;
	#endif

	return (0);
	}

	static int
	snps_detach(device_t dev)
	{
	#ifdef EXT_RESOURCES
	struct snps_softc *sc;
	clk_t baudclk, apb_pclk;
	hwreset_t reset;
	#endif
	int error;

	#ifdef EXT_RESOURCES
	sc = device_get_softc(dev);
	baudclk = sc->baudclk;
	apb_pclk = sc->apb_pclk;
	reset = sc->reset;
	#endif

	error = uart_bus_detach(dev);
	if (error != 0)
	return (error);

	#ifdef EXT_RESOURCES
	if (reset != NULL) {
	error = hwreset_assert(reset);
	if (error != 0) {
	device_printf(dev, "cannot assert reset\n");
	return (error);
	}
	hwreset_release(reset);
	}
	if (apb_pclk != NULL) {
	error = clk_release(apb_pclk);
	if (error != 0) {
	device_printf(dev, "cannot release peripheral clock\n");
	return (error);
	}
	}
	if (baudclk != NULL) {
	error = clk_release(baudclk);
	if (error != 0) {
	device_printf(dev, "cannot release baud clock\n");
	return (error);
	}
	}
	#endif

	return (0);
	}

	static device_method_t snps_bus_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, snps_probe),
	DEVMETHOD(device_attach, uart_bus_attach),
	DEVMETHOD(device_detach, snps_detach),
	DEVMETHOD_END
	};

	static driver_t snps_uart_driver = {
	uart_driver_name,
	snps_bus_methods,
	sizeof(struct snps_softc)
	};

	DRIVER_MODULE(uart_snps, simplebus, snps_uart_driver, uart_devclass, 0, 0);
	Index: head/sys/dev/usb/controller/dwc_otg_fdt.c
	===================================================================
	--- head/sys/dev/usb/controller/dwc_otg_fdt.c (revision 327172)
	+++ head/sys/dev/usb/controller/dwc_otg_fdt.c (revision 327173)
	@@ -1,226 +1,225 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Hans Petter Selasky. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/condvar.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/rman.h>

	#include <dev/ofw/openfirm.h>
	#include <dev/ofw/ofw_bus.h>
	#include <dev/ofw/ofw_bus_subr.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usbdi.h>

	#include <dev/usb/usb_core.h>
	#include <dev/usb/usb_busdma.h>
	#include <dev/usb/usb_process.h>
	#include <dev/usb/usb_util.h>

	#include <dev/usb/usb_controller.h>
	#include <dev/usb/usb_bus.h>

	#include <dev/usb/controller/dwc_otg.h>
	#include <dev/usb/controller/dwc_otg_fdt.h>

	static device_probe_t dwc_otg_probe;

	static struct ofw_compat_data compat_data[] = {
	{ "synopsys,designware-hs-otg2", 1 },
	{ "snps,dwc2", 1 },
	{ NULL, 0 }
	};

	static int
	dwc_otg_probe(device_t dev)
	{

	if (!ofw_bus_status_okay(dev))
	return (ENXIO);

	if (!ofw_bus_search_compatible(dev, compat_data)->ocd_data)
	return (ENXIO);

	device_set_desc(dev, "DWC OTG 2.0 integrated USB controller");

	return (BUS_PROBE_DEFAULT);
	}

	int
	dwc_otg_attach(device_t dev)
	{
	struct dwc_otg_fdt_softc *sc = device_get_softc(dev);
	char usb_mode[24];
	int err;
	int rid;

	/* initialise some bus fields */
	sc->sc_otg.sc_bus.parent = dev;
	sc->sc_otg.sc_bus.devices = sc->sc_otg.sc_devices;
	sc->sc_otg.sc_bus.devices_max = DWC_OTG_MAX_DEVICES;
	sc->sc_otg.sc_bus.dma_bits = 32;

	/* get USB mode, if any */
	if (OF_getprop(ofw_bus_get_node(dev), "dr_mode",
	&usb_mode, sizeof(usb_mode)) > 0) {

	/* ensure proper zero termination */
	usb_mode[sizeof(usb_mode) - 1] = 0;

	if (strcasecmp(usb_mode, "host") == 0)
	sc->sc_otg.sc_mode = DWC_MODE_HOST;
	else if (strcasecmp(usb_mode, "peripheral") == 0)
	sc->sc_otg.sc_mode = DWC_MODE_DEVICE;
	else if (strcasecmp(usb_mode, "otg") != 0) {
	device_printf(dev, "Invalid FDT dr_mode: %s\n",
	usb_mode);
	}
	}

	/* get all DMA memory */
	if (usb_bus_mem_alloc_all(&sc->sc_otg.sc_bus,
	USB_GET_DMA_TAG(dev), NULL)) {
	return (ENOMEM);
	}
	rid = 0;
	sc->sc_otg.sc_io_res =
	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);

	if (!(sc->sc_otg.sc_io_res)) {
	err = ENOMEM;
	goto error;
	}
	sc->sc_otg.sc_io_tag = rman_get_bustag(sc->sc_otg.sc_io_res);
	sc->sc_otg.sc_io_hdl = rman_get_bushandle(sc->sc_otg.sc_io_res);
	sc->sc_otg.sc_io_size = rman_get_size(sc->sc_otg.sc_io_res);


	/*
	* brcm,bcm2708-usb FDT provides two interrupts,
	* we need only second one (VC_USB)
	*/
	rid = ofw_bus_is_compatible(dev, "brcm,bcm2708-usb") ? 1 : 0;
	sc->sc_otg.sc_irq_res =
	bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE);
	if (sc->sc_otg.sc_irq_res == NULL)
	goto error;

	sc->sc_otg.sc_bus.bdev = device_add_child(dev, "usbus", -1);
	if (sc->sc_otg.sc_bus.bdev == NULL)
	goto error;

	device_set_ivars(sc->sc_otg.sc_bus.bdev, &sc->sc_otg.sc_bus);

	err = bus_setup_intr(dev, sc->sc_otg.sc_irq_res, INTR_TYPE_TTY \| INTR_MPSAFE,
	&dwc_otg_filter_interrupt, &dwc_otg_interrupt, sc, &sc->sc_otg.sc_intr_hdl);
	if (err) {
	sc->sc_otg.sc_intr_hdl = NULL;
	goto error;
	}
	err = dwc_otg_init(&sc->sc_otg);
	if (err == 0) {
	err = device_probe_and_attach(sc->sc_otg.sc_bus.bdev);
	}
	if (err)
	goto error;


	return (0);

	error:
	dwc_otg_detach(dev);
	return (ENXIO);
	}

	int
	dwc_otg_detach(device_t dev)
	{
	struct dwc_otg_fdt_softc *sc = device_get_softc(dev);
	- int err;

	/* during module unload there are lots of children leftover */
	device_delete_children(dev);

	if (sc->sc_otg.sc_irq_res && sc->sc_otg.sc_intr_hdl) {
	/*
	* only call dwc_otg_uninit() after dwc_otg_init()
	*/
	dwc_otg_uninit(&sc->sc_otg);

	- err = bus_teardown_intr(dev, sc->sc_otg.sc_irq_res,
	+ bus_teardown_intr(dev, sc->sc_otg.sc_irq_res,
	sc->sc_otg.sc_intr_hdl);
	sc->sc_otg.sc_intr_hdl = NULL;
	}
	/* free IRQ channel, if any */
	if (sc->sc_otg.sc_irq_res) {
	bus_release_resource(dev, SYS_RES_IRQ, 0,
	sc->sc_otg.sc_irq_res);
	sc->sc_otg.sc_irq_res = NULL;
	}
	/* free memory resource, if any */
	if (sc->sc_otg.sc_io_res) {
	bus_release_resource(dev, SYS_RES_MEMORY, 0,
	sc->sc_otg.sc_io_res);
	sc->sc_otg.sc_io_res = NULL;
	}
	usb_bus_mem_free_all(&sc->sc_otg.sc_bus, NULL);

	return (0);
	}

	static device_method_t dwc_otg_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, dwc_otg_probe),
	DEVMETHOD(device_attach, dwc_otg_attach),
	DEVMETHOD(device_detach, dwc_otg_detach),
	DEVMETHOD(device_suspend, bus_generic_suspend),
	DEVMETHOD(device_resume, bus_generic_resume),
	DEVMETHOD(device_shutdown, bus_generic_shutdown),

	DEVMETHOD_END
	};

	driver_t dwc_otg_driver = {
	.name = "dwcotg",
	.methods = dwc_otg_methods,
	.size = sizeof(struct dwc_otg_fdt_softc),
	};

	static devclass_t dwc_otg_devclass;

	DRIVER_MODULE(dwcotg, simplebus, dwc_otg_driver, dwc_otg_devclass, 0, 0);
	MODULE_DEPEND(dwcotg, usb, 1, 1, 1);
	Index: head/sys/dev/usb/controller/ehci.c
	===================================================================
	--- head/sys/dev/usb/controller/ehci.c (revision 327172)
	+++ head/sys/dev/usb/controller/ehci.c (revision 327173)
	@@ -1,3975 +1,3971 @@
	/* $FreeBSD$ */
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
	* Copyright (c) 2004 The NetBSD Foundation, Inc. All rights reserved.
	* Copyright (c) 2004 Lennart Augustsson. All rights reserved.
	* Copyright (c) 2004 Charles M. Hannum. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* USB Enhanced Host Controller Driver, a.k.a. USB 2.0 controller.
	*
	* The EHCI 0.96 spec can be found at
	* http://developer.intel.com/technology/usb/download/ehci-r096.pdf
	* The EHCI 1.0 spec can be found at
	* http://developer.intel.com/technology/usb/download/ehci-r10.pdf
	* and the USB 2.0 spec at
	* http://www.usb.org/developers/docs/usb_20.zip
	*
	*/

	/*
	* TODO:
	* 1) command failures are not recovered correctly
	*/

	#ifdef USB_GLOBAL_INCLUDE_FILE
	#include USB_GLOBAL_INCLUDE_FILE
	#else
	#include <sys/stdint.h>
	#include <sys/stddef.h>
	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usbdi.h>

	#define USB_DEBUG_VAR ehcidebug

	#include <dev/usb/usb_core.h>
	#include <dev/usb/usb_debug.h>
	#include <dev/usb/usb_busdma.h>
	#include <dev/usb/usb_process.h>
	#include <dev/usb/usb_transfer.h>
	#include <dev/usb/usb_device.h>
	#include <dev/usb/usb_hub.h>
	#include <dev/usb/usb_util.h>

	#include <dev/usb/usb_controller.h>
	#include <dev/usb/usb_bus.h>
	#endif /* USB_GLOBAL_INCLUDE_FILE */

	#include <dev/usb/controller/ehci.h>
	#include <dev/usb/controller/ehcireg.h>

	#define EHCI_BUS2SC(bus) \
	((ehci_softc_t )(((uint8_t )(bus)) - \
	((uint8_t )&(((ehci_softc_t )0)->sc_bus))))

	#ifdef USB_DEBUG
	static int ehcidebug = 0;
	static int ehcinohighspeed = 0;
	static int ehciiaadbug = 0;
	static int ehcilostintrbug = 0;

	static SYSCTL_NODE(_hw_usb, OID_AUTO, ehci, CTLFLAG_RW, 0, "USB ehci");
	SYSCTL_INT(_hw_usb_ehci, OID_AUTO, debug, CTLFLAG_RWTUN,
	&ehcidebug, 0, "Debug level");
	SYSCTL_INT(_hw_usb_ehci, OID_AUTO, no_hs, CTLFLAG_RWTUN,
	&ehcinohighspeed, 0, "Disable High Speed USB");
	SYSCTL_INT(_hw_usb_ehci, OID_AUTO, iaadbug, CTLFLAG_RWTUN,
	&ehciiaadbug, 0, "Enable doorbell bug workaround");
	SYSCTL_INT(_hw_usb_ehci, OID_AUTO, lostintrbug, CTLFLAG_RWTUN,
	&ehcilostintrbug, 0, "Enable lost interrupt bug workaround");

	static void ehci_dump_regs(ehci_softc_t *sc);
	static void ehci_dump_sqh(ehci_softc_t sc, ehci_qh_t sqh);

	#endif

	#define EHCI_INTR_ENDPT 1

	static const struct usb_bus_methods ehci_bus_methods;
	static const struct usb_pipe_methods ehci_device_bulk_methods;
	static const struct usb_pipe_methods ehci_device_ctrl_methods;
	static const struct usb_pipe_methods ehci_device_intr_methods;
	static const struct usb_pipe_methods ehci_device_isoc_fs_methods;
	static const struct usb_pipe_methods ehci_device_isoc_hs_methods;

	static void ehci_do_poll(struct usb_bus *);
	static void ehci_device_done(struct usb_xfer *, usb_error_t);
	static uint8_t ehci_check_transfer(struct usb_xfer *);
	static void ehci_timeout(void *);
	static void ehci_poll_timeout(void *);

	static void ehci_root_intr(ehci_softc_t *sc);

	struct ehci_std_temp {
	ehci_softc_t *sc;
	struct usb_page_cache *pc;
	ehci_qtd_t *td;
	ehci_qtd_t *td_next;
	uint32_t average;
	uint32_t qtd_status;
	uint32_t len;
	uint16_t max_frame_size;
	uint8_t shortpkt;
	uint8_t auto_data_toggle;
	uint8_t setup_alt_next;
	uint8_t last_frame;
	};

	void
	ehci_iterate_hw_softc(struct usb_bus bus, usb_bus_mem_sub_cb_t cb)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(bus);
	uint32_t i;

	cb(bus, &sc->sc_hw.pframes_pc, &sc->sc_hw.pframes_pg,
	sizeof(uint32_t) * EHCI_FRAMELIST_COUNT, EHCI_FRAMELIST_ALIGN);

	cb(bus, &sc->sc_hw.terminate_pc, &sc->sc_hw.terminate_pg,
	sizeof(struct ehci_qh_sub), EHCI_QH_ALIGN);

	cb(bus, &sc->sc_hw.async_start_pc, &sc->sc_hw.async_start_pg,
	sizeof(ehci_qh_t), EHCI_QH_ALIGN);

	for (i = 0; i != EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
	cb(bus, sc->sc_hw.intr_start_pc + i,
	sc->sc_hw.intr_start_pg + i,
	sizeof(ehci_qh_t), EHCI_QH_ALIGN);
	}

	for (i = 0; i != EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
	cb(bus, sc->sc_hw.isoc_hs_start_pc + i,
	sc->sc_hw.isoc_hs_start_pg + i,
	sizeof(ehci_itd_t), EHCI_ITD_ALIGN);
	}

	for (i = 0; i != EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
	cb(bus, sc->sc_hw.isoc_fs_start_pc + i,
	sc->sc_hw.isoc_fs_start_pg + i,
	sizeof(ehci_sitd_t), EHCI_SITD_ALIGN);
	}
	}

	usb_error_t
	ehci_reset(ehci_softc_t *sc)
	{
	uint32_t hcr;
	int i;

	EOWRITE4(sc, EHCI_USBCMD, EHCI_CMD_HCRESET);
	for (i = 0; i < 100; i++) {
	usb_pause_mtx(NULL, hz / 128);
	hcr = EOREAD4(sc, EHCI_USBCMD) & EHCI_CMD_HCRESET;
	if (!hcr) {
	if (sc->sc_vendor_post_reset != NULL)
	sc->sc_vendor_post_reset(sc);
	return (0);
	}
	}
	device_printf(sc->sc_bus.bdev, "reset timeout\n");
	return (USB_ERR_IOERROR);
	}

	static usb_error_t
	ehci_hcreset(ehci_softc_t *sc)
	{
	uint32_t hcr;
	int i;

	EOWRITE4(sc, EHCI_USBCMD, 0); /* Halt controller */
	for (i = 0; i < 100; i++) {
	usb_pause_mtx(NULL, hz / 128);
	hcr = EOREAD4(sc, EHCI_USBSTS) & EHCI_STS_HCH;
	if (hcr)
	break;
	}
	if (!hcr)
	/*
	* Fall through and try reset anyway even though
	* Table 2-9 in the EHCI spec says this will result
	* in undefined behavior.
	*/
	device_printf(sc->sc_bus.bdev, "stop timeout\n");

	return (ehci_reset(sc));
	}

	static int
	ehci_init_sub(struct ehci_softc *sc)
	{
	struct usb_page_search buf_res;
	uint32_t cparams;
	uint32_t hcr;
	uint8_t i;

	cparams = EREAD4(sc, EHCI_HCCPARAMS);

	DPRINTF("cparams=0x%x\n", cparams);

	if (EHCI_HCC_64BIT(cparams)) {
	DPRINTF("HCC uses 64-bit structures\n");

	/* MUST clear segment register if 64 bit capable */
	EOWRITE4(sc, EHCI_CTRLDSSEGMENT, 0);
	}

	usbd_get_page(&sc->sc_hw.pframes_pc, 0, &buf_res);
	EOWRITE4(sc, EHCI_PERIODICLISTBASE, buf_res.physaddr);

	usbd_get_page(&sc->sc_hw.async_start_pc, 0, &buf_res);
	EOWRITE4(sc, EHCI_ASYNCLISTADDR, buf_res.physaddr \| EHCI_LINK_QH);

	/* enable interrupts */
	EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);

	/* turn on controller */
	EOWRITE4(sc, EHCI_USBCMD,
	EHCI_CMD_ITC_1 \| /* 1 microframes interrupt delay */
	(EOREAD4(sc, EHCI_USBCMD) & EHCI_CMD_FLS_M) \|
	EHCI_CMD_ASE \|
	EHCI_CMD_PSE \|
	EHCI_CMD_RS);

	/* Take over port ownership */
	EOWRITE4(sc, EHCI_CONFIGFLAG, EHCI_CONF_CF);

	for (i = 0; i < 100; i++) {
	usb_pause_mtx(NULL, hz / 128);
	hcr = EOREAD4(sc, EHCI_USBSTS) & EHCI_STS_HCH;
	if (!hcr) {
	break;
	}
	}
	if (hcr) {
	device_printf(sc->sc_bus.bdev, "run timeout\n");
	return (USB_ERR_IOERROR);
	}
	return (USB_ERR_NORMAL_COMPLETION);
	}

	usb_error_t
	ehci_init(ehci_softc_t *sc)
	{
	struct usb_page_search buf_res;
	uint32_t version;
	uint32_t sparams;
	uint16_t i;
	uint16_t x;
	uint16_t y;
	uint16_t bit;
	usb_error_t err = 0;

	DPRINTF("start\n");

	usb_callout_init_mtx(&sc->sc_tmo_pcd, &sc->sc_bus.bus_mtx, 0);
	usb_callout_init_mtx(&sc->sc_tmo_poll, &sc->sc_bus.bus_mtx, 0);

	sc->sc_offs = EHCI_CAPLENGTH(EREAD4(sc, EHCI_CAPLEN_HCIVERSION));

	#ifdef USB_DEBUG
	if (ehciiaadbug)
	sc->sc_flags \|= EHCI_SCFLG_IAADBUG;
	if (ehcilostintrbug)
	sc->sc_flags \|= EHCI_SCFLG_LOSTINTRBUG;
	if (ehcidebug > 2) {
	ehci_dump_regs(sc);
	}
	#endif

	version = EHCI_HCIVERSION(EREAD4(sc, EHCI_CAPLEN_HCIVERSION));
	device_printf(sc->sc_bus.bdev, "EHCI version %x.%x\n",
	version >> 8, version & 0xff);

	sparams = EREAD4(sc, EHCI_HCSPARAMS);
	DPRINTF("sparams=0x%x\n", sparams);

	sc->sc_noport = EHCI_HCS_N_PORTS(sparams);
	sc->sc_bus.usbrev = USB_REV_2_0;

	if (!(sc->sc_flags & EHCI_SCFLG_DONTRESET)) {
	/* Reset the controller */
	DPRINTF("%s: resetting\n",
	device_get_nameunit(sc->sc_bus.bdev));

	err = ehci_hcreset(sc);
	if (err) {
	device_printf(sc->sc_bus.bdev, "reset timeout\n");
	return (err);
	}
	}

	/*
	* use current frame-list-size selection 0: 10244 bytes 1: 5124
	* bytes 2: 256*4 bytes 3: unknown
	*/
	if (EHCI_CMD_FLS(EOREAD4(sc, EHCI_USBCMD)) == 3) {
	device_printf(sc->sc_bus.bdev, "invalid frame-list-size\n");
	return (USB_ERR_IOERROR);
	}
	/* set up the bus struct */
	sc->sc_bus.methods = &ehci_bus_methods;

	sc->sc_eintrs = EHCI_NORMAL_INTRS;

	if (1) {
	struct ehci_qh_sub *qh;

	usbd_get_page(&sc->sc_hw.terminate_pc, 0, &buf_res);

	qh = buf_res.buffer;

	sc->sc_terminate_self = htohc32(sc, buf_res.physaddr);

	/* init terminate TD */
	qh->qtd_next =
	htohc32(sc, EHCI_LINK_TERMINATE);
	qh->qtd_altnext =
	htohc32(sc, EHCI_LINK_TERMINATE);
	qh->qtd_status =
	htohc32(sc, EHCI_QTD_HALTED);
	}

	for (i = 0; i < EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
	ehci_qh_t *qh;

	usbd_get_page(sc->sc_hw.intr_start_pc + i, 0, &buf_res);

	qh = buf_res.buffer;

	/* initialize page cache pointer */

	qh->page_cache = sc->sc_hw.intr_start_pc + i;

	/* store a pointer to queue head */

	sc->sc_intr_p_last[i] = qh;

	qh->qh_self =
	htohc32(sc, buf_res.physaddr) \|
	htohc32(sc, EHCI_LINK_QH);

	qh->qh_endp =
	htohc32(sc, EHCI_QH_SET_EPS(EHCI_QH_SPEED_HIGH));
	qh->qh_endphub =
	htohc32(sc, EHCI_QH_SET_MULT(1));
	qh->qh_curqtd = 0;

	qh->qh_qtd.qtd_next =
	htohc32(sc, EHCI_LINK_TERMINATE);
	qh->qh_qtd.qtd_altnext =
	htohc32(sc, EHCI_LINK_TERMINATE);
	qh->qh_qtd.qtd_status =
	htohc32(sc, EHCI_QTD_HALTED);
	}

	/*
	* the QHs are arranged to give poll intervals that are
	* powers of 2 times 1ms
	*/
	bit = EHCI_VIRTUAL_FRAMELIST_COUNT / 2;
	while (bit) {
	x = bit;
	while (x & bit) {
	ehci_qh_t *qh_x;
	ehci_qh_t *qh_y;

	y = (x ^ bit) \| (bit / 2);

	qh_x = sc->sc_intr_p_last[x];
	qh_y = sc->sc_intr_p_last[y];

	/*
	* the next QH has half the poll interval
	*/
	qh_x->qh_link = qh_y->qh_self;

	x++;
	}
	bit >>= 1;
	}

	if (1) {
	ehci_qh_t *qh;

	qh = sc->sc_intr_p_last[0];

	/* the last (1ms) QH terminates */
	qh->qh_link = htohc32(sc, EHCI_LINK_TERMINATE);
	}
	for (i = 0; i < EHCI_VIRTUAL_FRAMELIST_COUNT; i++) {
	ehci_sitd_t *sitd;
	ehci_itd_t *itd;

	usbd_get_page(sc->sc_hw.isoc_fs_start_pc + i, 0, &buf_res);

	sitd = buf_res.buffer;

	/* initialize page cache pointer */

	sitd->page_cache = sc->sc_hw.isoc_fs_start_pc + i;

	/* store a pointer to the transfer descriptor */

	sc->sc_isoc_fs_p_last[i] = sitd;

	/* initialize full speed isochronous */

	sitd->sitd_self =
	htohc32(sc, buf_res.physaddr) \|
	htohc32(sc, EHCI_LINK_SITD);

	sitd->sitd_back =
	htohc32(sc, EHCI_LINK_TERMINATE);

	sitd->sitd_next =
	sc->sc_intr_p_last[i \| (EHCI_VIRTUAL_FRAMELIST_COUNT / 2)]->qh_self;


	usbd_get_page(sc->sc_hw.isoc_hs_start_pc + i, 0, &buf_res);

	itd = buf_res.buffer;

	/* initialize page cache pointer */

	itd->page_cache = sc->sc_hw.isoc_hs_start_pc + i;

	/* store a pointer to the transfer descriptor */

	sc->sc_isoc_hs_p_last[i] = itd;

	/* initialize high speed isochronous */

	itd->itd_self =
	htohc32(sc, buf_res.physaddr) \|
	htohc32(sc, EHCI_LINK_ITD);

	itd->itd_next =
	sitd->sitd_self;
	}

	usbd_get_page(&sc->sc_hw.pframes_pc, 0, &buf_res);

	if (1) {
	uint32_t *pframes;

	pframes = buf_res.buffer;

	/*
	* execution order:
	* pframes -> high speed isochronous ->
	* full speed isochronous -> interrupt QH's
	*/
	for (i = 0; i < EHCI_FRAMELIST_COUNT; i++) {
	pframes[i] = sc->sc_isoc_hs_p_last
	[i & (EHCI_VIRTUAL_FRAMELIST_COUNT - 1)]->itd_self;
	}
	}
	usbd_get_page(&sc->sc_hw.async_start_pc, 0, &buf_res);

	if (1) {

	ehci_qh_t *qh;

	qh = buf_res.buffer;

	/* initialize page cache pointer */

	qh->page_cache = &sc->sc_hw.async_start_pc;

	/* store a pointer to the queue head */

	sc->sc_async_p_last = qh;

	/* init dummy QH that starts the async list */

	qh->qh_self =
	htohc32(sc, buf_res.physaddr) \|
	htohc32(sc, EHCI_LINK_QH);

	/* fill the QH */
	qh->qh_endp =
	htohc32(sc, EHCI_QH_SET_EPS(EHCI_QH_SPEED_HIGH) \| EHCI_QH_HRECL);
	qh->qh_endphub = htohc32(sc, EHCI_QH_SET_MULT(1));
	qh->qh_link = qh->qh_self;
	qh->qh_curqtd = 0;

	/* fill the overlay qTD */
	qh->qh_qtd.qtd_next = htohc32(sc, EHCI_LINK_TERMINATE);
	qh->qh_qtd.qtd_altnext = htohc32(sc, EHCI_LINK_TERMINATE);
	qh->qh_qtd.qtd_status = htohc32(sc, EHCI_QTD_HALTED);
	}
	/* flush all cache into memory */

	usb_bus_mem_flush_all(&sc->sc_bus, &ehci_iterate_hw_softc);

	#ifdef USB_DEBUG
	if (ehcidebug) {
	ehci_dump_sqh(sc, sc->sc_async_p_last);
	}
	#endif

	/* finial setup */
	err = ehci_init_sub(sc);

	if (!err) {
	/* catch any lost interrupts */
	ehci_do_poll(&sc->sc_bus);
	}
	return (err);
	}

	/*
	* shut down the controller when the system is going down
	*/
	void
	ehci_detach(ehci_softc_t *sc)
	{
	USB_BUS_LOCK(&sc->sc_bus);

	usb_callout_stop(&sc->sc_tmo_pcd);
	usb_callout_stop(&sc->sc_tmo_poll);

	EOWRITE4(sc, EHCI_USBINTR, 0);
	USB_BUS_UNLOCK(&sc->sc_bus);

	if (ehci_hcreset(sc)) {
	DPRINTF("reset failed!\n");
	}

	/* XXX let stray task complete */
	usb_pause_mtx(NULL, hz / 20);

	usb_callout_drain(&sc->sc_tmo_pcd);
	usb_callout_drain(&sc->sc_tmo_poll);
	}

	static void
	ehci_suspend(ehci_softc_t *sc)
	{
	DPRINTF("stopping the HC\n");

	/* reset HC */
	ehci_hcreset(sc);
	}

	static void
	ehci_resume(ehci_softc_t *sc)
	{
	/* reset HC */
	ehci_hcreset(sc);

	/* setup HC */
	ehci_init_sub(sc);

	/* catch any lost interrupts */
	ehci_do_poll(&sc->sc_bus);
	}

	#ifdef USB_DEBUG
	static void
	ehci_dump_regs(ehci_softc_t *sc)
	{
	uint32_t i;

	i = EOREAD4(sc, EHCI_USBCMD);
	printf("cmd=0x%08x\n", i);

	if (i & EHCI_CMD_ITC_1)
	printf(" EHCI_CMD_ITC_1\n");
	if (i & EHCI_CMD_ITC_2)
	printf(" EHCI_CMD_ITC_2\n");
	if (i & EHCI_CMD_ITC_4)
	printf(" EHCI_CMD_ITC_4\n");
	if (i & EHCI_CMD_ITC_8)
	printf(" EHCI_CMD_ITC_8\n");
	if (i & EHCI_CMD_ITC_16)
	printf(" EHCI_CMD_ITC_16\n");
	if (i & EHCI_CMD_ITC_32)
	printf(" EHCI_CMD_ITC_32\n");
	if (i & EHCI_CMD_ITC_64)
	printf(" EHCI_CMD_ITC_64\n");
	if (i & EHCI_CMD_ASPME)
	printf(" EHCI_CMD_ASPME\n");
	if (i & EHCI_CMD_ASPMC)
	printf(" EHCI_CMD_ASPMC\n");
	if (i & EHCI_CMD_LHCR)
	printf(" EHCI_CMD_LHCR\n");
	if (i & EHCI_CMD_IAAD)
	printf(" EHCI_CMD_IAAD\n");
	if (i & EHCI_CMD_ASE)
	printf(" EHCI_CMD_ASE\n");
	if (i & EHCI_CMD_PSE)
	printf(" EHCI_CMD_PSE\n");
	if (i & EHCI_CMD_FLS_M)
	printf(" EHCI_CMD_FLS_M\n");
	if (i & EHCI_CMD_HCRESET)
	printf(" EHCI_CMD_HCRESET\n");
	if (i & EHCI_CMD_RS)
	printf(" EHCI_CMD_RS\n");

	i = EOREAD4(sc, EHCI_USBSTS);

	printf("sts=0x%08x\n", i);

	if (i & EHCI_STS_ASS)
	printf(" EHCI_STS_ASS\n");
	if (i & EHCI_STS_PSS)
	printf(" EHCI_STS_PSS\n");
	if (i & EHCI_STS_REC)
	printf(" EHCI_STS_REC\n");
	if (i & EHCI_STS_HCH)
	printf(" EHCI_STS_HCH\n");
	if (i & EHCI_STS_IAA)
	printf(" EHCI_STS_IAA\n");
	if (i & EHCI_STS_HSE)
	printf(" EHCI_STS_HSE\n");
	if (i & EHCI_STS_FLR)
	printf(" EHCI_STS_FLR\n");
	if (i & EHCI_STS_PCD)
	printf(" EHCI_STS_PCD\n");
	if (i & EHCI_STS_ERRINT)
	printf(" EHCI_STS_ERRINT\n");
	if (i & EHCI_STS_INT)
	printf(" EHCI_STS_INT\n");

	printf("ien=0x%08x\n",
	EOREAD4(sc, EHCI_USBINTR));
	printf("frindex=0x%08x ctrdsegm=0x%08x periodic=0x%08x async=0x%08x\n",
	EOREAD4(sc, EHCI_FRINDEX),
	EOREAD4(sc, EHCI_CTRLDSSEGMENT),
	EOREAD4(sc, EHCI_PERIODICLISTBASE),
	EOREAD4(sc, EHCI_ASYNCLISTADDR));
	for (i = 1; i <= sc->sc_noport; i++) {
	printf("port %d status=0x%08x\n", i,
	EOREAD4(sc, EHCI_PORTSC(i)));
	}
	}

	static void
	ehci_dump_link(ehci_softc_t *sc, uint32_t link, int type)
	{
	link = hc32toh(sc, link);
	printf("0x%08x", link);
	if (link & EHCI_LINK_TERMINATE)
	printf("<T>");
	else {
	printf("<");
	if (type) {
	switch (EHCI_LINK_TYPE(link)) {
	case EHCI_LINK_ITD:
	printf("ITD");
	break;
	case EHCI_LINK_QH:
	printf("QH");
	break;
	case EHCI_LINK_SITD:
	printf("SITD");
	break;
	case EHCI_LINK_FSTN:
	printf("FSTN");
	break;
	}
	}
	printf(">");
	}
	}

	static void
	ehci_dump_qtd(ehci_softc_t sc, ehci_qtd_t qtd)
	{
	uint32_t s;

	printf(" next=");
	ehci_dump_link(sc, qtd->qtd_next, 0);
	printf(" altnext=");
	ehci_dump_link(sc, qtd->qtd_altnext, 0);
	printf("\n");
	s = hc32toh(sc, qtd->qtd_status);
	printf(" status=0x%08x: toggle=%d bytes=0x%x ioc=%d c_page=0x%x\n",
	s, EHCI_QTD_GET_TOGGLE(s), EHCI_QTD_GET_BYTES(s),
	EHCI_QTD_GET_IOC(s), EHCI_QTD_GET_C_PAGE(s));
	printf(" cerr=%d pid=%d stat=%s%s%s%s%s%s%s%s\n",
	EHCI_QTD_GET_CERR(s), EHCI_QTD_GET_PID(s),
	(s & EHCI_QTD_ACTIVE) ? "ACTIVE" : "NOT_ACTIVE",
	(s & EHCI_QTD_HALTED) ? "-HALTED" : "",
	(s & EHCI_QTD_BUFERR) ? "-BUFERR" : "",
	(s & EHCI_QTD_BABBLE) ? "-BABBLE" : "",
	(s & EHCI_QTD_XACTERR) ? "-XACTERR" : "",
	(s & EHCI_QTD_MISSEDMICRO) ? "-MISSED" : "",
	(s & EHCI_QTD_SPLITXSTATE) ? "-SPLIT" : "",
	(s & EHCI_QTD_PINGSTATE) ? "-PING" : "");

	for (s = 0; s < 5; s++) {
	printf(" buffer[%d]=0x%08x\n", s,
	hc32toh(sc, qtd->qtd_buffer[s]));
	}
	for (s = 0; s < 5; s++) {
	printf(" buffer_hi[%d]=0x%08x\n", s,
	hc32toh(sc, qtd->qtd_buffer_hi[s]));
	}
	}

	static uint8_t
	ehci_dump_sqtd(ehci_softc_t sc, ehci_qtd_t sqtd)
	{
	uint8_t temp;

	usb_pc_cpu_invalidate(sqtd->page_cache);
	printf("QTD(%p) at 0x%08x:\n", sqtd, hc32toh(sc, sqtd->qtd_self));
	ehci_dump_qtd(sc, sqtd);
	temp = (sqtd->qtd_next & htohc32(sc, EHCI_LINK_TERMINATE)) ? 1 : 0;
	return (temp);
	}

	static void
	ehci_dump_sqtds(ehci_softc_t sc, ehci_qtd_t sqtd)
	{
	uint16_t i;
	uint8_t stop;

	stop = 0;
	for (i = 0; sqtd && (i < 20) && !stop; sqtd = sqtd->obj_next, i++) {
	stop = ehci_dump_sqtd(sc, sqtd);
	}
	if (sqtd) {
	printf("dump aborted, too many TDs\n");
	}
	}

	static void
	ehci_dump_sqh(ehci_softc_t sc, ehci_qh_t qh)
	{
	uint32_t endp;
	uint32_t endphub;

	usb_pc_cpu_invalidate(qh->page_cache);
	printf("QH(%p) at 0x%08x:\n", qh, hc32toh(sc, qh->qh_self) & ~0x1F);
	printf(" link=");
	ehci_dump_link(sc, qh->qh_link, 1);
	printf("\n");
	endp = hc32toh(sc, qh->qh_endp);
	printf(" endp=0x%08x\n", endp);
	printf(" addr=0x%02x inact=%d endpt=%d eps=%d dtc=%d hrecl=%d\n",
	EHCI_QH_GET_ADDR(endp), EHCI_QH_GET_INACT(endp),
	EHCI_QH_GET_ENDPT(endp), EHCI_QH_GET_EPS(endp),
	EHCI_QH_GET_DTC(endp), EHCI_QH_GET_HRECL(endp));
	printf(" mpl=0x%x ctl=%d nrl=%d\n",
	EHCI_QH_GET_MPL(endp), EHCI_QH_GET_CTL(endp),
	EHCI_QH_GET_NRL(endp));
	endphub = hc32toh(sc, qh->qh_endphub);
	printf(" endphub=0x%08x\n", endphub);
	printf(" smask=0x%02x cmask=0x%02x huba=0x%02x port=%d mult=%d\n",
	EHCI_QH_GET_SMASK(endphub), EHCI_QH_GET_CMASK(endphub),
	EHCI_QH_GET_HUBA(endphub), EHCI_QH_GET_PORT(endphub),
	EHCI_QH_GET_MULT(endphub));
	printf(" curqtd=");
	ehci_dump_link(sc, qh->qh_curqtd, 0);
	printf("\n");
	printf("Overlay qTD:\n");
	ehci_dump_qtd(sc, (void *)&qh->qh_qtd);
	}

	static void
	ehci_dump_sitd(ehci_softc_t sc, ehci_sitd_t sitd)
	{
	usb_pc_cpu_invalidate(sitd->page_cache);
	printf("SITD(%p) at 0x%08x\n", sitd, hc32toh(sc, sitd->sitd_self) & ~0x1F);
	printf(" next=0x%08x\n", hc32toh(sc, sitd->sitd_next));
	printf(" portaddr=0x%08x dir=%s addr=%d endpt=0x%x port=0x%x huba=0x%x\n",
	hc32toh(sc, sitd->sitd_portaddr),
	(sitd->sitd_portaddr & htohc32(sc, EHCI_SITD_SET_DIR_IN))
	? "in" : "out",
	EHCI_SITD_GET_ADDR(hc32toh(sc, sitd->sitd_portaddr)),
	EHCI_SITD_GET_ENDPT(hc32toh(sc, sitd->sitd_portaddr)),
	EHCI_SITD_GET_PORT(hc32toh(sc, sitd->sitd_portaddr)),
	EHCI_SITD_GET_HUBA(hc32toh(sc, sitd->sitd_portaddr)));
	printf(" mask=0x%08x\n", hc32toh(sc, sitd->sitd_mask));
	printf(" status=0x%08x <%s> len=0x%x\n", hc32toh(sc, sitd->sitd_status),
	(sitd->sitd_status & htohc32(sc, EHCI_SITD_ACTIVE)) ? "ACTIVE" : "",
	EHCI_SITD_GET_LEN(hc32toh(sc, sitd->sitd_status)));
	printf(" back=0x%08x, bp=0x%08x,0x%08x,0x%08x,0x%08x\n",
	hc32toh(sc, sitd->sitd_back),
	hc32toh(sc, sitd->sitd_bp[0]),
	hc32toh(sc, sitd->sitd_bp[1]),
	hc32toh(sc, sitd->sitd_bp_hi[0]),
	hc32toh(sc, sitd->sitd_bp_hi[1]));
	}

	static void
	ehci_dump_itd(ehci_softc_t sc, ehci_itd_t itd)
	{
	usb_pc_cpu_invalidate(itd->page_cache);
	printf("ITD(%p) at 0x%08x\n", itd, hc32toh(sc, itd->itd_self) & ~0x1F);
	printf(" next=0x%08x\n", hc32toh(sc, itd->itd_next));
	printf(" status[0]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[0]),
	(itd->itd_status[0] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" status[1]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[1]),
	(itd->itd_status[1] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" status[2]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[2]),
	(itd->itd_status[2] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" status[3]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[3]),
	(itd->itd_status[3] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" status[4]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[4]),
	(itd->itd_status[4] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" status[5]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[5]),
	(itd->itd_status[5] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" status[6]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[6]),
	(itd->itd_status[6] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" status[7]=0x%08x; <%s>\n", hc32toh(sc, itd->itd_status[7]),
	(itd->itd_status[7] & htohc32(sc, EHCI_ITD_ACTIVE)) ? "ACTIVE" : "");
	printf(" bp[0]=0x%08x\n", hc32toh(sc, itd->itd_bp[0]));
	printf(" addr=0x%02x; endpt=0x%01x\n",
	EHCI_ITD_GET_ADDR(hc32toh(sc, itd->itd_bp[0])),
	EHCI_ITD_GET_ENDPT(hc32toh(sc, itd->itd_bp[0])));
	printf(" bp[1]=0x%08x\n", hc32toh(sc, itd->itd_bp[1]));
	printf(" dir=%s; mpl=0x%02x\n",
	(hc32toh(sc, itd->itd_bp[1]) & EHCI_ITD_SET_DIR_IN) ? "in" : "out",
	EHCI_ITD_GET_MPL(hc32toh(sc, itd->itd_bp[1])));
	printf(" bp[2..6]=0x%08x,0x%08x,0x%08x,0x%08x,0x%08x\n",
	hc32toh(sc, itd->itd_bp[2]),
	hc32toh(sc, itd->itd_bp[3]),
	hc32toh(sc, itd->itd_bp[4]),
	hc32toh(sc, itd->itd_bp[5]),
	hc32toh(sc, itd->itd_bp[6]));
	printf(" bp_hi=0x%08x,0x%08x,0x%08x,0x%08x,\n"
	" 0x%08x,0x%08x,0x%08x\n",
	hc32toh(sc, itd->itd_bp_hi[0]),
	hc32toh(sc, itd->itd_bp_hi[1]),
	hc32toh(sc, itd->itd_bp_hi[2]),
	hc32toh(sc, itd->itd_bp_hi[3]),
	hc32toh(sc, itd->itd_bp_hi[4]),
	hc32toh(sc, itd->itd_bp_hi[5]),
	hc32toh(sc, itd->itd_bp_hi[6]));
	}

	static void
	ehci_dump_isoc(ehci_softc_t *sc)
	{
	ehci_itd_t *itd;
	ehci_sitd_t *sitd;
	uint16_t max = 1000;
	uint16_t pos;

	pos = (EOREAD4(sc, EHCI_FRINDEX) / 8) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);

	printf("%s: isochronous dump from frame 0x%03x:\n",
	__FUNCTION__, pos);

	itd = sc->sc_isoc_hs_p_last[pos];
	sitd = sc->sc_isoc_fs_p_last[pos];

	while (itd && max && max--) {
	ehci_dump_itd(sc, itd);
	itd = itd->prev;
	}

	while (sitd && max && max--) {
	ehci_dump_sitd(sc, sitd);
	sitd = sitd->prev;
	}
	}

	#endif

	static void
	ehci_transfer_intr_enqueue(struct usb_xfer *xfer)
	{
	/* check for early completion */
	if (ehci_check_transfer(xfer)) {
	return;
	}
	/* put transfer on interrupt queue */
	usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer);

	/* start timeout, if any */
	if (xfer->timeout != 0) {
	usbd_transfer_timeout_ms(xfer, &ehci_timeout, xfer->timeout);
	}
	}

	#define EHCI_APPEND_FS_TD(std,last) (last) = _ehci_append_fs_td(std,last)
	static ehci_sitd_t *
	_ehci_append_fs_td(ehci_sitd_t std, ehci_sitd_t last)
	{
	DPRINTFN(11, "%p to %p\n", std, last);

	/* (sc->sc_bus.mtx) must be locked */

	std->next = last->next;
	std->sitd_next = last->sitd_next;

	std->prev = last;

	usb_pc_cpu_flush(std->page_cache);

	/*
	* the last->next->prev is never followed: std->next->prev = std;
	*/
	last->next = std;
	last->sitd_next = std->sitd_self;

	usb_pc_cpu_flush(last->page_cache);

	return (std);
	}

	#define EHCI_APPEND_HS_TD(std,last) (last) = _ehci_append_hs_td(std,last)
	static ehci_itd_t *
	_ehci_append_hs_td(ehci_itd_t std, ehci_itd_t last)
	{
	DPRINTFN(11, "%p to %p\n", std, last);

	/* (sc->sc_bus.mtx) must be locked */

	std->next = last->next;
	std->itd_next = last->itd_next;

	std->prev = last;

	usb_pc_cpu_flush(std->page_cache);

	/*
	* the last->next->prev is never followed: std->next->prev = std;
	*/
	last->next = std;
	last->itd_next = std->itd_self;

	usb_pc_cpu_flush(last->page_cache);

	return (std);
	}

	#define EHCI_APPEND_QH(sqh,last) (last) = _ehci_append_qh(sqh,last)
	static ehci_qh_t *
	_ehci_append_qh(ehci_qh_t sqh, ehci_qh_t last)
	{
	DPRINTFN(11, "%p to %p\n", sqh, last);

	if (sqh->prev != NULL) {
	/* should not happen */
	DPRINTFN(0, "QH already linked!\n");
	return (last);
	}
	/* (sc->sc_bus.mtx) must be locked */

	sqh->next = last->next;
	sqh->qh_link = last->qh_link;

	sqh->prev = last;

	usb_pc_cpu_flush(sqh->page_cache);

	/*
	* the last->next->prev is never followed: sqh->next->prev = sqh;
	*/

	last->next = sqh;
	last->qh_link = sqh->qh_self;

	usb_pc_cpu_flush(last->page_cache);

	return (sqh);
	}

	#define EHCI_REMOVE_FS_TD(std,last) (last) = _ehci_remove_fs_td(std,last)
	static ehci_sitd_t *
	_ehci_remove_fs_td(ehci_sitd_t std, ehci_sitd_t last)
	{
	DPRINTFN(11, "%p from %p\n", std, last);

	/* (sc->sc_bus.mtx) must be locked */

	std->prev->next = std->next;
	std->prev->sitd_next = std->sitd_next;

	usb_pc_cpu_flush(std->prev->page_cache);

	if (std->next) {
	std->next->prev = std->prev;
	usb_pc_cpu_flush(std->next->page_cache);
	}
	return ((last == std) ? std->prev : last);
	}

	#define EHCI_REMOVE_HS_TD(std,last) (last) = _ehci_remove_hs_td(std,last)
	static ehci_itd_t *
	_ehci_remove_hs_td(ehci_itd_t std, ehci_itd_t last)
	{
	DPRINTFN(11, "%p from %p\n", std, last);

	/* (sc->sc_bus.mtx) must be locked */

	std->prev->next = std->next;
	std->prev->itd_next = std->itd_next;

	usb_pc_cpu_flush(std->prev->page_cache);

	if (std->next) {
	std->next->prev = std->prev;
	usb_pc_cpu_flush(std->next->page_cache);
	}
	return ((last == std) ? std->prev : last);
	}

	#define EHCI_REMOVE_QH(sqh,last) (last) = _ehci_remove_qh(sqh,last)
	static ehci_qh_t *
	_ehci_remove_qh(ehci_qh_t sqh, ehci_qh_t last)
	{
	DPRINTFN(11, "%p from %p\n", sqh, last);

	/* (sc->sc_bus.mtx) must be locked */

	/* only remove if not removed from a queue */
	if (sqh->prev) {

	sqh->prev->next = sqh->next;
	sqh->prev->qh_link = sqh->qh_link;

	usb_pc_cpu_flush(sqh->prev->page_cache);

	if (sqh->next) {
	sqh->next->prev = sqh->prev;
	usb_pc_cpu_flush(sqh->next->page_cache);
	}
	last = ((last == sqh) ? sqh->prev : last);

	sqh->prev = 0;

	usb_pc_cpu_flush(sqh->page_cache);
	}
	return (last);
	}

	static void
	ehci_data_toggle_update(struct usb_xfer *xfer, uint16_t actlen, uint16_t xlen)
	{
	uint16_t rem;
	uint8_t dt;

	/* count number of full packets */
	dt = (actlen / xfer->max_packet_size) & 1;

	/* compute remainder */
	rem = actlen % xfer->max_packet_size;

	if (rem > 0)
	dt ^= 1; /* short packet at the end */
	else if (actlen != xlen)
	dt ^= 1; /* zero length packet at the end */
	else if (xlen == 0)
	dt ^= 1; /* zero length transfer */

	xfer->endpoint->toggle_next ^= dt;
	}

	static usb_error_t
	ehci_non_isoc_done_sub(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
	ehci_qtd_t *td;
	ehci_qtd_t *td_alt_next;
	uint32_t status;
	uint16_t len;

	td = xfer->td_transfer_cache;
	td_alt_next = td->alt_next;

	if (xfer->aframes != xfer->nframes) {
	usbd_xfer_set_frame_len(xfer, xfer->aframes, 0);
	}
	while (1) {

	usb_pc_cpu_invalidate(td->page_cache);
	status = hc32toh(sc, td->qtd_status);

	len = EHCI_QTD_GET_BYTES(status);

	/*
	* Verify the status length and
	* add the length to "frlengths[]":
	*/
	if (len > td->len) {
	/* should not happen */
	DPRINTF("Invalid status length, "
	"0x%04x/0x%04x bytes\n", len, td->len);
	status \|= EHCI_QTD_HALTED;
	} else if (xfer->aframes != xfer->nframes) {
	xfer->frlengths[xfer->aframes] += td->len - len;
	/* manually update data toggle */
	ehci_data_toggle_update(xfer, td->len - len, td->len);
	}

	/* Check for last transfer */
	if (((void *)td) == xfer->td_transfer_last) {
	td = NULL;
	break;
	}
	/* Check for transfer error */
	if (status & EHCI_QTD_HALTED) {
	/* the transfer is finished */
	td = NULL;
	break;
	}
	/* Check for short transfer */
	if (len > 0) {
	if (xfer->flags_int.short_frames_ok) {
	/* follow alt next */
	td = td->alt_next;
	} else {
	/* the transfer is finished */
	td = NULL;
	}
	break;
	}
	td = td->obj_next;

	if (td->alt_next != td_alt_next) {
	/* this USB frame is complete */
	break;
	}
	}

	/* update transfer cache */

	xfer->td_transfer_cache = td;

	#ifdef USB_DEBUG
	if (status & EHCI_QTD_STATERRS) {
	DPRINTFN(11, "error, addr=%d, endpt=0x%02x, frame=0x%02x"
	"status=%s%s%s%s%s%s%s%s\n",
	xfer->address, xfer->endpointno, xfer->aframes,
	(status & EHCI_QTD_ACTIVE) ? "[ACTIVE]" : "[NOT_ACTIVE]",
	(status & EHCI_QTD_HALTED) ? "[HALTED]" : "",
	(status & EHCI_QTD_BUFERR) ? "[BUFERR]" : "",
	(status & EHCI_QTD_BABBLE) ? "[BABBLE]" : "",
	(status & EHCI_QTD_XACTERR) ? "[XACTERR]" : "",
	(status & EHCI_QTD_MISSEDMICRO) ? "[MISSED]" : "",
	(status & EHCI_QTD_SPLITXSTATE) ? "[SPLIT]" : "",
	(status & EHCI_QTD_PINGSTATE) ? "[PING]" : "");
	}
	#endif
	if (status & EHCI_QTD_HALTED) {
	if ((xfer->xroot->udev->parent_hs_hub != NULL) \|\|
	(xfer->xroot->udev->address != 0)) {
	/* try to separate I/O errors from STALL */
	if (EHCI_QTD_GET_CERR(status) == 0)
	return (USB_ERR_IOERROR);
	}
	return (USB_ERR_STALLED);
	}
	return (USB_ERR_NORMAL_COMPLETION);
	}

	static void
	ehci_non_isoc_done(struct usb_xfer *xfer)
	{
	- ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
	ehci_qh_t *qh;
	- uint32_t status;
	usb_error_t err = 0;

	DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
	xfer, xfer->endpoint);

	#ifdef USB_DEBUG
	if (ehcidebug > 10) {
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);

	ehci_dump_sqtds(sc, xfer->td_transfer_first);
	}
	#endif

	/* extract data toggle directly from the QH's overlay area */

	qh = xfer->qh_start[xfer->flags_int.curr_dma_set];

	usb_pc_cpu_invalidate(qh->page_cache);
	-
	- status = hc32toh(sc, qh->qh_qtd.qtd_status);

	/* reset scanner */

	xfer->td_transfer_cache = xfer->td_transfer_first;

	if (xfer->flags_int.control_xfr) {

	if (xfer->flags_int.control_hdr) {

	err = ehci_non_isoc_done_sub(xfer);
	}
	xfer->aframes = 1;

	if (xfer->td_transfer_cache == NULL) {
	goto done;
	}
	}
	while (xfer->aframes != xfer->nframes) {

	err = ehci_non_isoc_done_sub(xfer);
	xfer->aframes++;

	if (xfer->td_transfer_cache == NULL) {
	goto done;
	}
	}

	if (xfer->flags_int.control_xfr &&
	!xfer->flags_int.control_act) {

	err = ehci_non_isoc_done_sub(xfer);
	}
	done:
	ehci_device_done(xfer, err);
	}

	/------------------------------------------------------------------------
	* ehci_check_transfer
	*
	* Return values:
	* 0: USB transfer is not finished
	* Else: USB transfer is finished
	------------------------------------------------------------------------/
	static uint8_t
	ehci_check_transfer(struct usb_xfer *xfer)
	{
	const struct usb_pipe_methods *methods = xfer->endpoint->methods;
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);

	uint32_t status;

	DPRINTFN(13, "xfer=%p checking transfer\n", xfer);

	if (methods == &ehci_device_isoc_fs_methods) {
	ehci_sitd_t *td;

	/* isochronous full speed transfer */

	td = xfer->td_transfer_last;
	usb_pc_cpu_invalidate(td->page_cache);
	status = hc32toh(sc, td->sitd_status);

	/* also check if first is complete */

	td = xfer->td_transfer_first;
	usb_pc_cpu_invalidate(td->page_cache);
	status \|= hc32toh(sc, td->sitd_status);

	if (!(status & EHCI_SITD_ACTIVE)) {
	ehci_device_done(xfer, USB_ERR_NORMAL_COMPLETION);
	goto transferred;
	}
	} else if (methods == &ehci_device_isoc_hs_methods) {
	ehci_itd_t *td;

	/* isochronous high speed transfer */

	/* check last transfer */
	td = xfer->td_transfer_last;
	usb_pc_cpu_invalidate(td->page_cache);
	status = td->itd_status[0];
	status \|= td->itd_status[1];
	status \|= td->itd_status[2];
	status \|= td->itd_status[3];
	status \|= td->itd_status[4];
	status \|= td->itd_status[5];
	status \|= td->itd_status[6];
	status \|= td->itd_status[7];

	/* also check first transfer */
	td = xfer->td_transfer_first;
	usb_pc_cpu_invalidate(td->page_cache);
	status \|= td->itd_status[0];
	status \|= td->itd_status[1];
	status \|= td->itd_status[2];
	status \|= td->itd_status[3];
	status \|= td->itd_status[4];
	status \|= td->itd_status[5];
	status \|= td->itd_status[6];
	status \|= td->itd_status[7];

	/* if no transactions are active we continue */
	if (!(status & htohc32(sc, EHCI_ITD_ACTIVE))) {
	ehci_device_done(xfer, USB_ERR_NORMAL_COMPLETION);
	goto transferred;
	}
	} else {
	ehci_qtd_t *td;
	ehci_qh_t *qh;

	/* non-isochronous transfer */

	/*
	* check whether there is an error somewhere in the middle,
	* or whether there was a short packet (SPD and not ACTIVE)
	*/
	td = xfer->td_transfer_cache;

	qh = xfer->qh_start[xfer->flags_int.curr_dma_set];

	usb_pc_cpu_invalidate(qh->page_cache);

	status = hc32toh(sc, qh->qh_qtd.qtd_status);
	if (status & EHCI_QTD_ACTIVE) {
	/* transfer is pending */
	goto done;
	}

	while (1) {
	usb_pc_cpu_invalidate(td->page_cache);
	status = hc32toh(sc, td->qtd_status);

	/*
	* Check if there is an active TD which
	* indicates that the transfer isn't done.
	*/
	if (status & EHCI_QTD_ACTIVE) {
	/* update cache */
	xfer->td_transfer_cache = td;
	goto done;
	}
	/*
	* last transfer descriptor makes the transfer done
	*/
	if (((void *)td) == xfer->td_transfer_last) {
	break;
	}
	/*
	* any kind of error makes the transfer done
	*/
	if (status & EHCI_QTD_HALTED) {
	break;
	}
	/*
	* if there is no alternate next transfer, a short
	* packet also makes the transfer done
	*/
	if (EHCI_QTD_GET_BYTES(status)) {
	if (xfer->flags_int.short_frames_ok) {
	/* follow alt next */
	if (td->alt_next) {
	td = td->alt_next;
	continue;
	}
	}
	/* transfer is done */
	break;
	}
	td = td->obj_next;
	}
	ehci_non_isoc_done(xfer);
	goto transferred;
	}

	done:
	DPRINTFN(13, "xfer=%p is still active\n", xfer);
	return (0);

	transferred:
	return (1);
	}

	static void
	ehci_pcd_enable(ehci_softc_t *sc)
	{
	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	sc->sc_eintrs \|= EHCI_STS_PCD;
	EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);

	/* acknowledge any PCD interrupt */
	EOWRITE4(sc, EHCI_USBSTS, EHCI_STS_PCD);

	ehci_root_intr(sc);
	}

	static void
	ehci_interrupt_poll(ehci_softc_t *sc)
	{
	struct usb_xfer *xfer;

	repeat:
	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
	/*
	* check if transfer is transferred
	*/
	if (ehci_check_transfer(xfer)) {
	/* queue has been modified */
	goto repeat;
	}
	}
	}

	/*
	* Some EHCI chips from VIA / ATI seem to trigger interrupts before
	* writing back the qTD status, or miss signalling occasionally under
	* heavy load. If the host machine is too fast, we can miss
	* transaction completion - when we scan the active list the
	* transaction still seems to be active. This generally exhibits
	* itself as a umass stall that never recovers.
	*
	* We work around this behaviour by setting up this callback after any
	* softintr that completes with transactions still pending, giving us
	* another chance to check for completion after the writeback has
	* taken place.
	*/
	static void
	ehci_poll_timeout(void *arg)
	{
	ehci_softc_t *sc = arg;

	DPRINTFN(3, "\n");
	ehci_interrupt_poll(sc);
	}

	/------------------------------------------------------------------------
	* ehci_interrupt - EHCI interrupt handler
	*
	* NOTE: Do not access "sc->sc_bus.bdev" inside the interrupt handler,
	* hence the interrupt handler will be setup before "sc->sc_bus.bdev"
	* is present !
	------------------------------------------------------------------------/
	void
	ehci_interrupt(ehci_softc_t *sc)
	{
	uint32_t status;

	USB_BUS_LOCK(&sc->sc_bus);

	DPRINTFN(16, "real interrupt\n");

	#ifdef USB_DEBUG
	if (ehcidebug > 15) {
	ehci_dump_regs(sc);
	}
	#endif

	status = EHCI_STS_INTRS(EOREAD4(sc, EHCI_USBSTS));
	if (status == 0) {
	/* the interrupt was not for us */
	goto done;
	}
	if (!(status & sc->sc_eintrs)) {
	goto done;
	}
	EOWRITE4(sc, EHCI_USBSTS, status); /* acknowledge */

	status &= sc->sc_eintrs;

	if (status & EHCI_STS_HSE) {
	printf("%s: unrecoverable error, "
	"controller halted\n", __FUNCTION__);
	#ifdef USB_DEBUG
	ehci_dump_regs(sc);
	ehci_dump_isoc(sc);
	#endif
	}
	if (status & EHCI_STS_PCD) {
	/*
	* Disable PCD interrupt for now, because it will be
	* on until the port has been reset.
	*/
	sc->sc_eintrs &= ~EHCI_STS_PCD;
	EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);

	ehci_root_intr(sc);

	/* do not allow RHSC interrupts > 1 per second */
	usb_callout_reset(&sc->sc_tmo_pcd, hz,
	(void *)&ehci_pcd_enable, sc);
	}
	status &= ~(EHCI_STS_INT \| EHCI_STS_ERRINT \| EHCI_STS_PCD \| EHCI_STS_IAA);

	if (status != 0) {
	/* block unprocessed interrupts */
	sc->sc_eintrs &= ~status;
	EOWRITE4(sc, EHCI_USBINTR, sc->sc_eintrs);
	printf("%s: blocking interrupts 0x%x\n", __FUNCTION__, status);
	}
	/* poll all the USB transfers */
	ehci_interrupt_poll(sc);

	if (sc->sc_flags & EHCI_SCFLG_LOSTINTRBUG) {
	usb_callout_reset(&sc->sc_tmo_poll, hz / 128,
	(void *)&ehci_poll_timeout, sc);
	}

	done:
	USB_BUS_UNLOCK(&sc->sc_bus);
	}

	/*
	* called when a request does not complete
	*/
	static void
	ehci_timeout(void *arg)
	{
	struct usb_xfer *xfer = arg;

	DPRINTF("xfer=%p\n", xfer);

	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);

	/* transfer is transferred */
	ehci_device_done(xfer, USB_ERR_TIMEOUT);
	}

	static void
	ehci_do_poll(struct usb_bus *bus)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(bus);

	USB_BUS_LOCK(&sc->sc_bus);
	ehci_interrupt_poll(sc);
	USB_BUS_UNLOCK(&sc->sc_bus);
	}

	static void
	ehci_setup_standard_chain_sub(struct ehci_std_temp *temp)
	{
	struct usb_page_search buf_res;
	ehci_qtd_t *td;
	ehci_qtd_t *td_next;
	ehci_qtd_t *td_alt_next;
	uint32_t buf_offset;
	uint32_t average;
	uint32_t len_old;
	uint32_t terminate;
	uint32_t qtd_altnext;
	uint8_t shortpkt_old;
	uint8_t precompute;

	terminate = temp->sc->sc_terminate_self;
	qtd_altnext = temp->sc->sc_terminate_self;
	td_alt_next = NULL;
	buf_offset = 0;
	shortpkt_old = temp->shortpkt;
	len_old = temp->len;
	precompute = 1;

	restart:

	td = temp->td;
	td_next = temp->td_next;

	while (1) {

	if (temp->len == 0) {

	if (temp->shortpkt) {
	break;
	}
	/* send a Zero Length Packet, ZLP, last */

	temp->shortpkt = 1;
	average = 0;

	} else {

	average = temp->average;

	if (temp->len < average) {
	if (temp->len % temp->max_frame_size) {
	temp->shortpkt = 1;
	}
	average = temp->len;
	}
	}

	if (td_next == NULL) {
	panic("%s: out of EHCI transfer descriptors!", __FUNCTION__);
	}
	/* get next TD */

	td = td_next;
	td_next = td->obj_next;

	/* check if we are pre-computing */

	if (precompute) {

	/* update remaining length */

	temp->len -= average;

	continue;
	}
	/* fill out current TD */

	td->qtd_status =
	temp->qtd_status \|
	htohc32(temp->sc, EHCI_QTD_IOC \|
	EHCI_QTD_SET_BYTES(average));

	if (average == 0) {

	if (temp->auto_data_toggle == 0) {

	/* update data toggle, ZLP case */

	temp->qtd_status ^=
	htohc32(temp->sc, EHCI_QTD_TOGGLE_MASK);
	}
	td->len = 0;

	/* properly reset reserved fields */
	td->qtd_buffer[0] = 0;
	td->qtd_buffer[1] = 0;
	td->qtd_buffer[2] = 0;
	td->qtd_buffer[3] = 0;
	td->qtd_buffer[4] = 0;
	td->qtd_buffer_hi[0] = 0;
	td->qtd_buffer_hi[1] = 0;
	td->qtd_buffer_hi[2] = 0;
	td->qtd_buffer_hi[3] = 0;
	td->qtd_buffer_hi[4] = 0;
	} else {

	uint8_t x;

	if (temp->auto_data_toggle == 0) {

	/* update data toggle */

	if (howmany(average, temp->max_frame_size) & 1) {
	temp->qtd_status ^=
	htohc32(temp->sc, EHCI_QTD_TOGGLE_MASK);
	}
	}
	td->len = average;

	/* update remaining length */

	temp->len -= average;

	/* fill out buffer pointers */

	usbd_get_page(temp->pc, buf_offset, &buf_res);
	td->qtd_buffer[0] =
	htohc32(temp->sc, buf_res.physaddr);
	td->qtd_buffer_hi[0] = 0;

	x = 1;

	while (average > EHCI_PAGE_SIZE) {
	average -= EHCI_PAGE_SIZE;
	buf_offset += EHCI_PAGE_SIZE;
	usbd_get_page(temp->pc, buf_offset, &buf_res);
	td->qtd_buffer[x] =
	htohc32(temp->sc,
	buf_res.physaddr & (~0xFFF));
	td->qtd_buffer_hi[x] = 0;
	x++;
	}

	/*
	* NOTE: The "average" variable is never zero after
	* exiting the loop above !
	*
	* NOTE: We have to subtract one from the offset to
	* ensure that we are computing the physical address
	* of a valid page !
	*/
	buf_offset += average;
	usbd_get_page(temp->pc, buf_offset - 1, &buf_res);
	td->qtd_buffer[x] =
	htohc32(temp->sc,
	buf_res.physaddr & (~0xFFF));
	td->qtd_buffer_hi[x] = 0;

	/* properly reset reserved fields */
	while (++x < EHCI_QTD_NBUFFERS) {
	td->qtd_buffer[x] = 0;
	td->qtd_buffer_hi[x] = 0;
	}
	}

	if (td_next) {
	/* link the current TD with the next one */
	td->qtd_next = td_next->qtd_self;
	}
	td->qtd_altnext = qtd_altnext;
	td->alt_next = td_alt_next;

	usb_pc_cpu_flush(td->page_cache);
	}

	if (precompute) {
	precompute = 0;

	/* setup alt next pointer, if any */
	if (temp->last_frame) {
	td_alt_next = NULL;
	qtd_altnext = terminate;
	} else {
	/* we use this field internally */
	td_alt_next = td_next;
	if (temp->setup_alt_next) {
	qtd_altnext = td_next->qtd_self;
	} else {
	qtd_altnext = terminate;
	}
	}

	/* restore */
	temp->shortpkt = shortpkt_old;
	temp->len = len_old;
	goto restart;
	}
	temp->td = td;
	temp->td_next = td_next;
	}

	static void
	ehci_setup_standard_chain(struct usb_xfer xfer, ehci_qh_t *qh_last)
	{
	struct ehci_std_temp temp;
	const struct usb_pipe_methods *methods;
	ehci_qh_t *qh;
	ehci_qtd_t *td;
	uint32_t qh_endp;
	uint32_t qh_endphub;
	uint32_t x;

	DPRINTFN(9, "addr=%d endpt=%d sumlen=%d speed=%d\n",
	xfer->address, UE_GET_ADDR(xfer->endpointno),
	xfer->sumlen, usbd_get_speed(xfer->xroot->udev));

	temp.average = xfer->max_hc_frame_size;
	temp.max_frame_size = xfer->max_frame_size;
	temp.sc = EHCI_BUS2SC(xfer->xroot->bus);

	/* toggle the DMA set we are using */
	xfer->flags_int.curr_dma_set ^= 1;

	/* get next DMA set */
	td = xfer->td_start[xfer->flags_int.curr_dma_set];

	xfer->td_transfer_first = td;
	xfer->td_transfer_cache = td;

	temp.td = NULL;
	temp.td_next = td;
	temp.qtd_status = 0;
	temp.last_frame = 0;
	temp.setup_alt_next = xfer->flags_int.short_frames_ok;

	if (xfer->flags_int.control_xfr) {
	if (xfer->endpoint->toggle_next) {
	/* DATA1 is next */
	temp.qtd_status \|=
	htohc32(temp.sc, EHCI_QTD_SET_TOGGLE(1));
	}
	temp.auto_data_toggle = 0;
	} else {
	temp.auto_data_toggle = 1;
	}

	if ((xfer->xroot->udev->parent_hs_hub != NULL) \|\|
	(xfer->xroot->udev->address != 0)) {
	/* max 3 retries */
	temp.qtd_status \|=
	htohc32(temp.sc, EHCI_QTD_SET_CERR(3));
	}
	/* check if we should prepend a setup message */

	if (xfer->flags_int.control_xfr) {
	if (xfer->flags_int.control_hdr) {

	xfer->endpoint->toggle_next = 0;

	temp.qtd_status &=
	htohc32(temp.sc, EHCI_QTD_SET_CERR(3));
	temp.qtd_status \|= htohc32(temp.sc,
	EHCI_QTD_ACTIVE \|
	EHCI_QTD_SET_PID(EHCI_QTD_PID_SETUP) \|
	EHCI_QTD_SET_TOGGLE(0));

	temp.len = xfer->frlengths[0];
	temp.pc = xfer->frbuffers + 0;
	temp.shortpkt = temp.len ? 1 : 0;
	/* check for last frame */
	if (xfer->nframes == 1) {
	/* no STATUS stage yet, SETUP is last */
	if (xfer->flags_int.control_act) {
	temp.last_frame = 1;
	temp.setup_alt_next = 0;
	}
	}
	ehci_setup_standard_chain_sub(&temp);
	}
	x = 1;
	} else {
	x = 0;
	}

	while (x != xfer->nframes) {

	/* DATA0 / DATA1 message */

	temp.len = xfer->frlengths[x];
	temp.pc = xfer->frbuffers + x;

	x++;

	if (x == xfer->nframes) {
	if (xfer->flags_int.control_xfr) {
	/* no STATUS stage yet, DATA is last */
	if (xfer->flags_int.control_act) {
	temp.last_frame = 1;
	temp.setup_alt_next = 0;
	}
	} else {
	temp.last_frame = 1;
	temp.setup_alt_next = 0;
	}
	}
	/* keep previous data toggle and error count */

	temp.qtd_status &=
	htohc32(temp.sc, EHCI_QTD_SET_CERR(3) \|
	EHCI_QTD_SET_TOGGLE(1));

	if (temp.len == 0) {

	/* make sure that we send an USB packet */

	temp.shortpkt = 0;

	} else {

	/* regular data transfer */

	temp.shortpkt = (xfer->flags.force_short_xfer) ? 0 : 1;
	}

	/* set endpoint direction */

	temp.qtd_status \|=
	(UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) ?
	htohc32(temp.sc, EHCI_QTD_ACTIVE \|
	EHCI_QTD_SET_PID(EHCI_QTD_PID_IN)) :
	htohc32(temp.sc, EHCI_QTD_ACTIVE \|
	EHCI_QTD_SET_PID(EHCI_QTD_PID_OUT));

	ehci_setup_standard_chain_sub(&temp);
	}

	/* check if we should append a status stage */

	if (xfer->flags_int.control_xfr &&
	!xfer->flags_int.control_act) {

	/*
	* Send a DATA1 message and invert the current endpoint
	* direction.
	*/

	temp.qtd_status &= htohc32(temp.sc, EHCI_QTD_SET_CERR(3) \|
	EHCI_QTD_SET_TOGGLE(1));
	temp.qtd_status \|=
	(UE_GET_DIR(xfer->endpointno) == UE_DIR_OUT) ?
	htohc32(temp.sc, EHCI_QTD_ACTIVE \|
	EHCI_QTD_SET_PID(EHCI_QTD_PID_IN) \|
	EHCI_QTD_SET_TOGGLE(1)) :
	htohc32(temp.sc, EHCI_QTD_ACTIVE \|
	EHCI_QTD_SET_PID(EHCI_QTD_PID_OUT) \|
	EHCI_QTD_SET_TOGGLE(1));

	temp.len = 0;
	temp.pc = NULL;
	temp.shortpkt = 0;
	temp.last_frame = 1;
	temp.setup_alt_next = 0;

	ehci_setup_standard_chain_sub(&temp);
	}
	td = temp.td;

	/* the last TD terminates the transfer: */
	td->qtd_next = htohc32(temp.sc, EHCI_LINK_TERMINATE);
	td->qtd_altnext = htohc32(temp.sc, EHCI_LINK_TERMINATE);

	usb_pc_cpu_flush(td->page_cache);

	/* must have at least one frame! */

	xfer->td_transfer_last = td;

	#ifdef USB_DEBUG
	if (ehcidebug > 8) {
	DPRINTF("nexttog=%d; data before transfer:\n",
	xfer->endpoint->toggle_next);
	ehci_dump_sqtds(temp.sc,
	xfer->td_transfer_first);
	}
	#endif

	methods = xfer->endpoint->methods;

	qh = xfer->qh_start[xfer->flags_int.curr_dma_set];

	/* the "qh_link" field is filled when the QH is added */

	qh_endp =
	(EHCI_QH_SET_ADDR(xfer->address) \|
	EHCI_QH_SET_ENDPT(UE_GET_ADDR(xfer->endpointno)) \|
	EHCI_QH_SET_MPL(xfer->max_packet_size));

	if (usbd_get_speed(xfer->xroot->udev) == USB_SPEED_HIGH) {
	qh_endp \|= EHCI_QH_SET_EPS(EHCI_QH_SPEED_HIGH);
	if (methods != &ehci_device_intr_methods)
	qh_endp \|= EHCI_QH_SET_NRL(8);
	} else {

	if (usbd_get_speed(xfer->xroot->udev) == USB_SPEED_FULL) {
	qh_endp \|= EHCI_QH_SET_EPS(EHCI_QH_SPEED_FULL);
	} else {
	qh_endp \|= EHCI_QH_SET_EPS(EHCI_QH_SPEED_LOW);
	}

	if (methods == &ehci_device_ctrl_methods) {
	qh_endp \|= EHCI_QH_CTL;
	}
	if (methods != &ehci_device_intr_methods) {
	/* Only try one time per microframe! */
	qh_endp \|= EHCI_QH_SET_NRL(1);
	}
	}

	if (temp.auto_data_toggle == 0) {
	/* software computes the data toggle */
	qh_endp \|= EHCI_QH_DTC;
	}

	qh->qh_endp = htohc32(temp.sc, qh_endp);

	qh_endphub =
	(EHCI_QH_SET_MULT(xfer->max_packet_count & 3) \|
	EHCI_QH_SET_CMASK(xfer->endpoint->usb_cmask) \|
	EHCI_QH_SET_SMASK(xfer->endpoint->usb_smask) \|
	EHCI_QH_SET_HUBA(xfer->xroot->udev->hs_hub_addr) \|
	EHCI_QH_SET_PORT(xfer->xroot->udev->hs_port_no));

	qh->qh_endphub = htohc32(temp.sc, qh_endphub);
	qh->qh_curqtd = 0;

	/* fill the overlay qTD */

	if (temp.auto_data_toggle && xfer->endpoint->toggle_next) {
	/* DATA1 is next */
	qh->qh_qtd.qtd_status = htohc32(temp.sc, EHCI_QTD_SET_TOGGLE(1));
	} else {
	qh->qh_qtd.qtd_status = 0;
	}

	td = xfer->td_transfer_first;

	qh->qh_qtd.qtd_next = td->qtd_self;
	qh->qh_qtd.qtd_altnext =
	htohc32(temp.sc, EHCI_LINK_TERMINATE);

	/* properly reset reserved fields */
	qh->qh_qtd.qtd_buffer[0] = 0;
	qh->qh_qtd.qtd_buffer[1] = 0;
	qh->qh_qtd.qtd_buffer[2] = 0;
	qh->qh_qtd.qtd_buffer[3] = 0;
	qh->qh_qtd.qtd_buffer[4] = 0;
	qh->qh_qtd.qtd_buffer_hi[0] = 0;
	qh->qh_qtd.qtd_buffer_hi[1] = 0;
	qh->qh_qtd.qtd_buffer_hi[2] = 0;
	qh->qh_qtd.qtd_buffer_hi[3] = 0;
	qh->qh_qtd.qtd_buffer_hi[4] = 0;

	usb_pc_cpu_flush(qh->page_cache);

	if (xfer->xroot->udev->flags.self_suspended == 0) {
	EHCI_APPEND_QH(qh, *qh_last);
	}
	}

	static void
	ehci_root_intr(ehci_softc_t *sc)
	{
	uint16_t i;
	uint16_t m;

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	/* clear any old interrupt data */
	memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata));

	/* set bits */
	m = (sc->sc_noport + 1);
	if (m > (8 * sizeof(sc->sc_hub_idata))) {
	m = (8 * sizeof(sc->sc_hub_idata));
	}
	for (i = 1; i < m; i++) {
	/* pick out CHANGE bits from the status register */
	if (EOREAD4(sc, EHCI_PORTSC(i)) & EHCI_PS_CLEAR) {
	sc->sc_hub_idata[i / 8] \|= 1 << (i % 8);
	DPRINTF("port %d changed\n", i);
	}
	}
	uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata,
	sizeof(sc->sc_hub_idata));
	}

	static void
	ehci_isoc_fs_done(ehci_softc_t sc, struct usb_xfer xfer)
	{
	uint32_t nframes = xfer->nframes;
	uint32_t status;
	uint32_t *plen = xfer->frlengths;
	uint16_t len = 0;
	ehci_sitd_t *td = xfer->td_transfer_first;
	ehci_sitd_t **pp_last = &sc->sc_isoc_fs_p_last[xfer->qh_pos];

	DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
	xfer, xfer->endpoint);

	while (nframes--) {
	if (td == NULL) {
	panic("%s:%d: out of TD's\n",
	__FUNCTION__, __LINE__);
	}
	if (pp_last >= &sc->sc_isoc_fs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT]) {
	pp_last = &sc->sc_isoc_fs_p_last[0];
	}
	#ifdef USB_DEBUG
	if (ehcidebug > 15) {
	DPRINTF("isoc FS-TD\n");
	ehci_dump_sitd(sc, td);
	}
	#endif
	usb_pc_cpu_invalidate(td->page_cache);
	status = hc32toh(sc, td->sitd_status);

	len = EHCI_SITD_GET_LEN(status);

	DPRINTFN(2, "status=0x%08x, rem=%u\n", status, len);

	if (*plen >= len) {
	len = *plen - len;
	} else {
	len = 0;
	}

	*plen = len;

	/* remove FS-TD from schedule */
	EHCI_REMOVE_FS_TD(td, *pp_last);

	pp_last++;
	plen++;
	td = td->obj_next;
	}

	xfer->aframes = xfer->nframes;
	}

	static void
	ehci_isoc_hs_done(ehci_softc_t sc, struct usb_xfer xfer)
	{
	uint32_t nframes = xfer->nframes;
	uint32_t status;
	uint32_t *plen = xfer->frlengths;
	uint16_t len = 0;
	uint8_t td_no = 0;
	ehci_itd_t *td = xfer->td_transfer_first;
	ehci_itd_t **pp_last = &sc->sc_isoc_hs_p_last[xfer->qh_pos];

	DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
	xfer, xfer->endpoint);

	while (nframes) {
	if (td == NULL) {
	panic("%s:%d: out of TD's\n",
	__FUNCTION__, __LINE__);
	}
	if (pp_last >= &sc->sc_isoc_hs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT]) {
	pp_last = &sc->sc_isoc_hs_p_last[0];
	}
	#ifdef USB_DEBUG
	if (ehcidebug > 15) {
	DPRINTF("isoc HS-TD\n");
	ehci_dump_itd(sc, td);
	}
	#endif

	usb_pc_cpu_invalidate(td->page_cache);
	status = hc32toh(sc, td->itd_status[td_no]);

	len = EHCI_ITD_GET_LEN(status);

	DPRINTFN(2, "status=0x%08x, len=%u\n", status, len);

	if (xfer->endpoint->usb_smask & (1 << td_no)) {

	if (*plen >= len) {
	/*
	* The length is valid. NOTE: The
	* complete length is written back
	* into the status field, and not the
	* remainder like with other transfer
	* descriptor types.
	*/
	} else {
	/* Invalid length - truncate */
	len = 0;
	}

	*plen = len;
	plen++;
	nframes--;
	}

	td_no++;

	if ((td_no == 8) \|\| (nframes == 0)) {
	/* remove HS-TD from schedule */
	EHCI_REMOVE_HS_TD(td, *pp_last);
	pp_last++;

	td_no = 0;
	td = td->obj_next;
	}
	}
	xfer->aframes = xfer->nframes;
	}

	/* NOTE: "done" can be run two times in a row,
	* from close and from interrupt
	*/
	static void
	ehci_device_done(struct usb_xfer *xfer, usb_error_t error)
	{
	const struct usb_pipe_methods *methods = xfer->endpoint->methods;
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n",
	xfer, xfer->endpoint, error);

	if ((methods == &ehci_device_bulk_methods) \|\|
	(methods == &ehci_device_ctrl_methods)) {
	#ifdef USB_DEBUG
	if (ehcidebug > 8) {
	DPRINTF("nexttog=%d; data after transfer:\n",
	xfer->endpoint->toggle_next);
	ehci_dump_sqtds(sc,
	xfer->td_transfer_first);
	}
	#endif

	EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
	sc->sc_async_p_last);
	}
	if (methods == &ehci_device_intr_methods) {
	EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
	sc->sc_intr_p_last[xfer->qh_pos]);
	}
	/*
	* Only finish isochronous transfers once which will update
	* "xfer->frlengths".
	*/
	if (xfer->td_transfer_first &&
	xfer->td_transfer_last) {
	if (methods == &ehci_device_isoc_fs_methods) {
	ehci_isoc_fs_done(sc, xfer);
	}
	if (methods == &ehci_device_isoc_hs_methods) {
	ehci_isoc_hs_done(sc, xfer);
	}
	xfer->td_transfer_first = NULL;
	xfer->td_transfer_last = NULL;
	}
	/* dequeue transfer and start next transfer */
	usbd_transfer_done(xfer, error);
	}

	/------------------------------------------------------------------------
	* ehci bulk support
	------------------------------------------------------------------------/
	static void
	ehci_device_bulk_open(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ehci_device_bulk_close(struct usb_xfer *xfer)
	{
	ehci_device_done(xfer, USB_ERR_CANCELLED);
	}

	static void
	ehci_device_bulk_enter(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ehci_doorbell_async(struct ehci_softc *sc)
	{
	uint32_t temp;

	/*
	* XXX Performance quirk: Some Host Controllers have a too low
	* interrupt rate. Issue an IAAD to stimulate the Host
	* Controller after queueing the BULK transfer.
	*
	* XXX Force the host controller to refresh any QH caches.
	*/
	temp = EOREAD4(sc, EHCI_USBCMD);
	if (!(temp & EHCI_CMD_IAAD))
	EOWRITE4(sc, EHCI_USBCMD, temp \| EHCI_CMD_IAAD);
	}

	static void
	ehci_device_bulk_start(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);

	/* setup TD's and QH */
	ehci_setup_standard_chain(xfer, &sc->sc_async_p_last);

	/* put transfer on interrupt queue */
	ehci_transfer_intr_enqueue(xfer);

	/*
	* XXX Certain nVidia chipsets choke when using the IAAD
	* feature too frequently.
	*/
	if (sc->sc_flags & EHCI_SCFLG_IAADBUG)
	return;

	ehci_doorbell_async(sc);
	}

	static const struct usb_pipe_methods ehci_device_bulk_methods =
	{
	.open = ehci_device_bulk_open,
	.close = ehci_device_bulk_close,
	.enter = ehci_device_bulk_enter,
	.start = ehci_device_bulk_start,
	};

	/------------------------------------------------------------------------
	* ehci control support
	------------------------------------------------------------------------/
	static void
	ehci_device_ctrl_open(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ehci_device_ctrl_close(struct usb_xfer *xfer)
	{
	ehci_device_done(xfer, USB_ERR_CANCELLED);
	}

	static void
	ehci_device_ctrl_enter(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ehci_device_ctrl_start(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);

	/* setup TD's and QH */
	ehci_setup_standard_chain(xfer, &sc->sc_async_p_last);

	/* put transfer on interrupt queue */
	ehci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ehci_device_ctrl_methods =
	{
	.open = ehci_device_ctrl_open,
	.close = ehci_device_ctrl_close,
	.enter = ehci_device_ctrl_enter,
	.start = ehci_device_ctrl_start,
	};

	/------------------------------------------------------------------------
	* ehci interrupt support
	------------------------------------------------------------------------/
	static void
	ehci_device_intr_open(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
	uint16_t best;
	uint16_t bit;
	uint16_t x;

	usb_hs_bandwidth_alloc(xfer);

	/*
	* Find the best QH position corresponding to the given interval:
	*/

	best = 0;
	bit = EHCI_VIRTUAL_FRAMELIST_COUNT / 2;
	while (bit) {
	if (xfer->interval >= bit) {
	x = bit;
	best = bit;
	while (x & bit) {
	if (sc->sc_intr_stat[x] <
	sc->sc_intr_stat[best]) {
	best = x;
	}
	x++;
	}
	break;
	}
	bit >>= 1;
	}

	sc->sc_intr_stat[best]++;
	xfer->qh_pos = best;

	DPRINTFN(3, "best=%d interval=%d\n",
	best, xfer->interval);
	}

	static void
	ehci_device_intr_close(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);

	sc->sc_intr_stat[xfer->qh_pos]--;

	ehci_device_done(xfer, USB_ERR_CANCELLED);

	/* bandwidth must be freed after device done */
	usb_hs_bandwidth_free(xfer);
	}

	static void
	ehci_device_intr_enter(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ehci_device_intr_start(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);

	/* setup TD's and QH */
	ehci_setup_standard_chain(xfer, &sc->sc_intr_p_last[xfer->qh_pos]);

	/* put transfer on interrupt queue */
	ehci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ehci_device_intr_methods =
	{
	.open = ehci_device_intr_open,
	.close = ehci_device_intr_close,
	.enter = ehci_device_intr_enter,
	.start = ehci_device_intr_start,
	};

	/------------------------------------------------------------------------
	* ehci full speed isochronous support
	------------------------------------------------------------------------/
	static void
	ehci_device_isoc_fs_open(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
	ehci_sitd_t *td;
	uint32_t sitd_portaddr;
	uint8_t ds;

	sitd_portaddr =
	EHCI_SITD_SET_ADDR(xfer->address) \|
	EHCI_SITD_SET_ENDPT(UE_GET_ADDR(xfer->endpointno)) \|
	EHCI_SITD_SET_HUBA(xfer->xroot->udev->hs_hub_addr) \|
	EHCI_SITD_SET_PORT(xfer->xroot->udev->hs_port_no);

	if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN)
	sitd_portaddr \|= EHCI_SITD_SET_DIR_IN;

	sitd_portaddr = htohc32(sc, sitd_portaddr);

	/* initialize all TD's */

	for (ds = 0; ds != 2; ds++) {

	for (td = xfer->td_start[ds]; td; td = td->obj_next) {

	td->sitd_portaddr = sitd_portaddr;

	/*
	* TODO: make some kind of automatic
	* SMASK/CMASK selection based on micro-frame
	* usage
	*
	* micro-frame usage (8 microframes per 1ms)
	*/
	td->sitd_back = htohc32(sc, EHCI_LINK_TERMINATE);

	usb_pc_cpu_flush(td->page_cache);
	}
	}
	}

	static void
	ehci_device_isoc_fs_close(struct usb_xfer *xfer)
	{
	ehci_device_done(xfer, USB_ERR_CANCELLED);
	}

	static void
	ehci_device_isoc_fs_enter(struct usb_xfer *xfer)
	{
	struct usb_page_search buf_res;
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
	ehci_sitd_t *td;
	ehci_sitd_t *td_last = NULL;
	ehci_sitd_t **pp_last;
	uint32_t *plen;
	uint32_t buf_offset;
	uint32_t nframes;
	uint32_t temp;
	uint32_t sitd_mask;
	uint16_t tlen;
	uint8_t sa;
	uint8_t sb;

	#ifdef USB_DEBUG
	uint8_t once = 1;

	#endif

	DPRINTFN(6, "xfer=%p next=%d nframes=%d\n",
	xfer, xfer->endpoint->isoc_next, xfer->nframes);

	/* get the current frame index */

	nframes = EOREAD4(sc, EHCI_FRINDEX) / 8;

	/*
	* check if the frame index is within the window where the frames
	* will be inserted
	*/
	buf_offset = (nframes - xfer->endpoint->isoc_next) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);

	if ((xfer->endpoint->is_synced == 0) \|\|
	(buf_offset < xfer->nframes)) {
	/*
	* If there is data underflow or the pipe queue is empty we
	* schedule the transfer a few frames ahead of the current
	* frame position. Else two isochronous transfers might
	* overlap.
	*/
	xfer->endpoint->isoc_next = (nframes + 3) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
	xfer->endpoint->is_synced = 1;
	DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
	}
	/*
	* compute how many milliseconds the insertion is ahead of the
	* current frame position:
	*/
	buf_offset = (xfer->endpoint->isoc_next - nframes) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);

	/*
	* pre-compute when the isochronous transfer will be finished:
	*/
	xfer->isoc_time_complete =
	usb_isoc_time_expand(&sc->sc_bus, nframes) +
	buf_offset + xfer->nframes;

	/* get the real number of frames */

	nframes = xfer->nframes;

	buf_offset = 0;

	plen = xfer->frlengths;

	/* toggle the DMA set we are using */
	xfer->flags_int.curr_dma_set ^= 1;

	/* get next DMA set */
	td = xfer->td_start[xfer->flags_int.curr_dma_set];
	xfer->td_transfer_first = td;

	pp_last = &sc->sc_isoc_fs_p_last[xfer->endpoint->isoc_next];

	/* store starting position */

	xfer->qh_pos = xfer->endpoint->isoc_next;

	while (nframes--) {
	if (td == NULL) {
	panic("%s:%d: out of TD's\n",
	__FUNCTION__, __LINE__);
	}
	if (pp_last >= &sc->sc_isoc_fs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT])
	pp_last = &sc->sc_isoc_fs_p_last[0];

	/* reuse sitd_portaddr and sitd_back from last transfer */

	if (*plen > xfer->max_frame_size) {
	#ifdef USB_DEBUG
	if (once) {
	once = 0;
	printf("%s: frame length(%d) exceeds %d "
	"bytes (frame truncated)\n",
	__FUNCTION__, *plen,
	xfer->max_frame_size);
	}
	#endif
	*plen = xfer->max_frame_size;
	}

	/* allocate a slot */

	sa = usbd_fs_isoc_schedule_alloc_slot(xfer,
	xfer->isoc_time_complete - nframes - 1);

	if (sa == 255) {
	/*
	* Schedule is FULL, set length to zero:
	*/

	*plen = 0;
	sa = USB_FS_ISOC_UFRAME_MAX - 1;
	}
	if (*plen) {
	/*
	* only call "usbd_get_page()" when we have a
	* non-zero length
	*/
	usbd_get_page(xfer->frbuffers, buf_offset, &buf_res);
	td->sitd_bp[0] = htohc32(sc, buf_res.physaddr);
	buf_offset += *plen;
	/*
	* NOTE: We need to subtract one from the offset so
	* that we are on a valid page!
	*/
	usbd_get_page(xfer->frbuffers, buf_offset - 1,
	&buf_res);
	temp = buf_res.physaddr & ~0xFFF;
	} else {
	td->sitd_bp[0] = 0;
	temp = 0;
	}

	if (UE_GET_DIR(xfer->endpointno) == UE_DIR_OUT) {
	tlen = *plen;
	if (tlen <= 188) {
	temp \|= 1; /* T-count = 1, TP = ALL */
	tlen = 1;
	} else {
	tlen += 187;
	tlen /= 188;
	temp \|= tlen; /* T-count = [1..6] */
	temp \|= 8; /* TP = Begin */
	}

	tlen += sa;

	if (tlen >= 8) {
	sb = 0;
	} else {
	sb = (1 << tlen);
	}

	sa = (1 << sa);
	sa = (sb - sa) & 0x3F;
	sb = 0;
	} else {
	sb = (-(4 << sa)) & 0xFE;
	sa = (1 << sa) & 0x3F;
	}

	sitd_mask = (EHCI_SITD_SET_SMASK(sa) \|
	EHCI_SITD_SET_CMASK(sb));

	td->sitd_bp[1] = htohc32(sc, temp);

	td->sitd_mask = htohc32(sc, sitd_mask);

	if (nframes == 0) {
	td->sitd_status = htohc32(sc,
	EHCI_SITD_IOC \|
	EHCI_SITD_ACTIVE \|
	EHCI_SITD_SET_LEN(*plen));
	} else {
	td->sitd_status = htohc32(sc,
	EHCI_SITD_ACTIVE \|
	EHCI_SITD_SET_LEN(*plen));
	}
	usb_pc_cpu_flush(td->page_cache);

	#ifdef USB_DEBUG
	if (ehcidebug > 15) {
	DPRINTF("FS-TD %d\n", nframes);
	ehci_dump_sitd(sc, td);
	}
	#endif
	/* insert TD into schedule */
	EHCI_APPEND_FS_TD(td, *pp_last);
	pp_last++;

	plen++;
	td_last = td;
	td = td->obj_next;
	}

	xfer->td_transfer_last = td_last;

	/* update isoc_next */
	xfer->endpoint->isoc_next = (pp_last - &sc->sc_isoc_fs_p_last[0]) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);

	/*
	* We don't allow cancelling of the SPLIT transaction USB FULL
	* speed transfer, because it disturbs the bandwidth
	* computation algorithm.
	*/
	xfer->flags_int.can_cancel_immed = 0;
	}

	static void
	ehci_device_isoc_fs_start(struct usb_xfer *xfer)
	{
	/*
	* We don't allow cancelling of the SPLIT transaction USB FULL
	* speed transfer, because it disturbs the bandwidth
	* computation algorithm.
	*/
	xfer->flags_int.can_cancel_immed = 0;

	/* set a default timeout */
	if (xfer->timeout == 0)
	xfer->timeout = 500; /* ms */

	/* put transfer on interrupt queue */
	ehci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ehci_device_isoc_fs_methods =
	{
	.open = ehci_device_isoc_fs_open,
	.close = ehci_device_isoc_fs_close,
	.enter = ehci_device_isoc_fs_enter,
	.start = ehci_device_isoc_fs_start,
	};

	/------------------------------------------------------------------------
	* ehci high speed isochronous support
	------------------------------------------------------------------------/
	static void
	ehci_device_isoc_hs_open(struct usb_xfer *xfer)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
	ehci_itd_t *td;
	uint32_t temp;
	uint8_t ds;

	usb_hs_bandwidth_alloc(xfer);

	/* initialize all TD's */

	for (ds = 0; ds != 2; ds++) {

	for (td = xfer->td_start[ds]; td; td = td->obj_next) {

	/* set TD inactive */
	td->itd_status[0] = 0;
	td->itd_status[1] = 0;
	td->itd_status[2] = 0;
	td->itd_status[3] = 0;
	td->itd_status[4] = 0;
	td->itd_status[5] = 0;
	td->itd_status[6] = 0;
	td->itd_status[7] = 0;

	/* set endpoint and address */
	td->itd_bp[0] = htohc32(sc,
	EHCI_ITD_SET_ADDR(xfer->address) \|
	EHCI_ITD_SET_ENDPT(UE_GET_ADDR(xfer->endpointno)));

	temp =
	EHCI_ITD_SET_MPL(xfer->max_packet_size & 0x7FF);

	/* set direction */
	if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) {
	temp \|= EHCI_ITD_SET_DIR_IN;
	}
	/* set maximum packet size */
	td->itd_bp[1] = htohc32(sc, temp);

	/* set transfer multiplier */
	td->itd_bp[2] = htohc32(sc, xfer->max_packet_count & 3);

	usb_pc_cpu_flush(td->page_cache);
	}
	}
	}

	static void
	ehci_device_isoc_hs_close(struct usb_xfer *xfer)
	{
	ehci_device_done(xfer, USB_ERR_CANCELLED);

	/* bandwidth must be freed after device done */
	usb_hs_bandwidth_free(xfer);
	}

	static void
	ehci_device_isoc_hs_enter(struct usb_xfer *xfer)
	{
	struct usb_page_search buf_res;
	ehci_softc_t *sc = EHCI_BUS2SC(xfer->xroot->bus);
	ehci_itd_t *td;
	ehci_itd_t *td_last = NULL;
	ehci_itd_t **pp_last;
	bus_size_t page_addr;
	uint32_t *plen;
	uint32_t status;
	uint32_t buf_offset;
	uint32_t nframes;
	uint32_t itd_offset[8 + 1];
	uint8_t x;
	uint8_t td_no;
	uint8_t page_no;
	uint8_t shift = usbd_xfer_get_fps_shift(xfer);

	#ifdef USB_DEBUG
	uint8_t once = 1;

	#endif

	DPRINTFN(6, "xfer=%p next=%d nframes=%d shift=%d\n",
	xfer, xfer->endpoint->isoc_next, xfer->nframes, (int)shift);

	/* get the current frame index */

	nframes = EOREAD4(sc, EHCI_FRINDEX) / 8;

	/*
	* check if the frame index is within the window where the frames
	* will be inserted
	*/
	buf_offset = (nframes - xfer->endpoint->isoc_next) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);

	if ((xfer->endpoint->is_synced == 0) \|\|
	(buf_offset < (((xfer->nframes << shift) + 7) / 8))) {
	/*
	* If there is data underflow or the pipe queue is empty we
	* schedule the transfer a few frames ahead of the current
	* frame position. Else two isochronous transfers might
	* overlap.
	*/
	xfer->endpoint->isoc_next = (nframes + 3) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
	xfer->endpoint->is_synced = 1;
	DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
	}
	/*
	* compute how many milliseconds the insertion is ahead of the
	* current frame position:
	*/
	buf_offset = (xfer->endpoint->isoc_next - nframes) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);

	/*
	* pre-compute when the isochronous transfer will be finished:
	*/
	xfer->isoc_time_complete =
	usb_isoc_time_expand(&sc->sc_bus, nframes) + buf_offset +
	(((xfer->nframes << shift) + 7) / 8);

	/* get the real number of frames */

	nframes = xfer->nframes;

	buf_offset = 0;
	td_no = 0;

	plen = xfer->frlengths;

	/* toggle the DMA set we are using */
	xfer->flags_int.curr_dma_set ^= 1;

	/* get next DMA set */
	td = xfer->td_start[xfer->flags_int.curr_dma_set];
	xfer->td_transfer_first = td;

	pp_last = &sc->sc_isoc_hs_p_last[xfer->endpoint->isoc_next];

	/* store starting position */

	xfer->qh_pos = xfer->endpoint->isoc_next;

	while (nframes) {
	if (td == NULL) {
	panic("%s:%d: out of TD's\n",
	__FUNCTION__, __LINE__);
	}
	if (pp_last >= &sc->sc_isoc_hs_p_last[EHCI_VIRTUAL_FRAMELIST_COUNT]) {
	pp_last = &sc->sc_isoc_hs_p_last[0];
	}
	/* range check */
	if (*plen > xfer->max_frame_size) {
	#ifdef USB_DEBUG
	if (once) {
	once = 0;
	printf("%s: frame length(%d) exceeds %d bytes "
	"(frame truncated)\n",
	__FUNCTION__, *plen, xfer->max_frame_size);
	}
	#endif
	*plen = xfer->max_frame_size;
	}

	if (xfer->endpoint->usb_smask & (1 << td_no)) {
	status = (EHCI_ITD_SET_LEN(*plen) \|
	EHCI_ITD_ACTIVE \|
	EHCI_ITD_SET_PG(0));
	td->itd_status[td_no] = htohc32(sc, status);
	itd_offset[td_no] = buf_offset;
	buf_offset += *plen;
	plen++;
	nframes --;
	} else {
	td->itd_status[td_no] = 0; /* not active */
	itd_offset[td_no] = buf_offset;
	}

	td_no++;

	if ((td_no == 8) \|\| (nframes == 0)) {

	/* the rest of the transfers are not active, if any */
	for (x = td_no; x != 8; x++) {
	td->itd_status[x] = 0; /* not active */
	}

	/* check if there is any data to be transferred */
	if (itd_offset[0] != buf_offset) {
	page_no = 0;
	itd_offset[td_no] = buf_offset;

	/* get first page offset */
	usbd_get_page(xfer->frbuffers, itd_offset[0], &buf_res);
	/* get page address */
	page_addr = buf_res.physaddr & ~0xFFF;
	/* update page address */
	td->itd_bp[0] &= htohc32(sc, 0xFFF);
	td->itd_bp[0] \|= htohc32(sc, page_addr);

	for (x = 0; x != td_no; x++) {
	/* set page number and page offset */
	status = (EHCI_ITD_SET_PG(page_no) \|
	(buf_res.physaddr & 0xFFF));
	td->itd_status[x] \|= htohc32(sc, status);

	/* get next page offset */
	if (itd_offset[x + 1] == buf_offset) {
	/*
	* We subtract one so that
	* we don't go off the last
	* page!
	*/
	usbd_get_page(xfer->frbuffers, buf_offset - 1, &buf_res);
	} else {
	usbd_get_page(xfer->frbuffers, itd_offset[x + 1], &buf_res);
	}

	/* check if we need a new page */
	if ((buf_res.physaddr ^ page_addr) & ~0xFFF) {
	/* new page needed */
	page_addr = buf_res.physaddr & ~0xFFF;
	if (page_no == 6) {
	panic("%s: too many pages\n", __FUNCTION__);
	}
	page_no++;
	/* update page address */
	td->itd_bp[page_no] &= htohc32(sc, 0xFFF);
	td->itd_bp[page_no] \|= htohc32(sc, page_addr);
	}
	}
	}
	/* set IOC bit if we are complete */
	if (nframes == 0) {
	td->itd_status[td_no - 1] \|= htohc32(sc, EHCI_ITD_IOC);
	}
	usb_pc_cpu_flush(td->page_cache);
	#ifdef USB_DEBUG
	if (ehcidebug > 15) {
	DPRINTF("HS-TD %d\n", nframes);
	ehci_dump_itd(sc, td);
	}
	#endif
	/* insert TD into schedule */
	EHCI_APPEND_HS_TD(td, *pp_last);
	pp_last++;

	td_no = 0;
	td_last = td;
	td = td->obj_next;
	}
	}

	xfer->td_transfer_last = td_last;

	/* update isoc_next */
	xfer->endpoint->isoc_next = (pp_last - &sc->sc_isoc_hs_p_last[0]) &
	(EHCI_VIRTUAL_FRAMELIST_COUNT - 1);
	}

	static void
	ehci_device_isoc_hs_start(struct usb_xfer *xfer)
	{
	/* put transfer on interrupt queue */
	ehci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ehci_device_isoc_hs_methods =
	{
	.open = ehci_device_isoc_hs_open,
	.close = ehci_device_isoc_hs_close,
	.enter = ehci_device_isoc_hs_enter,
	.start = ehci_device_isoc_hs_start,
	};

	/------------------------------------------------------------------------
	* ehci root control support
	------------------------------------------------------------------------
	* Simulate a hardware hub by handling all the necessary requests.
	------------------------------------------------------------------------/

	static const
	struct usb_device_descriptor ehci_devd =
	{
	sizeof(struct usb_device_descriptor),
	UDESC_DEVICE, /* type */
	{0x00, 0x02}, /* USB version */
	UDCLASS_HUB, /* class */
	UDSUBCLASS_HUB, /* subclass */
	UDPROTO_HSHUBSTT, /* protocol */
	64, /* max packet */
	{0}, {0}, {0x00, 0x01}, /* device id */
	1, 2, 0, /* string indexes */
	1 /* # of configurations */
	};

	static const
	struct usb_device_qualifier ehci_odevd =
	{
	sizeof(struct usb_device_qualifier),
	UDESC_DEVICE_QUALIFIER, /* type */
	{0x00, 0x02}, /* USB version */
	UDCLASS_HUB, /* class */
	UDSUBCLASS_HUB, /* subclass */
	UDPROTO_FSHUB, /* protocol */
	0, /* max packet */
	0, /* # of configurations */
	0
	};

	static const struct ehci_config_desc ehci_confd = {
	.confd = {
	.bLength = sizeof(struct usb_config_descriptor),
	.bDescriptorType = UDESC_CONFIG,
	.wTotalLength[0] = sizeof(ehci_confd),
	.bNumInterface = 1,
	.bConfigurationValue = 1,
	.iConfiguration = 0,
	.bmAttributes = UC_SELF_POWERED,
	.bMaxPower = 0 /* max power */
	},
	.ifcd = {
	.bLength = sizeof(struct usb_interface_descriptor),
	.bDescriptorType = UDESC_INTERFACE,
	.bNumEndpoints = 1,
	.bInterfaceClass = UICLASS_HUB,
	.bInterfaceSubClass = UISUBCLASS_HUB,
	.bInterfaceProtocol = 0,
	},
	.endpd = {
	.bLength = sizeof(struct usb_endpoint_descriptor),
	.bDescriptorType = UDESC_ENDPOINT,
	.bEndpointAddress = UE_DIR_IN \| EHCI_INTR_ENDPT,
	.bmAttributes = UE_INTERRUPT,
	.wMaxPacketSize[0] = 8, /* max packet (63 ports) */
	.bInterval = 255,
	},
	};

	static const
	struct usb_hub_descriptor ehci_hubd =
	{
	.bDescLength = 0, /* dynamic length */
	.bDescriptorType = UDESC_HUB,
	};

	uint16_t
	ehci_get_port_speed_portsc(struct ehci_softc *sc, uint16_t index)
	{
	uint32_t v;

	v = EOREAD4(sc, EHCI_PORTSC(index));
	v = (v >> EHCI_PORTSC_PSPD_SHIFT) & EHCI_PORTSC_PSPD_MASK;

	if (v == EHCI_PORT_SPEED_HIGH)
	return (UPS_HIGH_SPEED);
	if (v == EHCI_PORT_SPEED_LOW)
	return (UPS_LOW_SPEED);
	return (0);
	}

	uint16_t
	ehci_get_port_speed_hostc(struct ehci_softc *sc, uint16_t index)
	{
	uint32_t v;

	v = EOREAD4(sc, EHCI_HOSTC(index));
	v = (v >> EHCI_HOSTC_PSPD_SHIFT) & EHCI_HOSTC_PSPD_MASK;

	if (v == EHCI_PORT_SPEED_HIGH)
	return (UPS_HIGH_SPEED);
	if (v == EHCI_PORT_SPEED_LOW)
	return (UPS_LOW_SPEED);
	return (0);
	}

	static void
	ehci_disown(ehci_softc_t *sc, uint16_t index, uint8_t lowspeed)
	{
	uint32_t port;
	uint32_t v;

	DPRINTF("index=%d lowspeed=%d\n", index, lowspeed);

	port = EHCI_PORTSC(index);
	v = EOREAD4(sc, port) & ~EHCI_PS_CLEAR;
	EOWRITE4(sc, port, v \| EHCI_PS_PO);
	}

	static usb_error_t
	ehci_roothub_exec(struct usb_device *udev,
	struct usb_device_request req, const void pptr, uint16_t plength)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);
	const char *str_ptr;
	const void *ptr;
	uint32_t port;
	uint32_t v;
	uint16_t len;
	uint16_t i;
	uint16_t value;
	uint16_t index;
	usb_error_t err;

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	/* buffer reset */
	ptr = (const void *)&sc->sc_hub_desc;
	len = 0;
	err = 0;

	value = UGETW(req->wValue);
	index = UGETW(req->wIndex);

	DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x "
	"wValue=0x%04x wIndex=0x%04x\n",
	req->bmRequestType, req->bRequest,
	UGETW(req->wLength), value, index);

	#define C(x,y) ((x) \| ((y) << 8))
	switch (C(req->bRequest, req->bmRequestType)) {
	case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
	case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
	case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
	/*
	* DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops
	* for the integrated root hub.
	*/
	break;
	case C(UR_GET_CONFIG, UT_READ_DEVICE):
	len = 1;
	sc->sc_hub_desc.temp[0] = sc->sc_conf;
	break;
	case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
	switch (value >> 8) {
	case UDESC_DEVICE:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(ehci_devd);
	ptr = (const void *)&ehci_devd;
	break;
	/*
	* We can't really operate at another speed,
	* but the specification says we need this
	* descriptor:
	*/
	case UDESC_DEVICE_QUALIFIER:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(ehci_odevd);
	ptr = (const void *)&ehci_odevd;
	break;

	case UDESC_CONFIG:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(ehci_confd);
	ptr = (const void *)&ehci_confd;
	break;

	case UDESC_STRING:
	switch (value & 0xff) {
	case 0: /* Language table */
	str_ptr = "\001";
	break;

	case 1: /* Vendor */
	str_ptr = sc->sc_vendor;
	break;

	case 2: /* Product */
	str_ptr = "EHCI root HUB";
	break;

	default:
	str_ptr = "";
	break;
	}

	len = usb_make_str_desc(
	sc->sc_hub_desc.temp,
	sizeof(sc->sc_hub_desc.temp),
	str_ptr);
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;
	case C(UR_GET_INTERFACE, UT_READ_INTERFACE):
	len = 1;
	sc->sc_hub_desc.temp[0] = 0;
	break;
	case C(UR_GET_STATUS, UT_READ_DEVICE):
	len = 2;
	USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED);
	break;
	case C(UR_GET_STATUS, UT_READ_INTERFACE):
	case C(UR_GET_STATUS, UT_READ_ENDPOINT):
	len = 2;
	USETW(sc->sc_hub_desc.stat.wStatus, 0);
	break;
	case C(UR_SET_ADDRESS, UT_WRITE_DEVICE):
	if (value >= EHCI_MAX_DEVICES) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	sc->sc_addr = value;
	break;
	case C(UR_SET_CONFIG, UT_WRITE_DEVICE):
	if ((value != 0) && (value != 1)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	sc->sc_conf = value;
	break;
	case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
	break;
	case C(UR_SET_FEATURE, UT_WRITE_DEVICE):
	case C(UR_SET_FEATURE, UT_WRITE_INTERFACE):
	case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
	err = USB_ERR_IOERROR;
	goto done;
	case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
	break;
	case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
	break;
	/* Hub requests */
	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE):
	break;
	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER):
	DPRINTFN(9, "UR_CLEAR_PORT_FEATURE\n");

	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	port = EHCI_PORTSC(index);
	v = EOREAD4(sc, port) & ~EHCI_PS_CLEAR;
	switch (value) {
	case UHF_PORT_ENABLE:
	EOWRITE4(sc, port, v & ~EHCI_PS_PE);
	break;
	case UHF_PORT_SUSPEND:
	if ((v & EHCI_PS_SUSP) && (!(v & EHCI_PS_FPR))) {

	/*
	* waking up a High Speed device is rather
	* complicated if
	*/
	EOWRITE4(sc, port, v \| EHCI_PS_FPR);
	}
	/* wait 20ms for resume sequence to complete */
	usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 50);

	EOWRITE4(sc, port, v & ~(EHCI_PS_SUSP \|
	EHCI_PS_FPR \| (3 << 10) /* High Speed */ ));

	/* 4ms settle time */
	usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 250);
	break;
	case UHF_PORT_POWER:
	EOWRITE4(sc, port, v & ~EHCI_PS_PP);
	break;
	case UHF_PORT_TEST:
	DPRINTFN(3, "clear port test "
	"%d\n", index);
	break;
	case UHF_PORT_INDICATOR:
	DPRINTFN(3, "clear port ind "
	"%d\n", index);
	EOWRITE4(sc, port, v & ~EHCI_PS_PIC);
	break;
	case UHF_C_PORT_CONNECTION:
	EOWRITE4(sc, port, v \| EHCI_PS_CSC);
	break;
	case UHF_C_PORT_ENABLE:
	EOWRITE4(sc, port, v \| EHCI_PS_PEC);
	break;
	case UHF_C_PORT_SUSPEND:
	EOWRITE4(sc, port, v \| EHCI_PS_SUSP);
	break;
	case UHF_C_PORT_OVER_CURRENT:
	EOWRITE4(sc, port, v \| EHCI_PS_OCC);
	break;
	case UHF_C_PORT_RESET:
	sc->sc_isreset = 0;
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;
	case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE):
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	v = EREAD4(sc, EHCI_HCSPARAMS);

	sc->sc_hub_desc.hubd = ehci_hubd;
	sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport;

	if (EHCI_HCS_PPC(v))
	i = UHD_PWR_INDIVIDUAL;
	else
	i = UHD_PWR_NO_SWITCH;

	if (EHCI_HCS_P_INDICATOR(v))
	i \|= UHD_PORT_IND;

	USETW(sc->sc_hub_desc.hubd.wHubCharacteristics, i);
	/* XXX can't find out? */
	sc->sc_hub_desc.hubd.bPwrOn2PwrGood = 200;
	/* XXX don't know if ports are removable or not */
	sc->sc_hub_desc.hubd.bDescLength =
	8 + ((sc->sc_noport + 7) / 8);
	len = sc->sc_hub_desc.hubd.bDescLength;
	break;
	case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE):
	len = 16;
	memset(sc->sc_hub_desc.temp, 0, 16);
	break;
	case C(UR_GET_STATUS, UT_READ_CLASS_OTHER):
	DPRINTFN(9, "get port status i=%d\n",
	index);
	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	v = EOREAD4(sc, EHCI_PORTSC(index));
	DPRINTFN(9, "port status=0x%04x\n", v);
	if (sc->sc_flags & EHCI_SCFLG_TT) {
	if (sc->sc_vendor_get_port_speed != NULL) {
	i = sc->sc_vendor_get_port_speed(sc, index);
	} else {
	device_printf(sc->sc_bus.bdev,
	"EHCI_SCFLG_TT quirk is set but "
	"sc_vendor_get_hub_speed() is NULL\n");
	i = UPS_HIGH_SPEED;
	}
	} else {
	i = UPS_HIGH_SPEED;
	}
	if (v & EHCI_PS_CS)
	i \|= UPS_CURRENT_CONNECT_STATUS;
	if (v & EHCI_PS_PE)
	i \|= UPS_PORT_ENABLED;
	if ((v & EHCI_PS_SUSP) && !(v & EHCI_PS_FPR))
	i \|= UPS_SUSPEND;
	if (v & EHCI_PS_OCA)
	i \|= UPS_OVERCURRENT_INDICATOR;
	if (v & EHCI_PS_PR)
	i \|= UPS_RESET;
	if (v & EHCI_PS_PP)
	i \|= UPS_PORT_POWER;
	USETW(sc->sc_hub_desc.ps.wPortStatus, i);
	i = 0;
	if (v & EHCI_PS_CSC)
	i \|= UPS_C_CONNECT_STATUS;
	if (v & EHCI_PS_PEC)
	i \|= UPS_C_PORT_ENABLED;
	if (v & EHCI_PS_OCC)
	i \|= UPS_C_OVERCURRENT_INDICATOR;
	if (v & EHCI_PS_FPR)
	i \|= UPS_C_SUSPEND;
	if (sc->sc_isreset)
	i \|= UPS_C_PORT_RESET;
	USETW(sc->sc_hub_desc.ps.wPortChange, i);
	len = sizeof(sc->sc_hub_desc.ps);
	break;
	case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE):
	err = USB_ERR_IOERROR;
	goto done;
	case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE):
	break;
	case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER):
	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	port = EHCI_PORTSC(index);
	v = EOREAD4(sc, port) & ~EHCI_PS_CLEAR;
	switch (value) {
	case UHF_PORT_ENABLE:
	EOWRITE4(sc, port, v \| EHCI_PS_PE);
	break;
	case UHF_PORT_SUSPEND:
	EOWRITE4(sc, port, v \| EHCI_PS_SUSP);
	break;
	case UHF_PORT_RESET:
	DPRINTFN(6, "reset port %d\n", index);
	#ifdef USB_DEBUG
	if (ehcinohighspeed) {
	/*
	* Connect USB device to companion
	* controller.
	*/
	ehci_disown(sc, index, 1);
	break;
	}
	#endif
	if (EHCI_PS_IS_LOWSPEED(v) &&
	(sc->sc_flags & EHCI_SCFLG_TT) == 0) {
	/* Low speed device, give up ownership. */
	ehci_disown(sc, index, 1);
	break;
	}
	/* Start reset sequence. */
	v &= ~(EHCI_PS_PE \| EHCI_PS_PR);
	EOWRITE4(sc, port, v \| EHCI_PS_PR);

	/* Wait for reset to complete. */
	usb_pause_mtx(&sc->sc_bus.bus_mtx,
	USB_MS_TO_TICKS(usb_port_root_reset_delay));

	/* Terminate reset sequence. */
	if (!(sc->sc_flags & EHCI_SCFLG_NORESTERM))
	EOWRITE4(sc, port, v);

	/* Wait for HC to complete reset. */
	usb_pause_mtx(&sc->sc_bus.bus_mtx,
	USB_MS_TO_TICKS(EHCI_PORT_RESET_COMPLETE));

	v = EOREAD4(sc, port);
	DPRINTF("ehci after reset, status=0x%08x\n", v);
	if (v & EHCI_PS_PR) {
	device_printf(sc->sc_bus.bdev,
	"port reset timeout\n");
	err = USB_ERR_TIMEOUT;
	goto done;
	}
	if (!(v & EHCI_PS_PE) &&
	(sc->sc_flags & EHCI_SCFLG_TT) == 0) {
	/* Not a high speed device, give up ownership.*/
	ehci_disown(sc, index, 0);
	break;
	}
	sc->sc_isreset = 1;
	DPRINTF("ehci port %d reset, status = 0x%08x\n",
	index, v);
	break;

	case UHF_PORT_POWER:
	DPRINTFN(3, "set port power %d\n", index);
	EOWRITE4(sc, port, v \| EHCI_PS_PP);
	break;

	case UHF_PORT_TEST:
	DPRINTFN(3, "set port test %d\n", index);
	break;

	case UHF_PORT_INDICATOR:
	DPRINTFN(3, "set port ind %d\n", index);
	EOWRITE4(sc, port, v \| EHCI_PS_PIC);
	break;

	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;
	case C(UR_CLEAR_TT_BUFFER, UT_WRITE_CLASS_OTHER):
	case C(UR_RESET_TT, UT_WRITE_CLASS_OTHER):
	case C(UR_GET_TT_STATE, UT_READ_CLASS_OTHER):
	case C(UR_STOP_TT, UT_WRITE_CLASS_OTHER):
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	done:
	*plength = len;
	*pptr = ptr;
	return (err);
	}

	static void
	ehci_xfer_setup(struct usb_setup_params *parm)
	{
	struct usb_page_search page_info;
	struct usb_page_cache *pc;
	ehci_softc_t *sc;
	struct usb_xfer *xfer;
	void *last_obj;
	uint32_t nqtd;
	uint32_t nqh;
	uint32_t nsitd;
	uint32_t nitd;
	uint32_t n;

	sc = EHCI_BUS2SC(parm->udev->bus);
	xfer = parm->curr_xfer;

	nqtd = 0;
	nqh = 0;
	nsitd = 0;
	nitd = 0;

	/*
	* compute maximum number of some structures
	*/
	if (parm->methods == &ehci_device_ctrl_methods) {

	/*
	* The proof for the "nqtd" formula is illustrated like
	* this:
	*
	* +------------------------------------+
	* \| \|
	* \| \|remainder -> \|
	* \| +-----+---+ \|
	* \| \| xxx \| x \| frm 0 \|
	* \| +-----+---++ \|
	* \| \| xxx \| xx \| frm 1 \|
	* \| +-----+----+ \|
	* \| ... \|
	* +------------------------------------+
	*
	* "xxx" means a completely full USB transfer descriptor
	*
	* "x" and "xx" means a short USB packet
	*
	* For the remainder of an USB transfer modulo
	* "max_data_length" we need two USB transfer descriptors.
	* One to transfer the remaining data and one to finalise
	* with a zero length packet in case the "force_short_xfer"
	* flag is set. We only need two USB transfer descriptors in
	* the case where the transfer length of the first one is a
	* factor of "max_frame_size". The rest of the needed USB
	* transfer descriptors is given by the buffer size divided
	* by the maximum data payload.
	*/
	parm->hc_max_packet_size = 0x400;
	parm->hc_max_packet_count = 1;
	parm->hc_max_frame_size = EHCI_QTD_PAYLOAD_MAX;
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nqh = 1;
	nqtd = ((2 * xfer->nframes) + 1 /* STATUS */
	+ (xfer->max_data_length / xfer->max_hc_frame_size));

	} else if (parm->methods == &ehci_device_bulk_methods) {

	parm->hc_max_packet_size = 0x400;
	parm->hc_max_packet_count = 1;
	parm->hc_max_frame_size = EHCI_QTD_PAYLOAD_MAX;
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nqh = 1;
	nqtd = ((2 * xfer->nframes)
	+ (xfer->max_data_length / xfer->max_hc_frame_size));

	} else if (parm->methods == &ehci_device_intr_methods) {

	if (parm->speed == USB_SPEED_HIGH) {
	parm->hc_max_packet_size = 0x400;
	parm->hc_max_packet_count = 3;
	} else if (parm->speed == USB_SPEED_FULL) {
	parm->hc_max_packet_size = USB_FS_BYTES_PER_HS_UFRAME;
	parm->hc_max_packet_count = 1;
	} else {
	parm->hc_max_packet_size = USB_FS_BYTES_PER_HS_UFRAME / 8;
	parm->hc_max_packet_count = 1;
	}

	parm->hc_max_frame_size = EHCI_QTD_PAYLOAD_MAX;
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nqh = 1;
	nqtd = ((2 * xfer->nframes)
	+ (xfer->max_data_length / xfer->max_hc_frame_size));

	} else if (parm->methods == &ehci_device_isoc_fs_methods) {

	parm->hc_max_packet_size = 0x3FF;
	parm->hc_max_packet_count = 1;
	parm->hc_max_frame_size = 0x3FF;
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nsitd = xfer->nframes;

	} else if (parm->methods == &ehci_device_isoc_hs_methods) {

	parm->hc_max_packet_size = 0x400;
	parm->hc_max_packet_count = 3;
	parm->hc_max_frame_size = 0xC00;
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nitd = ((xfer->nframes + 7) / 8) <<
	usbd_xfer_get_fps_shift(xfer);

	} else {

	parm->hc_max_packet_size = 0x400;
	parm->hc_max_packet_count = 1;
	parm->hc_max_frame_size = 0x400;

	usbd_transfer_setup_sub(parm);
	}

	alloc_dma_set:

	if (parm->err) {
	return;
	}
	/*
	* Allocate queue heads and transfer descriptors
	*/
	last_obj = NULL;

	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(ehci_itd_t),
	EHCI_ITD_ALIGN, nitd)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != nitd; n++) {
	ehci_itd_t *td;

	usbd_get_page(pc + n, 0, &page_info);

	td = page_info.buffer;

	/* init TD */
	td->itd_self = htohc32(sc, page_info.physaddr \| EHCI_LINK_ITD);
	td->obj_next = last_obj;
	td->page_cache = pc + n;

	last_obj = td;

	usb_pc_cpu_flush(pc + n);
	}
	}
	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(ehci_sitd_t),
	EHCI_SITD_ALIGN, nsitd)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != nsitd; n++) {
	ehci_sitd_t *td;

	usbd_get_page(pc + n, 0, &page_info);

	td = page_info.buffer;

	/* init TD */
	td->sitd_self = htohc32(sc, page_info.physaddr \| EHCI_LINK_SITD);
	td->obj_next = last_obj;
	td->page_cache = pc + n;

	last_obj = td;

	usb_pc_cpu_flush(pc + n);
	}
	}
	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(ehci_qtd_t),
	EHCI_QTD_ALIGN, nqtd)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != nqtd; n++) {
	ehci_qtd_t *qtd;

	usbd_get_page(pc + n, 0, &page_info);

	qtd = page_info.buffer;

	/* init TD */
	qtd->qtd_self = htohc32(sc, page_info.physaddr);
	qtd->obj_next = last_obj;
	qtd->page_cache = pc + n;

	last_obj = qtd;

	usb_pc_cpu_flush(pc + n);
	}
	}
	xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj;

	last_obj = NULL;

	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(ehci_qh_t),
	EHCI_QH_ALIGN, nqh)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != nqh; n++) {
	ehci_qh_t *qh;

	usbd_get_page(pc + n, 0, &page_info);

	qh = page_info.buffer;

	/* init QH */
	qh->qh_self = htohc32(sc, page_info.physaddr \| EHCI_LINK_QH);
	qh->obj_next = last_obj;
	qh->page_cache = pc + n;

	last_obj = qh;

	usb_pc_cpu_flush(pc + n);
	}
	}
	xfer->qh_start[xfer->flags_int.curr_dma_set] = last_obj;

	if (!xfer->flags_int.curr_dma_set) {
	xfer->flags_int.curr_dma_set = 1;
	goto alloc_dma_set;
	}
	}

	static void
	ehci_xfer_unsetup(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ehci_ep_init(struct usb_device udev, struct usb_endpoint_descriptor edesc,
	struct usb_endpoint *ep)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);

	DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d (%d)\n",
	ep, udev->address,
	edesc->bEndpointAddress, udev->flags.usb_mode,
	sc->sc_addr);

	if (udev->device_index != sc->sc_addr) {

	if ((udev->speed != USB_SPEED_HIGH) &&
	((udev->hs_hub_addr == 0) \|\|
	(udev->hs_port_no == 0) \|\|
	(udev->parent_hs_hub == NULL) \|\|
	(udev->parent_hs_hub->hub == NULL))) {
	/* We need a transaction translator */
	goto done;
	}
	switch (edesc->bmAttributes & UE_XFERTYPE) {
	case UE_CONTROL:
	ep->methods = &ehci_device_ctrl_methods;
	break;
	case UE_INTERRUPT:
	ep->methods = &ehci_device_intr_methods;
	break;
	case UE_ISOCHRONOUS:
	if (udev->speed == USB_SPEED_HIGH) {
	ep->methods = &ehci_device_isoc_hs_methods;
	} else if (udev->speed == USB_SPEED_FULL) {
	ep->methods = &ehci_device_isoc_fs_methods;
	}
	break;
	case UE_BULK:
	ep->methods = &ehci_device_bulk_methods;
	break;
	default:
	/* do nothing */
	break;
	}
	}
	done:
	return;
	}

	static void
	ehci_get_dma_delay(struct usb_device udev, uint32_t pus)
	{
	/*
	* Wait until the hardware has finished any possible use of
	* the transfer descriptor(s) and QH
	*/
	pus = (1125); / microseconds */
	}

	static void
	ehci_device_resume(struct usb_device *udev)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);
	struct usb_xfer *xfer;
	const struct usb_pipe_methods *methods;

	DPRINTF("\n");

	USB_BUS_LOCK(udev->bus);

	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {

	if (xfer->xroot->udev == udev) {

	methods = xfer->endpoint->methods;

	if ((methods == &ehci_device_bulk_methods) \|\|
	(methods == &ehci_device_ctrl_methods)) {
	EHCI_APPEND_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
	sc->sc_async_p_last);
	}
	if (methods == &ehci_device_intr_methods) {
	EHCI_APPEND_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
	sc->sc_intr_p_last[xfer->qh_pos]);
	}
	}
	}

	USB_BUS_UNLOCK(udev->bus);

	return;
	}

	static void
	ehci_device_suspend(struct usb_device *udev)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(udev->bus);
	struct usb_xfer *xfer;
	const struct usb_pipe_methods *methods;

	DPRINTF("\n");

	USB_BUS_LOCK(udev->bus);

	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {

	if (xfer->xroot->udev == udev) {

	methods = xfer->endpoint->methods;

	if ((methods == &ehci_device_bulk_methods) \|\|
	(methods == &ehci_device_ctrl_methods)) {
	EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
	sc->sc_async_p_last);
	}
	if (methods == &ehci_device_intr_methods) {
	EHCI_REMOVE_QH(xfer->qh_start[xfer->flags_int.curr_dma_set],
	sc->sc_intr_p_last[xfer->qh_pos]);
	}
	}
	}

	USB_BUS_UNLOCK(udev->bus);
	}

	static void
	ehci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state)
	{
	struct ehci_softc *sc = EHCI_BUS2SC(bus);

	switch (state) {
	case USB_HW_POWER_SUSPEND:
	case USB_HW_POWER_SHUTDOWN:
	ehci_suspend(sc);
	break;
	case USB_HW_POWER_RESUME:
	ehci_resume(sc);
	break;
	default:
	break;
	}
	}

	static void
	ehci_set_hw_power(struct usb_bus *bus)
	{
	ehci_softc_t *sc = EHCI_BUS2SC(bus);
	uint32_t temp;
	uint32_t flags;

	DPRINTF("\n");

	USB_BUS_LOCK(bus);

	flags = bus->hw_power_state;

	temp = EOREAD4(sc, EHCI_USBCMD);

	temp &= ~(EHCI_CMD_ASE \| EHCI_CMD_PSE);

	if (flags & (USB_HW_POWER_CONTROL \|
	USB_HW_POWER_BULK)) {
	DPRINTF("Async is active\n");
	temp \|= EHCI_CMD_ASE;
	}
	if (flags & (USB_HW_POWER_INTERRUPT \|
	USB_HW_POWER_ISOC)) {
	DPRINTF("Periodic is active\n");
	temp \|= EHCI_CMD_PSE;
	}
	EOWRITE4(sc, EHCI_USBCMD, temp);

	USB_BUS_UNLOCK(bus);

	return;
	}

	static void
	ehci_start_dma_delay_second(struct usb_xfer *xfer)
	{
	struct ehci_softc *sc = EHCI_BUS2SC(xfer->xroot->bus);

	DPRINTF("\n");

	/* trigger doorbell */
	ehci_doorbell_async(sc);

	/* give the doorbell 4ms */
	usbd_transfer_timeout_ms(xfer,
	(void ()(void ))&usb_dma_delay_done_cb, 4);
	}

	/*
	* Ring the doorbell twice before freeing any DMA descriptors. Some host
	* controllers apparently cache the QH descriptors and need a message
	* that the cache needs to be discarded.
	*/
	static void
	ehci_start_dma_delay(struct usb_xfer *xfer)
	{
	struct ehci_softc *sc = EHCI_BUS2SC(xfer->xroot->bus);

	DPRINTF("\n");

	/* trigger doorbell */
	ehci_doorbell_async(sc);

	/* give the doorbell 4ms */
	usbd_transfer_timeout_ms(xfer,
	(void ()(void ))&ehci_start_dma_delay_second, 4);
	}

	static const struct usb_bus_methods ehci_bus_methods =
	{
	.endpoint_init = ehci_ep_init,
	.xfer_setup = ehci_xfer_setup,
	.xfer_unsetup = ehci_xfer_unsetup,
	.get_dma_delay = ehci_get_dma_delay,
	.device_resume = ehci_device_resume,
	.device_suspend = ehci_device_suspend,
	.set_hw_power = ehci_set_hw_power,
	.set_hw_power_sleep = ehci_set_hw_power_sleep,
	.roothub_exec = ehci_roothub_exec,
	.xfer_poll = ehci_do_poll,
	.start_dma_delay = ehci_start_dma_delay,
	};
	Index: head/sys/dev/usb/controller/ohci.c
	===================================================================
	--- head/sys/dev/usb/controller/ohci.c (revision 327172)
	+++ head/sys/dev/usb/controller/ohci.c (revision 327173)
	@@ -1,2736 +1,2734 @@
	/* $FreeBSD$ */
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
	* Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved.
	* Copyright (c) 1998 Lennart Augustsson. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* USB Open Host Controller driver.
	*
	* OHCI spec: http://www.compaq.com/productinfo/development/openhci.html
	* USB spec: http://www.usb.org/developers/docs/usbspec.zip
	*/

	#ifdef USB_GLOBAL_INCLUDE_FILE
	#include USB_GLOBAL_INCLUDE_FILE
	#else
	#include <sys/stdint.h>
	#include <sys/stddef.h>
	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usbdi.h>

	#define USB_DEBUG_VAR ohcidebug

	#include <dev/usb/usb_core.h>
	#include <dev/usb/usb_debug.h>
	#include <dev/usb/usb_busdma.h>
	#include <dev/usb/usb_process.h>
	#include <dev/usb/usb_transfer.h>
	#include <dev/usb/usb_device.h>
	#include <dev/usb/usb_hub.h>
	#include <dev/usb/usb_util.h>

	#include <dev/usb/usb_controller.h>
	#include <dev/usb/usb_bus.h>
	#endif /* USB_GLOBAL_INCLUDE_FILE */

	#include <dev/usb/controller/ohci.h>
	#include <dev/usb/controller/ohcireg.h>

	#define OHCI_BUS2SC(bus) \
	((ohci_softc_t )(((uint8_t )(bus)) - \
	((uint8_t )&(((ohci_softc_t )0)->sc_bus))))

	#ifdef USB_DEBUG
	static int ohcidebug = 0;

	static SYSCTL_NODE(_hw_usb, OID_AUTO, ohci, CTLFLAG_RW, 0, "USB ohci");
	SYSCTL_INT(_hw_usb_ohci, OID_AUTO, debug, CTLFLAG_RWTUN,
	&ohcidebug, 0, "ohci debug level");

	static void ohci_dumpregs(ohci_softc_t *);
	static void ohci_dump_tds(ohci_td_t *);
	static uint8_t ohci_dump_td(ohci_td_t *);
	static void ohci_dump_ed(ohci_ed_t *);
	static uint8_t ohci_dump_itd(ohci_itd_t *);
	static void ohci_dump_itds(ohci_itd_t *);

	#endif

	#define OBARR(sc) bus_space_barrier((sc)->sc_io_tag, (sc)->sc_io_hdl, 0, (sc)->sc_io_size, \
	BUS_SPACE_BARRIER_READ\|BUS_SPACE_BARRIER_WRITE)
	#define OWRITE1(sc, r, x) \
	do { OBARR(sc); bus_space_write_1((sc)->sc_io_tag, (sc)->sc_io_hdl, (r), (x)); } while (0)
	#define OWRITE2(sc, r, x) \
	do { OBARR(sc); bus_space_write_2((sc)->sc_io_tag, (sc)->sc_io_hdl, (r), (x)); } while (0)
	#define OWRITE4(sc, r, x) \
	do { OBARR(sc); bus_space_write_4((sc)->sc_io_tag, (sc)->sc_io_hdl, (r), (x)); } while (0)
	#define OREAD1(sc, r) (OBARR(sc), bus_space_read_1((sc)->sc_io_tag, (sc)->sc_io_hdl, (r)))
	#define OREAD2(sc, r) (OBARR(sc), bus_space_read_2((sc)->sc_io_tag, (sc)->sc_io_hdl, (r)))
	#define OREAD4(sc, r) (OBARR(sc), bus_space_read_4((sc)->sc_io_tag, (sc)->sc_io_hdl, (r)))

	#define OHCI_INTR_ENDPT 1

	static const struct usb_bus_methods ohci_bus_methods;
	static const struct usb_pipe_methods ohci_device_bulk_methods;
	static const struct usb_pipe_methods ohci_device_ctrl_methods;
	static const struct usb_pipe_methods ohci_device_intr_methods;
	static const struct usb_pipe_methods ohci_device_isoc_methods;

	static void ohci_do_poll(struct usb_bus *bus);
	static void ohci_device_done(struct usb_xfer *xfer, usb_error_t error);
	static void ohci_timeout(void *arg);
	static uint8_t ohci_check_transfer(struct usb_xfer *xfer);
	static void ohci_root_intr(ohci_softc_t *sc);

	struct ohci_std_temp {
	struct usb_page_cache *pc;
	ohci_td_t *td;
	ohci_td_t *td_next;
	uint32_t average;
	uint32_t td_flags;
	uint32_t len;
	uint16_t max_frame_size;
	uint8_t shortpkt;
	uint8_t setup_alt_next;
	uint8_t last_frame;
	};

	static struct ohci_hcca *
	ohci_get_hcca(ohci_softc_t *sc)
	{
	usb_pc_cpu_invalidate(&sc->sc_hw.hcca_pc);
	return (sc->sc_hcca_p);
	}

	void
	ohci_iterate_hw_softc(struct usb_bus bus, usb_bus_mem_sub_cb_t cb)
	{
	struct ohci_softc *sc = OHCI_BUS2SC(bus);
	uint32_t i;

	cb(bus, &sc->sc_hw.hcca_pc, &sc->sc_hw.hcca_pg,
	sizeof(ohci_hcca_t), OHCI_HCCA_ALIGN);

	cb(bus, &sc->sc_hw.ctrl_start_pc, &sc->sc_hw.ctrl_start_pg,
	sizeof(ohci_ed_t), OHCI_ED_ALIGN);

	cb(bus, &sc->sc_hw.bulk_start_pc, &sc->sc_hw.bulk_start_pg,
	sizeof(ohci_ed_t), OHCI_ED_ALIGN);

	cb(bus, &sc->sc_hw.isoc_start_pc, &sc->sc_hw.isoc_start_pg,
	sizeof(ohci_ed_t), OHCI_ED_ALIGN);

	for (i = 0; i != OHCI_NO_EDS; i++) {
	cb(bus, sc->sc_hw.intr_start_pc + i, sc->sc_hw.intr_start_pg + i,
	sizeof(ohci_ed_t), OHCI_ED_ALIGN);
	}
	}

	static usb_error_t
	ohci_controller_init(ohci_softc_t *sc, int do_suspend)
	{
	struct usb_page_search buf_res;
	uint32_t i;
	uint32_t ctl;
	uint32_t ival;
	uint32_t hcr;
	uint32_t fm;
	uint32_t per;
	uint32_t desca;

	/* Determine in what context we are running. */
	ctl = OREAD4(sc, OHCI_CONTROL);
	if (ctl & OHCI_IR) {
	/* SMM active, request change */
	DPRINTF("SMM active, request owner change\n");
	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_OCR);
	for (i = 0; (i < 100) && (ctl & OHCI_IR); i++) {
	usb_pause_mtx(NULL, hz / 1000);
	ctl = OREAD4(sc, OHCI_CONTROL);
	}
	if (ctl & OHCI_IR) {
	device_printf(sc->sc_bus.bdev,
	"SMM does not respond, resetting\n");
	OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);
	goto reset;
	}
	} else {
	DPRINTF("cold started\n");
	reset:
	/* controller was cold started */
	usb_pause_mtx(NULL,
	USB_MS_TO_TICKS(USB_BUS_RESET_DELAY));
	}

	/*
	* This reset should not be necessary according to the OHCI spec, but
	* without it some controllers do not start.
	*/
	DPRINTF("%s: resetting\n", device_get_nameunit(sc->sc_bus.bdev));
	OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);

	usb_pause_mtx(NULL,
	USB_MS_TO_TICKS(USB_BUS_RESET_DELAY));

	/* we now own the host controller and the bus has been reset */
	ival = OHCI_GET_IVAL(OREAD4(sc, OHCI_FM_INTERVAL));

	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_HCR); /* Reset HC */
	/* nominal time for a reset is 10 us */
	for (i = 0; i < 10; i++) {
	DELAY(10);
	hcr = OREAD4(sc, OHCI_COMMAND_STATUS) & OHCI_HCR;
	if (!hcr) {
	break;
	}
	}
	if (hcr) {
	device_printf(sc->sc_bus.bdev, "reset timeout\n");
	return (USB_ERR_IOERROR);
	}
	#ifdef USB_DEBUG
	if (ohcidebug > 15) {
	ohci_dumpregs(sc);
	}
	#endif

	if (do_suspend) {
	OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_SUSPEND);
	return (USB_ERR_NORMAL_COMPLETION);
	}

	/* The controller is now in SUSPEND state, we have 2ms to finish. */

	/* set up HC registers */
	usbd_get_page(&sc->sc_hw.hcca_pc, 0, &buf_res);
	OWRITE4(sc, OHCI_HCCA, buf_res.physaddr);

	usbd_get_page(&sc->sc_hw.ctrl_start_pc, 0, &buf_res);
	OWRITE4(sc, OHCI_CONTROL_HEAD_ED, buf_res.physaddr);

	usbd_get_page(&sc->sc_hw.bulk_start_pc, 0, &buf_res);
	OWRITE4(sc, OHCI_BULK_HEAD_ED, buf_res.physaddr);

	/* disable all interrupts and then switch on all desired interrupts */
	OWRITE4(sc, OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS);
	OWRITE4(sc, OHCI_INTERRUPT_ENABLE, sc->sc_eintrs \| OHCI_MIE);
	/* switch on desired functional features */
	ctl = OREAD4(sc, OHCI_CONTROL);
	ctl &= ~(OHCI_CBSR_MASK \| OHCI_LES \| OHCI_HCFS_MASK \| OHCI_IR);
	ctl \|= OHCI_PLE \| OHCI_IE \| OHCI_CLE \| OHCI_BLE \|
	OHCI_RATIO_1_4 \| OHCI_HCFS_OPERATIONAL;
	/* And finally start it! */
	OWRITE4(sc, OHCI_CONTROL, ctl);

	/*
	* The controller is now OPERATIONAL. Set a some final
	* registers that should be set earlier, but that the
	* controller ignores when in the SUSPEND state.
	*/
	fm = (OREAD4(sc, OHCI_FM_INTERVAL) & OHCI_FIT) ^ OHCI_FIT;
	fm \|= OHCI_FSMPS(ival) \| ival;
	OWRITE4(sc, OHCI_FM_INTERVAL, fm);
	per = OHCI_PERIODIC(ival); /* 90% periodic */
	OWRITE4(sc, OHCI_PERIODIC_START, per);

	/* Fiddle the No OverCurrent Protection bit to avoid chip bug. */
	desca = OREAD4(sc, OHCI_RH_DESCRIPTOR_A);
	OWRITE4(sc, OHCI_RH_DESCRIPTOR_A, desca \| OHCI_NOCP);
	OWRITE4(sc, OHCI_RH_STATUS, OHCI_LPSC); /* Enable port power */
	usb_pause_mtx(NULL,
	USB_MS_TO_TICKS(OHCI_ENABLE_POWER_DELAY));
	OWRITE4(sc, OHCI_RH_DESCRIPTOR_A, desca);

	/*
	* The AMD756 requires a delay before re-reading the register,
	* otherwise it will occasionally report 0 ports.
	*/
	sc->sc_noport = 0;
	for (i = 0; (i < 10) && (sc->sc_noport == 0); i++) {
	usb_pause_mtx(NULL,
	USB_MS_TO_TICKS(OHCI_READ_DESC_DELAY));
	sc->sc_noport = OHCI_GET_NDP(OREAD4(sc, OHCI_RH_DESCRIPTOR_A));
	}

	#ifdef USB_DEBUG
	if (ohcidebug > 5) {
	ohci_dumpregs(sc);
	}
	#endif
	return (USB_ERR_NORMAL_COMPLETION);
	}

	static struct ohci_ed *
	ohci_init_ed(struct usb_page_cache *pc)
	{
	struct usb_page_search buf_res;
	struct ohci_ed *ed;

	usbd_get_page(pc, 0, &buf_res);

	ed = buf_res.buffer;

	ed->ed_self = htole32(buf_res.physaddr);
	ed->ed_flags = htole32(OHCI_ED_SKIP);
	ed->page_cache = pc;

	return (ed);
	}

	usb_error_t
	ohci_init(ohci_softc_t *sc)
	{
	struct usb_page_search buf_res;
	uint16_t i;
	uint16_t bit;
	uint16_t x;
	uint16_t y;

	DPRINTF("start\n");

	sc->sc_eintrs = OHCI_NORMAL_INTRS;

	/*
	* Setup all ED's
	*/

	sc->sc_ctrl_p_last =
	ohci_init_ed(&sc->sc_hw.ctrl_start_pc);

	sc->sc_bulk_p_last =
	ohci_init_ed(&sc->sc_hw.bulk_start_pc);

	sc->sc_isoc_p_last =
	ohci_init_ed(&sc->sc_hw.isoc_start_pc);

	for (i = 0; i != OHCI_NO_EDS; i++) {
	sc->sc_intr_p_last[i] =
	ohci_init_ed(sc->sc_hw.intr_start_pc + i);
	}

	/*
	* the QHs are arranged to give poll intervals that are
	* powers of 2 times 1ms
	*/
	bit = OHCI_NO_EDS / 2;
	while (bit) {
	x = bit;
	while (x & bit) {
	ohci_ed_t *ed_x;
	ohci_ed_t *ed_y;

	y = (x ^ bit) \| (bit / 2);

	/*
	* the next QH has half the poll interval
	*/
	ed_x = sc->sc_intr_p_last[x];
	ed_y = sc->sc_intr_p_last[y];

	ed_x->next = NULL;
	ed_x->ed_next = ed_y->ed_self;

	x++;
	}
	bit >>= 1;
	}

	if (1) {

	ohci_ed_t *ed_int;
	ohci_ed_t *ed_isc;

	ed_int = sc->sc_intr_p_last[0];
	ed_isc = sc->sc_isoc_p_last;

	/* the last (1ms) QH */
	ed_int->next = ed_isc;
	ed_int->ed_next = ed_isc->ed_self;
	}
	usbd_get_page(&sc->sc_hw.hcca_pc, 0, &buf_res);

	sc->sc_hcca_p = buf_res.buffer;

	/*
	* Fill HCCA interrupt table. The bit reversal is to get
	* the tree set up properly to spread the interrupts.
	*/
	for (i = 0; i != OHCI_NO_INTRS; i++) {
	sc->sc_hcca_p->hcca_interrupt_table[i] =
	sc->sc_intr_p_last[i \| (OHCI_NO_EDS / 2)]->ed_self;
	}
	/* flush all cache into memory */

	usb_bus_mem_flush_all(&sc->sc_bus, &ohci_iterate_hw_softc);

	/* set up the bus struct */
	sc->sc_bus.methods = &ohci_bus_methods;

	usb_callout_init_mtx(&sc->sc_tmo_rhsc, &sc->sc_bus.bus_mtx, 0);

	#ifdef USB_DEBUG
	if (ohcidebug > 15) {
	for (i = 0; i != OHCI_NO_EDS; i++) {
	printf("ed#%d ", i);
	ohci_dump_ed(sc->sc_intr_p_last[i]);
	}
	printf("iso ");
	ohci_dump_ed(sc->sc_isoc_p_last);
	}
	#endif

	sc->sc_bus.usbrev = USB_REV_1_0;

	if (ohci_controller_init(sc, 0) != 0)
	return (USB_ERR_INVAL);

	/* catch any lost interrupts */
	ohci_do_poll(&sc->sc_bus);
	return (USB_ERR_NORMAL_COMPLETION);
	}

	/*
	* shut down the controller when the system is going down
	*/
	void
	ohci_detach(struct ohci_softc *sc)
	{
	USB_BUS_LOCK(&sc->sc_bus);

	usb_callout_stop(&sc->sc_tmo_rhsc);

	OWRITE4(sc, OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS);
	OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);

	USB_BUS_UNLOCK(&sc->sc_bus);

	/* XXX let stray task complete */
	usb_pause_mtx(NULL, hz / 20);

	usb_callout_drain(&sc->sc_tmo_rhsc);
	}

	static void
	ohci_suspend(ohci_softc_t *sc)
	{
	DPRINTF("\n");

	#ifdef USB_DEBUG
	if (ohcidebug > 2)
	ohci_dumpregs(sc);
	#endif

	/* reset HC and leave it suspended */
	ohci_controller_init(sc, 1);
	}

	static void
	ohci_resume(ohci_softc_t *sc)
	{
	DPRINTF("\n");

	#ifdef USB_DEBUG
	if (ohcidebug > 2)
	ohci_dumpregs(sc);
	#endif

	/* some broken BIOSes never initialize the Controller chip */
	ohci_controller_init(sc, 0);

	/* catch any lost interrupts */
	ohci_do_poll(&sc->sc_bus);
	}

	#ifdef USB_DEBUG
	static void
	ohci_dumpregs(ohci_softc_t *sc)
	{
	struct ohci_hcca *hcca;

	DPRINTF("ohci_dumpregs: rev=0x%08x control=0x%08x command=0x%08x\n",
	OREAD4(sc, OHCI_REVISION),
	OREAD4(sc, OHCI_CONTROL),
	OREAD4(sc, OHCI_COMMAND_STATUS));
	DPRINTF(" intrstat=0x%08x intre=0x%08x intrd=0x%08x\n",
	OREAD4(sc, OHCI_INTERRUPT_STATUS),
	OREAD4(sc, OHCI_INTERRUPT_ENABLE),
	OREAD4(sc, OHCI_INTERRUPT_DISABLE));
	DPRINTF(" hcca=0x%08x percur=0x%08x ctrlhd=0x%08x\n",
	OREAD4(sc, OHCI_HCCA),
	OREAD4(sc, OHCI_PERIOD_CURRENT_ED),
	OREAD4(sc, OHCI_CONTROL_HEAD_ED));
	DPRINTF(" ctrlcur=0x%08x bulkhd=0x%08x bulkcur=0x%08x\n",
	OREAD4(sc, OHCI_CONTROL_CURRENT_ED),
	OREAD4(sc, OHCI_BULK_HEAD_ED),
	OREAD4(sc, OHCI_BULK_CURRENT_ED));
	DPRINTF(" done=0x%08x fmival=0x%08x fmrem=0x%08x\n",
	OREAD4(sc, OHCI_DONE_HEAD),
	OREAD4(sc, OHCI_FM_INTERVAL),
	OREAD4(sc, OHCI_FM_REMAINING));
	DPRINTF(" fmnum=0x%08x perst=0x%08x lsthrs=0x%08x\n",
	OREAD4(sc, OHCI_FM_NUMBER),
	OREAD4(sc, OHCI_PERIODIC_START),
	OREAD4(sc, OHCI_LS_THRESHOLD));
	DPRINTF(" desca=0x%08x descb=0x%08x stat=0x%08x\n",
	OREAD4(sc, OHCI_RH_DESCRIPTOR_A),
	OREAD4(sc, OHCI_RH_DESCRIPTOR_B),
	OREAD4(sc, OHCI_RH_STATUS));
	DPRINTF(" port1=0x%08x port2=0x%08x\n",
	OREAD4(sc, OHCI_RH_PORT_STATUS(1)),
	OREAD4(sc, OHCI_RH_PORT_STATUS(2)));

	hcca = ohci_get_hcca(sc);

	DPRINTF(" HCCA: frame_number=0x%04x done_head=0x%08x\n",
	le32toh(hcca->hcca_frame_number),
	le32toh(hcca->hcca_done_head));
	}
	static void
	ohci_dump_tds(ohci_td_t *std)
	{
	for (; std; std = std->obj_next) {
	if (ohci_dump_td(std)) {
	break;
	}
	}
	}

	static uint8_t
	ohci_dump_td(ohci_td_t *std)
	{
	uint32_t td_flags;
	uint8_t temp;

	usb_pc_cpu_invalidate(std->page_cache);

	td_flags = le32toh(std->td_flags);
	temp = (std->td_next == 0);

	printf("TD(%p) at 0x%08x: %s%s%s%s%s delay=%d ec=%d "
	"cc=%d\ncbp=0x%08x next=0x%08x be=0x%08x\n",
	std, le32toh(std->td_self),
	(td_flags & OHCI_TD_R) ? "-R" : "",
	(td_flags & OHCI_TD_OUT) ? "-OUT" : "",
	(td_flags & OHCI_TD_IN) ? "-IN" : "",
	((td_flags & OHCI_TD_TOGGLE_MASK) == OHCI_TD_TOGGLE_1) ? "-TOG1" : "",
	((td_flags & OHCI_TD_TOGGLE_MASK) == OHCI_TD_TOGGLE_0) ? "-TOG0" : "",
	OHCI_TD_GET_DI(td_flags),
	OHCI_TD_GET_EC(td_flags),
	OHCI_TD_GET_CC(td_flags),
	le32toh(std->td_cbp),
	le32toh(std->td_next),
	le32toh(std->td_be));

	return (temp);
	}

	static uint8_t
	ohci_dump_itd(ohci_itd_t *sitd)
	{
	uint32_t itd_flags;
	uint16_t i;
	uint8_t temp;

	usb_pc_cpu_invalidate(sitd->page_cache);

	itd_flags = le32toh(sitd->itd_flags);
	temp = (sitd->itd_next == 0);

	printf("ITD(%p) at 0x%08x: sf=%d di=%d fc=%d cc=%d\n"
	"bp0=0x%08x next=0x%08x be=0x%08x\n",
	sitd, le32toh(sitd->itd_self),
	OHCI_ITD_GET_SF(itd_flags),
	OHCI_ITD_GET_DI(itd_flags),
	OHCI_ITD_GET_FC(itd_flags),
	OHCI_ITD_GET_CC(itd_flags),
	le32toh(sitd->itd_bp0),
	le32toh(sitd->itd_next),
	le32toh(sitd->itd_be));
	for (i = 0; i < OHCI_ITD_NOFFSET; i++) {
	printf("offs[%d]=0x%04x ", i,
	(uint32_t)le16toh(sitd->itd_offset[i]));
	}
	printf("\n");

	return (temp);
	}

	static void
	ohci_dump_itds(ohci_itd_t *sitd)
	{
	for (; sitd; sitd = sitd->obj_next) {
	if (ohci_dump_itd(sitd)) {
	break;
	}
	}
	}

	static void
	ohci_dump_ed(ohci_ed_t *sed)
	{
	uint32_t ed_flags;
	uint32_t ed_headp;

	usb_pc_cpu_invalidate(sed->page_cache);

	ed_flags = le32toh(sed->ed_flags);
	ed_headp = le32toh(sed->ed_headp);

	printf("ED(%p) at 0x%08x: addr=%d endpt=%d maxp=%d flags=%s%s%s%s%s\n"
	"tailp=0x%08x headflags=%s%s headp=0x%08x nexted=0x%08x\n",
	sed, le32toh(sed->ed_self),
	OHCI_ED_GET_FA(ed_flags),
	OHCI_ED_GET_EN(ed_flags),
	OHCI_ED_GET_MAXP(ed_flags),
	(ed_flags & OHCI_ED_DIR_OUT) ? "-OUT" : "",
	(ed_flags & OHCI_ED_DIR_IN) ? "-IN" : "",
	(ed_flags & OHCI_ED_SPEED) ? "-LOWSPEED" : "",
	(ed_flags & OHCI_ED_SKIP) ? "-SKIP" : "",
	(ed_flags & OHCI_ED_FORMAT_ISO) ? "-ISO" : "",
	le32toh(sed->ed_tailp),
	(ed_headp & OHCI_HALTED) ? "-HALTED" : "",
	(ed_headp & OHCI_TOGGLECARRY) ? "-CARRY" : "",
	le32toh(sed->ed_headp),
	le32toh(sed->ed_next));
	}

	#endif

	static void
	ohci_transfer_intr_enqueue(struct usb_xfer *xfer)
	{
	/* check for early completion */
	if (ohci_check_transfer(xfer)) {
	return;
	}
	/* put transfer on interrupt queue */
	usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer);

	/* start timeout, if any */
	if (xfer->timeout != 0) {
	usbd_transfer_timeout_ms(xfer, &ohci_timeout, xfer->timeout);
	}
	}

	#define OHCI_APPEND_QH(sed,last) (last) = _ohci_append_qh(sed,last)
	static ohci_ed_t *
	_ohci_append_qh(ohci_ed_t sed, ohci_ed_t last)
	{
	DPRINTFN(11, "%p to %p\n", sed, last);

	if (sed->prev != NULL) {
	/* should not happen */
	DPRINTFN(0, "ED already linked!\n");
	return (last);
	}
	/* (sc->sc_bus.bus_mtx) must be locked */

	sed->next = last->next;
	sed->ed_next = last->ed_next;
	sed->ed_tailp = 0;

	sed->prev = last;

	usb_pc_cpu_flush(sed->page_cache);

	/*
	* the last->next->prev is never followed: sed->next->prev = sed;
	*/

	last->next = sed;
	last->ed_next = sed->ed_self;

	usb_pc_cpu_flush(last->page_cache);

	return (sed);
	}

	#define OHCI_REMOVE_QH(sed,last) (last) = _ohci_remove_qh(sed,last)
	static ohci_ed_t *
	_ohci_remove_qh(ohci_ed_t sed, ohci_ed_t last)
	{
	DPRINTFN(11, "%p from %p\n", sed, last);

	/* (sc->sc_bus.bus_mtx) must be locked */

	/* only remove if not removed from a queue */
	if (sed->prev) {

	sed->prev->next = sed->next;
	sed->prev->ed_next = sed->ed_next;

	usb_pc_cpu_flush(sed->prev->page_cache);

	if (sed->next) {
	sed->next->prev = sed->prev;
	usb_pc_cpu_flush(sed->next->page_cache);
	}
	last = ((last == sed) ? sed->prev : last);

	sed->prev = 0;

	usb_pc_cpu_flush(sed->page_cache);
	}
	return (last);
	}

	static void
	ohci_isoc_done(struct usb_xfer *xfer)
	{
	uint8_t nframes;
	uint32_t *plen = xfer->frlengths;
	volatile uint16_t *olen;
	uint16_t len = 0;
	ohci_itd_t *td = xfer->td_transfer_first;

	while (1) {
	if (td == NULL) {
	panic("%s:%d: out of TD's\n",
	__FUNCTION__, __LINE__);
	}
	#ifdef USB_DEBUG
	if (ohcidebug > 5) {
	DPRINTF("isoc TD\n");
	ohci_dump_itd(td);
	}
	#endif
	usb_pc_cpu_invalidate(td->page_cache);

	nframes = td->frames;
	olen = &td->itd_offset[0];

	if (nframes > 8) {
	nframes = 8;
	}
	while (nframes--) {
	len = le16toh(*olen);

	if ((len >> 12) == OHCI_CC_NOT_ACCESSED) {
	len = 0;
	} else {
	len &= ((1 << 12) - 1);
	}

	if (len > *plen) {
	len = 0;/* invalid length */
	}
	*plen = len;
	plen++;
	olen++;
	}

	if (((void *)td) == xfer->td_transfer_last) {
	break;
	}
	td = td->obj_next;
	}

	xfer->aframes = xfer->nframes;
	ohci_device_done(xfer, USB_ERR_NORMAL_COMPLETION);
	}

	#ifdef USB_DEBUG
	static const char *const
	ohci_cc_strs[] =
	{
	"NO_ERROR",
	"CRC",
	"BIT_STUFFING",
	"DATA_TOGGLE_MISMATCH",

	"STALL",
	"DEVICE_NOT_RESPONDING",
	"PID_CHECK_FAILURE",
	"UNEXPECTED_PID",

	"DATA_OVERRUN",
	"DATA_UNDERRUN",
	"BUFFER_OVERRUN",
	"BUFFER_UNDERRUN",

	"reserved",
	"reserved",
	"NOT_ACCESSED",
	"NOT_ACCESSED"
	};

	#endif

	static usb_error_t
	ohci_non_isoc_done_sub(struct usb_xfer *xfer)
	{
	ohci_td_t *td;
	ohci_td_t *td_alt_next;
	uint32_t temp;
	uint32_t phy_start;
	uint32_t phy_end;
	uint32_t td_flags;
	uint16_t cc;

	td = xfer->td_transfer_cache;
	td_alt_next = td->alt_next;
	td_flags = 0;

	if (xfer->aframes != xfer->nframes) {
	usbd_xfer_set_frame_len(xfer, xfer->aframes, 0);
	}
	while (1) {

	usb_pc_cpu_invalidate(td->page_cache);
	phy_start = le32toh(td->td_cbp);
	td_flags = le32toh(td->td_flags);
	cc = OHCI_TD_GET_CC(td_flags);

	if (phy_start) {
	/*
	* short transfer - compute the number of remaining
	* bytes in the hardware buffer:
	*/
	phy_end = le32toh(td->td_be);
	temp = (OHCI_PAGE(phy_start ^ phy_end) ?
	(OHCI_PAGE_SIZE + 1) : 0x0001);
	temp += OHCI_PAGE_OFFSET(phy_end);
	temp -= OHCI_PAGE_OFFSET(phy_start);

	if (temp > td->len) {
	/* guard against corruption */
	cc = OHCI_CC_STALL;
	} else if (xfer->aframes != xfer->nframes) {
	/*
	* Sum up total transfer length
	* in "frlengths[]":
	*/
	xfer->frlengths[xfer->aframes] += td->len - temp;
	}
	} else {
	if (xfer->aframes != xfer->nframes) {
	/* transfer was complete */
	xfer->frlengths[xfer->aframes] += td->len;
	}
	}
	/* Check for last transfer */
	if (((void *)td) == xfer->td_transfer_last) {
	td = NULL;
	break;
	}
	/* Check transfer status */
	if (cc) {
	/* the transfer is finished */
	td = NULL;
	break;
	}
	/* Check for short transfer */
	if (phy_start) {
	if (xfer->flags_int.short_frames_ok) {
	/* follow alt next */
	td = td->alt_next;
	} else {
	/* the transfer is finished */
	td = NULL;
	}
	break;
	}
	td = td->obj_next;

	if (td->alt_next != td_alt_next) {
	/* this USB frame is complete */
	break;
	}
	}

	/* update transfer cache */

	xfer->td_transfer_cache = td;

	DPRINTFN(16, "error cc=%d (%s)\n",
	cc, ohci_cc_strs[cc]);

	return ((cc == 0) ? USB_ERR_NORMAL_COMPLETION :
	(cc == OHCI_CC_STALL) ? USB_ERR_STALLED : USB_ERR_IOERROR);
	}

	static void
	ohci_non_isoc_done(struct usb_xfer *xfer)
	{
	usb_error_t err = 0;

	DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
	xfer, xfer->endpoint);

	#ifdef USB_DEBUG
	if (ohcidebug > 10) {
	ohci_dump_tds(xfer->td_transfer_first);
	}
	#endif

	/* reset scanner */

	xfer->td_transfer_cache = xfer->td_transfer_first;

	if (xfer->flags_int.control_xfr) {

	if (xfer->flags_int.control_hdr) {

	err = ohci_non_isoc_done_sub(xfer);
	}
	xfer->aframes = 1;

	if (xfer->td_transfer_cache == NULL) {
	goto done;
	}
	}
	while (xfer->aframes != xfer->nframes) {

	err = ohci_non_isoc_done_sub(xfer);
	xfer->aframes++;

	if (xfer->td_transfer_cache == NULL) {
	goto done;
	}
	}

	if (xfer->flags_int.control_xfr &&
	!xfer->flags_int.control_act) {

	err = ohci_non_isoc_done_sub(xfer);
	}
	done:
	ohci_device_done(xfer, err);
	}

	/------------------------------------------------------------------------
	* ohci_check_transfer_sub
	------------------------------------------------------------------------/
	static void
	ohci_check_transfer_sub(struct usb_xfer *xfer)
	{
	ohci_td_t *td;
	ohci_ed_t *ed;
	uint32_t phy_start;
	uint32_t td_flags;
	uint32_t td_next;
	uint16_t cc;

	td = xfer->td_transfer_cache;

	while (1) {

	usb_pc_cpu_invalidate(td->page_cache);
	phy_start = le32toh(td->td_cbp);
	td_flags = le32toh(td->td_flags);
	td_next = le32toh(td->td_next);

	/* Check for last transfer */
	if (((void *)td) == xfer->td_transfer_last) {
	/* the transfer is finished */
	td = NULL;
	break;
	}
	/* Check transfer status */
	cc = OHCI_TD_GET_CC(td_flags);
	if (cc) {
	/* the transfer is finished */
	td = NULL;
	break;
	}
	/*
	* Check if we reached the last packet
	* or if there is a short packet:
	*/

	if (((td_next & (~0xF)) == OHCI_TD_NEXT_END) \|\| phy_start) {
	/* follow alt next */
	td = td->alt_next;
	break;
	}
	td = td->obj_next;
	}

	/* update transfer cache */

	xfer->td_transfer_cache = td;

	if (td) {

	ed = xfer->qh_start[xfer->flags_int.curr_dma_set];

	ed->ed_headp = td->td_self;
	usb_pc_cpu_flush(ed->page_cache);

	DPRINTFN(13, "xfer=%p following alt next\n", xfer);

	/*
	* Make sure that the OHCI re-scans the schedule by
	* writing the BLF and CLF bits:
	*/

	if (xfer->xroot->udev->flags.self_suspended) {
	/* nothing to do */
	} else if (xfer->endpoint->methods == &ohci_device_bulk_methods) {
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_BLF);
	} else if (xfer->endpoint->methods == &ohci_device_ctrl_methods) {
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_CLF);
	}
	}
	}

	/------------------------------------------------------------------------
	* ohci_check_transfer
	*
	* Return values:
	* 0: USB transfer is not finished
	* Else: USB transfer is finished
	------------------------------------------------------------------------/
	static uint8_t
	ohci_check_transfer(struct usb_xfer *xfer)
	{
	ohci_ed_t *ed;
	uint32_t ed_headp;
	uint32_t ed_tailp;

	DPRINTFN(13, "xfer=%p checking transfer\n", xfer);

	ed = xfer->qh_start[xfer->flags_int.curr_dma_set];

	usb_pc_cpu_invalidate(ed->page_cache);
	ed_headp = le32toh(ed->ed_headp);
	ed_tailp = le32toh(ed->ed_tailp);

	if ((ed_headp & OHCI_HALTED) \|\|
	(((ed_headp ^ ed_tailp) & (~0xF)) == 0)) {
	if (xfer->endpoint->methods == &ohci_device_isoc_methods) {
	/* isochronous transfer */
	ohci_isoc_done(xfer);
	} else {
	if (xfer->flags_int.short_frames_ok) {
	ohci_check_transfer_sub(xfer);
	if (xfer->td_transfer_cache) {
	/* not finished yet */
	return (0);
	}
	}
	/* store data-toggle */
	if (ed_headp & OHCI_TOGGLECARRY) {
	xfer->endpoint->toggle_next = 1;
	} else {
	xfer->endpoint->toggle_next = 0;
	}

	/* non-isochronous transfer */
	ohci_non_isoc_done(xfer);
	}
	return (1);
	}
	DPRINTFN(13, "xfer=%p is still active\n", xfer);
	return (0);
	}

	static void
	ohci_rhsc_enable(ohci_softc_t *sc)
	{
	DPRINTFN(5, "\n");

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	sc->sc_eintrs \|= OHCI_RHSC;
	OWRITE4(sc, OHCI_INTERRUPT_ENABLE, OHCI_RHSC);

	/* acknowledge any RHSC interrupt */
	OWRITE4(sc, OHCI_INTERRUPT_STATUS, OHCI_RHSC);

	ohci_root_intr(sc);
	}

	static void
	ohci_interrupt_poll(ohci_softc_t *sc)
	{
	struct usb_xfer *xfer;

	repeat:
	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {
	/*
	* check if transfer is transferred
	*/
	if (ohci_check_transfer(xfer)) {
	/* queue has been modified */
	goto repeat;
	}
	}
	}

	/------------------------------------------------------------------------
	* ohci_interrupt - OHCI interrupt handler
	*
	* NOTE: Do not access "sc->sc_bus.bdev" inside the interrupt handler,
	* hence the interrupt handler will be setup before "sc->sc_bus.bdev"
	* is present !
	------------------------------------------------------------------------/
	void
	ohci_interrupt(ohci_softc_t *sc)
	{
	struct ohci_hcca *hcca;
	uint32_t status;
	uint32_t done;

	USB_BUS_LOCK(&sc->sc_bus);

	hcca = ohci_get_hcca(sc);

	DPRINTFN(16, "real interrupt\n");

	#ifdef USB_DEBUG
	if (ohcidebug > 15) {
	ohci_dumpregs(sc);
	}
	#endif

	done = le32toh(hcca->hcca_done_head);

	/*
	* The LSb of done is used to inform the HC Driver that an interrupt
	* condition exists for both the Done list and for another event
	* recorded in HcInterruptStatus. On an interrupt from the HC, the
	* HC Driver checks the HccaDoneHead Value. If this value is 0, then
	* the interrupt was caused by other than the HccaDoneHead update
	* and the HcInterruptStatus register needs to be accessed to
	* determine that exact interrupt cause. If HccaDoneHead is nonzero,
	* then a Done list update interrupt is indicated and if the LSb of
	* done is nonzero, then an additional interrupt event is indicated
	* and HcInterruptStatus should be checked to determine its cause.
	*/
	if (done != 0) {
	status = 0;

	if (done & ~OHCI_DONE_INTRS) {
	status \|= OHCI_WDH;
	}
	if (done & OHCI_DONE_INTRS) {
	status \|= OREAD4(sc, OHCI_INTERRUPT_STATUS);
	}
	hcca->hcca_done_head = 0;

	usb_pc_cpu_flush(&sc->sc_hw.hcca_pc);
	} else {
	status = OREAD4(sc, OHCI_INTERRUPT_STATUS) & ~OHCI_WDH;
	}

	status &= ~OHCI_MIE;
	if (status == 0) {
	/*
	* nothing to be done (PCI shared
	* interrupt)
	*/
	goto done;
	}
	OWRITE4(sc, OHCI_INTERRUPT_STATUS, status); /* Acknowledge */

	status &= sc->sc_eintrs;
	if (status == 0) {
	goto done;
	}
	if (status & (OHCI_SO \| OHCI_RD \| OHCI_UE \| OHCI_RHSC)) {
	#if 0
	if (status & OHCI_SO) {
	/* XXX do what */
	}
	#endif
	if (status & OHCI_RD) {
	printf("%s: resume detect\n", __FUNCTION__);
	/* XXX process resume detect */
	}
	if (status & OHCI_UE) {
	printf("%s: unrecoverable error, "
	"controller halted\n", __FUNCTION__);
	OWRITE4(sc, OHCI_CONTROL, OHCI_HCFS_RESET);
	/* XXX what else */
	}
	if (status & OHCI_RHSC) {
	/*
	* Disable RHSC interrupt for now, because it will be
	* on until the port has been reset.
	*/
	sc->sc_eintrs &= ~OHCI_RHSC;
	OWRITE4(sc, OHCI_INTERRUPT_DISABLE, OHCI_RHSC);

	ohci_root_intr(sc);

	/* do not allow RHSC interrupts > 1 per second */
	usb_callout_reset(&sc->sc_tmo_rhsc, hz,
	(void *)&ohci_rhsc_enable, sc);
	}
	}
	status &= ~(OHCI_RHSC \| OHCI_WDH \| OHCI_SO);
	if (status != 0) {
	/* Block unprocessed interrupts. XXX */
	OWRITE4(sc, OHCI_INTERRUPT_DISABLE, status);
	sc->sc_eintrs &= ~status;
	printf("%s: blocking intrs 0x%x\n",
	__FUNCTION__, status);
	}
	/* poll all the USB transfers */
	ohci_interrupt_poll(sc);

	done:
	USB_BUS_UNLOCK(&sc->sc_bus);
	}

	/*
	* called when a request does not complete
	*/
	static void
	ohci_timeout(void *arg)
	{
	struct usb_xfer *xfer = arg;

	DPRINTF("xfer=%p\n", xfer);

	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);

	/* transfer is transferred */
	ohci_device_done(xfer, USB_ERR_TIMEOUT);
	}

	static void
	ohci_do_poll(struct usb_bus *bus)
	{
	struct ohci_softc *sc = OHCI_BUS2SC(bus);

	USB_BUS_LOCK(&sc->sc_bus);
	ohci_interrupt_poll(sc);
	USB_BUS_UNLOCK(&sc->sc_bus);
	}

	static void
	ohci_setup_standard_chain_sub(struct ohci_std_temp *temp)
	{
	struct usb_page_search buf_res;
	ohci_td_t *td;
	ohci_td_t *td_next;
	ohci_td_t *td_alt_next;
	uint32_t buf_offset;
	uint32_t average;
	uint32_t len_old;
	uint8_t shortpkt_old;
	uint8_t precompute;

	td_alt_next = NULL;
	buf_offset = 0;
	shortpkt_old = temp->shortpkt;
	len_old = temp->len;
	precompute = 1;

	/* software is used to detect short incoming transfers */

	if ((temp->td_flags & htole32(OHCI_TD_DP_MASK)) == htole32(OHCI_TD_IN)) {
	temp->td_flags \|= htole32(OHCI_TD_R);
	} else {
	temp->td_flags &= ~htole32(OHCI_TD_R);
	}

	restart:

	td = temp->td;
	td_next = temp->td_next;

	while (1) {

	if (temp->len == 0) {

	if (temp->shortpkt) {
	break;
	}
	/* send a Zero Length Packet, ZLP, last */

	temp->shortpkt = 1;
	average = 0;

	} else {

	average = temp->average;

	if (temp->len < average) {
	if (temp->len % temp->max_frame_size) {
	temp->shortpkt = 1;
	}
	average = temp->len;
	}
	}

	if (td_next == NULL) {
	panic("%s: out of OHCI transfer descriptors!", __FUNCTION__);
	}
	/* get next TD */

	td = td_next;
	td_next = td->obj_next;

	/* check if we are pre-computing */

	if (precompute) {

	/* update remaining length */

	temp->len -= average;

	continue;
	}
	/* fill out current TD */
	td->td_flags = temp->td_flags;

	/* the next TD uses TOGGLE_CARRY */
	temp->td_flags &= ~htole32(OHCI_TD_TOGGLE_MASK);

	if (average == 0) {
	/*
	* The buffer start and end phys addresses should be
	* 0x0 for a zero length packet.
	*/
	td->td_cbp = 0;
	td->td_be = 0;
	td->len = 0;

	} else {

	usbd_get_page(temp->pc, buf_offset, &buf_res);
	td->td_cbp = htole32(buf_res.physaddr);
	buf_offset += (average - 1);

	usbd_get_page(temp->pc, buf_offset, &buf_res);
	td->td_be = htole32(buf_res.physaddr);
	buf_offset++;

	td->len = average;

	/* update remaining length */

	temp->len -= average;
	}

	if ((td_next == td_alt_next) && temp->setup_alt_next) {
	/* we need to receive these frames one by one ! */
	td->td_flags &= htole32(~OHCI_TD_INTR_MASK);
	td->td_flags \|= htole32(OHCI_TD_SET_DI(1));
	td->td_next = htole32(OHCI_TD_NEXT_END);
	} else {
	if (td_next) {
	/* link the current TD with the next one */
	td->td_next = td_next->td_self;
	}
	}

	td->alt_next = td_alt_next;

	usb_pc_cpu_flush(td->page_cache);
	}

	if (precompute) {
	precompute = 0;

	/* setup alt next pointer, if any */
	if (temp->last_frame) {
	/* no alternate next */
	td_alt_next = NULL;
	} else {
	/* we use this field internally */
	td_alt_next = td_next;
	}

	/* restore */
	temp->shortpkt = shortpkt_old;
	temp->len = len_old;
	goto restart;
	}
	temp->td = td;
	temp->td_next = td_next;
	}

	static void
	ohci_setup_standard_chain(struct usb_xfer xfer, ohci_ed_t *ed_last)
	{
	struct ohci_std_temp temp;
	const struct usb_pipe_methods *methods;
	ohci_ed_t *ed;
	ohci_td_t *td;
	uint32_t ed_flags;
	uint32_t x;

	DPRINTFN(9, "addr=%d endpt=%d sumlen=%d speed=%d\n",
	xfer->address, UE_GET_ADDR(xfer->endpointno),
	xfer->sumlen, usbd_get_speed(xfer->xroot->udev));

	temp.average = xfer->max_hc_frame_size;
	temp.max_frame_size = xfer->max_frame_size;

	/* toggle the DMA set we are using */
	xfer->flags_int.curr_dma_set ^= 1;

	/* get next DMA set */
	td = xfer->td_start[xfer->flags_int.curr_dma_set];

	xfer->td_transfer_first = td;
	xfer->td_transfer_cache = td;

	temp.td = NULL;
	temp.td_next = td;
	temp.last_frame = 0;
	temp.setup_alt_next = xfer->flags_int.short_frames_ok;

	methods = xfer->endpoint->methods;

	/* check if we should prepend a setup message */

	if (xfer->flags_int.control_xfr) {
	if (xfer->flags_int.control_hdr) {

	temp.td_flags = htole32(OHCI_TD_SETUP \| OHCI_TD_NOCC \|
	OHCI_TD_TOGGLE_0 \| OHCI_TD_NOINTR);

	temp.len = xfer->frlengths[0];
	temp.pc = xfer->frbuffers + 0;
	temp.shortpkt = temp.len ? 1 : 0;
	/* check for last frame */
	if (xfer->nframes == 1) {
	/* no STATUS stage yet, SETUP is last */
	if (xfer->flags_int.control_act) {
	temp.last_frame = 1;
	temp.setup_alt_next = 0;
	}
	}
	ohci_setup_standard_chain_sub(&temp);

	/*
	* XXX assume that the setup message is
	* contained within one USB packet:
	*/
	xfer->endpoint->toggle_next = 1;
	}
	x = 1;
	} else {
	x = 0;
	}
	temp.td_flags = htole32(OHCI_TD_NOCC \| OHCI_TD_NOINTR);

	/* set data toggle */

	if (xfer->endpoint->toggle_next) {
	temp.td_flags \|= htole32(OHCI_TD_TOGGLE_1);
	} else {
	temp.td_flags \|= htole32(OHCI_TD_TOGGLE_0);
	}

	/* set endpoint direction */

	if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) {
	temp.td_flags \|= htole32(OHCI_TD_IN);
	} else {
	temp.td_flags \|= htole32(OHCI_TD_OUT);
	}

	while (x != xfer->nframes) {

	/* DATA0 / DATA1 message */

	temp.len = xfer->frlengths[x];
	temp.pc = xfer->frbuffers + x;

	x++;

	if (x == xfer->nframes) {
	if (xfer->flags_int.control_xfr) {
	/* no STATUS stage yet, DATA is last */
	if (xfer->flags_int.control_act) {
	temp.last_frame = 1;
	temp.setup_alt_next = 0;
	}
	} else {
	temp.last_frame = 1;
	temp.setup_alt_next = 0;
	}
	}
	if (temp.len == 0) {

	/* make sure that we send an USB packet */

	temp.shortpkt = 0;

	} else {

	/* regular data transfer */

	temp.shortpkt = (xfer->flags.force_short_xfer) ? 0 : 1;
	}

	ohci_setup_standard_chain_sub(&temp);
	}

	/* check if we should append a status stage */

	if (xfer->flags_int.control_xfr &&
	!xfer->flags_int.control_act) {

	/*
	* Send a DATA1 message and invert the current endpoint
	* direction.
	*/

	/* set endpoint direction and data toggle */

	if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN) {
	temp.td_flags = htole32(OHCI_TD_OUT \|
	OHCI_TD_NOCC \| OHCI_TD_TOGGLE_1 \| OHCI_TD_SET_DI(1));
	} else {
	temp.td_flags = htole32(OHCI_TD_IN \|
	OHCI_TD_NOCC \| OHCI_TD_TOGGLE_1 \| OHCI_TD_SET_DI(1));
	}

	temp.len = 0;
	temp.pc = NULL;
	temp.shortpkt = 0;
	temp.last_frame = 1;
	temp.setup_alt_next = 0;

	ohci_setup_standard_chain_sub(&temp);
	}
	td = temp.td;

	/* Ensure that last TD is terminating: */
	td->td_next = htole32(OHCI_TD_NEXT_END);
	td->td_flags &= ~htole32(OHCI_TD_INTR_MASK);
	td->td_flags \|= htole32(OHCI_TD_SET_DI(1));

	usb_pc_cpu_flush(td->page_cache);

	/* must have at least one frame! */

	xfer->td_transfer_last = td;

	#ifdef USB_DEBUG
	if (ohcidebug > 8) {
	DPRINTF("nexttog=%d; data before transfer:\n",
	xfer->endpoint->toggle_next);
	ohci_dump_tds(xfer->td_transfer_first);
	}
	#endif

	ed = xfer->qh_start[xfer->flags_int.curr_dma_set];

	ed_flags = (OHCI_ED_SET_FA(xfer->address) \|
	OHCI_ED_SET_EN(UE_GET_ADDR(xfer->endpointno)) \|
	OHCI_ED_SET_MAXP(xfer->max_frame_size));

	ed_flags \|= (OHCI_ED_FORMAT_GEN \| OHCI_ED_DIR_TD);

	if (xfer->xroot->udev->speed == USB_SPEED_LOW) {
	ed_flags \|= OHCI_ED_SPEED;
	}
	ed->ed_flags = htole32(ed_flags);

	td = xfer->td_transfer_first;

	ed->ed_headp = td->td_self;

	if (xfer->xroot->udev->flags.self_suspended == 0) {
	/* the append function will flush the endpoint descriptor */
	OHCI_APPEND_QH(ed, *ed_last);

	if (methods == &ohci_device_bulk_methods) {
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_BLF);
	}
	if (methods == &ohci_device_ctrl_methods) {
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_CLF);
	}
	} else {
	usb_pc_cpu_flush(ed->page_cache);
	}
	}

	static void
	ohci_root_intr(ohci_softc_t *sc)
	{
	uint32_t hstatus;
	uint16_t i;
	uint16_t m;

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	/* clear any old interrupt data */
	memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata));

	hstatus = OREAD4(sc, OHCI_RH_STATUS);
	DPRINTF("sc=%p hstatus=0x%08x\n",
	sc, hstatus);

	/* set bits */
	m = (sc->sc_noport + 1);
	if (m > (8 * sizeof(sc->sc_hub_idata))) {
	m = (8 * sizeof(sc->sc_hub_idata));
	}
	for (i = 1; i < m; i++) {
	/* pick out CHANGE bits from the status register */
	if (OREAD4(sc, OHCI_RH_PORT_STATUS(i)) >> 16) {
	sc->sc_hub_idata[i / 8] \|= 1 << (i % 8);
	DPRINTF("port %d changed\n", i);
	}
	}

	uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata,
	sizeof(sc->sc_hub_idata));
	}

	/* NOTE: "done" can be run two times in a row,
	* from close and from interrupt
	*/
	static void
	ohci_device_done(struct usb_xfer *xfer, usb_error_t error)
	{
	const struct usb_pipe_methods *methods = xfer->endpoint->methods;
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
	ohci_ed_t *ed;

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);


	DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n",
	xfer, xfer->endpoint, error);

	ed = xfer->qh_start[xfer->flags_int.curr_dma_set];
	if (ed) {
	usb_pc_cpu_invalidate(ed->page_cache);
	}
	if (methods == &ohci_device_bulk_methods) {
	OHCI_REMOVE_QH(ed, sc->sc_bulk_p_last);
	}
	if (methods == &ohci_device_ctrl_methods) {
	OHCI_REMOVE_QH(ed, sc->sc_ctrl_p_last);
	}
	if (methods == &ohci_device_intr_methods) {
	OHCI_REMOVE_QH(ed, sc->sc_intr_p_last[xfer->qh_pos]);
	}
	if (methods == &ohci_device_isoc_methods) {
	OHCI_REMOVE_QH(ed, sc->sc_isoc_p_last);
	}
	xfer->td_transfer_first = NULL;
	xfer->td_transfer_last = NULL;

	/* dequeue transfer and start next transfer */
	usbd_transfer_done(xfer, error);
	}

	/------------------------------------------------------------------------
	* ohci bulk support
	------------------------------------------------------------------------/
	static void
	ohci_device_bulk_open(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ohci_device_bulk_close(struct usb_xfer *xfer)
	{
	ohci_device_done(xfer, USB_ERR_CANCELLED);
	}

	static void
	ohci_device_bulk_enter(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ohci_device_bulk_start(struct usb_xfer *xfer)
	{
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	/* setup TD's and QH */
	ohci_setup_standard_chain(xfer, &sc->sc_bulk_p_last);

	/* put transfer on interrupt queue */
	ohci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ohci_device_bulk_methods =
	{
	.open = ohci_device_bulk_open,
	.close = ohci_device_bulk_close,
	.enter = ohci_device_bulk_enter,
	.start = ohci_device_bulk_start,
	};

	/------------------------------------------------------------------------
	* ohci control support
	------------------------------------------------------------------------/
	static void
	ohci_device_ctrl_open(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ohci_device_ctrl_close(struct usb_xfer *xfer)
	{
	ohci_device_done(xfer, USB_ERR_CANCELLED);
	}

	static void
	ohci_device_ctrl_enter(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ohci_device_ctrl_start(struct usb_xfer *xfer)
	{
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	/* setup TD's and QH */
	ohci_setup_standard_chain(xfer, &sc->sc_ctrl_p_last);

	/* put transfer on interrupt queue */
	ohci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ohci_device_ctrl_methods =
	{
	.open = ohci_device_ctrl_open,
	.close = ohci_device_ctrl_close,
	.enter = ohci_device_ctrl_enter,
	.start = ohci_device_ctrl_start,
	};

	/------------------------------------------------------------------------
	* ohci interrupt support
	------------------------------------------------------------------------/
	static void
	ohci_device_intr_open(struct usb_xfer *xfer)
	{
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
	uint16_t best;
	uint16_t bit;
	uint16_t x;

	best = 0;
	bit = OHCI_NO_EDS / 2;
	while (bit) {
	if (xfer->interval >= bit) {
	x = bit;
	best = bit;
	while (x & bit) {
	if (sc->sc_intr_stat[x] <
	sc->sc_intr_stat[best]) {
	best = x;
	}
	x++;
	}
	break;
	}
	bit >>= 1;
	}

	sc->sc_intr_stat[best]++;
	xfer->qh_pos = best;

	DPRINTFN(3, "best=%d interval=%d\n",
	best, xfer->interval);
	}

	static void
	ohci_device_intr_close(struct usb_xfer *xfer)
	{
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	sc->sc_intr_stat[xfer->qh_pos]--;

	ohci_device_done(xfer, USB_ERR_CANCELLED);
	}

	static void
	ohci_device_intr_enter(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ohci_device_intr_start(struct usb_xfer *xfer)
	{
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);

	/* setup TD's and QH */
	ohci_setup_standard_chain(xfer, &sc->sc_intr_p_last[xfer->qh_pos]);

	/* put transfer on interrupt queue */
	ohci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ohci_device_intr_methods =
	{
	.open = ohci_device_intr_open,
	.close = ohci_device_intr_close,
	.enter = ohci_device_intr_enter,
	.start = ohci_device_intr_start,
	};

	/------------------------------------------------------------------------
	* ohci isochronous support
	------------------------------------------------------------------------/
	static void
	ohci_device_isoc_open(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ohci_device_isoc_close(struct usb_xfer *xfer)
	{
	/**/
	ohci_device_done(xfer, USB_ERR_CANCELLED);
	}

	static void
	ohci_device_isoc_enter(struct usb_xfer *xfer)
	{
	struct usb_page_search buf_res;
	ohci_softc_t *sc = OHCI_BUS2SC(xfer->xroot->bus);
	struct ohci_hcca *hcca;
	uint32_t buf_offset;
	uint32_t nframes;
	uint32_t ed_flags;
	uint32_t *plen;
	uint16_t itd_offset[OHCI_ITD_NOFFSET];
	uint16_t length;
	uint8_t ncur;
	ohci_itd_t *td;
	ohci_itd_t *td_last = NULL;
	ohci_ed_t *ed;

	hcca = ohci_get_hcca(sc);

	nframes = le32toh(hcca->hcca_frame_number);

	DPRINTFN(6, "xfer=%p isoc_next=%u nframes=%u hcca_fn=%u\n",
	xfer, xfer->endpoint->isoc_next, xfer->nframes, nframes);

	if ((xfer->endpoint->is_synced == 0) \|\|
	(((nframes - xfer->endpoint->isoc_next) & 0xFFFF) < xfer->nframes) \|\|
	(((xfer->endpoint->isoc_next - nframes) & 0xFFFF) >= 128)) {
	/*
	* If there is data underflow or the pipe queue is empty we
	* schedule the transfer a few frames ahead of the current
	* frame position. Else two isochronous transfers might
	* overlap.
	*/
	xfer->endpoint->isoc_next = (nframes + 3) & 0xFFFF;
	xfer->endpoint->is_synced = 1;
	DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
	}
	/*
	* compute how many milliseconds the insertion is ahead of the
	* current frame position:
	*/
	buf_offset = ((xfer->endpoint->isoc_next - nframes) & 0xFFFF);

	/*
	* pre-compute when the isochronous transfer will be finished:
	*/
	xfer->isoc_time_complete =
	(usb_isoc_time_expand(&sc->sc_bus, nframes) + buf_offset +
	xfer->nframes);

	/* get the real number of frames */

	nframes = xfer->nframes;

	buf_offset = 0;

	plen = xfer->frlengths;

	/* toggle the DMA set we are using */
	xfer->flags_int.curr_dma_set ^= 1;

	/* get next DMA set */
	td = xfer->td_start[xfer->flags_int.curr_dma_set];

	xfer->td_transfer_first = td;

	ncur = 0;
	length = 0;

	while (nframes--) {
	if (td == NULL) {
	panic("%s:%d: out of TD's\n",
	__FUNCTION__, __LINE__);
	}
	itd_offset[ncur] = length;
	buf_offset += *plen;
	length += *plen;
	plen++;
	ncur++;

	if ( /* check if the ITD is full */
	(ncur == OHCI_ITD_NOFFSET) \|\|
	/* check if we have put more than 4K into the ITD */
	(length & 0xF000) \|\|
	/* check if it is the last frame */
	(nframes == 0)) {

	/* fill current ITD */
	td->itd_flags = htole32(
	OHCI_ITD_NOCC \|
	OHCI_ITD_SET_SF(xfer->endpoint->isoc_next) \|
	OHCI_ITD_NOINTR \|
	OHCI_ITD_SET_FC(ncur));

	td->frames = ncur;
	xfer->endpoint->isoc_next += ncur;

	if (length == 0) {
	/* all zero */
	td->itd_bp0 = 0;
	td->itd_be = ~0;

	while (ncur--) {
	td->itd_offset[ncur] =
	htole16(OHCI_ITD_MK_OFFS(0));
	}
	} else {
	usbd_get_page(xfer->frbuffers, buf_offset - length, &buf_res);
	length = OHCI_PAGE_MASK(buf_res.physaddr);
	buf_res.physaddr =
	OHCI_PAGE(buf_res.physaddr);
	td->itd_bp0 = htole32(buf_res.physaddr);
	usbd_get_page(xfer->frbuffers, buf_offset - 1, &buf_res);
	td->itd_be = htole32(buf_res.physaddr);

	while (ncur--) {
	itd_offset[ncur] += length;
	itd_offset[ncur] =
	OHCI_ITD_MK_OFFS(itd_offset[ncur]);
	td->itd_offset[ncur] =
	htole16(itd_offset[ncur]);
	}
	}
	ncur = 0;
	length = 0;
	td_last = td;
	td = td->obj_next;

	if (td) {
	/* link the last TD with the next one */
	td_last->itd_next = td->itd_self;
	}
	usb_pc_cpu_flush(td_last->page_cache);
	}
	}

	/* update the last TD */
	td_last->itd_flags &= ~htole32(OHCI_ITD_NOINTR);
	td_last->itd_flags \|= htole32(OHCI_ITD_SET_DI(0));
	td_last->itd_next = 0;

	usb_pc_cpu_flush(td_last->page_cache);

	xfer->td_transfer_last = td_last;

	#ifdef USB_DEBUG
	if (ohcidebug > 8) {
	DPRINTF("data before transfer:\n");
	ohci_dump_itds(xfer->td_transfer_first);
	}
	#endif
	ed = xfer->qh_start[xfer->flags_int.curr_dma_set];

	if (UE_GET_DIR(xfer->endpointno) == UE_DIR_IN)
	ed_flags = (OHCI_ED_DIR_IN \| OHCI_ED_FORMAT_ISO);
	else
	ed_flags = (OHCI_ED_DIR_OUT \| OHCI_ED_FORMAT_ISO);

	ed_flags \|= (OHCI_ED_SET_FA(xfer->address) \|
	OHCI_ED_SET_EN(UE_GET_ADDR(xfer->endpointno)) \|
	OHCI_ED_SET_MAXP(xfer->max_frame_size));

	if (xfer->xroot->udev->speed == USB_SPEED_LOW) {
	ed_flags \|= OHCI_ED_SPEED;
	}
	ed->ed_flags = htole32(ed_flags);

	td = xfer->td_transfer_first;

	ed->ed_headp = td->itd_self;

	/* isochronous transfers are not affected by suspend / resume */
	/* the append function will flush the endpoint descriptor */

	OHCI_APPEND_QH(ed, sc->sc_isoc_p_last);
	}

	static void
	ohci_device_isoc_start(struct usb_xfer *xfer)
	{
	/* put transfer on interrupt queue */
	ohci_transfer_intr_enqueue(xfer);
	}

	static const struct usb_pipe_methods ohci_device_isoc_methods =
	{
	.open = ohci_device_isoc_open,
	.close = ohci_device_isoc_close,
	.enter = ohci_device_isoc_enter,
	.start = ohci_device_isoc_start,
	};

	/------------------------------------------------------------------------
	* ohci root control support
	------------------------------------------------------------------------
	* Simulate a hardware hub by handling all the necessary requests.
	------------------------------------------------------------------------/

	static const
	struct usb_device_descriptor ohci_devd =
	{
	sizeof(struct usb_device_descriptor),
	UDESC_DEVICE, /* type */
	{0x00, 0x01}, /* USB version */
	UDCLASS_HUB, /* class */
	UDSUBCLASS_HUB, /* subclass */
	UDPROTO_FSHUB, /* protocol */
	64, /* max packet */
	{0}, {0}, {0x00, 0x01}, /* device id */
	1, 2, 0, /* string indexes */
	1 /* # of configurations */
	};

	static const
	struct ohci_config_desc ohci_confd =
	{
	.confd = {
	.bLength = sizeof(struct usb_config_descriptor),
	.bDescriptorType = UDESC_CONFIG,
	.wTotalLength[0] = sizeof(ohci_confd),
	.bNumInterface = 1,
	.bConfigurationValue = 1,
	.iConfiguration = 0,
	.bmAttributes = UC_SELF_POWERED,
	.bMaxPower = 0, /* max power */
	},
	.ifcd = {
	.bLength = sizeof(struct usb_interface_descriptor),
	.bDescriptorType = UDESC_INTERFACE,
	.bNumEndpoints = 1,
	.bInterfaceClass = UICLASS_HUB,
	.bInterfaceSubClass = UISUBCLASS_HUB,
	.bInterfaceProtocol = 0,
	},
	.endpd = {
	.bLength = sizeof(struct usb_endpoint_descriptor),
	.bDescriptorType = UDESC_ENDPOINT,
	.bEndpointAddress = UE_DIR_IN \| OHCI_INTR_ENDPT,
	.bmAttributes = UE_INTERRUPT,
	.wMaxPacketSize[0] = 32,/* max packet (255 ports) */
	.bInterval = 255,
	},
	};

	static const
	struct usb_hub_descriptor ohci_hubd =
	{
	.bDescLength = 0, /* dynamic length */
	.bDescriptorType = UDESC_HUB,
	};

	static usb_error_t
	ohci_roothub_exec(struct usb_device *udev,
	struct usb_device_request req, const void pptr, uint16_t plength)
	{
	ohci_softc_t *sc = OHCI_BUS2SC(udev->bus);
	const void *ptr;
	const char *str_ptr;
	uint32_t port;
	uint32_t v;
	uint16_t len;
	uint16_t value;
	uint16_t index;
	uint8_t l;
	usb_error_t err;

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	/* buffer reset */
	ptr = (const void *)&sc->sc_hub_desc.temp;
	len = 0;
	err = 0;

	value = UGETW(req->wValue);
	index = UGETW(req->wIndex);

	DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x "
	"wValue=0x%04x wIndex=0x%04x\n",
	req->bmRequestType, req->bRequest,
	UGETW(req->wLength), value, index);

	#define C(x,y) ((x) \| ((y) << 8))
	switch (C(req->bRequest, req->bmRequestType)) {
	case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
	case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
	case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
	/*
	* DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops
	* for the integrated root hub.
	*/
	break;
	case C(UR_GET_CONFIG, UT_READ_DEVICE):
	len = 1;
	sc->sc_hub_desc.temp[0] = sc->sc_conf;
	break;
	case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
	switch (value >> 8) {
	case UDESC_DEVICE:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(ohci_devd);
	ptr = (const void *)&ohci_devd;
	break;

	case UDESC_CONFIG:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(ohci_confd);
	ptr = (const void *)&ohci_confd;
	break;

	case UDESC_STRING:
	switch (value & 0xff) {
	case 0: /* Language table */
	str_ptr = "\001";
	break;

	case 1: /* Vendor */
	str_ptr = sc->sc_vendor;
	break;

	case 2: /* Product */
	str_ptr = "OHCI root HUB";
	break;

	default:
	str_ptr = "";
	break;
	}

	len = usb_make_str_desc(
	sc->sc_hub_desc.temp,
	sizeof(sc->sc_hub_desc.temp),
	str_ptr);
	break;

	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;
	case C(UR_GET_INTERFACE, UT_READ_INTERFACE):
	len = 1;
	sc->sc_hub_desc.temp[0] = 0;
	break;
	case C(UR_GET_STATUS, UT_READ_DEVICE):
	len = 2;
	USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED);
	break;
	case C(UR_GET_STATUS, UT_READ_INTERFACE):
	case C(UR_GET_STATUS, UT_READ_ENDPOINT):
	len = 2;
	USETW(sc->sc_hub_desc.stat.wStatus, 0);
	break;
	case C(UR_SET_ADDRESS, UT_WRITE_DEVICE):
	if (value >= OHCI_MAX_DEVICES) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	sc->sc_addr = value;
	break;
	case C(UR_SET_CONFIG, UT_WRITE_DEVICE):
	if ((value != 0) && (value != 1)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	sc->sc_conf = value;
	break;
	case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
	break;
	case C(UR_SET_FEATURE, UT_WRITE_DEVICE):
	case C(UR_SET_FEATURE, UT_WRITE_INTERFACE):
	case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
	err = USB_ERR_IOERROR;
	goto done;
	case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
	break;
	case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
	break;
	/* Hub requests */
	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE):
	break;
	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER):
	DPRINTFN(9, "UR_CLEAR_PORT_FEATURE "
	"port=%d feature=%d\n",
	index, value);
	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	port = OHCI_RH_PORT_STATUS(index);
	switch (value) {
	case UHF_PORT_ENABLE:
	OWRITE4(sc, port, UPS_CURRENT_CONNECT_STATUS);
	break;
	case UHF_PORT_SUSPEND:
	OWRITE4(sc, port, UPS_OVERCURRENT_INDICATOR);
	break;
	case UHF_PORT_POWER:
	/* Yes, writing to the LOW_SPEED bit clears power. */
	OWRITE4(sc, port, UPS_LOW_SPEED);
	break;
	case UHF_C_PORT_CONNECTION:
	OWRITE4(sc, port, UPS_C_CONNECT_STATUS << 16);
	break;
	case UHF_C_PORT_ENABLE:
	OWRITE4(sc, port, UPS_C_PORT_ENABLED << 16);
	break;
	case UHF_C_PORT_SUSPEND:
	OWRITE4(sc, port, UPS_C_SUSPEND << 16);
	break;
	case UHF_C_PORT_OVER_CURRENT:
	OWRITE4(sc, port, UPS_C_OVERCURRENT_INDICATOR << 16);
	break;
	case UHF_C_PORT_RESET:
	OWRITE4(sc, port, UPS_C_PORT_RESET << 16);
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	switch (value) {
	case UHF_C_PORT_CONNECTION:
	case UHF_C_PORT_ENABLE:
	case UHF_C_PORT_SUSPEND:
	case UHF_C_PORT_OVER_CURRENT:
	case UHF_C_PORT_RESET:
	/* enable RHSC interrupt if condition is cleared. */
	if ((OREAD4(sc, port) >> 16) == 0)
	ohci_rhsc_enable(sc);
	break;
	default:
	break;
	}
	break;
	case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE):
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	v = OREAD4(sc, OHCI_RH_DESCRIPTOR_A);

	sc->sc_hub_desc.hubd = ohci_hubd;
	sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport;
	USETW(sc->sc_hub_desc.hubd.wHubCharacteristics,
	(v & OHCI_NPS ? UHD_PWR_NO_SWITCH :
	v & OHCI_PSM ? UHD_PWR_GANGED : UHD_PWR_INDIVIDUAL)
	/* XXX overcurrent */
	);
	sc->sc_hub_desc.hubd.bPwrOn2PwrGood = OHCI_GET_POTPGT(v);
	v = OREAD4(sc, OHCI_RH_DESCRIPTOR_B);

	for (l = 0; l < sc->sc_noport; l++) {
	if (v & 1) {
	sc->sc_hub_desc.hubd.DeviceRemovable[l / 8] \|= (1 << (l % 8));
	}
	v >>= 1;
	}
	sc->sc_hub_desc.hubd.bDescLength =
	8 + ((sc->sc_noport + 7) / 8);
	len = sc->sc_hub_desc.hubd.bDescLength;
	break;

	case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE):
	len = 16;
	memset(sc->sc_hub_desc.temp, 0, 16);
	break;
	case C(UR_GET_STATUS, UT_READ_CLASS_OTHER):
	DPRINTFN(9, "get port status i=%d\n",
	index);
	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	v = OREAD4(sc, OHCI_RH_PORT_STATUS(index));
	DPRINTFN(9, "port status=0x%04x\n", v);
	v &= ~UPS_PORT_MODE_DEVICE; /* force host mode */
	USETW(sc->sc_hub_desc.ps.wPortStatus, v);
	USETW(sc->sc_hub_desc.ps.wPortChange, v >> 16);
	len = sizeof(sc->sc_hub_desc.ps);
	break;
	case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE):
	err = USB_ERR_IOERROR;
	goto done;
	case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE):
	break;
	case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER):
	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	port = OHCI_RH_PORT_STATUS(index);
	switch (value) {
	case UHF_PORT_ENABLE:
	OWRITE4(sc, port, UPS_PORT_ENABLED);
	break;
	case UHF_PORT_SUSPEND:
	OWRITE4(sc, port, UPS_SUSPEND);
	break;
	case UHF_PORT_RESET:
	DPRINTFN(6, "reset port %d\n", index);
	OWRITE4(sc, port, UPS_RESET);
	for (v = 0;; v++) {
	if (v < 12) {
	usb_pause_mtx(&sc->sc_bus.bus_mtx,
	USB_MS_TO_TICKS(usb_port_root_reset_delay));

	if ((OREAD4(sc, port) & UPS_RESET) == 0) {
	break;
	}
	} else {
	err = USB_ERR_TIMEOUT;
	goto done;
	}
	}
	DPRINTFN(9, "ohci port %d reset, status = 0x%04x\n",
	index, OREAD4(sc, port));
	break;
	case UHF_PORT_POWER:
	DPRINTFN(3, "set port power %d\n", index);
	OWRITE4(sc, port, UPS_PORT_POWER);
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	done:
	*plength = len;
	*pptr = ptr;
	return (err);
	}

	static void
	ohci_xfer_setup(struct usb_setup_params *parm)
	{
	struct usb_page_search page_info;
	struct usb_page_cache *pc;
	- ohci_softc_t *sc;
	struct usb_xfer *xfer;
	void *last_obj;
	uint32_t ntd;
	uint32_t nitd;
	uint32_t nqh;
	uint32_t n;

	- sc = OHCI_BUS2SC(parm->udev->bus);
	xfer = parm->curr_xfer;

	parm->hc_max_packet_size = 0x500;
	parm->hc_max_packet_count = 1;
	parm->hc_max_frame_size = OHCI_PAGE_SIZE;

	/*
	* calculate ntd and nqh
	*/
	if (parm->methods == &ohci_device_ctrl_methods) {
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nitd = 0;
	ntd = ((2 * xfer->nframes) + 1 /* STATUS */
	+ (xfer->max_data_length / xfer->max_hc_frame_size));
	nqh = 1;

	} else if (parm->methods == &ohci_device_bulk_methods) {
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nitd = 0;
	ntd = ((2 * xfer->nframes)
	+ (xfer->max_data_length / xfer->max_hc_frame_size));
	nqh = 1;

	} else if (parm->methods == &ohci_device_intr_methods) {
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nitd = 0;
	ntd = ((2 * xfer->nframes)
	+ (xfer->max_data_length / xfer->max_hc_frame_size));
	nqh = 1;

	} else if (parm->methods == &ohci_device_isoc_methods) {
	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	nitd = ((xfer->max_data_length / OHCI_PAGE_SIZE) +
	howmany(xfer->nframes, OHCI_ITD_NOFFSET) +
	1 /* EXTRA */ );
	ntd = 0;
	nqh = 1;

	} else {

	usbd_transfer_setup_sub(parm);

	nitd = 0;
	ntd = 0;
	nqh = 0;
	}

	alloc_dma_set:

	if (parm->err) {
	return;
	}
	last_obj = NULL;

	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(ohci_td_t),
	OHCI_TD_ALIGN, ntd)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != ntd; n++) {
	ohci_td_t *td;

	usbd_get_page(pc + n, 0, &page_info);

	td = page_info.buffer;

	/* init TD */
	td->td_self = htole32(page_info.physaddr);
	td->obj_next = last_obj;
	td->page_cache = pc + n;

	last_obj = td;

	usb_pc_cpu_flush(pc + n);
	}
	}
	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(ohci_itd_t),
	OHCI_ITD_ALIGN, nitd)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != nitd; n++) {
	ohci_itd_t *itd;

	usbd_get_page(pc + n, 0, &page_info);

	itd = page_info.buffer;

	/* init TD */
	itd->itd_self = htole32(page_info.physaddr);
	itd->obj_next = last_obj;
	itd->page_cache = pc + n;

	last_obj = itd;

	usb_pc_cpu_flush(pc + n);
	}
	}
	xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj;

	last_obj = NULL;

	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(ohci_ed_t),
	OHCI_ED_ALIGN, nqh)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != nqh; n++) {
	ohci_ed_t *ed;

	usbd_get_page(pc + n, 0, &page_info);

	ed = page_info.buffer;

	/* init QH */
	ed->ed_self = htole32(page_info.physaddr);
	ed->obj_next = last_obj;
	ed->page_cache = pc + n;

	last_obj = ed;

	usb_pc_cpu_flush(pc + n);
	}
	}
	xfer->qh_start[xfer->flags_int.curr_dma_set] = last_obj;

	if (!xfer->flags_int.curr_dma_set) {
	xfer->flags_int.curr_dma_set = 1;
	goto alloc_dma_set;
	}
	}

	static void
	ohci_ep_init(struct usb_device udev, struct usb_endpoint_descriptor edesc,
	struct usb_endpoint *ep)
	{
	ohci_softc_t *sc = OHCI_BUS2SC(udev->bus);

	DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d (%d)\n",
	ep, udev->address,
	edesc->bEndpointAddress, udev->flags.usb_mode,
	sc->sc_addr);

	if (udev->device_index != sc->sc_addr) {
	switch (edesc->bmAttributes & UE_XFERTYPE) {
	case UE_CONTROL:
	ep->methods = &ohci_device_ctrl_methods;
	break;
	case UE_INTERRUPT:
	ep->methods = &ohci_device_intr_methods;
	break;
	case UE_ISOCHRONOUS:
	if (udev->speed == USB_SPEED_FULL) {
	ep->methods = &ohci_device_isoc_methods;
	}
	break;
	case UE_BULK:
	ep->methods = &ohci_device_bulk_methods;
	break;
	default:
	/* do nothing */
	break;
	}
	}
	}

	static void
	ohci_xfer_unsetup(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	ohci_get_dma_delay(struct usb_device udev, uint32_t pus)
	{
	/*
	* Wait until hardware has finished any possible use of the
	* transfer descriptor(s) and QH
	*/
	pus = (1125); / microseconds */
	}

	static void
	ohci_device_resume(struct usb_device *udev)
	{
	struct ohci_softc *sc = OHCI_BUS2SC(udev->bus);
	struct usb_xfer *xfer;
	const struct usb_pipe_methods *methods;
	ohci_ed_t *ed;

	DPRINTF("\n");

	USB_BUS_LOCK(udev->bus);

	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {

	if (xfer->xroot->udev == udev) {

	methods = xfer->endpoint->methods;
	ed = xfer->qh_start[xfer->flags_int.curr_dma_set];

	if (methods == &ohci_device_bulk_methods) {
	OHCI_APPEND_QH(ed, sc->sc_bulk_p_last);
	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_BLF);
	}
	if (methods == &ohci_device_ctrl_methods) {
	OHCI_APPEND_QH(ed, sc->sc_ctrl_p_last);
	OWRITE4(sc, OHCI_COMMAND_STATUS, OHCI_CLF);
	}
	if (methods == &ohci_device_intr_methods) {
	OHCI_APPEND_QH(ed, sc->sc_intr_p_last[xfer->qh_pos]);
	}
	}
	}

	USB_BUS_UNLOCK(udev->bus);

	return;
	}

	static void
	ohci_device_suspend(struct usb_device *udev)
	{
	struct ohci_softc *sc = OHCI_BUS2SC(udev->bus);
	struct usb_xfer *xfer;
	const struct usb_pipe_methods *methods;
	ohci_ed_t *ed;

	DPRINTF("\n");

	USB_BUS_LOCK(udev->bus);

	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {

	if (xfer->xroot->udev == udev) {

	methods = xfer->endpoint->methods;
	ed = xfer->qh_start[xfer->flags_int.curr_dma_set];

	if (methods == &ohci_device_bulk_methods) {
	OHCI_REMOVE_QH(ed, sc->sc_bulk_p_last);
	}
	if (methods == &ohci_device_ctrl_methods) {
	OHCI_REMOVE_QH(ed, sc->sc_ctrl_p_last);
	}
	if (methods == &ohci_device_intr_methods) {
	OHCI_REMOVE_QH(ed, sc->sc_intr_p_last[xfer->qh_pos]);
	}
	}
	}

	USB_BUS_UNLOCK(udev->bus);

	return;
	}

	static void
	ohci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state)
	{
	struct ohci_softc *sc = OHCI_BUS2SC(bus);

	switch (state) {
	case USB_HW_POWER_SUSPEND:
	case USB_HW_POWER_SHUTDOWN:
	ohci_suspend(sc);
	break;
	case USB_HW_POWER_RESUME:
	ohci_resume(sc);
	break;
	default:
	break;
	}
	}

	static void
	ohci_set_hw_power(struct usb_bus *bus)
	{
	struct ohci_softc *sc = OHCI_BUS2SC(bus);
	uint32_t temp;
	uint32_t flags;

	DPRINTF("\n");

	USB_BUS_LOCK(bus);

	flags = bus->hw_power_state;

	temp = OREAD4(sc, OHCI_CONTROL);
	temp &= ~(OHCI_PLE \| OHCI_IE \| OHCI_CLE \| OHCI_BLE);

	if (flags & USB_HW_POWER_CONTROL)
	temp \|= OHCI_CLE;

	if (flags & USB_HW_POWER_BULK)
	temp \|= OHCI_BLE;

	if (flags & USB_HW_POWER_INTERRUPT)
	temp \|= OHCI_PLE;

	if (flags & USB_HW_POWER_ISOC)
	temp \|= OHCI_IE \| OHCI_PLE;

	OWRITE4(sc, OHCI_CONTROL, temp);

	USB_BUS_UNLOCK(bus);

	return;
	}

	static const struct usb_bus_methods ohci_bus_methods =
	{
	.endpoint_init = ohci_ep_init,
	.xfer_setup = ohci_xfer_setup,
	.xfer_unsetup = ohci_xfer_unsetup,
	.get_dma_delay = ohci_get_dma_delay,
	.device_resume = ohci_device_resume,
	.device_suspend = ohci_device_suspend,
	.set_hw_power = ohci_set_hw_power,
	.set_hw_power_sleep = ohci_set_hw_power_sleep,
	.roothub_exec = ohci_roothub_exec,
	.xfer_poll = ohci_do_poll,
	};
	Index: head/sys/dev/usb/controller/xhci.c
	===================================================================
	--- head/sys/dev/usb/controller/xhci.c (revision 327172)
	+++ head/sys/dev/usb/controller/xhci.c (revision 327173)
	@@ -1,4361 +1,4359 @@
	/* $FreeBSD$ */
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2010 Hans Petter Selasky. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* USB eXtensible Host Controller Interface, a.k.a. USB 3.0 controller.
	*
	* The XHCI 1.0 spec can be found at
	* http://www.intel.com/technology/usb/download/xHCI_Specification_for_USB.pdf
	* and the USB 3.0 spec at
	* http://www.usb.org/developers/docs/usb_30_spec_060910.zip
	*/

	/*
	* A few words about the design implementation: This driver emulates
	* the concept about TDs which is found in EHCI specification. This
	* way we achieve that the USB controller drivers look similar to
	* eachother which makes it easier to understand the code.
	*/

	#ifdef USB_GLOBAL_INCLUDE_FILE
	#include USB_GLOBAL_INCLUDE_FILE
	#else
	#include <sys/stdint.h>
	#include <sys/stddef.h>
	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usbdi.h>

	#define USB_DEBUG_VAR xhcidebug

	#include <dev/usb/usb_core.h>
	#include <dev/usb/usb_debug.h>
	#include <dev/usb/usb_busdma.h>
	#include <dev/usb/usb_process.h>
	#include <dev/usb/usb_transfer.h>
	#include <dev/usb/usb_device.h>
	#include <dev/usb/usb_hub.h>
	#include <dev/usb/usb_util.h>

	#include <dev/usb/usb_controller.h>
	#include <dev/usb/usb_bus.h>
	#endif /* USB_GLOBAL_INCLUDE_FILE */

	#include <dev/usb/controller/xhci.h>
	#include <dev/usb/controller/xhcireg.h>

	#define XHCI_BUS2SC(bus) \
	((struct xhci_softc )(((uint8_t )(bus)) - \
	((uint8_t )&(((struct xhci_softc )0)->sc_bus))))

	static SYSCTL_NODE(_hw_usb, OID_AUTO, xhci, CTLFLAG_RW, 0, "USB XHCI");

	static int xhcistreams;
	SYSCTL_INT(_hw_usb_xhci, OID_AUTO, streams, CTLFLAG_RWTUN,
	&xhcistreams, 0, "Set to enable streams mode support");

	#ifdef USB_DEBUG
	static int xhcidebug;
	static int xhciroute;
	static int xhcipolling;
	static int xhcidma32;

	SYSCTL_INT(_hw_usb_xhci, OID_AUTO, debug, CTLFLAG_RWTUN,
	&xhcidebug, 0, "Debug level");
	SYSCTL_INT(_hw_usb_xhci, OID_AUTO, xhci_port_route, CTLFLAG_RWTUN,
	&xhciroute, 0, "Routing bitmap for switching EHCI ports to the XHCI controller");
	SYSCTL_INT(_hw_usb_xhci, OID_AUTO, use_polling, CTLFLAG_RWTUN,
	&xhcipolling, 0, "Set to enable software interrupt polling for the XHCI controller");
	SYSCTL_INT(_hw_usb_xhci, OID_AUTO, dma32, CTLFLAG_RWTUN,
	&xhcidma32, 0, "Set to only use 32-bit DMA for the XHCI controller");
	#else
	#define xhciroute 0
	#define xhcidma32 0
	#endif

	#define XHCI_INTR_ENDPT 1

	struct xhci_std_temp {
	struct xhci_softc *sc;
	struct usb_page_cache *pc;
	struct xhci_td *td;
	struct xhci_td *td_next;
	uint32_t len;
	uint32_t offset;
	uint32_t max_packet_size;
	uint32_t average;
	uint16_t isoc_delta;
	uint16_t isoc_frame;
	uint8_t shortpkt;
	uint8_t multishort;
	uint8_t last_frame;
	uint8_t trb_type;
	uint8_t direction;
	uint8_t tbc;
	uint8_t tlbpc;
	uint8_t step_td;
	uint8_t do_isoc_sync;
	};

	static void xhci_do_poll(struct usb_bus *);
	static void xhci_device_done(struct usb_xfer *, usb_error_t);
	static void xhci_root_intr(struct xhci_softc *);
	static void xhci_free_device_ext(struct usb_device *);
	static struct xhci_endpoint_ext xhci_get_endpoint_ext(struct usb_device ,
	struct usb_endpoint_descriptor *);
	static usb_proc_callback_t xhci_configure_msg;
	static usb_error_t xhci_configure_device(struct usb_device *);
	static usb_error_t xhci_configure_endpoint(struct usb_device *,
	struct usb_endpoint_descriptor , struct xhci_endpoint_ext ,
	uint16_t, uint8_t, uint8_t, uint8_t, uint16_t, uint16_t,
	uint8_t);
	static usb_error_t xhci_configure_mask(struct usb_device *,
	uint32_t, uint8_t);
	static usb_error_t xhci_cmd_evaluate_ctx(struct xhci_softc *,
	uint64_t, uint8_t);
	static void xhci_endpoint_doorbell(struct usb_xfer *);
	static void xhci_ctx_set_le32(struct xhci_softc sc, volatile uint32_t ptr, uint32_t val);
	static uint32_t xhci_ctx_get_le32(struct xhci_softc sc, volatile uint32_t ptr);
	static void xhci_ctx_set_le64(struct xhci_softc sc, volatile uint64_t ptr, uint64_t val);
	#ifdef USB_DEBUG
	static uint64_t xhci_ctx_get_le64(struct xhci_softc sc, volatile uint64_t ptr);
	#endif

	static const struct usb_bus_methods xhci_bus_methods;

	#ifdef USB_DEBUG
	static void
	xhci_dump_trb(struct xhci_trb *trb)
	{
	DPRINTFN(5, "trb = %p\n", trb);
	DPRINTFN(5, "qwTrb0 = 0x%016llx\n", (long long)le64toh(trb->qwTrb0));
	DPRINTFN(5, "dwTrb2 = 0x%08x\n", le32toh(trb->dwTrb2));
	DPRINTFN(5, "dwTrb3 = 0x%08x\n", le32toh(trb->dwTrb3));
	}

	static void
	xhci_dump_endpoint(struct xhci_softc sc, struct xhci_endp_ctx pep)
	{
	DPRINTFN(5, "pep = %p\n", pep);
	DPRINTFN(5, "dwEpCtx0=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx0));
	DPRINTFN(5, "dwEpCtx1=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx1));
	DPRINTFN(5, "qwEpCtx2=0x%016llx\n", (long long)xhci_ctx_get_le64(sc, &pep->qwEpCtx2));
	DPRINTFN(5, "dwEpCtx4=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx4));
	DPRINTFN(5, "dwEpCtx5=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx5));
	DPRINTFN(5, "dwEpCtx6=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx6));
	DPRINTFN(5, "dwEpCtx7=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx7));
	}

	static void
	xhci_dump_device(struct xhci_softc sc, struct xhci_slot_ctx psl)
	{
	DPRINTFN(5, "psl = %p\n", psl);
	DPRINTFN(5, "dwSctx0=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx0));
	DPRINTFN(5, "dwSctx1=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx1));
	DPRINTFN(5, "dwSctx2=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx2));
	DPRINTFN(5, "dwSctx3=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx3));
	}
	#endif

	uint8_t
	xhci_use_polling(void)
	{
	#ifdef USB_DEBUG
	return (xhcipolling != 0);
	#else
	return (0);
	#endif
	}

	static void
	xhci_iterate_hw_softc(struct usb_bus bus, usb_bus_mem_sub_cb_t cb)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(bus);
	uint16_t i;

	cb(bus, &sc->sc_hw.root_pc, &sc->sc_hw.root_pg,
	sizeof(struct xhci_hw_root), XHCI_PAGE_SIZE);

	cb(bus, &sc->sc_hw.ctx_pc, &sc->sc_hw.ctx_pg,
	sizeof(struct xhci_dev_ctx_addr), XHCI_PAGE_SIZE);

	for (i = 0; i != sc->sc_noscratch; i++) {
	cb(bus, &sc->sc_hw.scratch_pc[i], &sc->sc_hw.scratch_pg[i],
	XHCI_PAGE_SIZE, XHCI_PAGE_SIZE);
	}
	}

	static void
	xhci_ctx_set_le32(struct xhci_softc sc, volatile uint32_t ptr, uint32_t val)
	{
	if (sc->sc_ctx_is_64_byte) {
	uint32_t offset;
	/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
	/* all contexts are initially 32-bytes */
	offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
	ptr = (volatile uint32_t )(((volatile uint8_t )ptr) + offset);
	}
	*ptr = htole32(val);
	}

	static uint32_t
	xhci_ctx_get_le32(struct xhci_softc sc, volatile uint32_t ptr)
	{
	if (sc->sc_ctx_is_64_byte) {
	uint32_t offset;
	/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
	/* all contexts are initially 32-bytes */
	offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
	ptr = (volatile uint32_t )(((volatile uint8_t )ptr) + offset);
	}
	return (le32toh(*ptr));
	}

	static void
	xhci_ctx_set_le64(struct xhci_softc sc, volatile uint64_t ptr, uint64_t val)
	{
	if (sc->sc_ctx_is_64_byte) {
	uint32_t offset;
	/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
	/* all contexts are initially 32-bytes */
	offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
	ptr = (volatile uint64_t )(((volatile uint8_t )ptr) + offset);
	}
	*ptr = htole64(val);
	}

	#ifdef USB_DEBUG
	static uint64_t
	xhci_ctx_get_le64(struct xhci_softc sc, volatile uint64_t ptr)
	{
	if (sc->sc_ctx_is_64_byte) {
	uint32_t offset;
	/* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */
	/* all contexts are initially 32-bytes */
	offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U));
	ptr = (volatile uint64_t )(((volatile uint8_t )ptr) + offset);
	}
	return (le64toh(*ptr));
	}
	#endif

	static int
	xhci_reset_command_queue_locked(struct xhci_softc *sc)
	{
	struct usb_page_search buf_res;
	struct xhci_hw_root *phwr;
	uint64_t addr;
	uint32_t temp;

	DPRINTF("\n");

	temp = XREAD4(sc, oper, XHCI_CRCR_LO);
	if (temp & XHCI_CRCR_LO_CRR) {
	DPRINTF("Command ring running\n");
	temp &= ~(XHCI_CRCR_LO_CS \| XHCI_CRCR_LO_CA);

	/*
	* Try to abort the last command as per section
	* 4.6.1.2 "Aborting a Command" of the XHCI
	* specification:
	*/

	/* stop and cancel */
	XWRITE4(sc, oper, XHCI_CRCR_LO, temp \| XHCI_CRCR_LO_CS);
	XWRITE4(sc, oper, XHCI_CRCR_HI, 0);

	XWRITE4(sc, oper, XHCI_CRCR_LO, temp \| XHCI_CRCR_LO_CA);
	XWRITE4(sc, oper, XHCI_CRCR_HI, 0);

	/* wait 250ms */
	usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 4);

	/* check if command ring is still running */
	temp = XREAD4(sc, oper, XHCI_CRCR_LO);
	if (temp & XHCI_CRCR_LO_CRR) {
	DPRINTF("Comand ring still running\n");
	return (USB_ERR_IOERROR);
	}
	}

	/* reset command ring */
	sc->sc_command_ccs = 1;
	sc->sc_command_idx = 0;

	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);

	/* set up command ring control base address */
	addr = buf_res.physaddr;
	phwr = buf_res.buffer;
	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0];

	DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr);

	memset(phwr->hwr_commands, 0, sizeof(phwr->hwr_commands));
	phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr);

	usb_pc_cpu_flush(&sc->sc_hw.root_pc);

	XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) \| XHCI_CRCR_LO_RCS);
	XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32));

	return (0);
	}

	usb_error_t
	xhci_start_controller(struct xhci_softc *sc)
	{
	struct usb_page_search buf_res;
	struct xhci_hw_root *phwr;
	struct xhci_dev_ctx_addr *pdctxa;
	usb_error_t err;
	uint64_t addr;
	uint32_t temp;
	uint16_t i;

	DPRINTF("\n");

	sc->sc_event_ccs = 1;
	sc->sc_event_idx = 0;
	sc->sc_command_ccs = 1;
	sc->sc_command_idx = 0;

	err = xhci_reset_controller(sc);
	if (err)
	return (err);

	/* set up number of device slots */
	DPRINTF("CONFIG=0x%08x -> 0x%08x\n",
	XREAD4(sc, oper, XHCI_CONFIG), sc->sc_noslot);

	XWRITE4(sc, oper, XHCI_CONFIG, sc->sc_noslot);

	temp = XREAD4(sc, oper, XHCI_USBSTS);

	/* clear interrupts */
	XWRITE4(sc, oper, XHCI_USBSTS, temp);
	/* disable all device notifications */
	XWRITE4(sc, oper, XHCI_DNCTRL, 0);

	/* set up device context base address */
	usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res);
	pdctxa = buf_res.buffer;
	memset(pdctxa, 0, sizeof(*pdctxa));

	addr = buf_res.physaddr;
	addr += (uintptr_t)&((struct xhci_dev_ctx_addr *)0)->qwSpBufPtr[0];

	/* slot 0 points to the table of scratchpad pointers */
	pdctxa->qwBaaDevCtxAddr[0] = htole64(addr);

	for (i = 0; i != sc->sc_noscratch; i++) {
	struct usb_page_search buf_scp;
	usbd_get_page(&sc->sc_hw.scratch_pc[i], 0, &buf_scp);
	pdctxa->qwSpBufPtr[i] = htole64((uint64_t)buf_scp.physaddr);
	}

	addr = buf_res.physaddr;

	XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr);
	XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32));
	XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr);
	XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32));

	/* set up event table size */
	DPRINTF("ERSTSZ=0x%08x -> 0x%08x\n",
	XREAD4(sc, runt, XHCI_ERSTSZ(0)), sc->sc_erst_max);

	XWRITE4(sc, runt, XHCI_ERSTSZ(0), XHCI_ERSTS_SET(sc->sc_erst_max));

	/* set up interrupt rate */
	XWRITE4(sc, runt, XHCI_IMOD(0), sc->sc_imod_default);

	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);

	phwr = buf_res.buffer;
	addr = buf_res.physaddr;
	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[0];

	/* reset hardware root structure */
	memset(phwr, 0, sizeof(*phwr));

	phwr->hwr_ring_seg[0].qwEvrsTablePtr = htole64(addr);
	phwr->hwr_ring_seg[0].dwEvrsTableSize = htole32(XHCI_MAX_EVENTS);

	DPRINTF("ERDP(0)=0x%016llx\n", (unsigned long long)addr);

	XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr);
	XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32));

	addr = buf_res.physaddr;

	DPRINTF("ERSTBA(0)=0x%016llx\n", (unsigned long long)addr);

	XWRITE4(sc, runt, XHCI_ERSTBA_LO(0), (uint32_t)addr);
	XWRITE4(sc, runt, XHCI_ERSTBA_HI(0), (uint32_t)(addr >> 32));

	/* set up interrupter registers */
	temp = XREAD4(sc, runt, XHCI_IMAN(0));
	temp \|= XHCI_IMAN_INTR_ENA;
	XWRITE4(sc, runt, XHCI_IMAN(0), temp);

	/* set up command ring control base address */
	addr = buf_res.physaddr;
	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0];

	DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr);

	XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) \| XHCI_CRCR_LO_RCS);
	XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32));

	phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr);

	usb_bus_mem_flush_all(&sc->sc_bus, &xhci_iterate_hw_softc);

	/* Go! */
	XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_RS \|
	XHCI_CMD_INTE \| XHCI_CMD_HSEE);

	for (i = 0; i != 100; i++) {
	usb_pause_mtx(NULL, hz / 100);
	temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH;
	if (!temp)
	break;
	}
	if (temp) {
	XWRITE4(sc, oper, XHCI_USBCMD, 0);
	device_printf(sc->sc_bus.parent, "Run timeout.\n");
	return (USB_ERR_IOERROR);
	}

	/* catch any lost interrupts */
	xhci_do_poll(&sc->sc_bus);

	if (sc->sc_port_route != NULL) {
	/* Route all ports to the XHCI by default */
	sc->sc_port_route(sc->sc_bus.parent,
	~xhciroute, xhciroute);
	}
	return (0);
	}

	usb_error_t
	xhci_halt_controller(struct xhci_softc *sc)
	{
	uint32_t temp;
	uint16_t i;

	DPRINTF("\n");

	sc->sc_capa_off = 0;
	sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH);
	sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0xF;
	sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3;

	/* Halt controller */
	XWRITE4(sc, oper, XHCI_USBCMD, 0);

	for (i = 0; i != 100; i++) {
	usb_pause_mtx(NULL, hz / 100);
	temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH;
	if (temp)
	break;
	}

	if (!temp) {
	device_printf(sc->sc_bus.parent, "Controller halt timeout.\n");
	return (USB_ERR_IOERROR);
	}
	return (0);
	}

	usb_error_t
	xhci_reset_controller(struct xhci_softc *sc)
	{
	uint32_t temp = 0;
	uint16_t i;

	DPRINTF("\n");

	/* Reset controller */
	XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_HCRST);

	for (i = 0; i != 100; i++) {
	usb_pause_mtx(NULL, hz / 100);
	temp = (XREAD4(sc, oper, XHCI_USBCMD) & XHCI_CMD_HCRST) \|
	(XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_CNR);
	if (!temp)
	break;
	}

	if (temp) {
	device_printf(sc->sc_bus.parent, "Controller "
	"reset timeout.\n");
	return (USB_ERR_IOERROR);
	}
	return (0);
	}

	usb_error_t
	xhci_init(struct xhci_softc *sc, device_t self, uint8_t dma32)
	{
	uint32_t temp;

	DPRINTF("\n");

	/* initialize some bus fields */
	sc->sc_bus.parent = self;

	/* set the bus revision */
	sc->sc_bus.usbrev = USB_REV_3_0;

	/* set up the bus struct */
	sc->sc_bus.methods = &xhci_bus_methods;

	/* set up devices array */
	sc->sc_bus.devices = sc->sc_devices;
	sc->sc_bus.devices_max = XHCI_MAX_DEVICES;

	/* set default cycle state in case of early interrupts */
	sc->sc_event_ccs = 1;
	sc->sc_command_ccs = 1;

	/* set up bus space offsets */
	sc->sc_capa_off = 0;
	sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH);
	sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0x1F;
	sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3;

	DPRINTF("CAPLENGTH=0x%x\n", sc->sc_oper_off);
	DPRINTF("RUNTIMEOFFSET=0x%x\n", sc->sc_runt_off);
	DPRINTF("DOOROFFSET=0x%x\n", sc->sc_door_off);

	DPRINTF("xHCI version = 0x%04x\n", XREAD2(sc, capa, XHCI_HCIVERSION));

	if (!(XREAD4(sc, oper, XHCI_PAGESIZE) & XHCI_PAGESIZE_4K)) {
	device_printf(sc->sc_bus.parent, "Controller does "
	"not support 4K page size.\n");
	return (ENXIO);
	}

	temp = XREAD4(sc, capa, XHCI_HCSPARAMS0);

	DPRINTF("HCS0 = 0x%08x\n", temp);

	/* set up context size */
	if (XHCI_HCS0_CSZ(temp)) {
	sc->sc_ctx_is_64_byte = 1;
	} else {
	sc->sc_ctx_is_64_byte = 0;
	}

	/* get DMA bits */
	sc->sc_bus.dma_bits = (XHCI_HCS0_AC64(temp) &&
	xhcidma32 == 0 && dma32 == 0) ? 64 : 32;

	device_printf(self, "%d bytes context size, %d-bit DMA\n",
	sc->sc_ctx_is_64_byte ? 64 : 32, (int)sc->sc_bus.dma_bits);

	temp = XREAD4(sc, capa, XHCI_HCSPARAMS1);

	/* get number of device slots */
	sc->sc_noport = XHCI_HCS1_N_PORTS(temp);

	if (sc->sc_noport == 0) {
	device_printf(sc->sc_bus.parent, "Invalid number "
	"of ports: %u\n", sc->sc_noport);
	return (ENXIO);
	}

	sc->sc_noport = sc->sc_noport;
	sc->sc_noslot = XHCI_HCS1_DEVSLOT_MAX(temp);

	DPRINTF("Max slots: %u\n", sc->sc_noslot);

	if (sc->sc_noslot > XHCI_MAX_DEVICES)
	sc->sc_noslot = XHCI_MAX_DEVICES;

	temp = XREAD4(sc, capa, XHCI_HCSPARAMS2);

	DPRINTF("HCS2=0x%08x\n", temp);

	/* get number of scratchpads */
	sc->sc_noscratch = XHCI_HCS2_SPB_MAX(temp);

	if (sc->sc_noscratch > XHCI_MAX_SCRATCHPADS) {
	device_printf(sc->sc_bus.parent, "XHCI request "
	"too many scratchpads\n");
	return (ENOMEM);
	}

	DPRINTF("Max scratch: %u\n", sc->sc_noscratch);

	/* get event table size */
	sc->sc_erst_max = 1U << XHCI_HCS2_ERST_MAX(temp);
	if (sc->sc_erst_max > XHCI_MAX_RSEG)
	sc->sc_erst_max = XHCI_MAX_RSEG;

	temp = XREAD4(sc, capa, XHCI_HCSPARAMS3);

	/* get maximum exit latency */
	sc->sc_exit_lat_max = XHCI_HCS3_U1_DEL(temp) +
	XHCI_HCS3_U2_DEL(temp) + 250 /* us */;

	/* Check if we should use the default IMOD value. */
	if (sc->sc_imod_default == 0)
	sc->sc_imod_default = XHCI_IMOD_DEFAULT;

	/* get all DMA memory */
	if (usb_bus_mem_alloc_all(&sc->sc_bus,
	USB_GET_DMA_TAG(self), &xhci_iterate_hw_softc)) {
	return (ENOMEM);
	}

	/* set up command queue mutex and condition varible */
	cv_init(&sc->sc_cmd_cv, "CMDQ");
	sx_init(&sc->sc_cmd_sx, "CMDQ lock");

	sc->sc_config_msg[0].hdr.pm_callback = &xhci_configure_msg;
	sc->sc_config_msg[0].bus = &sc->sc_bus;
	sc->sc_config_msg[1].hdr.pm_callback = &xhci_configure_msg;
	sc->sc_config_msg[1].bus = &sc->sc_bus;

	return (0);
	}

	void
	xhci_uninit(struct xhci_softc *sc)
	{
	/*
	* NOTE: At this point the control transfer process is gone
	* and "xhci_configure_msg" is no longer called. Consequently
	* waiting for the configuration messages to complete is not
	* needed.
	*/
	usb_bus_mem_free_all(&sc->sc_bus, &xhci_iterate_hw_softc);

	cv_destroy(&sc->sc_cmd_cv);
	sx_destroy(&sc->sc_cmd_sx);
	}

	static void
	xhci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(bus);

	switch (state) {
	case USB_HW_POWER_SUSPEND:
	DPRINTF("Stopping the XHCI\n");
	xhci_halt_controller(sc);
	xhci_reset_controller(sc);
	break;
	case USB_HW_POWER_SHUTDOWN:
	DPRINTF("Stopping the XHCI\n");
	xhci_halt_controller(sc);
	xhci_reset_controller(sc);
	break;
	case USB_HW_POWER_RESUME:
	DPRINTF("Starting the XHCI\n");
	xhci_start_controller(sc);
	break;
	default:
	break;
	}
	}

	static usb_error_t
	xhci_generic_done_sub(struct usb_xfer *xfer)
	{
	struct xhci_td *td;
	struct xhci_td *td_alt_next;
	uint32_t len;
	uint8_t status;

	td = xfer->td_transfer_cache;
	td_alt_next = td->alt_next;

	if (xfer->aframes != xfer->nframes)
	usbd_xfer_set_frame_len(xfer, xfer->aframes, 0);

	while (1) {

	usb_pc_cpu_invalidate(td->page_cache);

	status = td->status;
	len = td->remainder;

	DPRINTFN(4, "xfer=%p[%u/%u] rem=%u/%u status=%u\n",
	xfer, (unsigned int)xfer->aframes,
	(unsigned int)xfer->nframes,
	(unsigned int)len, (unsigned int)td->len,
	(unsigned int)status);

	/*
	* Verify the status length and
	* add the length to "frlengths[]":
	*/
	if (len > td->len) {
	/* should not happen */
	DPRINTF("Invalid status length, "
	"0x%04x/0x%04x bytes\n", len, td->len);
	status = XHCI_TRB_ERROR_LENGTH;
	} else if (xfer->aframes != xfer->nframes) {
	xfer->frlengths[xfer->aframes] += td->len - len;
	}
	/* Check for last transfer */
	if (((void *)td) == xfer->td_transfer_last) {
	td = NULL;
	break;
	}
	/* Check for transfer error */
	if (status != XHCI_TRB_ERROR_SHORT_PKT &&
	status != XHCI_TRB_ERROR_SUCCESS) {
	/* the transfer is finished */
	td = NULL;
	break;
	}
	/* Check for short transfer */
	if (len > 0) {
	if (xfer->flags_int.short_frames_ok \|\|
	xfer->flags_int.isochronous_xfr \|\|
	xfer->flags_int.control_xfr) {
	/* follow alt next */
	td = td->alt_next;
	} else {
	/* the transfer is finished */
	td = NULL;
	}
	break;
	}
	td = td->obj_next;

	if (td->alt_next != td_alt_next) {
	/* this USB frame is complete */
	break;
	}
	}

	/* update transfer cache */

	xfer->td_transfer_cache = td;

	return ((status == XHCI_TRB_ERROR_STALL) ? USB_ERR_STALLED :
	(status != XHCI_TRB_ERROR_SHORT_PKT &&
	status != XHCI_TRB_ERROR_SUCCESS) ? USB_ERR_IOERROR :
	USB_ERR_NORMAL_COMPLETION);
	}

	static void
	xhci_generic_done(struct usb_xfer *xfer)
	{
	usb_error_t err = 0;

	DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n",
	xfer, xfer->endpoint);

	/* reset scanner */

	xfer->td_transfer_cache = xfer->td_transfer_first;

	if (xfer->flags_int.control_xfr) {

	if (xfer->flags_int.control_hdr)
	err = xhci_generic_done_sub(xfer);

	xfer->aframes = 1;

	if (xfer->td_transfer_cache == NULL)
	goto done;
	}

	while (xfer->aframes != xfer->nframes) {

	err = xhci_generic_done_sub(xfer);
	xfer->aframes++;

	if (xfer->td_transfer_cache == NULL)
	goto done;
	}

	if (xfer->flags_int.control_xfr &&
	!xfer->flags_int.control_act)
	err = xhci_generic_done_sub(xfer);
	done:
	/* transfer is complete */
	xhci_device_done(xfer, err);
	}

	static void
	xhci_activate_transfer(struct usb_xfer *xfer)
	{
	struct xhci_td *td;

	td = xfer->td_transfer_cache;

	usb_pc_cpu_invalidate(td->page_cache);

	if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) {

	/* activate the transfer */

	td->td_trb[0].dwTrb3 \|= htole32(XHCI_TRB_3_CYCLE_BIT);
	usb_pc_cpu_flush(td->page_cache);

	xhci_endpoint_doorbell(xfer);
	}
	}

	static void
	xhci_skip_transfer(struct usb_xfer *xfer)
	{
	struct xhci_td *td;
	struct xhci_td *td_last;

	td = xfer->td_transfer_cache;
	td_last = xfer->td_transfer_last;

	td = td->alt_next;

	usb_pc_cpu_invalidate(td->page_cache);

	if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) {

	usb_pc_cpu_invalidate(td_last->page_cache);

	/* copy LINK TRB to current waiting location */

	td->td_trb[0].qwTrb0 = td_last->td_trb[td_last->ntrb].qwTrb0;
	td->td_trb[0].dwTrb2 = td_last->td_trb[td_last->ntrb].dwTrb2;
	usb_pc_cpu_flush(td->page_cache);

	td->td_trb[0].dwTrb3 = td_last->td_trb[td_last->ntrb].dwTrb3;
	usb_pc_cpu_flush(td->page_cache);

	xhci_endpoint_doorbell(xfer);
	}
	}

	/------------------------------------------------------------------------
	* xhci_check_transfer
	------------------------------------------------------------------------/
	static void
	xhci_check_transfer(struct xhci_softc sc, struct xhci_trb trb)
	{
	struct xhci_endpoint_ext *pepext;
	int64_t offset;
	uint64_t td_event;
	uint32_t temp;
	uint32_t remainder;
	uint16_t stream_id;
	uint16_t i;
	uint8_t status;
	uint8_t halted;
	uint8_t epno;
	uint8_t index;

	/* decode TRB */
	td_event = le64toh(trb->qwTrb0);
	temp = le32toh(trb->dwTrb2);

	remainder = XHCI_TRB_2_REM_GET(temp);
	status = XHCI_TRB_2_ERROR_GET(temp);
	stream_id = XHCI_TRB_2_STREAM_GET(temp);

	temp = le32toh(trb->dwTrb3);
	epno = XHCI_TRB_3_EP_GET(temp);
	index = XHCI_TRB_3_SLOT_GET(temp);

	/* check if error means halted */
	halted = (status != XHCI_TRB_ERROR_SHORT_PKT &&
	status != XHCI_TRB_ERROR_SUCCESS);

	DPRINTF("slot=%u epno=%u stream=%u remainder=%u status=%u\n",
	index, epno, stream_id, remainder, status);

	if (index > sc->sc_noslot) {
	DPRINTF("Invalid slot.\n");
	return;
	}

	if ((epno == 0) \|\| (epno >= XHCI_MAX_ENDPOINTS)) {
	DPRINTF("Invalid endpoint.\n");
	return;
	}

	pepext = &sc->sc_hw.devs[index].endp[epno];

	if (pepext->trb_ep_mode != USB_EP_MODE_STREAMS) {
	stream_id = 0;
	DPRINTF("stream_id=0\n");
	} else if (stream_id >= XHCI_MAX_STREAMS) {
	DPRINTF("Invalid stream ID.\n");
	return;
	}

	/* try to find the USB transfer that generated the event */
	for (i = 0; i != (XHCI_MAX_TRANSFERS - 1); i++) {
	struct usb_xfer *xfer;
	struct xhci_td *td;

	xfer = pepext->xfer[i + (XHCI_MAX_TRANSFERS * stream_id)];
	if (xfer == NULL)
	continue;

	td = xfer->td_transfer_cache;

	DPRINTFN(5, "Checking if 0x%016llx == (0x%016llx .. 0x%016llx)\n",
	(long long)td_event,
	(long long)td->td_self,
	(long long)td->td_self + sizeof(td->td_trb));

	/*
	* NOTE: Some XHCI implementations might not trigger
	* an event on the last LINK TRB so we need to
	* consider both the last and second last event
	* address as conditions for a successful transfer.
	*
	* NOTE: We assume that the XHCI will only trigger one
	* event per chain of TRBs.
	*/

	offset = td_event - td->td_self;

	if (offset >= 0 &&
	offset < (int64_t)sizeof(td->td_trb)) {

	usb_pc_cpu_invalidate(td->page_cache);

	/* compute rest of remainder, if any */
	for (i = (offset / 16) + 1; i < td->ntrb; i++) {
	temp = le32toh(td->td_trb[i].dwTrb2);
	remainder += XHCI_TRB_2_BYTES_GET(temp);
	}

	DPRINTFN(5, "New remainder: %u\n", remainder);

	/* clear isochronous transfer errors */
	if (xfer->flags_int.isochronous_xfr) {
	if (halted) {
	halted = 0;
	status = XHCI_TRB_ERROR_SUCCESS;
	remainder = td->len;
	}
	}

	/* "td->remainder" is verified later */
	td->remainder = remainder;
	td->status = status;

	usb_pc_cpu_flush(td->page_cache);

	/*
	* 1) Last transfer descriptor makes the
	* transfer done
	*/
	if (((void *)td) == xfer->td_transfer_last) {
	DPRINTF("TD is last\n");
	xhci_generic_done(xfer);
	break;
	}

	/*
	* 2) Any kind of error makes the transfer
	* done
	*/
	if (halted) {
	DPRINTF("TD has I/O error\n");
	xhci_generic_done(xfer);
	break;
	}

	/*
	* 3) If there is no alternate next transfer,
	* a short packet also makes the transfer done
	*/
	if (td->remainder > 0) {
	if (td->alt_next == NULL) {
	DPRINTF(
	"short TD has no alternate next\n");
	xhci_generic_done(xfer);
	break;
	}
	DPRINTF("TD has short pkt\n");
	if (xfer->flags_int.short_frames_ok \|\|
	xfer->flags_int.isochronous_xfr \|\|
	xfer->flags_int.control_xfr) {
	/* follow the alt next */
	xfer->td_transfer_cache = td->alt_next;
	xhci_activate_transfer(xfer);
	break;
	}
	xhci_skip_transfer(xfer);
	xhci_generic_done(xfer);
	break;
	}

	/*
	* 4) Transfer complete - go to next TD
	*/
	DPRINTF("Following next TD\n");
	xfer->td_transfer_cache = td->obj_next;
	xhci_activate_transfer(xfer);
	break; /* there should only be one match */
	}
	}
	}

	static int
	xhci_check_command(struct xhci_softc sc, struct xhci_trb trb)
	{
	if (sc->sc_cmd_addr == trb->qwTrb0) {
	DPRINTF("Received command event\n");
	sc->sc_cmd_result[0] = trb->dwTrb2;
	sc->sc_cmd_result[1] = trb->dwTrb3;
	cv_signal(&sc->sc_cmd_cv);
	return (1); /* command match */
	}
	return (0);
	}

	static int
	xhci_interrupt_poll(struct xhci_softc *sc)
	{
	struct usb_page_search buf_res;
	struct xhci_hw_root *phwr;
	uint64_t addr;
	uint32_t temp;
	int retval = 0;
	uint16_t i;
	uint8_t event;
	uint8_t j;
	uint8_t k;
	uint8_t t;

	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);

	phwr = buf_res.buffer;

	/* Receive any events */

	usb_pc_cpu_invalidate(&sc->sc_hw.root_pc);

	i = sc->sc_event_idx;
	j = sc->sc_event_ccs;
	t = 2;

	while (1) {

	temp = le32toh(phwr->hwr_events[i].dwTrb3);

	k = (temp & XHCI_TRB_3_CYCLE_BIT) ? 1 : 0;

	if (j != k)
	break;

	event = XHCI_TRB_3_TYPE_GET(temp);

	DPRINTFN(10, "event[%u] = %u (0x%016llx 0x%08lx 0x%08lx)\n",
	i, event, (long long)le64toh(phwr->hwr_events[i].qwTrb0),
	(long)le32toh(phwr->hwr_events[i].dwTrb2),
	(long)le32toh(phwr->hwr_events[i].dwTrb3));

	switch (event) {
	case XHCI_TRB_EVENT_TRANSFER:
	xhci_check_transfer(sc, &phwr->hwr_events[i]);
	break;
	case XHCI_TRB_EVENT_CMD_COMPLETE:
	retval \|= xhci_check_command(sc, &phwr->hwr_events[i]);
	break;
	default:
	DPRINTF("Unhandled event = %u\n", event);
	break;
	}

	i++;

	if (i == XHCI_MAX_EVENTS) {
	i = 0;
	j ^= 1;

	/* check for timeout */
	if (!--t)
	break;
	}
	}

	sc->sc_event_idx = i;
	sc->sc_event_ccs = j;

	/*
	* NOTE: The Event Ring Dequeue Pointer Register is 64-bit
	* latched. That means to activate the register we need to
	* write both the low and high double word of the 64-bit
	* register.
	*/

	addr = buf_res.physaddr;
	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[i];

	/* try to clear busy bit */
	addr \|= XHCI_ERDP_LO_BUSY;

	XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr);
	XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32));

	return (retval);
	}

	static usb_error_t
	xhci_do_command(struct xhci_softc sc, struct xhci_trb trb,
	uint16_t timeout_ms)
	{
	struct usb_page_search buf_res;
	struct xhci_hw_root *phwr;
	uint64_t addr;
	uint32_t temp;
	uint8_t i;
	uint8_t j;
	uint8_t timeout = 0;
	int err;

	XHCI_CMD_ASSERT_LOCKED(sc);

	/* get hardware root structure */

	usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res);

	phwr = buf_res.buffer;

	/* Queue command */

	USB_BUS_LOCK(&sc->sc_bus);
	retry:
	i = sc->sc_command_idx;
	j = sc->sc_command_ccs;

	DPRINTFN(10, "command[%u] = %u (0x%016llx, 0x%08lx, 0x%08lx)\n",
	i, XHCI_TRB_3_TYPE_GET(le32toh(trb->dwTrb3)),
	(long long)le64toh(trb->qwTrb0),
	(long)le32toh(trb->dwTrb2),
	(long)le32toh(trb->dwTrb3));

	phwr->hwr_commands[i].qwTrb0 = trb->qwTrb0;
	phwr->hwr_commands[i].dwTrb2 = trb->dwTrb2;

	usb_pc_cpu_flush(&sc->sc_hw.root_pc);

	temp = trb->dwTrb3;

	if (j)
	temp \|= htole32(XHCI_TRB_3_CYCLE_BIT);
	else
	temp &= ~htole32(XHCI_TRB_3_CYCLE_BIT);

	temp &= ~htole32(XHCI_TRB_3_TC_BIT);

	phwr->hwr_commands[i].dwTrb3 = temp;

	usb_pc_cpu_flush(&sc->sc_hw.root_pc);

	addr = buf_res.physaddr;
	addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[i];

	sc->sc_cmd_addr = htole64(addr);

	i++;

	if (i == (XHCI_MAX_COMMANDS - 1)) {

	if (j) {
	temp = htole32(XHCI_TRB_3_TC_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) \|
	XHCI_TRB_3_CYCLE_BIT);
	} else {
	temp = htole32(XHCI_TRB_3_TC_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));
	}

	phwr->hwr_commands[i].dwTrb3 = temp;

	usb_pc_cpu_flush(&sc->sc_hw.root_pc);

	i = 0;
	j ^= 1;
	}

	sc->sc_command_idx = i;
	sc->sc_command_ccs = j;

	XWRITE4(sc, door, XHCI_DOORBELL(0), 0);

	err = cv_timedwait(&sc->sc_cmd_cv, &sc->sc_bus.bus_mtx,
	USB_MS_TO_TICKS(timeout_ms));

	/*
	* In some error cases event interrupts are not generated.
	* Poll one time to see if the command has completed.
	*/
	if (err != 0 && xhci_interrupt_poll(sc) != 0) {
	DPRINTF("Command was completed when polling\n");
	err = 0;
	}
	if (err != 0) {
	DPRINTF("Command timeout!\n");
	/*
	* After some weeks of continuous operation, it has
	* been observed that the ASMedia Technology, ASM1042
	* SuperSpeed USB Host Controller can suddenly stop
	* accepting commands via the command queue. Try to
	* first reset the command queue. If that fails do a
	* host controller reset.
	*/
	if (timeout == 0 &&
	xhci_reset_command_queue_locked(sc) == 0) {
	temp = le32toh(trb->dwTrb3);

	/*
	* Avoid infinite XHCI reset loops if the set
	* address command fails to respond due to a
	* non-enumerating device:
	*/
	if (XHCI_TRB_3_TYPE_GET(temp) == XHCI_TRB_TYPE_ADDRESS_DEVICE &&
	(temp & XHCI_TRB_3_BSR_BIT) == 0) {
	DPRINTF("Set address timeout\n");
	} else {
	timeout = 1;
	goto retry;
	}
	} else {
	DPRINTF("Controller reset!\n");
	usb_bus_reset_async_locked(&sc->sc_bus);
	}
	err = USB_ERR_TIMEOUT;
	trb->dwTrb2 = 0;
	trb->dwTrb3 = 0;
	} else {
	temp = le32toh(sc->sc_cmd_result[0]);
	if (XHCI_TRB_2_ERROR_GET(temp) != XHCI_TRB_ERROR_SUCCESS)
	err = USB_ERR_IOERROR;

	trb->dwTrb2 = sc->sc_cmd_result[0];
	trb->dwTrb3 = sc->sc_cmd_result[1];
	}

	USB_BUS_UNLOCK(&sc->sc_bus);

	return (err);
	}

	#if 0
	static usb_error_t
	xhci_cmd_nop(struct xhci_softc *sc)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = 0;
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NOOP);

	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}
	#endif

	static usb_error_t
	xhci_cmd_enable_slot(struct xhci_softc sc, uint8_t pslot)
	{
	struct xhci_trb trb;
	uint32_t temp;
	usb_error_t err;

	DPRINTF("\n");

	trb.qwTrb0 = 0;
	trb.dwTrb2 = 0;
	trb.dwTrb3 = htole32(XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ENABLE_SLOT));

	err = xhci_do_command(sc, &trb, 100 /* ms */);
	if (err)
	goto done;

	temp = le32toh(trb.dwTrb3);

	*pslot = XHCI_TRB_3_SLOT_GET(temp);

	done:
	return (err);
	}

	static usb_error_t
	xhci_cmd_disable_slot(struct xhci_softc *sc, uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = 0;
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DISABLE_SLOT) \|
	XHCI_TRB_3_SLOT_SET(slot_id);

	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}

	static usb_error_t
	xhci_cmd_set_address(struct xhci_softc *sc, uint64_t input_ctx,
	uint8_t bsr, uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = htole64(input_ctx);
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ADDRESS_DEVICE) \|
	XHCI_TRB_3_SLOT_SET(slot_id);

	if (bsr)
	temp \|= XHCI_TRB_3_BSR_BIT;

	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 500 /* ms */));
	}

	static usb_error_t
	xhci_set_address(struct usb_device udev, struct mtx mtx, uint16_t address)
	{
	struct usb_page_search buf_inp;
	struct usb_page_search buf_dev;
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	struct xhci_hw_dev *hdev;
	struct xhci_dev_ctx *pdev;
	struct xhci_endpoint_ext *pepext;
	uint32_t temp;
	uint16_t mps;
	usb_error_t err;
	uint8_t index;

	/* the root HUB case is not handled here */
	if (udev->parent_hub == NULL)
	return (USB_ERR_INVAL);

	index = udev->controller_slot_id;

	hdev = &sc->sc_hw.devs[index];

	if (mtx != NULL)
	mtx_unlock(mtx);

	XHCI_CMD_LOCK(sc);

	switch (hdev->state) {
	case XHCI_ST_DEFAULT:
	case XHCI_ST_ENABLED:

	hdev->state = XHCI_ST_ENABLED;

	/* set configure mask to slot and EP0 */
	xhci_configure_mask(udev, 3, 0);

	/* configure input slot context structure */
	err = xhci_configure_device(udev);

	if (err != 0) {
	DPRINTF("Could not configure device\n");
	break;
	}

	/* configure input endpoint context structure */
	switch (udev->speed) {
	case USB_SPEED_LOW:
	case USB_SPEED_FULL:
	mps = 8;
	break;
	case USB_SPEED_HIGH:
	mps = 64;
	break;
	default:
	mps = 512;
	break;
	}

	pepext = xhci_get_endpoint_ext(udev,
	&udev->ctrl_ep_desc);

	/* ensure the control endpoint is setup again */
	USB_BUS_LOCK(udev->bus);
	pepext->trb_halted = 1;
	pepext->trb_running = 0;
	USB_BUS_UNLOCK(udev->bus);

	err = xhci_configure_endpoint(udev,
	&udev->ctrl_ep_desc, pepext,
	0, 1, 1, 0, mps, mps, USB_EP_MODE_DEFAULT);

	if (err != 0) {
	DPRINTF("Could not configure default endpoint\n");
	break;
	}

	/* execute set address command */
	usbd_get_page(&hdev->input_pc, 0, &buf_inp);

	err = xhci_cmd_set_address(sc, buf_inp.physaddr,
	(address == 0), index);

	if (err != 0) {
	temp = le32toh(sc->sc_cmd_result[0]);
	if (address == 0 && sc->sc_port_route != NULL &&
	XHCI_TRB_2_ERROR_GET(temp) ==
	XHCI_TRB_ERROR_PARAMETER) {
	/* LynxPoint XHCI - ports are not switchable */
	/* Un-route all ports from the XHCI */
	sc->sc_port_route(sc->sc_bus.parent, 0, ~0);
	}
	DPRINTF("Could not set address "
	"for slot %u.\n", index);
	if (address != 0)
	break;
	}

	/* update device address to new value */

	usbd_get_page(&hdev->device_pc, 0, &buf_dev);
	pdev = buf_dev.buffer;
	usb_pc_cpu_invalidate(&hdev->device_pc);

	temp = xhci_ctx_get_le32(sc, &pdev->ctx_slot.dwSctx3);
	udev->address = XHCI_SCTX_3_DEV_ADDR_GET(temp);

	/* update device state to new value */

	if (address != 0)
	hdev->state = XHCI_ST_ADDRESSED;
	else
	hdev->state = XHCI_ST_DEFAULT;
	break;

	default:
	DPRINTF("Wrong state for set address.\n");
	err = USB_ERR_IOERROR;
	break;
	}
	XHCI_CMD_UNLOCK(sc);

	if (mtx != NULL)
	mtx_lock(mtx);

	return (err);
	}

	static usb_error_t
	xhci_cmd_configure_ep(struct xhci_softc *sc, uint64_t input_ctx,
	uint8_t deconfigure, uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = htole64(input_ctx);
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_CONFIGURE_EP) \|
	XHCI_TRB_3_SLOT_SET(slot_id);

	if (deconfigure)
	temp \|= XHCI_TRB_3_DCEP_BIT;

	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}

	static usb_error_t
	xhci_cmd_evaluate_ctx(struct xhci_softc *sc, uint64_t input_ctx,
	uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = htole64(input_ctx);
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_EVALUATE_CTX) \|
	XHCI_TRB_3_SLOT_SET(slot_id);
	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}

	static usb_error_t
	xhci_cmd_reset_ep(struct xhci_softc *sc, uint8_t preserve,
	uint8_t ep_id, uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = 0;
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_EP) \|
	XHCI_TRB_3_SLOT_SET(slot_id) \|
	XHCI_TRB_3_EP_SET(ep_id);

	if (preserve)
	temp \|= XHCI_TRB_3_PRSV_BIT;

	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}

	static usb_error_t
	xhci_cmd_set_tr_dequeue_ptr(struct xhci_softc *sc, uint64_t dequeue_ptr,
	uint16_t stream_id, uint8_t ep_id, uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = htole64(dequeue_ptr);

	temp = XHCI_TRB_2_STREAM_SET(stream_id);
	trb.dwTrb2 = htole32(temp);

	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SET_TR_DEQUEUE) \|
	XHCI_TRB_3_SLOT_SET(slot_id) \|
	XHCI_TRB_3_EP_SET(ep_id);
	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}

	static usb_error_t
	xhci_cmd_stop_ep(struct xhci_softc *sc, uint8_t suspend,
	uint8_t ep_id, uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = 0;
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STOP_EP) \|
	XHCI_TRB_3_SLOT_SET(slot_id) \|
	XHCI_TRB_3_EP_SET(ep_id);

	if (suspend)
	temp \|= XHCI_TRB_3_SUSP_EP_BIT;

	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}

	static usb_error_t
	xhci_cmd_reset_dev(struct xhci_softc *sc, uint8_t slot_id)
	{
	struct xhci_trb trb;
	uint32_t temp;

	DPRINTF("\n");

	trb.qwTrb0 = 0;
	trb.dwTrb2 = 0;
	temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_DEVICE) \|
	XHCI_TRB_3_SLOT_SET(slot_id);

	trb.dwTrb3 = htole32(temp);

	return (xhci_do_command(sc, &trb, 100 /* ms */));
	}

	/------------------------------------------------------------------------
	* xhci_interrupt - XHCI interrupt handler
	------------------------------------------------------------------------/
	void
	xhci_interrupt(struct xhci_softc *sc)
	{
	uint32_t status;
	uint32_t temp;

	USB_BUS_LOCK(&sc->sc_bus);

	status = XREAD4(sc, oper, XHCI_USBSTS);

	/* acknowledge interrupts, if any */
	if (status != 0) {
	XWRITE4(sc, oper, XHCI_USBSTS, status);
	DPRINTFN(16, "real interrupt (status=0x%08x)\n", status);
	}

	temp = XREAD4(sc, runt, XHCI_IMAN(0));

	/* force clearing of pending interrupts */
	if (temp & XHCI_IMAN_INTR_PEND)
	XWRITE4(sc, runt, XHCI_IMAN(0), temp);

	/* check for event(s) */
	xhci_interrupt_poll(sc);

	if (status & (XHCI_STS_PCD \| XHCI_STS_HCH \|
	XHCI_STS_HSE \| XHCI_STS_HCE)) {

	if (status & XHCI_STS_PCD) {
	xhci_root_intr(sc);
	}

	if (status & XHCI_STS_HCH) {
	printf("%s: host controller halted\n",
	__FUNCTION__);
	}

	if (status & XHCI_STS_HSE) {
	printf("%s: host system error\n",
	__FUNCTION__);
	}

	if (status & XHCI_STS_HCE) {
	printf("%s: host controller error\n",
	__FUNCTION__);
	}
	}
	USB_BUS_UNLOCK(&sc->sc_bus);
	}

	/------------------------------------------------------------------------
	* xhci_timeout - XHCI timeout handler
	------------------------------------------------------------------------/
	static void
	xhci_timeout(void *arg)
	{
	struct usb_xfer *xfer = arg;

	DPRINTF("xfer=%p\n", xfer);

	USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED);

	/* transfer is transferred */
	xhci_device_done(xfer, USB_ERR_TIMEOUT);
	}

	static void
	xhci_do_poll(struct usb_bus *bus)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(bus);

	USB_BUS_LOCK(&sc->sc_bus);
	xhci_interrupt_poll(sc);
	USB_BUS_UNLOCK(&sc->sc_bus);
	}

	static void
	xhci_setup_generic_chain_sub(struct xhci_std_temp *temp)
	{
	struct usb_page_search buf_res;
	struct xhci_td *td;
	struct xhci_td *td_next;
	struct xhci_td *td_alt_next;
	struct xhci_td *td_first;
	uint32_t buf_offset;
	uint32_t average;
	uint32_t len_old;
	uint32_t npkt_off;
	uint32_t dword;
	uint8_t shortpkt_old;
	uint8_t precompute;
	uint8_t x;

	td_alt_next = NULL;
	buf_offset = 0;
	shortpkt_old = temp->shortpkt;
	len_old = temp->len;
	npkt_off = 0;
	precompute = 1;

	restart:

	td = temp->td;
	td_next = td_first = temp->td_next;

	while (1) {

	if (temp->len == 0) {

	if (temp->shortpkt)
	break;

	/* send a Zero Length Packet, ZLP, last */

	temp->shortpkt = 1;
	average = 0;

	} else {

	average = temp->average;

	if (temp->len < average) {
	if (temp->len % temp->max_packet_size) {
	temp->shortpkt = 1;
	}
	average = temp->len;
	}
	}

	if (td_next == NULL)
	panic("%s: out of XHCI transfer descriptors!", __FUNCTION__);

	/* get next TD */

	td = td_next;
	td_next = td->obj_next;

	/* check if we are pre-computing */

	if (precompute) {

	/* update remaining length */

	temp->len -= average;

	continue;
	}
	/* fill out current TD */

	td->len = average;
	td->remainder = 0;
	td->status = 0;

	/* update remaining length */

	temp->len -= average;

	/* reset TRB index */

	x = 0;

	if (temp->trb_type == XHCI_TRB_TYPE_SETUP_STAGE) {
	/* immediate data */

	if (average > 8)
	average = 8;

	td->td_trb[0].qwTrb0 = 0;

	usbd_copy_out(temp->pc, temp->offset + buf_offset,
	(uint8_t *)(uintptr_t)&td->td_trb[0].qwTrb0,
	average);

	dword = XHCI_TRB_2_BYTES_SET(8) \|
	XHCI_TRB_2_TDSZ_SET(0) \|
	XHCI_TRB_2_IRQ_SET(0);

	td->td_trb[0].dwTrb2 = htole32(dword);

	dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SETUP_STAGE) \|
	XHCI_TRB_3_IDT_BIT \| XHCI_TRB_3_CYCLE_BIT;

	/* check wLength */
	if (td->td_trb[0].qwTrb0 &
	htole64(XHCI_TRB_0_WLENGTH_MASK)) {
	if (td->td_trb[0].qwTrb0 &
	htole64(XHCI_TRB_0_DIR_IN_MASK))
	dword \|= XHCI_TRB_3_TRT_IN;
	else
	dword \|= XHCI_TRB_3_TRT_OUT;
	}

	td->td_trb[0].dwTrb3 = htole32(dword);
	#ifdef USB_DEBUG
	xhci_dump_trb(&td->td_trb[x]);
	#endif
	x++;

	} else do {

	uint32_t npkt;

	/* fill out buffer pointers */

	if (average == 0) {
	memset(&buf_res, 0, sizeof(buf_res));
	} else {
	usbd_get_page(temp->pc, temp->offset +
	buf_offset, &buf_res);

	/* get length to end of page */
	if (buf_res.length > average)
	buf_res.length = average;

	/* check for maximum length */
	if (buf_res.length > XHCI_TD_PAGE_SIZE)
	buf_res.length = XHCI_TD_PAGE_SIZE;

	npkt_off += buf_res.length;
	}

	/* set up npkt */
	npkt = howmany(len_old - npkt_off,
	temp->max_packet_size);

	if (npkt == 0)
	npkt = 1;
	else if (npkt > 31)
	npkt = 31;

	/* fill out TRB's */
	td->td_trb[x].qwTrb0 =
	htole64((uint64_t)buf_res.physaddr);

	dword =
	XHCI_TRB_2_BYTES_SET(buf_res.length) \|
	XHCI_TRB_2_TDSZ_SET(npkt) \|
	XHCI_TRB_2_IRQ_SET(0);

	td->td_trb[x].dwTrb2 = htole32(dword);

	switch (temp->trb_type) {
	case XHCI_TRB_TYPE_ISOCH:
	dword = XHCI_TRB_3_CHAIN_BIT \| XHCI_TRB_3_CYCLE_BIT \|
	XHCI_TRB_3_TBC_SET(temp->tbc) \|
	XHCI_TRB_3_TLBPC_SET(temp->tlbpc);
	if (td != td_first) {
	dword \|= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL);
	} else if (temp->do_isoc_sync != 0) {
	temp->do_isoc_sync = 0;
	/* wait until "isoc_frame" */
	dword \|= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) \|
	XHCI_TRB_3_FRID_SET(temp->isoc_frame / 8);
	} else {
	/* start data transfer at next interval */
	dword \|= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) \|
	XHCI_TRB_3_ISO_SIA_BIT;
	}
	if (temp->direction == UE_DIR_IN)
	dword \|= XHCI_TRB_3_ISP_BIT;
	break;
	case XHCI_TRB_TYPE_DATA_STAGE:
	dword = XHCI_TRB_3_CHAIN_BIT \| XHCI_TRB_3_CYCLE_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DATA_STAGE);
	if (temp->direction == UE_DIR_IN)
	dword \|= XHCI_TRB_3_DIR_IN \| XHCI_TRB_3_ISP_BIT;
	/*
	* Section 3.2.9 in the XHCI
	* specification about control
	* transfers says that we should use a
	* normal-TRB if there are more TRBs
	* extending the data-stage
	* TRB. Update the "trb_type".
	*/
	temp->trb_type = XHCI_TRB_TYPE_NORMAL;
	break;
	case XHCI_TRB_TYPE_STATUS_STAGE:
	dword = XHCI_TRB_3_CHAIN_BIT \| XHCI_TRB_3_CYCLE_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STATUS_STAGE);
	if (temp->direction == UE_DIR_IN)
	dword \|= XHCI_TRB_3_DIR_IN;
	break;
	default: /* XHCI_TRB_TYPE_NORMAL */
	dword = XHCI_TRB_3_CHAIN_BIT \| XHCI_TRB_3_CYCLE_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL);
	if (temp->direction == UE_DIR_IN)
	dword \|= XHCI_TRB_3_ISP_BIT;
	break;
	}
	td->td_trb[x].dwTrb3 = htole32(dword);

	average -= buf_res.length;
	buf_offset += buf_res.length;
	#ifdef USB_DEBUG
	xhci_dump_trb(&td->td_trb[x]);
	#endif
	x++;

	} while (average != 0);

	td->td_trb[x-1].dwTrb3 \|= htole32(XHCI_TRB_3_IOC_BIT);

	/* store number of data TRB's */

	td->ntrb = x;

	DPRINTF("NTRB=%u\n", x);

	/* fill out link TRB */

	if (td_next != NULL) {
	/* link the current TD with the next one */
	td->td_trb[x].qwTrb0 = htole64((uint64_t)td_next->td_self);
	DPRINTF("LINK=0x%08llx\n", (long long)td_next->td_self);
	} else {
	/* this field will get updated later */
	DPRINTF("NOLINK\n");
	}

	dword = XHCI_TRB_2_IRQ_SET(0);

	td->td_trb[x].dwTrb2 = htole32(dword);

	dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) \|
	XHCI_TRB_3_CYCLE_BIT \| XHCI_TRB_3_IOC_BIT \|
	/*
	* CHAIN-BIT: Ensure that a multi-TRB IN-endpoint
	* frame only receives a single short packet event
	* by setting the CHAIN bit in the LINK field. In
	* addition some XHCI controllers have problems
	* sending a ZLP unless the CHAIN-BIT is set in
	* the LINK TRB.
	*/
	XHCI_TRB_3_CHAIN_BIT;

	td->td_trb[x].dwTrb3 = htole32(dword);

	td->alt_next = td_alt_next;
	#ifdef USB_DEBUG
	xhci_dump_trb(&td->td_trb[x]);
	#endif
	usb_pc_cpu_flush(td->page_cache);
	}

	if (precompute) {
	precompute = 0;

	/* set up alt next pointer, if any */
	if (temp->last_frame) {
	td_alt_next = NULL;
	} else {
	/* we use this field internally */
	td_alt_next = td_next;
	}

	/* restore */
	temp->shortpkt = shortpkt_old;
	temp->len = len_old;
	goto restart;
	}

	/*
	* Remove cycle bit from the first TRB if we are
	* stepping them:
	*/
	if (temp->step_td != 0) {
	td_first->td_trb[0].dwTrb3 &= ~htole32(XHCI_TRB_3_CYCLE_BIT);
	usb_pc_cpu_flush(td_first->page_cache);
	}

	/* clear TD SIZE to zero, hence this is the last TRB */
	/* remove chain bit because this is the last data TRB in the chain */
	td->td_trb[td->ntrb - 1].dwTrb2 &= ~htole32(XHCI_TRB_2_TDSZ_SET(15));
	td->td_trb[td->ntrb - 1].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT);
	/* remove CHAIN-BIT from last LINK TRB */
	td->td_trb[td->ntrb].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT);

	usb_pc_cpu_flush(td->page_cache);

	temp->td = td;
	temp->td_next = td_next;
	}

	static void
	xhci_setup_generic_chain(struct usb_xfer *xfer)
	{
	struct xhci_std_temp temp;
	struct xhci_td *td;
	uint32_t x;
	uint32_t y;
	uint8_t mult;

	temp.do_isoc_sync = 0;
	temp.step_td = 0;
	temp.tbc = 0;
	temp.tlbpc = 0;
	temp.average = xfer->max_hc_frame_size;
	temp.max_packet_size = xfer->max_packet_size;
	temp.sc = XHCI_BUS2SC(xfer->xroot->bus);
	temp.pc = NULL;
	temp.last_frame = 0;
	temp.offset = 0;
	temp.multishort = xfer->flags_int.isochronous_xfr \|\|
	xfer->flags_int.control_xfr \|\|
	xfer->flags_int.short_frames_ok;

	/* toggle the DMA set we are using */
	xfer->flags_int.curr_dma_set ^= 1;

	/* get next DMA set */
	td = xfer->td_start[xfer->flags_int.curr_dma_set];

	temp.td = NULL;
	temp.td_next = td;

	xfer->td_transfer_first = td;
	xfer->td_transfer_cache = td;

	if (xfer->flags_int.isochronous_xfr) {
	uint8_t shift;

	/* compute multiplier for ISOCHRONOUS transfers */
	mult = xfer->endpoint->ecomp ?
	UE_GET_SS_ISO_MULT(xfer->endpoint->ecomp->bmAttributes)
	: 0;
	/* check for USB 2.0 multiplier */
	if (mult == 0) {
	mult = (xfer->endpoint->edesc->
	wMaxPacketSize[1] >> 3) & 3;
	}
	/* range check */
	if (mult > 2)
	mult = 3;
	else
	mult++;

	x = XREAD4(temp.sc, runt, XHCI_MFINDEX);

	DPRINTF("MFINDEX=0x%08x\n", x);

	switch (usbd_get_speed(xfer->xroot->udev)) {
	case USB_SPEED_FULL:
	shift = 3;
	temp.isoc_delta = 8; /* 1ms */
	x += temp.isoc_delta - 1;
	x &= ~(temp.isoc_delta - 1);
	break;
	default:
	shift = usbd_xfer_get_fps_shift(xfer);
	temp.isoc_delta = 1U << shift;
	x += temp.isoc_delta - 1;
	x &= ~(temp.isoc_delta - 1);
	/* simple frame load balancing */
	x += xfer->endpoint->usb_uframe;
	break;
	}

	y = XHCI_MFINDEX_GET(x - xfer->endpoint->isoc_next);

	if ((xfer->endpoint->is_synced == 0) \|\|
	(y < (xfer->nframes << shift)) \|\|
	(XHCI_MFINDEX_GET(-y) >= (128 * 8))) {
	/*
	* If there is data underflow or the pipe
	* queue is empty we schedule the transfer a
	* few frames ahead of the current frame
	* position. Else two isochronous transfers
	* might overlap.
	*/
	xfer->endpoint->isoc_next = XHCI_MFINDEX_GET(x + (3 * 8));
	xfer->endpoint->is_synced = 1;
	temp.do_isoc_sync = 1;

	DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next);
	}

	/* compute isochronous completion time */

	y = XHCI_MFINDEX_GET(xfer->endpoint->isoc_next - (x & ~7));

	xfer->isoc_time_complete =
	usb_isoc_time_expand(&temp.sc->sc_bus, x / 8) +
	(y / 8) + (((xfer->nframes << shift) + 7) / 8);

	x = 0;
	temp.isoc_frame = xfer->endpoint->isoc_next;
	temp.trb_type = XHCI_TRB_TYPE_ISOCH;

	xfer->endpoint->isoc_next += xfer->nframes << shift;

	} else if (xfer->flags_int.control_xfr) {

	/* check if we should prepend a setup message */

	if (xfer->flags_int.control_hdr) {

	temp.len = xfer->frlengths[0];
	temp.pc = xfer->frbuffers + 0;
	temp.shortpkt = temp.len ? 1 : 0;
	temp.trb_type = XHCI_TRB_TYPE_SETUP_STAGE;
	temp.direction = 0;

	/* check for last frame */
	if (xfer->nframes == 1) {
	/* no STATUS stage yet, SETUP is last */
	if (xfer->flags_int.control_act)
	temp.last_frame = 1;
	}

	xhci_setup_generic_chain_sub(&temp);
	}
	x = 1;
	mult = 1;
	temp.isoc_delta = 0;
	temp.isoc_frame = 0;
	temp.trb_type = xfer->flags_int.control_did_data ?
	XHCI_TRB_TYPE_NORMAL : XHCI_TRB_TYPE_DATA_STAGE;
	} else {
	x = 0;
	mult = 1;
	temp.isoc_delta = 0;
	temp.isoc_frame = 0;
	temp.trb_type = XHCI_TRB_TYPE_NORMAL;
	}

	if (x != xfer->nframes) {
	/* set up page_cache pointer */
	temp.pc = xfer->frbuffers + x;
	/* set endpoint direction */
	temp.direction = UE_GET_DIR(xfer->endpointno);
	}

	while (x != xfer->nframes) {

	/* DATA0 / DATA1 message */

	temp.len = xfer->frlengths[x];
	temp.step_td = ((xfer->endpointno & UE_DIR_IN) &&
	x != 0 && temp.multishort == 0);

	x++;

	if (x == xfer->nframes) {
	if (xfer->flags_int.control_xfr) {
	/* no STATUS stage yet, DATA is last */
	if (xfer->flags_int.control_act)
	temp.last_frame = 1;
	} else {
	temp.last_frame = 1;
	}
	}
	if (temp.len == 0) {

	/* make sure that we send an USB packet */

	temp.shortpkt = 0;

	temp.tbc = 0;
	temp.tlbpc = mult - 1;

	} else if (xfer->flags_int.isochronous_xfr) {

	uint8_t tdpc;

	/*
	* Isochronous transfers don't have short
	* packet termination:
	*/

	temp.shortpkt = 1;

	/* isochronous transfers have a transfer limit */

	if (temp.len > xfer->max_frame_size)
	temp.len = xfer->max_frame_size;

	/* compute TD packet count */
	tdpc = howmany(temp.len, xfer->max_packet_size);

	temp.tbc = howmany(tdpc, mult) - 1;
	temp.tlbpc = (tdpc % mult);

	if (temp.tlbpc == 0)
	temp.tlbpc = mult - 1;
	else
	temp.tlbpc--;
	} else {

	/* regular data transfer */

	temp.shortpkt = xfer->flags.force_short_xfer ? 0 : 1;
	}

	xhci_setup_generic_chain_sub(&temp);

	if (xfer->flags_int.isochronous_xfr) {
	temp.offset += xfer->frlengths[x - 1];
	temp.isoc_frame += temp.isoc_delta;
	} else {
	/* get next Page Cache pointer */
	temp.pc = xfer->frbuffers + x;
	}
	}

	/* check if we should append a status stage */

	if (xfer->flags_int.control_xfr &&
	!xfer->flags_int.control_act) {

	/*
	* Send a DATA1 message and invert the current
	* endpoint direction.
	*/
	#ifdef XHCI_STEP_STATUS_STAGE
	temp.step_td = (xfer->nframes != 0);
	#else
	temp.step_td = 0;
	#endif
	temp.direction = UE_GET_DIR(xfer->endpointno) ^ UE_DIR_IN;
	temp.len = 0;
	temp.pc = NULL;
	temp.shortpkt = 0;
	temp.last_frame = 1;
	temp.trb_type = XHCI_TRB_TYPE_STATUS_STAGE;

	xhci_setup_generic_chain_sub(&temp);
	}

	td = temp.td;

	/* must have at least one frame! */

	xfer->td_transfer_last = td;

	DPRINTF("first=%p last=%p\n", xfer->td_transfer_first, td);
	}

	static void
	xhci_set_slot_pointer(struct xhci_softc *sc, uint8_t index, uint64_t dev_addr)
	{
	struct usb_page_search buf_res;
	struct xhci_dev_ctx_addr *pdctxa;

	usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res);

	pdctxa = buf_res.buffer;

	DPRINTF("addr[%u]=0x%016llx\n", index, (long long)dev_addr);

	pdctxa->qwBaaDevCtxAddr[index] = htole64(dev_addr);

	usb_pc_cpu_flush(&sc->sc_hw.ctx_pc);
	}

	static usb_error_t
	xhci_configure_mask(struct usb_device *udev, uint32_t mask, uint8_t drop)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	struct usb_page_search buf_inp;
	struct xhci_input_dev_ctx *pinp;
	uint32_t temp;
	uint8_t index;
	uint8_t x;

	index = udev->controller_slot_id;

	usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);

	pinp = buf_inp.buffer;

	if (drop) {
	mask &= XHCI_INCTX_NON_CTRL_MASK;
	xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0, mask);
	xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, 0);
	} else {
	/*
	* Some hardware requires that we drop the endpoint
	* context before adding it again:
	*/
	xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0,
	mask & XHCI_INCTX_NON_CTRL_MASK);

	/* Add new endpoint context */
	xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, mask);

	/* find most significant set bit */
	for (x = 31; x != 1; x--) {
	if (mask & (1 << x))
	break;
	}

	/* adjust */
	x--;

	/* figure out the maximum number of contexts */
	if (x > sc->sc_hw.devs[index].context_num)
	sc->sc_hw.devs[index].context_num = x;
	else
	x = sc->sc_hw.devs[index].context_num;

	/* update number of contexts */
	temp = xhci_ctx_get_le32(sc, &pinp->ctx_slot.dwSctx0);
	temp &= ~XHCI_SCTX_0_CTX_NUM_SET(31);
	temp \|= XHCI_SCTX_0_CTX_NUM_SET(x + 1);
	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp);
	}
	usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc);
	return (0);
	}

	static usb_error_t
	xhci_configure_endpoint(struct usb_device *udev,
	struct usb_endpoint_descriptor edesc, struct xhci_endpoint_ext pepext,
	uint16_t interval, uint8_t max_packet_count,
	uint8_t mult, uint8_t fps_shift, uint16_t max_packet_size,
	uint16_t max_frame_size, uint8_t ep_mode)
	{
	struct usb_page_search buf_inp;
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	struct xhci_input_dev_ctx *pinp;
	uint64_t ring_addr = pepext->physaddr;
	uint32_t temp;
	uint8_t index;
	uint8_t epno;
	uint8_t type;

	index = udev->controller_slot_id;

	usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);

	pinp = buf_inp.buffer;

	epno = edesc->bEndpointAddress;
	type = edesc->bmAttributes & UE_XFERTYPE;

	if (type == UE_CONTROL)
	epno \|= UE_DIR_IN;

	epno = XHCI_EPNO2EPID(epno);

	if (epno == 0)
	return (USB_ERR_NO_PIPE); /* invalid */

	if (max_packet_count == 0)
	return (USB_ERR_BAD_BUFSIZE);

	max_packet_count--;

	if (mult == 0)
	return (USB_ERR_BAD_BUFSIZE);

	/* store endpoint mode */
	pepext->trb_ep_mode = ep_mode;
	/* store bMaxPacketSize for control endpoints */
	pepext->trb_ep_maxp = edesc->wMaxPacketSize[0];
	usb_pc_cpu_flush(pepext->page_cache);

	if (ep_mode == USB_EP_MODE_STREAMS) {
	temp = XHCI_EPCTX_0_EPSTATE_SET(0) \|
	XHCI_EPCTX_0_MAXP_STREAMS_SET(XHCI_MAX_STREAMS_LOG - 1) \|
	XHCI_EPCTX_0_LSA_SET(1);

	ring_addr += sizeof(struct xhci_trb) *
	XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS;
	} else {
	temp = XHCI_EPCTX_0_EPSTATE_SET(0) \|
	XHCI_EPCTX_0_MAXP_STREAMS_SET(0) \|
	XHCI_EPCTX_0_LSA_SET(0);

	ring_addr \|= XHCI_EPCTX_2_DCS_SET(1);
	}

	switch (udev->speed) {
	case USB_SPEED_FULL:
	case USB_SPEED_LOW:
	/* 1ms -> 125us */
	fps_shift += 3;
	break;
	default:
	break;
	}

	switch (type) {
	case UE_INTERRUPT:
	if (fps_shift > 3)
	fps_shift--;
	temp \|= XHCI_EPCTX_0_IVAL_SET(fps_shift);
	break;
	case UE_ISOCHRONOUS:
	temp \|= XHCI_EPCTX_0_IVAL_SET(fps_shift);

	switch (udev->speed) {
	case USB_SPEED_SUPER:
	if (mult > 3)
	mult = 3;
	temp \|= XHCI_EPCTX_0_MULT_SET(mult - 1);
	max_packet_count /= mult;
	break;
	default:
	break;
	}
	break;
	default:
	break;
	}

	xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx0, temp);

	temp =
	XHCI_EPCTX_1_HID_SET(0) \|
	XHCI_EPCTX_1_MAXB_SET(max_packet_count) \|
	XHCI_EPCTX_1_MAXP_SIZE_SET(max_packet_size);

	/*
	* Always enable the "three strikes and you are gone" feature
	* except for ISOCHRONOUS endpoints. This is suggested by
	* section 4.3.3 in the XHCI specification about device slot
	* initialisation.
	*/
	if (type != UE_ISOCHRONOUS)
	temp \|= XHCI_EPCTX_1_CERR_SET(3);

	switch (type) {
	case UE_CONTROL:
	temp \|= XHCI_EPCTX_1_EPTYPE_SET(4);
	break;
	case UE_ISOCHRONOUS:
	temp \|= XHCI_EPCTX_1_EPTYPE_SET(1);
	break;
	case UE_BULK:
	temp \|= XHCI_EPCTX_1_EPTYPE_SET(2);
	break;
	default:
	temp \|= XHCI_EPCTX_1_EPTYPE_SET(3);
	break;
	}

	/* check for IN direction */
	if (epno & 1)
	temp \|= XHCI_EPCTX_1_EPTYPE_SET(4);

	xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx1, temp);
	xhci_ctx_set_le64(sc, &pinp->ctx_ep[epno - 1].qwEpCtx2, ring_addr);

	switch (edesc->bmAttributes & UE_XFERTYPE) {
	case UE_INTERRUPT:
	case UE_ISOCHRONOUS:
	temp = XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(max_frame_size) \|
	XHCI_EPCTX_4_AVG_TRB_LEN_SET(MIN(XHCI_PAGE_SIZE,
	max_frame_size));
	break;
	case UE_CONTROL:
	temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(8);
	break;
	default:
	temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(XHCI_PAGE_SIZE);
	break;
	}

	xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx4, temp);

	#ifdef USB_DEBUG
	xhci_dump_endpoint(sc, &pinp->ctx_ep[epno - 1]);
	#endif
	usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc);

	return (0); /* success */
	}

	static usb_error_t
	xhci_configure_endpoint_by_xfer(struct usb_xfer *xfer)
	{
	struct xhci_endpoint_ext *pepext;
	struct usb_endpoint_ss_comp_descriptor *ecomp;
	usb_stream_t x;

	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
	xfer->endpoint->edesc);

	ecomp = xfer->endpoint->ecomp;

	for (x = 0; x != XHCI_MAX_STREAMS; x++) {
	uint64_t temp;

	/* halt any transfers */
	pepext->trb[x * XHCI_MAX_TRANSFERS].dwTrb3 = 0;

	/* compute start of TRB ring for stream "x" */
	temp = pepext->physaddr +
	(x * XHCI_MAX_TRANSFERS * sizeof(struct xhci_trb)) +
	XHCI_SCTX_0_SCT_SEC_TR_RING;

	/* make tree structure */
	pepext->trb[(XHCI_MAX_TRANSFERS *
	XHCI_MAX_STREAMS) + x].qwTrb0 = htole64(temp);

	/* reserved fields */
	pepext->trb[(XHCI_MAX_TRANSFERS *
	XHCI_MAX_STREAMS) + x].dwTrb2 = 0;
	pepext->trb[(XHCI_MAX_TRANSFERS *
	XHCI_MAX_STREAMS) + x].dwTrb3 = 0;
	}
	usb_pc_cpu_flush(pepext->page_cache);

	return (xhci_configure_endpoint(xfer->xroot->udev,
	xfer->endpoint->edesc, pepext,
	xfer->interval, xfer->max_packet_count,
	(ecomp != NULL) ? UE_GET_SS_ISO_MULT(ecomp->bmAttributes) + 1 : 1,
	usbd_xfer_get_fps_shift(xfer), xfer->max_packet_size,
	xfer->max_frame_size, xfer->endpoint->ep_mode));
	}

	static usb_error_t
	xhci_configure_device(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	struct usb_page_search buf_inp;
	struct usb_page_cache *pcinp;
	struct xhci_input_dev_ctx *pinp;
	struct usb_device *hubdev;
	uint32_t temp;
	uint32_t route;
	uint32_t rh_port;
	uint8_t is_hub;
	uint8_t index;
	uint8_t depth;

	index = udev->controller_slot_id;

	DPRINTF("index=%u\n", index);

	pcinp = &sc->sc_hw.devs[index].input_pc;

	usbd_get_page(pcinp, 0, &buf_inp);

	pinp = buf_inp.buffer;

	rh_port = 0;
	route = 0;

	/* figure out route string and root HUB port number */

	for (hubdev = udev; hubdev != NULL; hubdev = hubdev->parent_hub) {

	if (hubdev->parent_hub == NULL)
	break;

	depth = hubdev->parent_hub->depth;

	/*
	* NOTE: HS/FS/LS devices and the SS root HUB can have
	* more than 15 ports
	*/

	rh_port = hubdev->port_no;

	if (depth == 0)
	break;

	if (rh_port > 15)
	rh_port = 15;

	if (depth < 6)
	route \|= rh_port << (4 * (depth - 1));
	}

	DPRINTF("Route=0x%08x\n", route);

	temp = XHCI_SCTX_0_ROUTE_SET(route) \|
	XHCI_SCTX_0_CTX_NUM_SET(
	sc->sc_hw.devs[index].context_num + 1);

	switch (udev->speed) {
	case USB_SPEED_LOW:
	temp \|= XHCI_SCTX_0_SPEED_SET(2);
	if (udev->parent_hs_hub != NULL &&
	udev->parent_hs_hub->ddesc.bDeviceProtocol ==
	UDPROTO_HSHUBMTT) {
	DPRINTF("Device inherits MTT\n");
	temp \|= XHCI_SCTX_0_MTT_SET(1);
	}
	break;
	case USB_SPEED_HIGH:
	temp \|= XHCI_SCTX_0_SPEED_SET(3);
	if (sc->sc_hw.devs[index].nports != 0 &&
	udev->ddesc.bDeviceProtocol == UDPROTO_HSHUBMTT) {
	DPRINTF("HUB supports MTT\n");
	temp \|= XHCI_SCTX_0_MTT_SET(1);
	}
	break;
	case USB_SPEED_FULL:
	temp \|= XHCI_SCTX_0_SPEED_SET(1);
	if (udev->parent_hs_hub != NULL &&
	udev->parent_hs_hub->ddesc.bDeviceProtocol ==
	UDPROTO_HSHUBMTT) {
	DPRINTF("Device inherits MTT\n");
	temp \|= XHCI_SCTX_0_MTT_SET(1);
	}
	break;
	default:
	temp \|= XHCI_SCTX_0_SPEED_SET(4);
	break;
	}

	is_hub = sc->sc_hw.devs[index].nports != 0 &&
	(udev->speed == USB_SPEED_SUPER \|\|
	udev->speed == USB_SPEED_HIGH);

	if (is_hub)
	temp \|= XHCI_SCTX_0_HUB_SET(1);

	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp);

	temp = XHCI_SCTX_1_RH_PORT_SET(rh_port);

	if (is_hub) {
	temp \|= XHCI_SCTX_1_NUM_PORTS_SET(
	sc->sc_hw.devs[index].nports);
	}

	switch (udev->speed) {
	case USB_SPEED_SUPER:
	switch (sc->sc_hw.devs[index].state) {
	case XHCI_ST_ADDRESSED:
	case XHCI_ST_CONFIGURED:
	/* enable power save */
	temp \|= XHCI_SCTX_1_MAX_EL_SET(sc->sc_exit_lat_max);
	break;
	default:
	/* disable power save */
	break;
	}
	break;
	default:
	break;
	}

	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx1, temp);

	temp = XHCI_SCTX_2_IRQ_TARGET_SET(0);

	if (is_hub) {
	temp \|= XHCI_SCTX_2_TT_THINK_TIME_SET(
	sc->sc_hw.devs[index].tt);
	}

	hubdev = udev->parent_hs_hub;

	/* check if we should activate the transaction translator */
	switch (udev->speed) {
	case USB_SPEED_FULL:
	case USB_SPEED_LOW:
	if (hubdev != NULL) {
	temp \|= XHCI_SCTX_2_TT_HUB_SID_SET(
	hubdev->controller_slot_id);
	temp \|= XHCI_SCTX_2_TT_PORT_NUM_SET(
	udev->hs_port_no);
	}
	break;
	default:
	break;
	}

	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx2, temp);

	/*
	* These fields should be initialized to zero, according to
	* XHCI section 6.2.2 - slot context:
	*/
	temp = XHCI_SCTX_3_DEV_ADDR_SET(0) \|
	XHCI_SCTX_3_SLOT_STATE_SET(0);

	xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx3, temp);

	#ifdef USB_DEBUG
	xhci_dump_device(sc, &pinp->ctx_slot);
	#endif
	usb_pc_cpu_flush(pcinp);

	return (0); /* success */
	}

	static usb_error_t
	xhci_alloc_device_ext(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	struct usb_page_search buf_dev;
	struct usb_page_search buf_ep;
	struct xhci_trb *trb;
	struct usb_page_cache *pc;
	struct usb_page *pg;
	uint64_t addr;
	uint8_t index;
	uint8_t i;

	index = udev->controller_slot_id;

	pc = &sc->sc_hw.devs[index].device_pc;
	pg = &sc->sc_hw.devs[index].device_pg;

	/* need to initialize the page cache */
	pc->tag_parent = sc->sc_bus.dma_parent_tag;

	if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ?
	(2 * sizeof(struct xhci_dev_ctx)) :
	sizeof(struct xhci_dev_ctx), XHCI_PAGE_SIZE))
	goto error;

	usbd_get_page(pc, 0, &buf_dev);

	pc = &sc->sc_hw.devs[index].input_pc;
	pg = &sc->sc_hw.devs[index].input_pg;

	/* need to initialize the page cache */
	pc->tag_parent = sc->sc_bus.dma_parent_tag;

	if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ?
	(2 * sizeof(struct xhci_input_dev_ctx)) :
	sizeof(struct xhci_input_dev_ctx), XHCI_PAGE_SIZE)) {
	goto error;
	}

	/* initialize all endpoint LINK TRBs */

	for (i = 0; i != XHCI_MAX_ENDPOINTS; i++) {

	pc = &sc->sc_hw.devs[index].endpoint_pc[i];
	pg = &sc->sc_hw.devs[index].endpoint_pg[i];

	/* need to initialize the page cache */
	pc->tag_parent = sc->sc_bus.dma_parent_tag;

	if (usb_pc_alloc_mem(pc, pg,
	sizeof(struct xhci_dev_endpoint_trbs), XHCI_TRB_ALIGN)) {
	goto error;
	}

	/* lookup endpoint TRB ring */
	usbd_get_page(pc, 0, &buf_ep);

	/* get TRB pointer */
	trb = buf_ep.buffer;
	trb += XHCI_MAX_TRANSFERS - 1;

	/* get TRB start address */
	addr = buf_ep.physaddr;

	/* create LINK TRB */
	trb->qwTrb0 = htole64(addr);
	trb->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
	trb->dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));

	usb_pc_cpu_flush(pc);
	}

	xhci_set_slot_pointer(sc, index, buf_dev.physaddr);

	return (0);

	error:
	xhci_free_device_ext(udev);

	return (USB_ERR_NOMEM);
	}

	static void
	xhci_free_device_ext(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	uint8_t index;
	uint8_t i;

	index = udev->controller_slot_id;
	xhci_set_slot_pointer(sc, index, 0);

	usb_pc_free_mem(&sc->sc_hw.devs[index].device_pc);
	usb_pc_free_mem(&sc->sc_hw.devs[index].input_pc);
	for (i = 0; i != XHCI_MAX_ENDPOINTS; i++)
	usb_pc_free_mem(&sc->sc_hw.devs[index].endpoint_pc[i]);
	}

	static struct xhci_endpoint_ext *
	xhci_get_endpoint_ext(struct usb_device udev, struct usb_endpoint_descriptor edesc)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	struct xhci_endpoint_ext *pepext;
	struct usb_page_cache *pc;
	struct usb_page_search buf_ep;
	uint8_t epno;
	uint8_t index;

	epno = edesc->bEndpointAddress;
	if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL)
	epno \|= UE_DIR_IN;

	epno = XHCI_EPNO2EPID(epno);

	index = udev->controller_slot_id;

	pc = &sc->sc_hw.devs[index].endpoint_pc[epno];

	usbd_get_page(pc, 0, &buf_ep);

	pepext = &sc->sc_hw.devs[index].endp[epno];
	pepext->page_cache = pc;
	pepext->trb = buf_ep.buffer;
	pepext->physaddr = buf_ep.physaddr;

	return (pepext);
	}

	static void
	xhci_endpoint_doorbell(struct usb_xfer *xfer)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
	uint8_t epno;
	uint8_t index;

	epno = xfer->endpointno;
	if (xfer->flags_int.control_xfr)
	epno \|= UE_DIR_IN;

	epno = XHCI_EPNO2EPID(epno);
	index = xfer->xroot->udev->controller_slot_id;

	if (xfer->xroot->udev->flags.self_suspended == 0) {
	XWRITE4(sc, door, XHCI_DOORBELL(index),
	epno \| XHCI_DB_SID_SET(xfer->stream_id));
	}
	}

	static void
	xhci_transfer_remove(struct usb_xfer *xfer, usb_error_t error)
	{
	struct xhci_endpoint_ext *pepext;

	if (xfer->flags_int.bandwidth_reclaimed) {
	xfer->flags_int.bandwidth_reclaimed = 0;

	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
	xfer->endpoint->edesc);

	pepext->trb_used[xfer->stream_id]--;

	pepext->xfer[xfer->qh_pos] = NULL;

	if (error && pepext->trb_running != 0) {
	pepext->trb_halted = 1;
	pepext->trb_running = 0;
	}
	}
	}

	static usb_error_t
	xhci_transfer_insert(struct usb_xfer *xfer)
	{
	struct xhci_td *td_first;
	struct xhci_td *td_last;
	struct xhci_trb *trb_link;
	struct xhci_endpoint_ext *pepext;
	uint64_t addr;
	usb_stream_t id;
	uint8_t i;
	uint8_t inext;
	uint8_t trb_limit;

	DPRINTFN(8, "\n");

	id = xfer->stream_id;

	/* check if already inserted */
	if (xfer->flags_int.bandwidth_reclaimed) {
	DPRINTFN(8, "Already in schedule\n");
	return (0);
	}

	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
	xfer->endpoint->edesc);

	td_first = xfer->td_transfer_first;
	td_last = xfer->td_transfer_last;
	addr = pepext->physaddr;

	switch (xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE) {
	case UE_CONTROL:
	case UE_INTERRUPT:
	/* single buffered */
	trb_limit = 1;
	break;
	default:
	/* multi buffered */
	trb_limit = (XHCI_MAX_TRANSFERS - 2);
	break;
	}

	if (pepext->trb_used[id] >= trb_limit) {
	DPRINTFN(8, "Too many TDs queued.\n");
	return (USB_ERR_NOMEM);
	}

	/* check if bMaxPacketSize changed */
	if (xfer->flags_int.control_xfr != 0 &&
	pepext->trb_ep_maxp != xfer->endpoint->edesc->wMaxPacketSize[0]) {

	DPRINTFN(8, "Reconfigure control endpoint\n");

	/* force driver to reconfigure endpoint */
	pepext->trb_halted = 1;
	pepext->trb_running = 0;
	}

	/* check for stopped condition, after putting transfer on interrupt queue */
	if (pepext->trb_running == 0) {
	struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);

	DPRINTFN(8, "Not running\n");

	/* start configuration */
	(void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus),
	&sc->sc_config_msg[0], &sc->sc_config_msg[1]);
	return (0);
	}

	pepext->trb_used[id]++;

	/* get current TRB index */
	i = pepext->trb_index[id];

	/* get next TRB index */
	inext = (i + 1);

	/* the last entry of the ring is a hardcoded link TRB */
	if (inext >= (XHCI_MAX_TRANSFERS - 1))
	inext = 0;

	/* store next TRB index, before stream ID offset is added */
	pepext->trb_index[id] = inext;

	/* offset for stream */
	i += id * XHCI_MAX_TRANSFERS;
	inext += id * XHCI_MAX_TRANSFERS;

	/* compute terminating return address */
	addr += (inext * sizeof(struct xhci_trb));

	/* compute link TRB pointer */
	trb_link = td_last->td_trb + td_last->ntrb;

	/* update next pointer of last link TRB */
	trb_link->qwTrb0 = htole64(addr);
	trb_link->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));
	trb_link->dwTrb3 = htole32(XHCI_TRB_3_IOC_BIT \|
	XHCI_TRB_3_CYCLE_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));

	#ifdef USB_DEBUG
	xhci_dump_trb(&td_last->td_trb[td_last->ntrb]);
	#endif
	usb_pc_cpu_flush(td_last->page_cache);

	/* write ahead chain end marker */

	pepext->trb[inext].qwTrb0 = 0;
	pepext->trb[inext].dwTrb2 = 0;
	pepext->trb[inext].dwTrb3 = 0;

	/* update next pointer of link TRB */

	pepext->trb[i].qwTrb0 = htole64((uint64_t)td_first->td_self);
	pepext->trb[i].dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0));

	#ifdef USB_DEBUG
	xhci_dump_trb(&pepext->trb[i]);
	#endif
	usb_pc_cpu_flush(pepext->page_cache);

	/* toggle cycle bit which activates the transfer chain */

	pepext->trb[i].dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT \|
	XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK));

	usb_pc_cpu_flush(pepext->page_cache);

	DPRINTF("qh_pos = %u\n", i);

	pepext->xfer[i] = xfer;

	xfer->qh_pos = i;

	xfer->flags_int.bandwidth_reclaimed = 1;

	xhci_endpoint_doorbell(xfer);

	return (0);
	}

	static void
	xhci_root_intr(struct xhci_softc *sc)
	{
	uint16_t i;

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	/* clear any old interrupt data */
	memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata));

	for (i = 1; i <= sc->sc_noport; i++) {
	/* pick out CHANGE bits from the status register */
	if (XREAD4(sc, oper, XHCI_PORTSC(i)) & (
	XHCI_PS_CSC \| XHCI_PS_PEC \|
	XHCI_PS_OCC \| XHCI_PS_WRC \|
	XHCI_PS_PRC \| XHCI_PS_PLC \|
	XHCI_PS_CEC)) {
	sc->sc_hub_idata[i / 8] \|= 1 << (i % 8);
	DPRINTF("port %d changed\n", i);
	}
	}
	uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata,
	sizeof(sc->sc_hub_idata));
	}

	/------------------------------------------------------------------------
	* xhci_device_done - XHCI done handler
	*
	* NOTE: This function can be called two times in a row on
	* the same USB transfer. From close and from interrupt.
	------------------------------------------------------------------------/
	static void
	xhci_device_done(struct usb_xfer *xfer, usb_error_t error)
	{
	DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n",
	xfer, xfer->endpoint, error);

	/* remove transfer from HW queue */
	xhci_transfer_remove(xfer, error);

	/* dequeue transfer and start next transfer */
	usbd_transfer_done(xfer, error);
	}

	/------------------------------------------------------------------------
	* XHCI data transfer support (generic type)
	------------------------------------------------------------------------/
	static void
	xhci_device_generic_open(struct usb_xfer *xfer)
	{
	if (xfer->flags_int.isochronous_xfr) {
	switch (xfer->xroot->udev->speed) {
	case USB_SPEED_FULL:
	break;
	default:
	usb_hs_bandwidth_alloc(xfer);
	break;
	}
	}
	}

	static void
	xhci_device_generic_close(struct usb_xfer *xfer)
	{
	DPRINTF("\n");

	xhci_device_done(xfer, USB_ERR_CANCELLED);

	if (xfer->flags_int.isochronous_xfr) {
	switch (xfer->xroot->udev->speed) {
	case USB_SPEED_FULL:
	break;
	default:
	usb_hs_bandwidth_free(xfer);
	break;
	}
	}
	}

	static void
	xhci_device_generic_multi_enter(struct usb_endpoint *ep,
	usb_stream_t stream_id, struct usb_xfer *enter_xfer)
	{
	struct usb_xfer *xfer;

	/* check if there is a current transfer */
	xfer = ep->endpoint_q[stream_id].curr;
	if (xfer == NULL)
	return;

	/*
	* Check if the current transfer is started and then pickup
	* the next one, if any. Else wait for next start event due to
	* block on failure feature.
	*/
	if (!xfer->flags_int.bandwidth_reclaimed)
	return;

	xfer = TAILQ_FIRST(&ep->endpoint_q[stream_id].head);
	if (xfer == NULL) {
	/*
	* In case of enter we have to consider that the
	* transfer is queued by the USB core after the enter
	* method is called.
	*/
	xfer = enter_xfer;

	if (xfer == NULL)
	return;
	}

	/* try to multi buffer */
	xhci_transfer_insert(xfer);
	}

	static void
	xhci_device_generic_enter(struct usb_xfer *xfer)
	{
	DPRINTF("\n");

	/* set up TD's and QH */
	xhci_setup_generic_chain(xfer);

	xhci_device_generic_multi_enter(xfer->endpoint,
	xfer->stream_id, xfer);
	}

	static void
	xhci_device_generic_start(struct usb_xfer *xfer)
	{
	DPRINTF("\n");

	/* try to insert xfer on HW queue */
	xhci_transfer_insert(xfer);

	/* try to multi buffer */
	xhci_device_generic_multi_enter(xfer->endpoint,
	xfer->stream_id, NULL);

	/* add transfer last on interrupt queue */
	usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer);

	/* start timeout, if any */
	if (xfer->timeout != 0)
	usbd_transfer_timeout_ms(xfer, &xhci_timeout, xfer->timeout);
	}

	static const struct usb_pipe_methods xhci_device_generic_methods =
	{
	.open = xhci_device_generic_open,
	.close = xhci_device_generic_close,
	.enter = xhci_device_generic_enter,
	.start = xhci_device_generic_start,
	};

	/------------------------------------------------------------------------
	* xhci root HUB support
	------------------------------------------------------------------------
	* Simulate a hardware HUB by handling all the necessary requests.
	------------------------------------------------------------------------/

	#define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }

	static const
	struct usb_device_descriptor xhci_devd =
	{
	.bLength = sizeof(xhci_devd),
	.bDescriptorType = UDESC_DEVICE, /* type */
	HSETW(.bcdUSB, 0x0300), /* USB version */
	.bDeviceClass = UDCLASS_HUB, /* class */
	.bDeviceSubClass = UDSUBCLASS_HUB, /* subclass */
	.bDeviceProtocol = UDPROTO_SSHUB, /* protocol */
	.bMaxPacketSize = 9, /* max packet size */
	HSETW(.idVendor, 0x0000), /* vendor */
	HSETW(.idProduct, 0x0000), /* product */
	HSETW(.bcdDevice, 0x0100), /* device version */
	.iManufacturer = 1,
	.iProduct = 2,
	.iSerialNumber = 0,
	.bNumConfigurations = 1, /* # of configurations */
	};

	static const
	struct xhci_bos_desc xhci_bosd = {
	.bosd = {
	.bLength = sizeof(xhci_bosd.bosd),
	.bDescriptorType = UDESC_BOS,
	HSETW(.wTotalLength, sizeof(xhci_bosd)),
	.bNumDeviceCaps = 3,
	},
	.usb2extd = {
	.bLength = sizeof(xhci_bosd.usb2extd),
	.bDescriptorType = 1,
	.bDevCapabilityType = 2,
	.bmAttributes[0] = 2,
	},
	.usbdcd = {
	.bLength = sizeof(xhci_bosd.usbdcd),
	.bDescriptorType = UDESC_DEVICE_CAPABILITY,
	.bDevCapabilityType = 3,
	.bmAttributes = 0, /* XXX */
	HSETW(.wSpeedsSupported, 0x000C),
	.bFunctionalitySupport = 8,
	.bU1DevExitLat = 255, /* dummy - not used */
	.wU2DevExitLat = { 0x00, 0x08 },
	},
	.cidd = {
	.bLength = sizeof(xhci_bosd.cidd),
	.bDescriptorType = 1,
	.bDevCapabilityType = 4,
	.bReserved = 0,
	.bContainerID = 0, /* XXX */
	},
	};

	static const
	struct xhci_config_desc xhci_confd = {
	.confd = {
	.bLength = sizeof(xhci_confd.confd),
	.bDescriptorType = UDESC_CONFIG,
	.wTotalLength[0] = sizeof(xhci_confd),
	.bNumInterface = 1,
	.bConfigurationValue = 1,
	.iConfiguration = 0,
	.bmAttributes = UC_SELF_POWERED,
	.bMaxPower = 0 /* max power */
	},
	.ifcd = {
	.bLength = sizeof(xhci_confd.ifcd),
	.bDescriptorType = UDESC_INTERFACE,
	.bNumEndpoints = 1,
	.bInterfaceClass = UICLASS_HUB,
	.bInterfaceSubClass = UISUBCLASS_HUB,
	.bInterfaceProtocol = 0,
	},
	.endpd = {
	.bLength = sizeof(xhci_confd.endpd),
	.bDescriptorType = UDESC_ENDPOINT,
	.bEndpointAddress = UE_DIR_IN \| XHCI_INTR_ENDPT,
	.bmAttributes = UE_INTERRUPT,
	.wMaxPacketSize[0] = 2, /* max 15 ports */
	.bInterval = 255,
	},
	.endpcd = {
	.bLength = sizeof(xhci_confd.endpcd),
	.bDescriptorType = UDESC_ENDPOINT_SS_COMP,
	.bMaxBurst = 0,
	.bmAttributes = 0,
	},
	};

	static const
	struct usb_hub_ss_descriptor xhci_hubd = {
	.bLength = sizeof(xhci_hubd),
	.bDescriptorType = UDESC_SS_HUB,
	};

	static usb_error_t
	xhci_roothub_exec(struct usb_device *udev,
	struct usb_device_request req, const void pptr, uint16_t plength)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	const char *str_ptr;
	const void *ptr;
	uint32_t port;
	uint32_t v;
	uint16_t len;
	uint16_t i;
	uint16_t value;
	uint16_t index;
	uint8_t j;
	usb_error_t err;

	USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED);

	/* buffer reset */
	ptr = (const void *)&sc->sc_hub_desc;
	len = 0;
	err = 0;

	value = UGETW(req->wValue);
	index = UGETW(req->wIndex);

	DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x "
	"wValue=0x%04x wIndex=0x%04x\n",
	req->bmRequestType, req->bRequest,
	UGETW(req->wLength), value, index);

	#define C(x,y) ((x) \| ((y) << 8))
	switch (C(req->bRequest, req->bmRequestType)) {
	case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
	case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
	case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
	/*
	* DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops
	* for the integrated root hub.
	*/
	break;
	case C(UR_GET_CONFIG, UT_READ_DEVICE):
	len = 1;
	sc->sc_hub_desc.temp[0] = sc->sc_conf;
	break;
	case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
	switch (value >> 8) {
	case UDESC_DEVICE:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(xhci_devd);
	ptr = (const void *)&xhci_devd;
	break;

	case UDESC_BOS:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(xhci_bosd);
	ptr = (const void *)&xhci_bosd;
	break;

	case UDESC_CONFIG:
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	len = sizeof(xhci_confd);
	ptr = (const void *)&xhci_confd;
	break;

	case UDESC_STRING:
	switch (value & 0xff) {
	case 0: /* Language table */
	str_ptr = "\001";
	break;

	case 1: /* Vendor */
	str_ptr = sc->sc_vendor;
	break;

	case 2: /* Product */
	str_ptr = "XHCI root HUB";
	break;

	default:
	str_ptr = "";
	break;
	}

	len = usb_make_str_desc(
	sc->sc_hub_desc.temp,
	sizeof(sc->sc_hub_desc.temp),
	str_ptr);
	break;

	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;
	case C(UR_GET_INTERFACE, UT_READ_INTERFACE):
	len = 1;
	sc->sc_hub_desc.temp[0] = 0;
	break;
	case C(UR_GET_STATUS, UT_READ_DEVICE):
	len = 2;
	USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED);
	break;
	case C(UR_GET_STATUS, UT_READ_INTERFACE):
	case C(UR_GET_STATUS, UT_READ_ENDPOINT):
	len = 2;
	USETW(sc->sc_hub_desc.stat.wStatus, 0);
	break;
	case C(UR_SET_ADDRESS, UT_WRITE_DEVICE):
	if (value >= XHCI_MAX_DEVICES) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;
	case C(UR_SET_CONFIG, UT_WRITE_DEVICE):
	if (value != 0 && value != 1) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	sc->sc_conf = value;
	break;
	case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
	break;
	case C(UR_SET_FEATURE, UT_WRITE_DEVICE):
	case C(UR_SET_FEATURE, UT_WRITE_INTERFACE):
	case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
	err = USB_ERR_IOERROR;
	goto done;
	case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
	break;
	case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
	break;
	/* Hub requests */
	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE):
	break;
	case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER):
	DPRINTFN(9, "UR_CLEAR_PORT_FEATURE\n");

	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	port = XHCI_PORTSC(index);

	v = XREAD4(sc, oper, port);
	i = XHCI_PS_PLS_GET(v);
	v &= ~XHCI_PS_CLEAR;

	switch (value) {
	case UHF_C_BH_PORT_RESET:
	XWRITE4(sc, oper, port, v \| XHCI_PS_WRC);
	break;
	case UHF_C_PORT_CONFIG_ERROR:
	XWRITE4(sc, oper, port, v \| XHCI_PS_CEC);
	break;
	case UHF_C_PORT_SUSPEND:
	case UHF_C_PORT_LINK_STATE:
	XWRITE4(sc, oper, port, v \| XHCI_PS_PLC);
	break;
	case UHF_C_PORT_CONNECTION:
	XWRITE4(sc, oper, port, v \| XHCI_PS_CSC);
	break;
	case UHF_C_PORT_ENABLE:
	XWRITE4(sc, oper, port, v \| XHCI_PS_PEC);
	break;
	case UHF_C_PORT_OVER_CURRENT:
	XWRITE4(sc, oper, port, v \| XHCI_PS_OCC);
	break;
	case UHF_C_PORT_RESET:
	XWRITE4(sc, oper, port, v \| XHCI_PS_PRC);
	break;
	case UHF_PORT_ENABLE:
	XWRITE4(sc, oper, port, v \| XHCI_PS_PED);
	break;
	case UHF_PORT_POWER:
	XWRITE4(sc, oper, port, v & ~XHCI_PS_PP);
	break;
	case UHF_PORT_INDICATOR:
	XWRITE4(sc, oper, port, v & ~XHCI_PS_PIC_SET(3));
	break;
	case UHF_PORT_SUSPEND:

	/* U3 -> U15 */
	if (i == 3) {
	XWRITE4(sc, oper, port, v \|
	XHCI_PS_PLS_SET(0xF) \| XHCI_PS_LWS);
	}

	/* wait 20ms for resume sequence to complete */
	usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 50);

	/* U0 */
	XWRITE4(sc, oper, port, v \|
	XHCI_PS_PLS_SET(0) \| XHCI_PS_LWS);
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;

	case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE):
	if ((value & 0xff) != 0) {
	err = USB_ERR_IOERROR;
	goto done;
	}

	v = XREAD4(sc, capa, XHCI_HCSPARAMS0);

	sc->sc_hub_desc.hubd = xhci_hubd;

	sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport;

	if (XHCI_HCS0_PPC(v))
	i = UHD_PWR_INDIVIDUAL;
	else
	i = UHD_PWR_GANGED;

	if (XHCI_HCS0_PIND(v))
	i \|= UHD_PORT_IND;

	i \|= UHD_OC_INDIVIDUAL;

	USETW(sc->sc_hub_desc.hubd.wHubCharacteristics, i);

	/* see XHCI section 5.4.9: */
	sc->sc_hub_desc.hubd.bPwrOn2PwrGood = 10;

	for (j = 1; j <= sc->sc_noport; j++) {

	v = XREAD4(sc, oper, XHCI_PORTSC(j));
	if (v & XHCI_PS_DR) {
	sc->sc_hub_desc.hubd.
	DeviceRemovable[j / 8] \|= 1U << (j % 8);
	}
	}
	len = sc->sc_hub_desc.hubd.bLength;
	break;

	case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE):
	len = 16;
	memset(sc->sc_hub_desc.temp, 0, 16);
	break;

	case C(UR_GET_STATUS, UT_READ_CLASS_OTHER):
	DPRINTFN(9, "UR_GET_STATUS i=%d\n", index);

	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}

	v = XREAD4(sc, oper, XHCI_PORTSC(index));

	DPRINTFN(9, "port status=0x%08x\n", v);

	i = UPS_PORT_LINK_STATE_SET(XHCI_PS_PLS_GET(v));

	switch (XHCI_PS_SPEED_GET(v)) {
	case 3:
	i \|= UPS_HIGH_SPEED;
	break;
	case 2:
	i \|= UPS_LOW_SPEED;
	break;
	case 1:
	/* FULL speed */
	break;
	default:
	i \|= UPS_OTHER_SPEED;
	break;
	}

	if (v & XHCI_PS_CCS)
	i \|= UPS_CURRENT_CONNECT_STATUS;
	if (v & XHCI_PS_PED)
	i \|= UPS_PORT_ENABLED;
	if (v & XHCI_PS_OCA)
	i \|= UPS_OVERCURRENT_INDICATOR;
	if (v & XHCI_PS_PR)
	i \|= UPS_RESET;
	if (v & XHCI_PS_PP) {
	/*
	* The USB 3.0 RH is using the
	* USB 2.0's power bit
	*/
	i \|= UPS_PORT_POWER;
	}
	USETW(sc->sc_hub_desc.ps.wPortStatus, i);

	i = 0;
	if (v & XHCI_PS_CSC)
	i \|= UPS_C_CONNECT_STATUS;
	if (v & XHCI_PS_PEC)
	i \|= UPS_C_PORT_ENABLED;
	if (v & XHCI_PS_OCC)
	i \|= UPS_C_OVERCURRENT_INDICATOR;
	if (v & XHCI_PS_WRC)
	i \|= UPS_C_BH_PORT_RESET;
	if (v & XHCI_PS_PRC)
	i \|= UPS_C_PORT_RESET;
	if (v & XHCI_PS_PLC)
	i \|= UPS_C_PORT_LINK_STATE;
	if (v & XHCI_PS_CEC)
	i \|= UPS_C_PORT_CONFIG_ERROR;

	USETW(sc->sc_hub_desc.ps.wPortChange, i);
	len = sizeof(sc->sc_hub_desc.ps);
	break;

	case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE):
	err = USB_ERR_IOERROR;
	goto done;

	case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE):
	break;

	case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER):

	i = index >> 8;
	index &= 0x00FF;

	if ((index < 1) \|\|
	(index > sc->sc_noport)) {
	err = USB_ERR_IOERROR;
	goto done;
	}

	port = XHCI_PORTSC(index);
	v = XREAD4(sc, oper, port) & ~XHCI_PS_CLEAR;

	switch (value) {
	case UHF_PORT_U1_TIMEOUT:
	if (XHCI_PS_SPEED_GET(v) != 4) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	port = XHCI_PORTPMSC(index);
	v = XREAD4(sc, oper, port);
	v &= ~XHCI_PM3_U1TO_SET(0xFF);
	v \|= XHCI_PM3_U1TO_SET(i);
	XWRITE4(sc, oper, port, v);
	break;
	case UHF_PORT_U2_TIMEOUT:
	if (XHCI_PS_SPEED_GET(v) != 4) {
	err = USB_ERR_IOERROR;
	goto done;
	}
	port = XHCI_PORTPMSC(index);
	v = XREAD4(sc, oper, port);
	v &= ~XHCI_PM3_U2TO_SET(0xFF);
	v \|= XHCI_PM3_U2TO_SET(i);
	XWRITE4(sc, oper, port, v);
	break;
	case UHF_BH_PORT_RESET:
	XWRITE4(sc, oper, port, v \| XHCI_PS_WPR);
	break;
	case UHF_PORT_LINK_STATE:
	XWRITE4(sc, oper, port, v \|
	XHCI_PS_PLS_SET(i) \| XHCI_PS_LWS);
	/* 4ms settle time */
	usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 250);
	break;
	case UHF_PORT_ENABLE:
	DPRINTFN(3, "set port enable %d\n", index);
	break;
	case UHF_PORT_SUSPEND:
	DPRINTFN(6, "suspend port %u (LPM=%u)\n", index, i);
	j = XHCI_PS_SPEED_GET(v);
	if ((j < 1) \|\| (j > 3)) {
	/* non-supported speed */
	err = USB_ERR_IOERROR;
	goto done;
	}
	XWRITE4(sc, oper, port, v \|
	XHCI_PS_PLS_SET(i ? 2 /* LPM */ : 3) \| XHCI_PS_LWS);
	break;
	case UHF_PORT_RESET:
	DPRINTFN(6, "reset port %d\n", index);
	XWRITE4(sc, oper, port, v \| XHCI_PS_PR);
	break;
	case UHF_PORT_POWER:
	DPRINTFN(3, "set port power %d\n", index);
	XWRITE4(sc, oper, port, v \| XHCI_PS_PP);
	break;
	case UHF_PORT_TEST:
	DPRINTFN(3, "set port test %d\n", index);
	break;
	case UHF_PORT_INDICATOR:
	DPRINTFN(3, "set port indicator %d\n", index);

	v &= ~XHCI_PS_PIC_SET(3);
	v \|= XHCI_PS_PIC_SET(1);

	XWRITE4(sc, oper, port, v);
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	break;

	case C(UR_CLEAR_TT_BUFFER, UT_WRITE_CLASS_OTHER):
	case C(UR_RESET_TT, UT_WRITE_CLASS_OTHER):
	case C(UR_GET_TT_STATE, UT_READ_CLASS_OTHER):
	case C(UR_STOP_TT, UT_WRITE_CLASS_OTHER):
	break;
	default:
	err = USB_ERR_IOERROR;
	goto done;
	}
	done:
	*plength = len;
	*pptr = ptr;
	return (err);
	}

	static void
	xhci_xfer_setup(struct usb_setup_params *parm)
	{
	struct usb_page_search page_info;
	struct usb_page_cache *pc;
	- struct xhci_softc *sc;
	struct usb_xfer *xfer;
	void *last_obj;
	uint32_t ntd;
	uint32_t n;

	- sc = XHCI_BUS2SC(parm->udev->bus);
	xfer = parm->curr_xfer;

	/*
	* The proof for the "ntd" formula is illustrated like this:
	*
	* +------------------------------------+
	* \| \|
	* \| \|remainder -> \|
	* \| +-----+---+ \|
	* \| \| xxx \| x \| frm 0 \|
	* \| +-----+---++ \|
	* \| \| xxx \| xx \| frm 1 \|
	* \| +-----+----+ \|
	* \| ... \|
	* +------------------------------------+
	*
	* "xxx" means a completely full USB transfer descriptor
	*
	* "x" and "xx" means a short USB packet
	*
	* For the remainder of an USB transfer modulo
	* "max_data_length" we need two USB transfer descriptors.
	* One to transfer the remaining data and one to finalise with
	* a zero length packet in case the "force_short_xfer" flag is
	* set. We only need two USB transfer descriptors in the case
	* where the transfer length of the first one is a factor of
	* "max_frame_size". The rest of the needed USB transfer
	* descriptors is given by the buffer size divided by the
	* maximum data payload.
	*/
	parm->hc_max_packet_size = 0x400;
	parm->hc_max_packet_count = 16 * 3;
	parm->hc_max_frame_size = XHCI_TD_PAYLOAD_MAX;

	xfer->flags_int.bdma_enable = 1;

	usbd_transfer_setup_sub(parm);

	if (xfer->flags_int.isochronous_xfr) {
	ntd = ((1 * xfer->nframes)
	+ (xfer->max_data_length / xfer->max_hc_frame_size));
	} else if (xfer->flags_int.control_xfr) {
	ntd = ((2 * xfer->nframes) + 1 /* STATUS */
	+ (xfer->max_data_length / xfer->max_hc_frame_size));
	} else {
	ntd = ((2 * xfer->nframes)
	+ (xfer->max_data_length / xfer->max_hc_frame_size));
	}

	alloc_dma_set:

	if (parm->err)
	return;

	/*
	* Allocate queue heads and transfer descriptors
	*/
	last_obj = NULL;

	if (usbd_transfer_setup_sub_malloc(
	parm, &pc, sizeof(struct xhci_td),
	XHCI_TD_ALIGN, ntd)) {
	parm->err = USB_ERR_NOMEM;
	return;
	}
	if (parm->buf) {
	for (n = 0; n != ntd; n++) {
	struct xhci_td *td;

	usbd_get_page(pc + n, 0, &page_info);

	td = page_info.buffer;

	/* init TD */
	td->td_self = page_info.physaddr;
	td->obj_next = last_obj;
	td->page_cache = pc + n;

	last_obj = td;

	usb_pc_cpu_flush(pc + n);
	}
	}
	xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj;

	if (!xfer->flags_int.curr_dma_set) {
	xfer->flags_int.curr_dma_set = 1;
	goto alloc_dma_set;
	}
	}

	static usb_error_t
	xhci_configure_reset_endpoint(struct usb_xfer *xfer)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);
	struct usb_page_search buf_inp;
	struct usb_device *udev;
	struct xhci_endpoint_ext *pepext;
	struct usb_endpoint_descriptor *edesc;
	struct usb_page_cache *pcinp;
	usb_error_t err;
	usb_stream_t stream_id;
	uint8_t index;
	uint8_t epno;

	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
	xfer->endpoint->edesc);

	udev = xfer->xroot->udev;
	index = udev->controller_slot_id;

	pcinp = &sc->sc_hw.devs[index].input_pc;

	usbd_get_page(pcinp, 0, &buf_inp);

	edesc = xfer->endpoint->edesc;

	epno = edesc->bEndpointAddress;
	stream_id = xfer->stream_id;

	if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL)
	epno \|= UE_DIR_IN;

	epno = XHCI_EPNO2EPID(epno);

	if (epno == 0)
	return (USB_ERR_NO_PIPE); /* invalid */

	XHCI_CMD_LOCK(sc);

	/* configure endpoint */

	err = xhci_configure_endpoint_by_xfer(xfer);

	if (err != 0) {
	XHCI_CMD_UNLOCK(sc);
	return (err);
	}

	/*
	* Get the endpoint into the stopped state according to the
	* endpoint context state diagram in the XHCI specification:
	*/

	err = xhci_cmd_stop_ep(sc, 0, epno, index);

	if (err != 0)
	DPRINTF("Could not stop endpoint %u\n", epno);

	err = xhci_cmd_reset_ep(sc, 0, epno, index);

	if (err != 0)
	DPRINTF("Could not reset endpoint %u\n", epno);

	err = xhci_cmd_set_tr_dequeue_ptr(sc,
	(pepext->physaddr + (stream_id * sizeof(struct xhci_trb) *
	XHCI_MAX_TRANSFERS)) \| XHCI_EPCTX_2_DCS_SET(1),
	stream_id, epno, index);

	if (err != 0)
	DPRINTF("Could not set dequeue ptr for endpoint %u\n", epno);

	/*
	* Get the endpoint into the running state according to the
	* endpoint context state diagram in the XHCI specification:
	*/

	xhci_configure_mask(udev, (1U << epno) \| 1U, 0);

	if (epno > 1)
	err = xhci_cmd_configure_ep(sc, buf_inp.physaddr, 0, index);
	else
	err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);

	if (err != 0)
	DPRINTF("Could not configure endpoint %u\n", epno);

	XHCI_CMD_UNLOCK(sc);

	return (0);
	}

	static void
	xhci_xfer_unsetup(struct usb_xfer *xfer)
	{
	return;
	}

	static void
	xhci_start_dma_delay(struct usb_xfer *xfer)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus);

	/* put transfer on interrupt queue (again) */
	usbd_transfer_enqueue(&sc->sc_bus.intr_q, xfer);

	(void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus),
	&sc->sc_config_msg[0], &sc->sc_config_msg[1]);
	}

	static void
	xhci_configure_msg(struct usb_proc_msg *pm)
	{
	struct xhci_softc *sc;
	struct xhci_endpoint_ext *pepext;
	struct usb_xfer *xfer;

	sc = XHCI_BUS2SC(((struct usb_bus_msg *)pm)->bus);

	restart:
	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {

	pepext = xhci_get_endpoint_ext(xfer->xroot->udev,
	xfer->endpoint->edesc);

	if ((pepext->trb_halted != 0) \|\|
	(pepext->trb_running == 0)) {

	uint16_t i;

	/* clear halted and running */
	pepext->trb_halted = 0;
	pepext->trb_running = 0;

	/* nuke remaining buffered transfers */

	for (i = 0; i != (XHCI_MAX_TRANSFERS *
	XHCI_MAX_STREAMS); i++) {
	/*
	* NOTE: We need to use the timeout
	* error code here else existing
	* isochronous clients can get
	* confused:
	*/
	if (pepext->xfer[i] != NULL) {
	xhci_device_done(pepext->xfer[i],
	USB_ERR_TIMEOUT);
	}
	}

	/*
	* NOTE: The USB transfer cannot vanish in
	* this state!
	*/

	USB_BUS_UNLOCK(&sc->sc_bus);

	xhci_configure_reset_endpoint(xfer);

	USB_BUS_LOCK(&sc->sc_bus);

	/* check if halted is still cleared */
	if (pepext->trb_halted == 0) {
	pepext->trb_running = 1;
	memset(pepext->trb_index, 0,
	sizeof(pepext->trb_index));
	}
	goto restart;
	}

	if (xfer->flags_int.did_dma_delay) {

	/* remove transfer from interrupt queue (again) */
	usbd_transfer_dequeue(xfer);

	/* we are finally done */
	usb_dma_delay_done_cb(xfer);

	/* queue changed - restart */
	goto restart;
	}
	}

	TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) {

	/* try to insert xfer on HW queue */
	xhci_transfer_insert(xfer);

	/* try to multi buffer */
	xhci_device_generic_multi_enter(xfer->endpoint,
	xfer->stream_id, NULL);
	}
	}

	static void
	xhci_ep_init(struct usb_device udev, struct usb_endpoint_descriptor edesc,
	struct usb_endpoint *ep)
	{
	struct xhci_endpoint_ext *pepext;

	DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d\n",
	ep, udev->address, edesc->bEndpointAddress, udev->flags.usb_mode);

	if (udev->parent_hub == NULL) {
	/* root HUB has special endpoint handling */
	return;
	}

	ep->methods = &xhci_device_generic_methods;

	pepext = xhci_get_endpoint_ext(udev, edesc);

	USB_BUS_LOCK(udev->bus);
	pepext->trb_halted = 1;
	pepext->trb_running = 0;
	USB_BUS_UNLOCK(udev->bus);
	}

	static void
	xhci_ep_uninit(struct usb_device udev, struct usb_endpoint ep)
	{

	}

	static void
	xhci_ep_clear_stall(struct usb_device udev, struct usb_endpoint ep)
	{
	struct xhci_endpoint_ext *pepext;

	DPRINTF("\n");

	if (udev->flags.usb_mode != USB_MODE_HOST) {
	/* not supported */
	return;
	}
	if (udev->parent_hub == NULL) {
	/* root HUB has special endpoint handling */
	return;
	}

	pepext = xhci_get_endpoint_ext(udev, ep->edesc);

	USB_BUS_LOCK(udev->bus);
	pepext->trb_halted = 1;
	pepext->trb_running = 0;
	USB_BUS_UNLOCK(udev->bus);
	}

	static usb_error_t
	xhci_device_init(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	usb_error_t err;
	uint8_t temp;

	/* no init for root HUB */
	if (udev->parent_hub == NULL)
	return (0);

	XHCI_CMD_LOCK(sc);

	/* set invalid default */

	udev->controller_slot_id = sc->sc_noslot + 1;

	/* try to get a new slot ID from the XHCI */

	err = xhci_cmd_enable_slot(sc, &temp);

	if (err) {
	XHCI_CMD_UNLOCK(sc);
	return (err);
	}

	if (temp > sc->sc_noslot) {
	XHCI_CMD_UNLOCK(sc);
	return (USB_ERR_BAD_ADDRESS);
	}

	if (sc->sc_hw.devs[temp].state != XHCI_ST_DISABLED) {
	DPRINTF("slot %u already allocated.\n", temp);
	XHCI_CMD_UNLOCK(sc);
	return (USB_ERR_BAD_ADDRESS);
	}

	/* store slot ID for later reference */

	udev->controller_slot_id = temp;

	/* reset data structure */

	memset(&sc->sc_hw.devs[temp], 0, sizeof(sc->sc_hw.devs[0]));

	/* set mark slot allocated */

	sc->sc_hw.devs[temp].state = XHCI_ST_ENABLED;

	err = xhci_alloc_device_ext(udev);

	XHCI_CMD_UNLOCK(sc);

	/* get device into default state */

	if (err == 0)
	err = xhci_set_address(udev, NULL, 0);

	return (err);
	}

	static void
	xhci_device_uninit(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	uint8_t index;

	/* no init for root HUB */
	if (udev->parent_hub == NULL)
	return;

	XHCI_CMD_LOCK(sc);

	index = udev->controller_slot_id;

	if (index <= sc->sc_noslot) {
	xhci_cmd_disable_slot(sc, index);
	sc->sc_hw.devs[index].state = XHCI_ST_DISABLED;

	/* free device extension */
	xhci_free_device_ext(udev);
	}

	XHCI_CMD_UNLOCK(sc);
	}

	static void
	xhci_get_dma_delay(struct usb_device udev, uint32_t pus)
	{
	/*
	* Wait until the hardware has finished any possible use of
	* the transfer descriptor(s)
	*/
	pus = 2048; / microseconds */
	}

	static void
	xhci_device_resume(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	uint8_t index;
	uint8_t n;
	uint8_t p;

	DPRINTF("\n");

	/* check for root HUB */
	if (udev->parent_hub == NULL)
	return;

	index = udev->controller_slot_id;

	XHCI_CMD_LOCK(sc);

	/* blindly resume all endpoints */

	USB_BUS_LOCK(udev->bus);

	for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) {
	for (p = 0; p != XHCI_MAX_STREAMS; p++) {
	XWRITE4(sc, door, XHCI_DOORBELL(index),
	n \| XHCI_DB_SID_SET(p));
	}
	}

	USB_BUS_UNLOCK(udev->bus);

	XHCI_CMD_UNLOCK(sc);
	}

	static void
	xhci_device_suspend(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	uint8_t index;
	uint8_t n;
	usb_error_t err;

	DPRINTF("\n");

	/* check for root HUB */
	if (udev->parent_hub == NULL)
	return;

	index = udev->controller_slot_id;

	XHCI_CMD_LOCK(sc);

	/* blindly suspend all endpoints */

	for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) {
	err = xhci_cmd_stop_ep(sc, 1, n, index);
	if (err != 0) {
	DPRINTF("Failed to suspend endpoint "
	"%u on slot %u (ignored).\n", n, index);
	}
	}

	XHCI_CMD_UNLOCK(sc);
	}

	static void
	xhci_set_hw_power(struct usb_bus *bus)
	{
	DPRINTF("\n");
	}

	static void
	xhci_device_state_change(struct usb_device *udev)
	{
	struct xhci_softc *sc = XHCI_BUS2SC(udev->bus);
	struct usb_page_search buf_inp;
	usb_error_t err;
	uint8_t index;

	/* check for root HUB */
	if (udev->parent_hub == NULL)
	return;

	index = udev->controller_slot_id;

	DPRINTF("\n");

	if (usb_get_device_state(udev) == USB_STATE_CONFIGURED) {
	err = uhub_query_info(udev, &sc->sc_hw.devs[index].nports,
	&sc->sc_hw.devs[index].tt);
	if (err != 0)
	sc->sc_hw.devs[index].nports = 0;
	}

	XHCI_CMD_LOCK(sc);

	switch (usb_get_device_state(udev)) {
	case USB_STATE_POWERED:
	if (sc->sc_hw.devs[index].state == XHCI_ST_DEFAULT)
	break;

	/* set default state */
	sc->sc_hw.devs[index].state = XHCI_ST_DEFAULT;

	/* reset number of contexts */
	sc->sc_hw.devs[index].context_num = 0;

	err = xhci_cmd_reset_dev(sc, index);

	if (err != 0) {
	DPRINTF("Device reset failed "
	"for slot %u.\n", index);
	}
	break;

	case USB_STATE_ADDRESSED:
	if (sc->sc_hw.devs[index].state == XHCI_ST_ADDRESSED)
	break;

	sc->sc_hw.devs[index].state = XHCI_ST_ADDRESSED;

	/* set configure mask to slot only */
	xhci_configure_mask(udev, 1, 0);

	/* deconfigure all endpoints, except EP0 */
	err = xhci_cmd_configure_ep(sc, 0, 1, index);

	if (err) {
	DPRINTF("Failed to deconfigure "
	"slot %u.\n", index);
	}
	break;

	case USB_STATE_CONFIGURED:
	if (sc->sc_hw.devs[index].state == XHCI_ST_CONFIGURED)
	break;

	/* set configured state */
	sc->sc_hw.devs[index].state = XHCI_ST_CONFIGURED;

	/* reset number of contexts */
	sc->sc_hw.devs[index].context_num = 0;

	usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp);

	xhci_configure_mask(udev, 3, 0);

	err = xhci_configure_device(udev);
	if (err != 0) {
	DPRINTF("Could not configure device "
	"at slot %u.\n", index);
	}

	err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index);
	if (err != 0) {
	DPRINTF("Could not evaluate device "
	"context at slot %u.\n", index);
	}
	break;

	default:
	break;
	}
	XHCI_CMD_UNLOCK(sc);
	}

	static usb_error_t
	xhci_set_endpoint_mode(struct usb_device udev, struct usb_endpoint ep,
	uint8_t ep_mode)
	{
	switch (ep_mode) {
	case USB_EP_MODE_DEFAULT:
	return (0);
	case USB_EP_MODE_STREAMS:
	if (xhcistreams == 0 \|\|
	(ep->edesc->bmAttributes & UE_XFERTYPE) != UE_BULK \|\|
	udev->speed != USB_SPEED_SUPER)
	return (USB_ERR_INVAL);
	return (0);
	default:
	return (USB_ERR_INVAL);
	}
	}

	static const struct usb_bus_methods xhci_bus_methods = {
	.endpoint_init = xhci_ep_init,
	.endpoint_uninit = xhci_ep_uninit,
	.xfer_setup = xhci_xfer_setup,
	.xfer_unsetup = xhci_xfer_unsetup,
	.get_dma_delay = xhci_get_dma_delay,
	.device_init = xhci_device_init,
	.device_uninit = xhci_device_uninit,
	.device_resume = xhci_device_resume,
	.device_suspend = xhci_device_suspend,
	.set_hw_power = xhci_set_hw_power,
	.roothub_exec = xhci_roothub_exec,
	.xfer_poll = xhci_do_poll,
	.start_dma_delay = xhci_start_dma_delay,
	.set_address = xhci_set_address,
	.clear_stall = xhci_ep_clear_stall,
	.device_state_change = xhci_device_state_change,
	.set_hw_power_sleep = xhci_set_hw_power_sleep,
	.set_endpoint_mode = xhci_set_endpoint_mode,
	};
	Index: head/sys/dev/usb/storage/umass.c
	===================================================================
	--- head/sys/dev/usb/storage/umass.c (revision 327172)
	+++ head/sys/dev/usb/storage/umass.c (revision 327173)
	@@ -1,3020 +1,3019 @@
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1999 MAEKAWA Masahide <bishop@rr.iij4u.or.jp>,
	* Nick Hibma <n_hibma@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	* $NetBSD: umass.c,v 1.28 2000/04/02 23:46:53 augustss Exp $
	*/

	/* Also already merged from NetBSD:
	* $NetBSD: umass.c,v 1.67 2001/11/25 19:05:22 augustss Exp $
	* $NetBSD: umass.c,v 1.90 2002/11/04 19:17:33 pooka Exp $
	* $NetBSD: umass.c,v 1.108 2003/11/07 17:03:25 wiz Exp $
	* $NetBSD: umass.c,v 1.109 2003/12/04 13:57:31 keihan Exp $
	*/

	/*
	* Universal Serial Bus Mass Storage Class specs:
	* http://www.usb.org/developers/devclass_docs/usb_msc_overview_1.2.pdf
	* http://www.usb.org/developers/devclass_docs/usbmassbulk_10.pdf
	* http://www.usb.org/developers/devclass_docs/usb_msc_cbi_1.1.pdf
	* http://www.usb.org/developers/devclass_docs/usbmass-ufi10.pdf
	*/

	/*
	* Ported to NetBSD by Lennart Augustsson <augustss@NetBSD.org>.
	* Parts of the code written by Jason R. Thorpe <thorpej@shagadelic.org>.
	*/

	/*
	* The driver handles 3 Wire Protocols
	* - Command/Bulk/Interrupt (CBI)
	* - Command/Bulk/Interrupt with Command Completion Interrupt (CBI with CCI)
	* - Mass Storage Bulk-Only (BBB)
	* (BBB refers Bulk/Bulk/Bulk for Command/Data/Status phases)
	*
	* Over these wire protocols it handles the following command protocols
	* - SCSI
	* - UFI (floppy command set)
	* - 8070i (ATAPI)
	*
	* UFI and 8070i (ATAPI) are transformed versions of the SCSI command set. The
	* sc->sc_transform method is used to convert the commands into the appropriate
	* format (if at all necessary). For example, UFI requires all commands to be
	* 12 bytes in length amongst other things.
	*
	* The source code below is marked and can be split into a number of pieces
	* (in this order):
	*
	* - probe/attach/detach
	* - generic transfer routines
	* - BBB
	* - CBI
	* - CBI_I (in addition to functions from CBI)
	* - CAM (Common Access Method)
	* - SCSI
	* - UFI
	* - 8070i (ATAPI)
	*
	* The protocols are implemented using a state machine, for the transfers as
	* well as for the resets. The state machine is contained in umass_t_*_callback.
	* The state machine is started through either umass_command_start() or
	* umass_reset().
	*
	* The reason for doing this is a) CAM performs a lot better this way and b) it
	* avoids using tsleep from interrupt context (for example after a failed
	* transfer).
	*/

	/*
	* The SCSI related part of this driver has been derived from the
	* dev/ppbus/vpo.c driver, by Nicolas Souchu (nsouch@FreeBSD.org).
	*
	* The CAM layer uses so called actions which are messages sent to the host
	* adapter for completion. The actions come in through umass_cam_action. The
	* appropriate block of routines is called depending on the transport protocol
	* in use. When the transfer has finished, these routines call
	* umass_cam_cb again to complete the CAM command.
	*/

	#include <sys/stdint.h>
	#include <sys/stddef.h>
	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usbdi.h>
	#include <dev/usb/usbdi_util.h>
	#include "usbdevs.h"

	#include <dev/usb/quirk/usb_quirk.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>
	#include <cam/cam_sim.h>
	#include <cam/cam_xpt_sim.h>
	#include <cam/scsi/scsi_all.h>
	#include <cam/scsi/scsi_da.h>

	#include <cam/cam_periph.h>

	#ifdef USB_DEBUG
	#define DIF(m, x) \
	do { \
	if (umass_debug & (m)) { x ; } \
	} while (0)

	#define DPRINTF(sc, m, fmt, ...) \
	do { \
	if (umass_debug & (m)) { \
	printf("%s:%s: " fmt, \
	(sc) ? (const char *)(sc)->sc_name : \
	(const char *)"umassX", \
	__FUNCTION__ ,## __VA_ARGS__); \
	} \
	} while (0)

	#define UDMASS_GEN 0x00010000 /* general */
	#define UDMASS_SCSI 0x00020000 /* scsi */
	#define UDMASS_UFI 0x00040000 /* ufi command set */
	#define UDMASS_ATAPI 0x00080000 /* 8070i command set */
	#define UDMASS_CMD (UDMASS_SCSI\|UDMASS_UFI\|UDMASS_ATAPI)
	#define UDMASS_USB 0x00100000 /* USB general */
	#define UDMASS_BBB 0x00200000 /* Bulk-Only transfers */
	#define UDMASS_CBI 0x00400000 /* CBI transfers */
	#define UDMASS_WIRE (UDMASS_BBB\|UDMASS_CBI)
	#define UDMASS_ALL 0xffff0000 /* all of the above */
	static int umass_debug;
	static int umass_throttle;

	static SYSCTL_NODE(_hw_usb, OID_AUTO, umass, CTLFLAG_RW, 0, "USB umass");
	SYSCTL_INT(_hw_usb_umass, OID_AUTO, debug, CTLFLAG_RWTUN,
	&umass_debug, 0, "umass debug level");
	SYSCTL_INT(_hw_usb_umass, OID_AUTO, throttle, CTLFLAG_RWTUN,
	&umass_throttle, 0, "Forced delay between commands in milliseconds");
	#else
	#define DIF(...) do { } while (0)
	#define DPRINTF(...) do { } while (0)
	#endif

	#define UMASS_BULK_SIZE (1 << 17)
	#define UMASS_CBI_DIAGNOSTIC_CMDLEN 12 /* bytes */
	#define UMASS_MAX_CMDLEN MAX(12, CAM_MAX_CDBLEN) /* bytes */

	/* USB transfer definitions */

	#define UMASS_T_BBB_RESET1 0 /* Bulk-Only */
	#define UMASS_T_BBB_RESET2 1
	#define UMASS_T_BBB_RESET3 2
	#define UMASS_T_BBB_COMMAND 3
	#define UMASS_T_BBB_DATA_READ 4
	#define UMASS_T_BBB_DATA_RD_CS 5
	#define UMASS_T_BBB_DATA_WRITE 6
	#define UMASS_T_BBB_DATA_WR_CS 7
	#define UMASS_T_BBB_STATUS 8
	#define UMASS_T_BBB_MAX 9

	#define UMASS_T_CBI_RESET1 0 /* CBI */
	#define UMASS_T_CBI_RESET2 1
	#define UMASS_T_CBI_RESET3 2
	#define UMASS_T_CBI_COMMAND 3
	#define UMASS_T_CBI_DATA_READ 4
	#define UMASS_T_CBI_DATA_RD_CS 5
	#define UMASS_T_CBI_DATA_WRITE 6
	#define UMASS_T_CBI_DATA_WR_CS 7
	#define UMASS_T_CBI_STATUS 8
	#define UMASS_T_CBI_RESET4 9
	#define UMASS_T_CBI_MAX 10

	#define UMASS_T_MAX MAX(UMASS_T_CBI_MAX, UMASS_T_BBB_MAX)

	/* Generic definitions */

	/* Direction for transfer */
	#define DIR_NONE 0
	#define DIR_IN 1
	#define DIR_OUT 2

	/* device name */
	#define DEVNAME "umass"
	#define DEVNAME_SIM "umass-sim"

	/* Approximate maximum transfer speeds (assumes 33% overhead). */
	#define UMASS_FULL_TRANSFER_SPEED 1000
	#define UMASS_HIGH_TRANSFER_SPEED 40000
	#define UMASS_SUPER_TRANSFER_SPEED 400000
	#define UMASS_FLOPPY_TRANSFER_SPEED 20

	#define UMASS_TIMEOUT 5000 /* ms */

	/* CAM specific definitions */

	#define UMASS_SCSIID_MAX 1 /* maximum number of drives expected */
	#define UMASS_SCSIID_HOST UMASS_SCSIID_MAX

	/* Bulk-Only features */

	#define UR_BBB_RESET 0xff /* Bulk-Only reset */
	#define UR_BBB_GET_MAX_LUN 0xfe /* Get maximum lun */

	/* Command Block Wrapper */
	typedef struct {
	uDWord dCBWSignature;
	#define CBWSIGNATURE 0x43425355
	uDWord dCBWTag;
	uDWord dCBWDataTransferLength;
	uByte bCBWFlags;
	#define CBWFLAGS_OUT 0x00
	#define CBWFLAGS_IN 0x80
	uByte bCBWLUN;
	uByte bCDBLength;
	#define CBWCDBLENGTH 16
	uByte CBWCDB[CBWCDBLENGTH];
	} __packed umass_bbb_cbw_t;

	#define UMASS_BBB_CBW_SIZE 31

	/* Command Status Wrapper */
	typedef struct {
	uDWord dCSWSignature;
	#define CSWSIGNATURE 0x53425355
	#define CSWSIGNATURE_IMAGINATION_DBX1 0x43425355
	#define CSWSIGNATURE_OLYMPUS_C1 0x55425355
	uDWord dCSWTag;
	uDWord dCSWDataResidue;
	uByte bCSWStatus;
	#define CSWSTATUS_GOOD 0x0
	#define CSWSTATUS_FAILED 0x1
	#define CSWSTATUS_PHASE 0x2
	} __packed umass_bbb_csw_t;

	#define UMASS_BBB_CSW_SIZE 13

	/* CBI features */

	#define UR_CBI_ADSC 0x00

	typedef union {
	struct {
	uint8_t type;
	#define IDB_TYPE_CCI 0x00
	uint8_t value;
	#define IDB_VALUE_PASS 0x00
	#define IDB_VALUE_FAIL 0x01
	#define IDB_VALUE_PHASE 0x02
	#define IDB_VALUE_PERSISTENT 0x03
	#define IDB_VALUE_STATUS_MASK 0x03
	} __packed common;

	struct {
	uint8_t asc;
	uint8_t ascq;
	} __packed ufi;
	} __packed umass_cbi_sbl_t;

	struct umass_softc; /* see below */

	typedef void (umass_callback_t)(struct umass_softc sc, union ccb ccb,
	uint32_t residue, uint8_t status);

	#define STATUS_CMD_OK 0 /* everything ok */
	#define STATUS_CMD_UNKNOWN 1 /* will have to fetch sense */
	#define STATUS_CMD_FAILED 2 /* transfer was ok, command failed */
	#define STATUS_WIRE_FAILED 3 /* couldn't even get command across */

	typedef uint8_t (umass_transform_t)(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len);

	/* Wire and command protocol */
	#define UMASS_PROTO_BBB 0x0001 /* USB wire protocol */
	#define UMASS_PROTO_CBI 0x0002
	#define UMASS_PROTO_CBI_I 0x0004
	#define UMASS_PROTO_WIRE 0x00ff /* USB wire protocol mask */
	#define UMASS_PROTO_SCSI 0x0100 /* command protocol */
	#define UMASS_PROTO_ATAPI 0x0200
	#define UMASS_PROTO_UFI 0x0400
	#define UMASS_PROTO_RBC 0x0800
	#define UMASS_PROTO_COMMAND 0xff00 /* command protocol mask */

	/* Device specific quirks */
	#define NO_QUIRKS 0x0000
	/*
	* The drive does not support Test Unit Ready. Convert to Start Unit
	*/
	#define NO_TEST_UNIT_READY 0x0001
	/*
	* The drive does not reset the Unit Attention state after REQUEST
	* SENSE has been sent. The INQUIRY command does not reset the UA
	* either, and so CAM runs in circles trying to retrieve the initial
	* INQUIRY data.
	*/
	#define RS_NO_CLEAR_UA 0x0002
	/* The drive does not support START STOP. */
	#define NO_START_STOP 0x0004
	/* Don't ask for full inquiry data (255b). */
	#define FORCE_SHORT_INQUIRY 0x0008
	/* Needs to be initialised the Shuttle way */
	#define SHUTTLE_INIT 0x0010
	/* Drive needs to be switched to alternate iface 1 */
	#define ALT_IFACE_1 0x0020
	/* Drive does not do 1Mb/s, but just floppy speeds (20kb/s) */
	#define FLOPPY_SPEED 0x0040
	/* The device can't count and gets the residue of transfers wrong */
	#define IGNORE_RESIDUE 0x0080
	/* No GetMaxLun call */
	#define NO_GETMAXLUN 0x0100
	/* The device uses a weird CSWSIGNATURE. */
	#define WRONG_CSWSIG 0x0200
	/* Device cannot handle INQUIRY so fake a generic response */
	#define NO_INQUIRY 0x0400
	/* Device cannot handle INQUIRY EVPD, return CHECK CONDITION */
	#define NO_INQUIRY_EVPD 0x0800
	/* Pad all RBC requests to 12 bytes. */
	#define RBC_PAD_TO_12 0x1000
	/*
	* Device reports number of sectors from READ_CAPACITY, not max
	* sector number.
	*/
	#define READ_CAPACITY_OFFBY1 0x2000
	/*
	* Device cannot handle a SCSI synchronize cache command. Normally
	* this quirk would be handled in the cam layer, but for IDE bridges
	* we need to associate the quirk with the bridge and not the
	* underlying disk device. This is handled by faking a success
	* result.
	*/
	#define NO_SYNCHRONIZE_CACHE 0x4000
	/* Device does not support 'PREVENT/ALLOW MEDIUM REMOVAL'. */
	#define NO_PREVENT_ALLOW 0x8000

	struct umass_softc {

	struct scsi_sense cam_scsi_sense;
	struct scsi_test_unit_ready cam_scsi_test_unit_ready;
	struct mtx sc_mtx;
	struct {
	uint8_t *data_ptr;
	union ccb *ccb;
	umass_callback_t *callback;

	uint32_t data_len; /* bytes */
	uint32_t data_rem; /* bytes */
	uint32_t data_timeout; /* ms */
	uint32_t actlen; /* bytes */

	uint8_t cmd_data[UMASS_MAX_CMDLEN];
	uint8_t cmd_len; /* bytes */
	uint8_t dir;
	uint8_t lun;
	} sc_transfer;

	/* Bulk specific variables for transfers in progress */
	umass_bbb_cbw_t cbw; /* command block wrapper */
	umass_bbb_csw_t csw; /* command status wrapper */

	/* CBI specific variables for transfers in progress */
	umass_cbi_sbl_t sbl; /* status block */

	device_t sc_dev;
	struct usb_device *sc_udev;
	struct cam_sim sc_sim; / SCSI Interface Module */
	struct usb_xfer *sc_xfer[UMASS_T_MAX];

	/*
	* The command transform function is used to convert the SCSI
	* commands into their derivatives, like UFI, ATAPI, and friends.
	*/
	umass_transform_t *sc_transform;

	uint32_t sc_unit;
	uint32_t sc_quirks; /* they got it almost right */
	uint32_t sc_proto; /* wire and cmd protocol */

	uint8_t sc_name[16];
	uint8_t sc_iface_no; /* interface number */
	uint8_t sc_maxlun; /* maximum LUN number, inclusive */
	uint8_t sc_last_xfer_index;
	uint8_t sc_status_try;
	};

	struct umass_probe_proto {
	uint32_t quirks;
	uint32_t proto;

	int error;
	};

	/* prototypes */

	static device_probe_t umass_probe;
	static device_attach_t umass_attach;
	static device_detach_t umass_detach;

	static usb_callback_t umass_tr_error;
	static usb_callback_t umass_t_bbb_reset1_callback;
	static usb_callback_t umass_t_bbb_reset2_callback;
	static usb_callback_t umass_t_bbb_reset3_callback;
	static usb_callback_t umass_t_bbb_command_callback;
	static usb_callback_t umass_t_bbb_data_read_callback;
	static usb_callback_t umass_t_bbb_data_rd_cs_callback;
	static usb_callback_t umass_t_bbb_data_write_callback;
	static usb_callback_t umass_t_bbb_data_wr_cs_callback;
	static usb_callback_t umass_t_bbb_status_callback;
	static usb_callback_t umass_t_cbi_reset1_callback;
	static usb_callback_t umass_t_cbi_reset2_callback;
	static usb_callback_t umass_t_cbi_reset3_callback;
	static usb_callback_t umass_t_cbi_reset4_callback;
	static usb_callback_t umass_t_cbi_command_callback;
	static usb_callback_t umass_t_cbi_data_read_callback;
	static usb_callback_t umass_t_cbi_data_rd_cs_callback;
	static usb_callback_t umass_t_cbi_data_write_callback;
	static usb_callback_t umass_t_cbi_data_wr_cs_callback;
	static usb_callback_t umass_t_cbi_status_callback;

	static void umass_cancel_ccb(struct umass_softc *);
	static void umass_init_shuttle(struct umass_softc *);
	static void umass_reset(struct umass_softc *);
	static void umass_t_bbb_data_clear_stall_callback(struct usb_xfer *,
	uint8_t, uint8_t, usb_error_t);
	static void umass_command_start(struct umass_softc , uint8_t, void ,
	uint32_t, uint32_t, umass_callback_t , union ccb );
	static uint8_t umass_bbb_get_max_lun(struct umass_softc *);
	static void umass_cbi_start_status(struct umass_softc *);
	static void umass_t_cbi_data_clear_stall_callback(struct usb_xfer *,
	uint8_t, uint8_t, usb_error_t);
	static int umass_cam_attach_sim(struct umass_softc *);
	static void umass_cam_attach(struct umass_softc *);
	static void umass_cam_detach_sim(struct umass_softc *);
	static void umass_cam_action(struct cam_sim , union ccb );
	static void umass_cam_poll(struct cam_sim *);
	static void umass_cam_cb(struct umass_softc , union ccb , uint32_t,
	uint8_t);
	static void umass_cam_sense_cb(struct umass_softc , union ccb , uint32_t,
	uint8_t);
	static void umass_cam_quirk_cb(struct umass_softc , union ccb , uint32_t,
	uint8_t);
	static uint8_t umass_scsi_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_rbc_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_ufi_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_atapi_transform(struct umass_softc , uint8_t ,
	uint8_t);
	static uint8_t umass_no_transform(struct umass_softc , uint8_t , uint8_t);
	static uint8_t umass_std_transform(struct umass_softc , union ccb , uint8_t
	*, uint8_t);

	#ifdef USB_DEBUG
	static void umass_bbb_dump_cbw(struct umass_softc , umass_bbb_cbw_t );
	static void umass_bbb_dump_csw(struct umass_softc , umass_bbb_csw_t );
	static void umass_cbi_dump_cmd(struct umass_softc , void , uint8_t);
	static void umass_dump_buffer(struct umass_softc , uint8_t , uint32_t,
	uint32_t);
	#endif

	static struct usb_config umass_bbb_config[UMASS_T_BBB_MAX] = {

	[UMASS_T_BBB_RESET1] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_reset1_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 500, /* 500 milliseconds */
	},

	[UMASS_T_BBB_RESET2] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_reset2_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_BBB_RESET3] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_reset3_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_BBB_COMMAND] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_OUT,
	.bufsize = sizeof(umass_bbb_cbw_t),
	.callback = &umass_t_bbb_command_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_BBB_DATA_READ] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_bbb_data_read_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_BBB_DATA_RD_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_data_rd_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_BBB_DATA_WRITE] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_OUT,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_bbb_data_write_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_BBB_DATA_WR_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_bbb_data_wr_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_BBB_STATUS] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.bufsize = sizeof(umass_bbb_csw_t),
	.flags = {.short_xfer_ok = 1,},
	.callback = &umass_t_bbb_status_callback,
	.timeout = 5000, /* ms */
	},
	};

	static struct usb_config umass_cbi_config[UMASS_T_CBI_MAX] = {

	[UMASS_T_CBI_RESET1] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = (sizeof(struct usb_device_request) +
	UMASS_CBI_DIAGNOSTIC_CMDLEN),
	.callback = &umass_t_cbi_reset1_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 500, /* 500 milliseconds */
	},

	[UMASS_T_CBI_RESET2] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_reset2_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_CBI_RESET3] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_reset3_callback,
	.timeout = 5000, /* 5 seconds */
	.interval = 50, /* 50 milliseconds */
	},

	[UMASS_T_CBI_COMMAND] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = (sizeof(struct usb_device_request) +
	UMASS_MAX_CMDLEN),
	.callback = &umass_t_cbi_command_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_CBI_DATA_READ] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_cbi_data_read_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_CBI_DATA_RD_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_data_rd_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_CBI_DATA_WRITE] = {
	.type = UE_BULK,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_OUT,
	.bufsize = UMASS_BULK_SIZE,
	.flags = {.proxy_buffer = 1,.short_xfer_ok = 1,.ext_buffer=1,},
	.callback = &umass_t_cbi_data_write_callback,
	.timeout = 0, /* overwritten later */
	},

	[UMASS_T_CBI_DATA_WR_CS] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_data_wr_cs_callback,
	.timeout = 5000, /* 5 seconds */
	},

	[UMASS_T_CBI_STATUS] = {
	.type = UE_INTERRUPT,
	.endpoint = UE_ADDR_ANY,
	.direction = UE_DIR_IN,
	.flags = {.short_xfer_ok = 1,.no_pipe_ok = 1,},
	.bufsize = sizeof(umass_cbi_sbl_t),
	.callback = &umass_t_cbi_status_callback,
	.timeout = 5000, /* ms */
	},

	[UMASS_T_CBI_RESET4] = {
	.type = UE_CONTROL,
	.endpoint = 0x00, /* Control pipe */
	.direction = UE_DIR_ANY,
	.bufsize = sizeof(struct usb_device_request),
	.callback = &umass_t_cbi_reset4_callback,
	.timeout = 5000, /* ms */
	},
	};

	/* If device cannot return valid inquiry data, fake it */
	static const uint8_t fake_inq_data[SHORT_INQUIRY_LENGTH] = {
	0, /* removable */ 0x80, SCSI_REV_2, SCSI_REV_2,
	/* additional_length */ 31, 0, 0, 0
	};

	#define UFI_COMMAND_LENGTH 12 /* UFI commands are always 12 bytes */
	#define ATAPI_COMMAND_LENGTH 12 /* ATAPI commands are always 12 bytes */

	static devclass_t umass_devclass;

	static device_method_t umass_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, umass_probe),
	DEVMETHOD(device_attach, umass_attach),
	DEVMETHOD(device_detach, umass_detach),

	DEVMETHOD_END
	};

	static driver_t umass_driver = {
	.name = "umass",
	.methods = umass_methods,
	.size = sizeof(struct umass_softc),
	};

	static const STRUCT_USB_HOST_ID __used umass_devs[] = {
	/* generic mass storage class */
	{USB_IFACE_CLASS(UICLASS_MASS),},
	};

	DRIVER_MODULE(umass, uhub, umass_driver, umass_devclass, NULL, 0);
	MODULE_DEPEND(umass, usb, 1, 1, 1);
	MODULE_DEPEND(umass, cam, 1, 1, 1);
	MODULE_VERSION(umass, 1);
	USB_PNP_HOST_INFO(umass_devs);

	/*
	* USB device probe/attach/detach
	*/

	static uint16_t
	umass_get_proto(struct usb_interface *iface)
	{
	struct usb_interface_descriptor *id;
	uint16_t retval;

	retval = 0;

	/* Check for a standards compliant device */
	id = usbd_get_interface_descriptor(iface);
	if ((id == NULL) \|\|
	(id->bInterfaceClass != UICLASS_MASS)) {
	goto done;
	}
	switch (id->bInterfaceSubClass) {
	case UISUBCLASS_SCSI:
	retval \|= UMASS_PROTO_SCSI;
	break;
	case UISUBCLASS_UFI:
	retval \|= UMASS_PROTO_UFI;
	break;
	case UISUBCLASS_RBC:
	retval \|= UMASS_PROTO_RBC;
	break;
	case UISUBCLASS_SFF8020I:
	case UISUBCLASS_SFF8070I:
	retval \|= UMASS_PROTO_ATAPI;
	break;
	default:
	goto done;
	}

	switch (id->bInterfaceProtocol) {
	case UIPROTO_MASS_CBI:
	retval \|= UMASS_PROTO_CBI;
	break;
	case UIPROTO_MASS_CBI_I:
	retval \|= UMASS_PROTO_CBI_I;
	break;
	case UIPROTO_MASS_BBB_OLD:
	case UIPROTO_MASS_BBB:
	retval \|= UMASS_PROTO_BBB;
	break;
	default:
	goto done;
	}
	done:
	return (retval);
	}

	/*
	* Match the device we are seeing with the devices supported.
	*/
	static struct umass_probe_proto
	umass_probe_proto(device_t dev, struct usb_attach_arg *uaa)
	{
	struct umass_probe_proto ret;
	uint32_t quirks = NO_QUIRKS;
	uint32_t proto = umass_get_proto(uaa->iface);

	memset(&ret, 0, sizeof(ret));
	ret.error = BUS_PROBE_GENERIC;

	/* Search for protocol enforcement */

	if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_BBB)) {
	proto &= ~UMASS_PROTO_WIRE;
	proto \|= UMASS_PROTO_BBB;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_CBI)) {
	proto &= ~UMASS_PROTO_WIRE;
	proto \|= UMASS_PROTO_CBI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_WIRE_CBI_I)) {
	proto &= ~UMASS_PROTO_WIRE;
	proto \|= UMASS_PROTO_CBI_I;
	}

	if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_SCSI)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_SCSI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_ATAPI)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_ATAPI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_UFI)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_UFI;
	} else if (usb_test_quirk(uaa, UQ_MSC_FORCE_PROTO_RBC)) {
	proto &= ~UMASS_PROTO_COMMAND;
	proto \|= UMASS_PROTO_RBC;
	}

	/* Check if the protocol is invalid */

	if ((proto & UMASS_PROTO_COMMAND) == 0) {
	ret.error = ENXIO;
	goto done;
	}

	if ((proto & UMASS_PROTO_WIRE) == 0) {
	ret.error = ENXIO;
	goto done;
	}

	/* Search for quirks */

	if (usb_test_quirk(uaa, UQ_MSC_NO_TEST_UNIT_READY))
	quirks \|= NO_TEST_UNIT_READY;
	if (usb_test_quirk(uaa, UQ_MSC_NO_RS_CLEAR_UA))
	quirks \|= RS_NO_CLEAR_UA;
	if (usb_test_quirk(uaa, UQ_MSC_NO_START_STOP))
	quirks \|= NO_START_STOP;
	if (usb_test_quirk(uaa, UQ_MSC_NO_GETMAXLUN))
	quirks \|= NO_GETMAXLUN;
	if (usb_test_quirk(uaa, UQ_MSC_NO_INQUIRY))
	quirks \|= NO_INQUIRY;
	if (usb_test_quirk(uaa, UQ_MSC_NO_INQUIRY_EVPD))
	quirks \|= NO_INQUIRY_EVPD;
	if (usb_test_quirk(uaa, UQ_MSC_NO_PREVENT_ALLOW))
	quirks \|= NO_PREVENT_ALLOW;
	if (usb_test_quirk(uaa, UQ_MSC_NO_SYNC_CACHE))
	quirks \|= NO_SYNCHRONIZE_CACHE;
	if (usb_test_quirk(uaa, UQ_MSC_SHUTTLE_INIT))
	quirks \|= SHUTTLE_INIT;
	if (usb_test_quirk(uaa, UQ_MSC_ALT_IFACE_1))
	quirks \|= ALT_IFACE_1;
	if (usb_test_quirk(uaa, UQ_MSC_FLOPPY_SPEED))
	quirks \|= FLOPPY_SPEED;
	if (usb_test_quirk(uaa, UQ_MSC_IGNORE_RESIDUE))
	quirks \|= IGNORE_RESIDUE;
	if (usb_test_quirk(uaa, UQ_MSC_WRONG_CSWSIG))
	quirks \|= WRONG_CSWSIG;
	if (usb_test_quirk(uaa, UQ_MSC_RBC_PAD_TO_12))
	quirks \|= RBC_PAD_TO_12;
	if (usb_test_quirk(uaa, UQ_MSC_READ_CAP_OFFBY1))
	quirks \|= READ_CAPACITY_OFFBY1;
	if (usb_test_quirk(uaa, UQ_MSC_FORCE_SHORT_INQ))
	quirks \|= FORCE_SHORT_INQUIRY;

	done:
	ret.quirks = quirks;
	ret.proto = proto;
	return (ret);
	}

	static int
	umass_probe(device_t dev)
	{
	struct usb_attach_arg *uaa = device_get_ivars(dev);
	struct umass_probe_proto temp;

	if (uaa->usb_mode != USB_MODE_HOST) {
	return (ENXIO);
	}
	temp = umass_probe_proto(dev, uaa);

	return (temp.error);
	}

	static int
	umass_attach(device_t dev)
	{
	struct umass_softc *sc = device_get_softc(dev);
	struct usb_attach_arg *uaa = device_get_ivars(dev);
	struct umass_probe_proto temp = umass_probe_proto(dev, uaa);
	struct usb_interface_descriptor *id;
	int err;

	/*
	* NOTE: the softc struct is cleared in device_set_driver.
	* We can safely call umass_detach without specifically
	* initializing the struct.
	*/

	sc->sc_dev = dev;
	sc->sc_udev = uaa->device;
	sc->sc_proto = temp.proto;
	sc->sc_quirks = temp.quirks;
	sc->sc_unit = device_get_unit(dev);

	snprintf(sc->sc_name, sizeof(sc->sc_name),
	"%s", device_get_nameunit(dev));

	device_set_usb_desc(dev);

	mtx_init(&sc->sc_mtx, device_get_nameunit(dev),
	NULL, MTX_DEF \| MTX_RECURSE);

	/* get interface index */

	id = usbd_get_interface_descriptor(uaa->iface);
	if (id == NULL) {
	device_printf(dev, "failed to get "
	"interface number\n");
	goto detach;
	}
	sc->sc_iface_no = id->bInterfaceNumber;

	#ifdef USB_DEBUG
	device_printf(dev, " ");

	switch (sc->sc_proto & UMASS_PROTO_COMMAND) {
	case UMASS_PROTO_SCSI:
	printf("SCSI");
	break;
	case UMASS_PROTO_ATAPI:
	printf("8070i (ATAPI)");
	break;
	case UMASS_PROTO_UFI:
	printf("UFI");
	break;
	case UMASS_PROTO_RBC:
	printf("RBC");
	break;
	default:
	printf("(unknown 0x%02x)",
	sc->sc_proto & UMASS_PROTO_COMMAND);
	break;
	}

	printf(" over ");

	switch (sc->sc_proto & UMASS_PROTO_WIRE) {
	case UMASS_PROTO_BBB:
	printf("Bulk-Only");
	break;
	case UMASS_PROTO_CBI: /* uses Comand/Bulk pipes */
	printf("CBI");
	break;
	case UMASS_PROTO_CBI_I: /* uses Comand/Bulk/Interrupt pipes */
	printf("CBI with CCI");
	break;
	default:
	printf("(unknown 0x%02x)",
	sc->sc_proto & UMASS_PROTO_WIRE);
	}

	printf("; quirks = 0x%04x\n", sc->sc_quirks);
	#endif

	if (sc->sc_quirks & ALT_IFACE_1) {
	err = usbd_set_alt_interface_index
	(uaa->device, uaa->info.bIfaceIndex, 1);

	if (err) {
	DPRINTF(sc, UDMASS_USB, "could not switch to "
	"Alt Interface 1\n");
	goto detach;
	}
	}
	/* allocate all required USB transfers */

	if (sc->sc_proto & UMASS_PROTO_BBB) {

	err = usbd_transfer_setup(uaa->device,
	&uaa->info.bIfaceIndex, sc->sc_xfer, umass_bbb_config,
	UMASS_T_BBB_MAX, sc, &sc->sc_mtx);

	/* skip reset first time */
	sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;

	} else if (sc->sc_proto & (UMASS_PROTO_CBI \| UMASS_PROTO_CBI_I)) {

	err = usbd_transfer_setup(uaa->device,
	&uaa->info.bIfaceIndex, sc->sc_xfer, umass_cbi_config,
	UMASS_T_CBI_MAX, sc, &sc->sc_mtx);

	/* skip reset first time */
	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	} else {
	err = USB_ERR_INVAL;
	}

	if (err) {
	device_printf(dev, "could not setup required "
	"transfers, %s\n", usbd_errstr(err));
	goto detach;
	}
	#ifdef USB_DEBUG
	if (umass_throttle > 0) {
	uint8_t x;
	int iv;

	iv = umass_throttle;

	if (iv < 1)
	iv = 1;
	else if (iv > 8000)
	iv = 8000;

	for (x = 0; x != UMASS_T_MAX; x++) {
	if (sc->sc_xfer[x] != NULL)
	usbd_xfer_set_interval(sc->sc_xfer[x], iv);
	}
	}
	#endif
	sc->sc_transform =
	(sc->sc_proto & UMASS_PROTO_SCSI) ? &umass_scsi_transform :
	(sc->sc_proto & UMASS_PROTO_UFI) ? &umass_ufi_transform :
	(sc->sc_proto & UMASS_PROTO_ATAPI) ? &umass_atapi_transform :
	(sc->sc_proto & UMASS_PROTO_RBC) ? &umass_rbc_transform :
	&umass_no_transform;

	/* from here onwards the device can be used. */

	if (sc->sc_quirks & SHUTTLE_INIT) {
	umass_init_shuttle(sc);
	}
	/* get the maximum LUN supported by the device */

	if (((sc->sc_proto & UMASS_PROTO_WIRE) == UMASS_PROTO_BBB) &&
	!(sc->sc_quirks & NO_GETMAXLUN))
	sc->sc_maxlun = umass_bbb_get_max_lun(sc);
	else
	sc->sc_maxlun = 0;

	/* Prepare the SCSI command block */
	sc->cam_scsi_sense.opcode = REQUEST_SENSE;
	sc->cam_scsi_test_unit_ready.opcode = TEST_UNIT_READY;

	/* register the SIM */
	err = umass_cam_attach_sim(sc);
	if (err) {
	goto detach;
	}
	/* scan the SIM */
	umass_cam_attach(sc);

	DPRINTF(sc, UDMASS_GEN, "Attach finished\n");

	return (0); /* success */

	detach:
	umass_detach(dev);
	return (ENXIO); /* failure */
	}

	static int
	umass_detach(device_t dev)
	{
	struct umass_softc *sc = device_get_softc(dev);

	DPRINTF(sc, UDMASS_USB, "\n");

	/* teardown our statemachine */

	usbd_transfer_unsetup(sc->sc_xfer, UMASS_T_MAX);

	mtx_lock(&sc->sc_mtx);

	/* cancel any leftover CCB's */

	umass_cancel_ccb(sc);

	umass_cam_detach_sim(sc);

	mtx_unlock(&sc->sc_mtx);

	mtx_destroy(&sc->sc_mtx);

	return (0); /* success */
	}

	static void
	umass_init_shuttle(struct umass_softc *sc)
	{
	struct usb_device_request req;
	- usb_error_t err;
	uint8_t status[2] = {0, 0};

	/*
	* The Linux driver does this, but no one can tell us what the
	* command does.
	*/
	req.bmRequestType = UT_READ_VENDOR_DEVICE;
	req.bRequest = 1; /* XXX unknown command */
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, sizeof(status));
	- err = usbd_do_request(sc->sc_udev, NULL, &req, &status);
	+ usbd_do_request(sc->sc_udev, NULL, &req, &status);

	DPRINTF(sc, UDMASS_GEN, "Shuttle init returned 0x%02x%02x\n",
	status[0], status[1]);
	}

	/*
	* Generic functions to handle transfers
	*/

	static void
	umass_transfer_start(struct umass_softc *sc, uint8_t xfer_index)
	{
	DPRINTF(sc, UDMASS_GEN, "transfer index = "
	"%d\n", xfer_index);

	if (sc->sc_xfer[xfer_index]) {
	sc->sc_last_xfer_index = xfer_index;
	usbd_transfer_start(sc->sc_xfer[xfer_index]);
	} else {
	umass_cancel_ccb(sc);
	}
	}

	static void
	umass_reset(struct umass_softc *sc)
	{
	DPRINTF(sc, UDMASS_GEN, "resetting device\n");

	/*
	* stop the last transfer, if not already stopped:
	*/
	usbd_transfer_stop(sc->sc_xfer[sc->sc_last_xfer_index]);
	umass_transfer_start(sc, 0);
	}

	static void
	umass_cancel_ccb(struct umass_softc *sc)
	{
	union ccb *ccb;

	USB_MTX_ASSERT(&sc->sc_mtx, MA_OWNED);

	ccb = sc->sc_transfer.ccb;
	sc->sc_transfer.ccb = NULL;
	sc->sc_last_xfer_index = 0;

	if (ccb) {
	(sc->sc_transfer.callback)
	(sc, ccb, (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen), STATUS_WIRE_FAILED);
	}
	}

	static void
	umass_tr_error(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	if (error != USB_ERR_CANCELLED) {

	DPRINTF(sc, UDMASS_GEN, "transfer error, %s -> "
	"reset\n", usbd_errstr(error));
	}
	umass_cancel_ccb(sc);
	}

	/*
	* BBB protocol specific functions
	*/

	static void
	umass_t_bbb_reset1_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	struct usb_device_request req;
	struct usb_page_cache *pc;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	umass_transfer_start(sc, UMASS_T_BBB_RESET2);
	return;

	case USB_ST_SETUP:
	/*
	* Reset recovery (5.3.4 in Universal Serial Bus Mass Storage Class)
	*
	* For Reset Recovery the host shall issue in the following order:
	* a) a Bulk-Only Mass Storage Reset
	* b) a Clear Feature HALT to the Bulk-In endpoint
	* c) a Clear Feature HALT to the Bulk-Out endpoint
	*
	* This is done in 3 steps, using 3 transfers:
	* UMASS_T_BBB_RESET1
	* UMASS_T_BBB_RESET2
	* UMASS_T_BBB_RESET3
	*/

	DPRINTF(sc, UDMASS_BBB, "BBB reset!\n");

	req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
	req.bRequest = UR_BBB_RESET; /* bulk only reset */
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, 0);

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &req, sizeof(req));

	usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
	usbd_xfer_set_frames(xfer, 1);
	usbd_transfer_submit(xfer);
	return;

	default: /* Error */
	umass_tr_error(xfer, error);
	return;
	}
	}

	static void
	umass_t_bbb_reset2_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_RESET3,
	UMASS_T_BBB_DATA_READ, error);
	}

	static void
	umass_t_bbb_reset3_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_COMMAND,
	UMASS_T_BBB_DATA_WRITE, error);
	}

	static void
	umass_t_bbb_data_clear_stall_callback(struct usb_xfer *xfer,
	uint8_t next_xfer, uint8_t stall_xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	tr_transferred:
	umass_transfer_start(sc, next_xfer);
	return;

	case USB_ST_SETUP:
	if (usbd_clear_stall_callback(xfer, sc->sc_xfer[stall_xfer])) {
	goto tr_transferred;
	}
	return;

	default: /* Error */
	umass_tr_error(xfer, error);
	return;
	}
	}

	static void
	umass_t_bbb_command_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_page_cache *pc;
	uint32_t tag;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	umass_transfer_start
	(sc, ((sc->sc_transfer.dir == DIR_IN) ? UMASS_T_BBB_DATA_READ :
	(sc->sc_transfer.dir == DIR_OUT) ? UMASS_T_BBB_DATA_WRITE :
	UMASS_T_BBB_STATUS));
	return;

	case USB_ST_SETUP:

	sc->sc_status_try = 0;

	if (ccb) {

	/*
	* the initial value is not important,
	* as long as the values are unique:
	*/
	tag = UGETDW(sc->cbw.dCBWTag) + 1;

	USETDW(sc->cbw.dCBWSignature, CBWSIGNATURE);
	USETDW(sc->cbw.dCBWTag, tag);

	/*
	* dCBWDataTransferLength:
	* This field indicates the number of bytes of data that the host
	* intends to transfer on the IN or OUT Bulk endpoint(as indicated by
	* the Direction bit) during the execution of this command. If this
	* field is set to 0, the device will expect that no data will be
	* transferred IN or OUT during this command, regardless of the value
	* of the Direction bit defined in dCBWFlags.
	*/
	USETDW(sc->cbw.dCBWDataTransferLength, sc->sc_transfer.data_len);

	/*
	* dCBWFlags:
	* The bits of the Flags field are defined as follows:
	* Bits 0-6 reserved
	* Bit 7 Direction - this bit shall be ignored if the
	* dCBWDataTransferLength field is zero.
	* 0 = data Out from host to device
	* 1 = data In from device to host
	*/
	sc->cbw.bCBWFlags = ((sc->sc_transfer.dir == DIR_IN) ?
	CBWFLAGS_IN : CBWFLAGS_OUT);
	sc->cbw.bCBWLUN = sc->sc_transfer.lun;

	if (sc->sc_transfer.cmd_len > sizeof(sc->cbw.CBWCDB)) {
	sc->sc_transfer.cmd_len = sizeof(sc->cbw.CBWCDB);
	DPRINTF(sc, UDMASS_BBB, "Truncating long command!\n");
	}
	sc->cbw.bCDBLength = sc->sc_transfer.cmd_len;

	/* copy SCSI command data */
	memcpy(sc->cbw.CBWCDB, sc->sc_transfer.cmd_data,
	sc->sc_transfer.cmd_len);

	/* clear remaining command area */
	memset(sc->cbw.CBWCDB +
	sc->sc_transfer.cmd_len, 0,
	sizeof(sc->cbw.CBWCDB) -
	sc->sc_transfer.cmd_len);

	DIF(UDMASS_BBB, umass_bbb_dump_cbw(sc, &sc->cbw));

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &sc->cbw, sizeof(sc->cbw));
	usbd_xfer_set_frame_len(xfer, 0, sizeof(sc->cbw));

	usbd_transfer_submit(xfer);
	}
	return;

	default: /* Error */
	umass_tr_error(xfer, error);
	return;
	}
	}

	static void
	umass_t_bbb_data_read_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_BBB, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_transfer_start(sc, UMASS_T_BBB_STATUS);
	return;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	return;

	default: /* Error */
	if (error == USB_ERR_CANCELLED) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_BBB_DATA_RD_CS);
	}
	return;
	}
	}

	static void
	umass_t_bbb_data_rd_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_STATUS,
	UMASS_T_BBB_DATA_READ, error);
	}

	static void
	umass_t_bbb_data_write_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_BBB, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_transfer_start(sc, UMASS_T_BBB_STATUS);
	return;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	return;

	default: /* Error */
	if (error == USB_ERR_CANCELLED) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_BBB_DATA_WR_CS);
	}
	return;
	}
	}

	static void
	umass_t_bbb_data_wr_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_bbb_data_clear_stall_callback(xfer, UMASS_T_BBB_STATUS,
	UMASS_T_BBB_DATA_WRITE, error);
	}

	static void
	umass_t_bbb_status_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_page_cache *pc;
	uint32_t residue;
	int actlen;

	usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:

	/*
	* Do a full reset if there is something wrong with the CSW:
	*/
	sc->sc_status_try = 1;

	/* Zero missing parts of the CSW: */

	if (actlen < (int)sizeof(sc->csw))
	memset(&sc->csw, 0, sizeof(sc->csw));

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_out(pc, 0, &sc->csw, actlen);

	DIF(UDMASS_BBB, umass_bbb_dump_csw(sc, &sc->csw));

	residue = UGETDW(sc->csw.dCSWDataResidue);

	if ((!residue) \|\| (sc->sc_quirks & IGNORE_RESIDUE)) {
	residue = (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen);
	}
	if (residue > sc->sc_transfer.data_len) {
	DPRINTF(sc, UDMASS_BBB, "truncating residue from %d "
	"to %d bytes\n", residue, sc->sc_transfer.data_len);
	residue = sc->sc_transfer.data_len;
	}
	/* translate weird command-status signatures: */
	if (sc->sc_quirks & WRONG_CSWSIG) {

	uint32_t temp = UGETDW(sc->csw.dCSWSignature);

	if ((temp == CSWSIGNATURE_OLYMPUS_C1) \|\|
	(temp == CSWSIGNATURE_IMAGINATION_DBX1)) {
	USETDW(sc->csw.dCSWSignature, CSWSIGNATURE);
	}
	}
	/* check CSW and handle eventual error */
	if (UGETDW(sc->csw.dCSWSignature) != CSWSIGNATURE) {
	DPRINTF(sc, UDMASS_BBB, "bad CSW signature 0x%08x != 0x%08x\n",
	UGETDW(sc->csw.dCSWSignature), CSWSIGNATURE);
	/*
	* Invalid CSW: Wrong signature or wrong tag might
	* indicate that we lost synchronization. Reset the
	* device.
	*/
	goto tr_error;
	} else if (UGETDW(sc->csw.dCSWTag) != UGETDW(sc->cbw.dCBWTag)) {
	DPRINTF(sc, UDMASS_BBB, "Invalid CSW: tag 0x%08x should be "
	"0x%08x\n", UGETDW(sc->csw.dCSWTag),
	UGETDW(sc->cbw.dCBWTag));
	goto tr_error;
	} else if (sc->csw.bCSWStatus > CSWSTATUS_PHASE) {
	DPRINTF(sc, UDMASS_BBB, "Invalid CSW: status %d > %d\n",
	sc->csw.bCSWStatus, CSWSTATUS_PHASE);
	goto tr_error;
	} else if (sc->csw.bCSWStatus == CSWSTATUS_PHASE) {
	DPRINTF(sc, UDMASS_BBB, "Phase error, residue = "
	"%d\n", residue);
	goto tr_error;
	} else if (sc->sc_transfer.actlen > sc->sc_transfer.data_len) {
	DPRINTF(sc, UDMASS_BBB, "Buffer overrun %d > %d\n",
	sc->sc_transfer.actlen, sc->sc_transfer.data_len);
	goto tr_error;
	} else if (sc->csw.bCSWStatus == CSWSTATUS_FAILED) {
	DPRINTF(sc, UDMASS_BBB, "Command failed, residue = "
	"%d\n", residue);

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, STATUS_CMD_FAILED);
	} else {
	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_BBB_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, STATUS_CMD_OK);
	}
	return;

	case USB_ST_SETUP:
	usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
	usbd_transfer_submit(xfer);
	return;

	default:
	tr_error:
	DPRINTF(sc, UDMASS_BBB, "Failed to read CSW: %s, try %d\n",
	usbd_errstr(error), sc->sc_status_try);

	if ((error == USB_ERR_CANCELLED) \|\|
	(sc->sc_status_try)) {
	umass_tr_error(xfer, error);
	} else {
	sc->sc_status_try = 1;
	umass_transfer_start(sc, UMASS_T_BBB_DATA_RD_CS);
	}
	return;
	}
	}

	static void
	umass_command_start(struct umass_softc *sc, uint8_t dir,
	void *data_ptr, uint32_t data_len,
	uint32_t data_timeout, umass_callback_t *callback,
	union ccb *ccb)
	{
	sc->sc_transfer.lun = ccb->ccb_h.target_lun;

	/*
	* NOTE: assumes that "sc->sc_transfer.cmd_data" and
	* "sc->sc_transfer.cmd_len" has been properly
	* initialized.
	*/

	sc->sc_transfer.dir = data_len ? dir : DIR_NONE;
	sc->sc_transfer.data_ptr = data_ptr;
	sc->sc_transfer.data_len = data_len;
	sc->sc_transfer.data_rem = data_len;
	sc->sc_transfer.data_timeout = (data_timeout + UMASS_TIMEOUT);

	sc->sc_transfer.actlen = 0;
	sc->sc_transfer.callback = callback;
	sc->sc_transfer.ccb = ccb;

	if (sc->sc_xfer[sc->sc_last_xfer_index]) {
	usbd_transfer_start(sc->sc_xfer[sc->sc_last_xfer_index]);
	} else {
	umass_cancel_ccb(sc);
	}
	}

	static uint8_t
	umass_bbb_get_max_lun(struct umass_softc *sc)
	{
	struct usb_device_request req;
	usb_error_t err;
	uint8_t buf = 0;

	/* The Get Max Lun command is a class-specific request. */
	req.bmRequestType = UT_READ_CLASS_INTERFACE;
	req.bRequest = UR_BBB_GET_MAX_LUN;
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, 1);

	err = usbd_do_request(sc->sc_udev, NULL, &req, &buf);
	if (err) {
	buf = 0;

	/* Device doesn't support Get Max Lun request. */
	printf("%s: Get Max Lun not supported (%s)\n",
	sc->sc_name, usbd_errstr(err));
	}
	return (buf);
	}

	/*
	* Command/Bulk/Interrupt (CBI) specific functions
	*/

	static void
	umass_cbi_start_status(struct umass_softc *sc)
	{
	if (sc->sc_xfer[UMASS_T_CBI_STATUS]) {
	umass_transfer_start(sc, UMASS_T_CBI_STATUS);
	} else {
	union ccb *ccb = sc->sc_transfer.ccb;

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen), STATUS_CMD_UNKNOWN);
	}
	}

	static void
	umass_t_cbi_reset1_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	struct usb_device_request req;
	struct usb_page_cache *pc;
	uint8_t buf[UMASS_CBI_DIAGNOSTIC_CMDLEN];

	uint8_t i;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	umass_transfer_start(sc, UMASS_T_CBI_RESET2);
	break;

	case USB_ST_SETUP:
	/*
	* Command Block Reset Protocol
	*
	* First send a reset request to the device. Then clear
	* any possibly stalled bulk endpoints.
	*
	* This is done in 3 steps, using 3 transfers:
	* UMASS_T_CBI_RESET1
	* UMASS_T_CBI_RESET2
	* UMASS_T_CBI_RESET3
	* UMASS_T_CBI_RESET4 (only if there is an interrupt endpoint)
	*/

	DPRINTF(sc, UDMASS_CBI, "CBI reset!\n");

	req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
	req.bRequest = UR_CBI_ADSC;
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	USETW(req.wLength, UMASS_CBI_DIAGNOSTIC_CMDLEN);

	/*
	* The 0x1d code is the SEND DIAGNOSTIC command. To
	* distinguish between the two, the last 10 bytes of the CBL
	* is filled with 0xff (section 2.2 of the CBI
	* specification)
	*/
	buf[0] = 0x1d; /* Command Block Reset */
	buf[1] = 0x04;

	for (i = 2; i < UMASS_CBI_DIAGNOSTIC_CMDLEN; i++) {
	buf[i] = 0xff;
	}

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &req, sizeof(req));
	pc = usbd_xfer_get_frame(xfer, 1);
	usbd_copy_in(pc, 0, buf, sizeof(buf));

	usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
	usbd_xfer_set_frame_len(xfer, 1, sizeof(buf));
	usbd_xfer_set_frames(xfer, 2);
	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	if (error == USB_ERR_CANCELLED)
	umass_tr_error(xfer, error);
	else
	umass_transfer_start(sc, UMASS_T_CBI_RESET2);
	break;
	}
	}

	static void
	umass_t_cbi_reset2_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_RESET3,
	UMASS_T_CBI_DATA_READ, error);
	}

	static void
	umass_t_cbi_reset3_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	umass_t_cbi_data_clear_stall_callback
	(xfer, (sc->sc_xfer[UMASS_T_CBI_RESET4] &&
	sc->sc_xfer[UMASS_T_CBI_STATUS]) ?
	UMASS_T_CBI_RESET4 : UMASS_T_CBI_COMMAND,
	UMASS_T_CBI_DATA_WRITE, error);
	}

	static void
	umass_t_cbi_reset4_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_COMMAND,
	UMASS_T_CBI_STATUS, error);
	}

	static void
	umass_t_cbi_data_clear_stall_callback(struct usb_xfer *xfer,
	uint8_t next_xfer, uint8_t stall_xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	tr_transferred:
	if (next_xfer == UMASS_T_CBI_STATUS) {
	umass_cbi_start_status(sc);
	} else {
	umass_transfer_start(sc, next_xfer);
	}
	break;

	case USB_ST_SETUP:
	if (usbd_clear_stall_callback(xfer, sc->sc_xfer[stall_xfer])) {
	goto tr_transferred; /* should not happen */
	}
	break;

	default: /* Error */
	umass_tr_error(xfer, error);
	break;
	}
	}

	static void
	umass_t_cbi_command_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_device_request req;
	struct usb_page_cache *pc;

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:

	if (sc->sc_transfer.dir == DIR_NONE) {
	umass_cbi_start_status(sc);
	} else {
	umass_transfer_start
	(sc, (sc->sc_transfer.dir == DIR_IN) ?
	UMASS_T_CBI_DATA_READ : UMASS_T_CBI_DATA_WRITE);
	}
	break;

	case USB_ST_SETUP:

	if (ccb) {

	/*
	* do a CBI transfer with cmd_len bytes from
	* cmd_data, possibly a data phase of data_len
	* bytes from/to the device and finally a status
	* read phase.
	*/

	req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
	req.bRequest = UR_CBI_ADSC;
	USETW(req.wValue, 0);
	req.wIndex[0] = sc->sc_iface_no;
	req.wIndex[1] = 0;
	req.wLength[0] = sc->sc_transfer.cmd_len;
	req.wLength[1] = 0;

	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_in(pc, 0, &req, sizeof(req));
	pc = usbd_xfer_get_frame(xfer, 1);
	usbd_copy_in(pc, 0, sc->sc_transfer.cmd_data,
	sc->sc_transfer.cmd_len);

	usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
	usbd_xfer_set_frame_len(xfer, 1, sc->sc_transfer.cmd_len);
	usbd_xfer_set_frames(xfer,
	sc->sc_transfer.cmd_len ? 2 : 1);

	DIF(UDMASS_CBI,
	umass_cbi_dump_cmd(sc,
	sc->sc_transfer.cmd_data,
	sc->sc_transfer.cmd_len));

	usbd_transfer_submit(xfer);
	}
	break;

	default: /* Error */
	/*
	* STALL on the control pipe can be result of the command error.
	* Attempt to clear this STALL same as for bulk pipe also
	* results in command completion interrupt, but ASC/ASCQ there
	* look like not always valid, so don't bother about it.
	*/
	if ((error == USB_ERR_STALLED) \|\|
	(sc->sc_transfer.callback == &umass_cam_cb)) {
	sc->sc_transfer.ccb = NULL;
	(sc->sc_transfer.callback)
	(sc, ccb, sc->sc_transfer.data_len,
	STATUS_CMD_UNKNOWN);
	} else {
	umass_tr_error(xfer, error);
	/* skip reset */
	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;
	}
	break;
	}
	}

	static void
	umass_t_cbi_data_read_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_CBI, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_cbi_start_status(sc);
	break;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	if ((error == USB_ERR_CANCELLED) \|\|
	(sc->sc_transfer.callback != &umass_cam_cb)) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_CBI_DATA_RD_CS);
	}
	break;
	}
	}

	static void
	umass_t_cbi_data_rd_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_STATUS,
	UMASS_T_CBI_DATA_READ, error);
	}

	static void
	umass_t_cbi_data_write_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	uint32_t max_bulk = usbd_xfer_max_len(xfer);
	int actlen, sumlen;

	usbd_xfer_status(xfer, &actlen, &sumlen, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:
	sc->sc_transfer.data_rem -= actlen;
	sc->sc_transfer.data_ptr += actlen;
	sc->sc_transfer.actlen += actlen;

	if (actlen < sumlen) {
	/* short transfer */
	sc->sc_transfer.data_rem = 0;
	}
	case USB_ST_SETUP:
	DPRINTF(sc, UDMASS_CBI, "max_bulk=%d, data_rem=%d\n",
	max_bulk, sc->sc_transfer.data_rem);

	if (sc->sc_transfer.data_rem == 0) {
	umass_cbi_start_status(sc);
	break;
	}
	if (max_bulk > sc->sc_transfer.data_rem) {
	max_bulk = sc->sc_transfer.data_rem;
	}
	usbd_xfer_set_timeout(xfer, sc->sc_transfer.data_timeout);

	usbd_xfer_set_frame_data(xfer, 0, sc->sc_transfer.data_ptr,
	max_bulk);

	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	if ((error == USB_ERR_CANCELLED) \|\|
	(sc->sc_transfer.callback != &umass_cam_cb)) {
	umass_tr_error(xfer, error);
	} else {
	umass_transfer_start(sc, UMASS_T_CBI_DATA_WR_CS);
	}
	break;
	}
	}

	static void
	umass_t_cbi_data_wr_cs_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	umass_t_cbi_data_clear_stall_callback(xfer, UMASS_T_CBI_STATUS,
	UMASS_T_CBI_DATA_WRITE, error);
	}

	static void
	umass_t_cbi_status_callback(struct usb_xfer *xfer, usb_error_t error)
	{
	struct umass_softc *sc = usbd_xfer_softc(xfer);
	union ccb *ccb = sc->sc_transfer.ccb;
	struct usb_page_cache *pc;
	uint32_t residue;
	uint8_t status;
	int actlen;

	usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);

	switch (USB_GET_STATE(xfer)) {
	case USB_ST_TRANSFERRED:

	if (actlen < (int)sizeof(sc->sbl)) {
	goto tr_setup;
	}
	pc = usbd_xfer_get_frame(xfer, 0);
	usbd_copy_out(pc, 0, &sc->sbl, sizeof(sc->sbl));

	residue = (sc->sc_transfer.data_len -
	sc->sc_transfer.actlen);

	/* dissect the information in the buffer */

	if (sc->sc_proto & UMASS_PROTO_UFI) {

	/*
	* Section 3.4.3.1.3 specifies that the UFI command
	* protocol returns an ASC and ASCQ in the interrupt
	* data block.
	*/

	DPRINTF(sc, UDMASS_CBI, "UFI CCI, ASC = 0x%02x, "
	"ASCQ = 0x%02x\n", sc->sbl.ufi.asc,
	sc->sbl.ufi.ascq);

	status = (((sc->sbl.ufi.asc == 0) &&
	(sc->sbl.ufi.ascq == 0)) ?
	STATUS_CMD_OK : STATUS_CMD_FAILED);

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, status);

	break;

	} else {

	/* Command Interrupt Data Block */

	DPRINTF(sc, UDMASS_CBI, "type=0x%02x, value=0x%02x\n",
	sc->sbl.common.type, sc->sbl.common.value);

	if (sc->sbl.common.type == IDB_TYPE_CCI) {

	status = (sc->sbl.common.value & IDB_VALUE_STATUS_MASK);

	status = ((status == IDB_VALUE_PASS) ? STATUS_CMD_OK :
	(status == IDB_VALUE_FAIL) ? STATUS_CMD_FAILED :
	(status == IDB_VALUE_PERSISTENT) ? STATUS_CMD_FAILED :
	STATUS_WIRE_FAILED);

	sc->sc_transfer.ccb = NULL;

	sc->sc_last_xfer_index = UMASS_T_CBI_COMMAND;

	(sc->sc_transfer.callback)
	(sc, ccb, residue, status);

	break;
	}
	}

	/* fallthrough */

	case USB_ST_SETUP:
	tr_setup:
	usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
	usbd_transfer_submit(xfer);
	break;

	default: /* Error */
	DPRINTF(sc, UDMASS_CBI, "Failed to read CSW: %s\n",
	usbd_errstr(error));
	umass_tr_error(xfer, error);
	break;
	}
	}

	/*
	* CAM specific functions (used by SCSI, UFI, 8070i (ATAPI))
	*/

	static int
	umass_cam_attach_sim(struct umass_softc *sc)
	{
	struct cam_devq devq; / Per device Queue */

	/*
	* A HBA is attached to the CAM layer.
	*
	* The CAM layer will then after a while start probing for devices on
	* the bus. The number of SIMs is limited to one.
	*/

	devq = cam_simq_alloc(1 /* maximum openings */ );
	if (devq == NULL) {
	return (ENOMEM);
	}
	sc->sc_sim = cam_sim_alloc
	(&umass_cam_action, &umass_cam_poll,
	DEVNAME_SIM,
	sc /* priv */ ,
	sc->sc_unit /* unit number */ ,
	&sc->sc_mtx /* mutex */ ,
	1 /* maximum device openings */ ,
	0 /* maximum tagged device openings */ ,
	devq);

	if (sc->sc_sim == NULL) {
	cam_simq_free(devq);
	return (ENOMEM);
	}

	mtx_lock(&sc->sc_mtx);

	if (xpt_bus_register(sc->sc_sim, sc->sc_dev,
	sc->sc_unit) != CAM_SUCCESS) {
	mtx_unlock(&sc->sc_mtx);
	return (ENOMEM);
	}
	mtx_unlock(&sc->sc_mtx);

	return (0);
	}

	static void
	umass_cam_attach(struct umass_softc *sc)
	{
	#ifndef USB_DEBUG
	if (bootverbose)
	#endif
	printf("%s:%d:%d: Attached to scbus%d\n",
	sc->sc_name, cam_sim_path(sc->sc_sim),
	sc->sc_unit, cam_sim_path(sc->sc_sim));
	}

	/* umass_cam_detach
	* detach from the CAM layer
	*/

	static void
	umass_cam_detach_sim(struct umass_softc *sc)
	{
	if (sc->sc_sim != NULL) {
	if (xpt_bus_deregister(cam_sim_path(sc->sc_sim))) {
	/* accessing the softc is not possible after this */
	sc->sc_sim->softc = NULL;
	cam_sim_free(sc->sc_sim, /* free_devq */ TRUE);
	} else {
	panic("%s: CAM layer is busy\n",
	sc->sc_name);
	}
	sc->sc_sim = NULL;
	}
	}

	/* umass_cam_action
	* CAM requests for action come through here
	*/

	static void
	umass_cam_action(struct cam_sim sim, union ccb ccb)
	{
	struct umass_softc sc = (struct umass_softc )sim->softc;

	if (sc == NULL) {
	ccb->ccb_h.status = CAM_SEL_TIMEOUT;
	xpt_done(ccb);
	return;
	}

	/* Perform the requested action */
	switch (ccb->ccb_h.func_code) {
	case XPT_SCSI_IO:
	{
	uint8_t *cmd;
	uint8_t dir;

	if (ccb->csio.ccb_h.flags & CAM_CDB_POINTER) {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_ptr);
	} else {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_bytes);
	}

	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SCSI_IO: "
	"cmd: 0x%02x, flags: 0x%02x, "
	"%db cmd/%db data/%db sense\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun, cmd[0],
	ccb->ccb_h.flags & CAM_DIR_MASK, ccb->csio.cdb_len,
	ccb->csio.dxfer_len, ccb->csio.sense_len);

	if (sc->sc_transfer.ccb) {
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SCSI_IO: "
	"I/O in progress, deferring\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);
	ccb->ccb_h.status = CAM_SCSI_BUSY;
	xpt_done(ccb);
	goto done;
	}
	switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
	case CAM_DIR_IN:
	dir = DIR_IN;
	break;
	case CAM_DIR_OUT:
	dir = DIR_OUT;
	DIF(UDMASS_SCSI,
	umass_dump_buffer(sc, ccb->csio.data_ptr,
	ccb->csio.dxfer_len, 48));
	break;
	default:
	dir = DIR_NONE;
	}

	ccb->ccb_h.status = CAM_REQ_INPROG \| CAM_SIM_QUEUED;

	/*
	* sc->sc_transform will convert the command to the
	* command format needed by the specific command set
	* and return the converted command in
	* "sc->sc_transfer.cmd_data"
	*/
	if (umass_std_transform(sc, ccb, cmd, ccb->csio.cdb_len)) {

	if (sc->sc_transfer.cmd_data[0] == INQUIRY) {
	const char *pserial;

	pserial = usb_get_serial(sc->sc_udev);

	/*
	* Umass devices don't generally report their serial numbers
	* in the usual SCSI way. Emulate it here.
	*/
	if ((sc->sc_transfer.cmd_data[1] & SI_EVPD) &&
	(sc->sc_transfer.cmd_data[2] == SVPD_UNIT_SERIAL_NUMBER) &&
	(pserial[0] != '\0')) {
	struct scsi_vpd_unit_serial_number *vpd_serial;

	vpd_serial = (struct scsi_vpd_unit_serial_number *)ccb->csio.data_ptr;
	vpd_serial->length = strlen(pserial);
	if (vpd_serial->length > sizeof(vpd_serial->serial_num))
	vpd_serial->length = sizeof(vpd_serial->serial_num);
	memcpy(vpd_serial->serial_num, pserial, vpd_serial->length);
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}

	/*
	* Handle EVPD inquiry for broken devices first
	* NO_INQUIRY also implies NO_INQUIRY_EVPD
	*/
	if ((sc->sc_quirks & (NO_INQUIRY_EVPD \| NO_INQUIRY)) &&
	(sc->sc_transfer.cmd_data[1] & SI_EVPD)) {

	scsi_set_sense_data(&ccb->csio.sense_data,
	/sense_format/ SSD_TYPE_NONE,
	/current_error/ 1,
	/sense_key/ SSD_KEY_ILLEGAL_REQUEST,
	/asc/ 0x24,
	/ascq/ 0x00,
	/extra args/ SSD_ELEM_NONE);
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	ccb->ccb_h.status =
	CAM_SCSI_STATUS_ERROR \|
	CAM_AUTOSNS_VALID \|
	CAM_DEV_QFRZN;
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	xpt_done(ccb);
	goto done;
	}
	/*
	* Return fake inquiry data for
	* broken devices
	*/
	if (sc->sc_quirks & NO_INQUIRY) {
	memcpy(ccb->csio.data_ptr, &fake_inq_data,
	sizeof(fake_inq_data));
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}
	if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
	ccb->csio.dxfer_len = SHORT_INQUIRY_LENGTH;
	}
	} else if (sc->sc_transfer.cmd_data[0] == PREVENT_ALLOW) {
	if (sc->sc_quirks & NO_PREVENT_ALLOW) {
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}
	} else if (sc->sc_transfer.cmd_data[0] == SYNCHRONIZE_CACHE) {
	if (sc->sc_quirks & NO_SYNCHRONIZE_CACHE) {
	ccb->csio.scsi_status = SCSI_STATUS_OK;
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	goto done;
	}
	}
	umass_command_start(sc, dir, ccb->csio.data_ptr,
	ccb->csio.dxfer_len,
	ccb->ccb_h.timeout,
	&umass_cam_cb, ccb);
	}
	break;
	}
	case XPT_PATH_INQ:
	{
	struct ccb_pathinq *cpi = &ccb->cpi;

	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_PATH_INQ:.\n",
	sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	/* host specific information */
	cpi->version_num = 1;
	cpi->hba_inquiry = 0;
	cpi->target_sprt = 0;
	cpi->hba_misc = PIM_NO_6_BYTE;
	cpi->hba_eng_cnt = 0;
	cpi->max_target = UMASS_SCSIID_MAX; /* one target */
	cpi->initiator_id = UMASS_SCSIID_HOST;
	strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
	strlcpy(cpi->hba_vid, "USB SCSI", HBA_IDLEN);
	strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
	cpi->unit_number = cam_sim_unit(sim);
	cpi->bus_id = sc->sc_unit;
	cpi->protocol = PROTO_SCSI;
	cpi->protocol_version = SCSI_REV_2;
	cpi->transport = XPORT_USB;
	cpi->transport_version = 0;

	if (sc == NULL) {
	cpi->base_transfer_speed = 0;
	cpi->max_lun = 0;
	} else {
	if (sc->sc_quirks & FLOPPY_SPEED) {
	cpi->base_transfer_speed =
	UMASS_FLOPPY_TRANSFER_SPEED;
	} else {
	switch (usbd_get_speed(sc->sc_udev)) {
	case USB_SPEED_SUPER:
	cpi->base_transfer_speed =
	UMASS_SUPER_TRANSFER_SPEED;
	cpi->maxio = MAXPHYS;
	break;
	case USB_SPEED_HIGH:
	cpi->base_transfer_speed =
	UMASS_HIGH_TRANSFER_SPEED;
	break;
	default:
	cpi->base_transfer_speed =
	UMASS_FULL_TRANSFER_SPEED;
	break;
	}
	}
	cpi->max_lun = sc->sc_maxlun;
	}

	cpi->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_RESET_DEV:
	{
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_RESET_DEV:.\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	umass_reset(sc);

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_GET_TRAN_SETTINGS:
	{
	struct ccb_trans_settings *cts = &ccb->cts;

	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_GET_TRAN_SETTINGS:.\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	cts->protocol = PROTO_SCSI;
	cts->protocol_version = SCSI_REV_2;
	cts->transport = XPORT_USB;
	cts->transport_version = 0;
	cts->xport_specific.valid = 0;

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	case XPT_SET_TRAN_SETTINGS:
	{
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_SET_TRAN_SETTINGS:.\n",
	cam_sim_path(sc->sc_sim), ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	xpt_done(ccb);
	break;
	}
	case XPT_CALC_GEOMETRY:
	{
	cam_calc_geometry(&ccb->ccg, /* extended */ 1);
	xpt_done(ccb);
	break;
	}
	case XPT_NOOP:
	{
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:XPT_NOOP:.\n",
	sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun);

	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	break;
	}
	default:
	DPRINTF(sc, UDMASS_SCSI, "%d:%d:%jx:func_code 0x%04x: "
	"Not implemented\n",
	sc ? cam_sim_path(sc->sc_sim) : -1, ccb->ccb_h.target_id,
	(uintmax_t)ccb->ccb_h.target_lun, ccb->ccb_h.func_code);

	ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
	xpt_done(ccb);
	break;
	}

	done:
	return;
	}

	static void
	umass_cam_poll(struct cam_sim *sim)
	{
	struct umass_softc sc = (struct umass_softc )sim->softc;

	if (sc == NULL)
	return;

	DPRINTF(sc, UDMASS_SCSI, "CAM poll\n");

	usbd_transfer_poll(sc->sc_xfer, UMASS_T_MAX);
	}


	/* umass_cam_cb
	* finalise a completed CAM command
	*/

	static void
	umass_cam_cb(struct umass_softc sc, union ccb ccb, uint32_t residue,
	uint8_t status)
	{
	ccb->csio.resid = residue;

	switch (status) {
	case STATUS_CMD_OK:
	ccb->ccb_h.status = CAM_REQ_CMP;
	if ((sc->sc_quirks & READ_CAPACITY_OFFBY1) &&
	(ccb->ccb_h.func_code == XPT_SCSI_IO) &&
	(ccb->csio.cdb_io.cdb_bytes[0] == READ_CAPACITY)) {
	struct scsi_read_capacity_data *rcap;
	uint32_t maxsector;

	rcap = (void *)(ccb->csio.data_ptr);
	maxsector = scsi_4btoul(rcap->addr) - 1;
	scsi_ulto4b(maxsector, rcap->addr);
	}
	/*
	* We have to add SVPD_UNIT_SERIAL_NUMBER to the list
	* of pages supported by the device - otherwise, CAM
	* will never ask us for the serial number if the
	* device cannot handle that by itself.
	*/
	if (ccb->ccb_h.func_code == XPT_SCSI_IO &&
	sc->sc_transfer.cmd_data[0] == INQUIRY &&
	(sc->sc_transfer.cmd_data[1] & SI_EVPD) &&
	sc->sc_transfer.cmd_data[2] == SVPD_SUPPORTED_PAGE_LIST &&
	(usb_get_serial(sc->sc_udev)[0] != '\0')) {
	struct ccb_scsiio *csio;
	struct scsi_vpd_supported_page_list *page_list;

	csio = &ccb->csio;
	page_list = (struct scsi_vpd_supported_page_list *)csio->data_ptr;
	if (page_list->length + 1 < SVPD_SUPPORTED_PAGES_SIZE) {
	page_list->list[page_list->length] = SVPD_UNIT_SERIAL_NUMBER;
	page_list->length++;
	}
	}
	xpt_done(ccb);
	break;

	case STATUS_CMD_UNKNOWN:
	case STATUS_CMD_FAILED:

	/* fetch sense data */

	/* the rest of the command was filled in at attach */
	sc->cam_scsi_sense.length = ccb->csio.sense_len;

	DPRINTF(sc, UDMASS_SCSI, "Fetching %d bytes of "
	"sense data\n", ccb->csio.sense_len);

	if (umass_std_transform(sc, ccb, &sc->cam_scsi_sense.opcode,
	sizeof(sc->cam_scsi_sense))) {

	if ((sc->sc_quirks & FORCE_SHORT_INQUIRY) &&
	(sc->sc_transfer.cmd_data[0] == INQUIRY)) {
	ccb->csio.sense_len = SHORT_INQUIRY_LENGTH;
	}
	umass_command_start(sc, DIR_IN, &ccb->csio.sense_data.error_code,
	ccb->csio.sense_len, ccb->ccb_h.timeout,
	&umass_cam_sense_cb, ccb);
	}
	break;

	default:
	/*
	* The wire protocol failed and will hopefully have
	* recovered. We return an error to CAM and let CAM
	* retry the command if necessary.
	*/
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_REQ_CMP_ERR \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	break;
	}
	}

	/*
	* Finalise a completed autosense operation
	*/
	static void
	umass_cam_sense_cb(struct umass_softc sc, union ccb ccb, uint32_t residue,
	uint8_t status)
	{
	uint8_t *cmd;

	switch (status) {
	case STATUS_CMD_OK:
	case STATUS_CMD_UNKNOWN:
	case STATUS_CMD_FAILED: {
	int key, sense_len;

	ccb->csio.sense_resid = residue;
	sense_len = ccb->csio.sense_len - ccb->csio.sense_resid;
	key = scsi_get_sense_key(&ccb->csio.sense_data, sense_len,
	/show_errors/ 1);

	if (ccb->csio.ccb_h.flags & CAM_CDB_POINTER) {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_ptr);
	} else {
	cmd = (uint8_t *)(ccb->csio.cdb_io.cdb_bytes);
	}

	/*
	* Getting sense data always succeeds (apart from wire
	* failures):
	*/
	if ((sc->sc_quirks & RS_NO_CLEAR_UA) &&
	(cmd[0] == INQUIRY) &&
	(key == SSD_KEY_UNIT_ATTENTION)) {
	/*
	* Ignore unit attention errors in the case where
	* the Unit Attention state is not cleared on
	* REQUEST SENSE. They will appear again at the next
	* command.
	*/
	ccb->ccb_h.status = CAM_REQ_CMP;
	} else if (key == SSD_KEY_NO_SENSE) {
	/*
	* No problem after all (in the case of CBI without
	* CCI)
	*/
	ccb->ccb_h.status = CAM_REQ_CMP;
	} else if ((sc->sc_quirks & RS_NO_CLEAR_UA) &&
	(cmd[0] == READ_CAPACITY) &&
	(key == SSD_KEY_UNIT_ATTENTION)) {
	/*
	* Some devices do not clear the unit attention error
	* on request sense. We insert a test unit ready
	* command to make sure we clear the unit attention
	* condition, then allow the retry to proceed as
	* usual.
	*/

	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR
	\| CAM_AUTOSNS_VALID \| CAM_DEV_QFRZN;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;

	#if 0
	DELAY(300000);
	#endif
	DPRINTF(sc, UDMASS_SCSI, "Doing a sneaky"
	"TEST_UNIT_READY\n");

	/* the rest of the command was filled in at attach */

	if ((sc->sc_transform)(sc,
	&sc->cam_scsi_test_unit_ready.opcode,
	sizeof(sc->cam_scsi_test_unit_ready)) == 1) {
	umass_command_start(sc, DIR_NONE, NULL, 0,
	ccb->ccb_h.timeout,
	&umass_cam_quirk_cb, ccb);
	break;
	}
	} else {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	if (key >= 0) {
	ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR
	\| CAM_AUTOSNS_VALID \| CAM_DEV_QFRZN;
	ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
	} else
	ccb->ccb_h.status = CAM_AUTOSENSE_FAIL
	\| CAM_DEV_QFRZN;
	}
	xpt_done(ccb);
	break;
	}
	default:
	DPRINTF(sc, UDMASS_SCSI, "Autosense failed, "
	"status %d\n", status);
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_AUTOSENSE_FAIL \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	}
	}

	/*
	* This completion code just handles the fact that we sent a test-unit-ready
	* after having previously failed a READ CAPACITY with CHECK_COND. The CCB
	* status for CAM is already set earlier.
	*/
	static void
	umass_cam_quirk_cb(struct umass_softc sc, union ccb ccb, uint32_t residue,
	uint8_t status)
	{
	DPRINTF(sc, UDMASS_SCSI, "Test unit ready "
	"returned status %d\n", status);

	xpt_done(ccb);
	}

	/*
	* SCSI specific functions
	*/

	static uint8_t
	umass_scsi_transform(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	sc->sc_transfer.cmd_len = cmd_len;

	switch (cmd_ptr[0]) {
	case TEST_UNIT_READY:
	if (sc->sc_quirks & NO_TEST_UNIT_READY) {
	DPRINTF(sc, UDMASS_SCSI, "Converted TEST_UNIT_READY "
	"to START_UNIT\n");
	memset(sc->sc_transfer.cmd_data, 0, cmd_len);
	sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
	sc->sc_transfer.cmd_data[4] = SSS_START;
	return (1);
	}
	break;

	case INQUIRY:
	/*
	* some drives wedge when asked for full inquiry
	* information.
	*/
	if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	sc->sc_transfer.cmd_data[4] = SHORT_INQUIRY_LENGTH;
	return (1);
	}
	break;
	}

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	return (1);
	}

	static uint8_t
	umass_rbc_transform(struct umass_softc sc, uint8_t cmd_ptr, uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	switch (cmd_ptr[0]) {
	/* these commands are defined in RBC: */
	case READ_10:
	case READ_CAPACITY:
	case START_STOP_UNIT:
	case SYNCHRONIZE_CACHE:
	case WRITE_10:
	case VERIFY_10:
	case INQUIRY:
	case MODE_SELECT_10:
	case MODE_SENSE_10:
	case TEST_UNIT_READY:
	case WRITE_BUFFER:
	/*
	* The following commands are not listed in my copy of the
	* RBC specs. CAM however seems to want those, and at least
	* the Sony DSC device appears to support those as well
	*/
	case REQUEST_SENSE:
	case PREVENT_ALLOW:

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);

	if ((sc->sc_quirks & RBC_PAD_TO_12) && (cmd_len < 12)) {
	memset(sc->sc_transfer.cmd_data + cmd_len,
	0, 12 - cmd_len);
	cmd_len = 12;
	}
	sc->sc_transfer.cmd_len = cmd_len;
	return (1); /* success */

	/* All other commands are not legal in RBC */
	default:
	DPRINTF(sc, UDMASS_SCSI, "Unsupported RBC "
	"command 0x%02x\n", cmd_ptr[0]);
	return (0); /* failure */
	}
	}

	static uint8_t
	umass_ufi_transform(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	/* An UFI command is always 12 bytes in length */
	sc->sc_transfer.cmd_len = UFI_COMMAND_LENGTH;

	/* Zero the command data */
	memset(sc->sc_transfer.cmd_data, 0, UFI_COMMAND_LENGTH);

	switch (cmd_ptr[0]) {
	/*
	* Commands of which the format has been verified. They
	* should work. Copy the command into the (zeroed out)
	* destination buffer.
	*/
	case TEST_UNIT_READY:
	if (sc->sc_quirks & NO_TEST_UNIT_READY) {
	/*
	* Some devices do not support this command. Start
	* Stop Unit should give the same results
	*/
	DPRINTF(sc, UDMASS_UFI, "Converted TEST_UNIT_READY "
	"to START_UNIT\n");

	sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
	sc->sc_transfer.cmd_data[4] = SSS_START;
	return (1);
	}
	break;

	case REZERO_UNIT:
	case REQUEST_SENSE:
	case FORMAT_UNIT:
	case INQUIRY:
	case START_STOP_UNIT:
	case SEND_DIAGNOSTIC:
	case PREVENT_ALLOW:
	case READ_CAPACITY:
	case READ_10:
	case WRITE_10:
	case POSITION_TO_ELEMENT: /* SEEK_10 */
	case WRITE_AND_VERIFY:
	case VERIFY:
	case MODE_SELECT_10:
	case MODE_SENSE_10:
	case READ_12:
	case WRITE_12:
	case READ_FORMAT_CAPACITIES:
	break;

	/*
	* SYNCHRONIZE_CACHE isn't supported by UFI, nor should it be
	* required for UFI devices, so it is appropriate to fake
	* success.
	*/
	case SYNCHRONIZE_CACHE:
	return (2);

	default:
	DPRINTF(sc, UDMASS_SCSI, "Unsupported UFI "
	"command 0x%02x\n", cmd_ptr[0]);
	return (0); /* failure */
	}

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	return (1); /* success */
	}

	/*
	* 8070i (ATAPI) specific functions
	*/
	static uint8_t
	umass_atapi_transform(struct umass_softc sc, uint8_t cmd_ptr,
	uint8_t cmd_len)
	{
	if ((cmd_len == 0) \|\|
	(cmd_len > sizeof(sc->sc_transfer.cmd_data))) {
	DPRINTF(sc, UDMASS_SCSI, "Invalid command "
	"length: %d bytes\n", cmd_len);
	return (0); /* failure */
	}
	/* An ATAPI command is always 12 bytes in length. */
	sc->sc_transfer.cmd_len = ATAPI_COMMAND_LENGTH;

	/* Zero the command data */
	memset(sc->sc_transfer.cmd_data, 0, ATAPI_COMMAND_LENGTH);

	switch (cmd_ptr[0]) {
	/*
	* Commands of which the format has been verified. They
	* should work. Copy the command into the destination
	* buffer.
	*/
	case INQUIRY:
	/*
	* some drives wedge when asked for full inquiry
	* information.
	*/
	if (sc->sc_quirks & FORCE_SHORT_INQUIRY) {
	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);

	sc->sc_transfer.cmd_data[4] = SHORT_INQUIRY_LENGTH;
	return (1);
	}
	break;

	case TEST_UNIT_READY:
	if (sc->sc_quirks & NO_TEST_UNIT_READY) {
	DPRINTF(sc, UDMASS_SCSI, "Converted TEST_UNIT_READY "
	"to START_UNIT\n");
	sc->sc_transfer.cmd_data[0] = START_STOP_UNIT;
	sc->sc_transfer.cmd_data[4] = SSS_START;
	return (1);
	}
	break;

	case REZERO_UNIT:
	case REQUEST_SENSE:
	case START_STOP_UNIT:
	case SEND_DIAGNOSTIC:
	case PREVENT_ALLOW:
	case READ_CAPACITY:
	case READ_10:
	case WRITE_10:
	case POSITION_TO_ELEMENT: /* SEEK_10 */
	case SYNCHRONIZE_CACHE:
	case MODE_SELECT_10:
	case MODE_SENSE_10:
	case READ_BUFFER:
	case 0x42: /* READ_SUBCHANNEL */
	case 0x43: /* READ_TOC */
	case 0x44: /* READ_HEADER */
	case 0x47: /* PLAY_MSF (Play Minute/Second/Frame) */
	case 0x48: /* PLAY_TRACK */
	case 0x49: /* PLAY_TRACK_REL */
	case 0x4b: /* PAUSE */
	case 0x51: /* READ_DISK_INFO */
	case 0x52: /* READ_TRACK_INFO */
	case 0x54: /* SEND_OPC */
	case 0x59: /* READ_MASTER_CUE */
	case 0x5b: /* CLOSE_TR_SESSION */
	case 0x5c: /* READ_BUFFER_CAP */
	case 0x5d: /* SEND_CUE_SHEET */
	case 0xa1: /* BLANK */
	case 0xa5: /* PLAY_12 */
	case 0xa6: /* EXCHANGE_MEDIUM */
	case 0xad: /* READ_DVD_STRUCTURE */
	case 0xbb: /* SET_CD_SPEED */
	case 0xe5: /* READ_TRACK_INFO_PHILIPS */
	break;

	case READ_12:
	case WRITE_12:
	default:
	DPRINTF(sc, UDMASS_SCSI, "Unsupported ATAPI "
	"command 0x%02x - trying anyway\n",
	cmd_ptr[0]);
	break;
	}

	memcpy(sc->sc_transfer.cmd_data, cmd_ptr, cmd_len);
	return (1); /* success */
	}

	static uint8_t
	umass_no_transform(struct umass_softc sc, uint8_t cmd,
	uint8_t cmdlen)
	{
	return (0); /* failure */
	}

	static uint8_t
	umass_std_transform(struct umass_softc sc, union ccb ccb,
	uint8_t *cmd, uint8_t cmdlen)
	{
	uint8_t retval;

	retval = (sc->sc_transform) (sc, cmd, cmdlen);

	if (retval == 2) {
	ccb->ccb_h.status = CAM_REQ_CMP;
	xpt_done(ccb);
	return (0);
	} else if (retval == 0) {
	xpt_freeze_devq(ccb->ccb_h.path, 1);
	ccb->ccb_h.status = CAM_REQ_INVALID \| CAM_DEV_QFRZN;
	xpt_done(ccb);
	return (0);
	}
	/* Command should be executed */
	return (1);
	}

	#ifdef USB_DEBUG
	static void
	umass_bbb_dump_cbw(struct umass_softc sc, umass_bbb_cbw_t cbw)
	{
	uint8_t *c = cbw->CBWCDB;

	uint32_t dlen = UGETDW(cbw->dCBWDataTransferLength);
	uint32_t tag = UGETDW(cbw->dCBWTag);

	uint8_t clen = cbw->bCDBLength;
	uint8_t flags = cbw->bCBWFlags;
	uint8_t lun = cbw->bCBWLUN;

	DPRINTF(sc, UDMASS_BBB, "CBW %d: cmd = %db "
	"(0x%02x%02x%02x%02x%02x%02x%s), "
	"data = %db, lun = %d, dir = %s\n",
	tag, clen,
	c[0], c[1], c[2], c[3], c[4], c[5], (clen > 6 ? "..." : ""),
	dlen, lun, (flags == CBWFLAGS_IN ? "in" :
	(flags == CBWFLAGS_OUT ? "out" : "<invalid>")));
	}

	static void
	umass_bbb_dump_csw(struct umass_softc sc, umass_bbb_csw_t csw)
	{
	uint32_t sig = UGETDW(csw->dCSWSignature);
	uint32_t tag = UGETDW(csw->dCSWTag);
	uint32_t res = UGETDW(csw->dCSWDataResidue);
	uint8_t status = csw->bCSWStatus;

	DPRINTF(sc, UDMASS_BBB, "CSW %d: sig = 0x%08x (%s), tag = 0x%08x, "
	"res = %d, status = 0x%02x (%s)\n",
	tag, sig, (sig == CSWSIGNATURE ? "valid" : "invalid"),
	tag, res,
	status, (status == CSWSTATUS_GOOD ? "good" :
	(status == CSWSTATUS_FAILED ? "failed" :
	(status == CSWSTATUS_PHASE ? "phase" : "<invalid>"))));
	}

	static void
	umass_cbi_dump_cmd(struct umass_softc sc, void cmd, uint8_t cmdlen)
	{
	uint8_t *c = cmd;
	uint8_t dir = sc->sc_transfer.dir;

	DPRINTF(sc, UDMASS_BBB, "cmd = %db "
	"(0x%02x%02x%02x%02x%02x%02x%s), "
	"data = %db, dir = %s\n",
	cmdlen,
	c[0], c[1], c[2], c[3], c[4], c[5], (cmdlen > 6 ? "..." : ""),
	sc->sc_transfer.data_len,
	(dir == DIR_IN ? "in" :
	(dir == DIR_OUT ? "out" :
	(dir == DIR_NONE ? "no data phase" : "<invalid>"))));
	}

	static void
	umass_dump_buffer(struct umass_softc sc, uint8_t buffer, uint32_t buflen,
	uint32_t printlen)
	{
	uint32_t i, j;
	char s1[40];
	char s2[40];
	char s3[5];

	s1[0] = '\0';
	s3[0] = '\0';

	sprintf(s2, " buffer=%p, buflen=%d", buffer, buflen);
	for (i = 0; (i < buflen) && (i < printlen); i++) {
	j = i % 16;
	if (j == 0 && i != 0) {
	DPRINTF(sc, UDMASS_GEN, "0x %s%s\n",
	s1, s2);
	s2[0] = '\0';
	}
	sprintf(&s1[j * 2], "%02x", buffer[i] & 0xff);
	}
	if (buflen > printlen)
	sprintf(s3, " ...");
	DPRINTF(sc, UDMASS_GEN, "0x %s%s%s\n",
	s1, s2, s3);
	}

	#endif
	Index: head/sys/dev/usb/usb_dev.c
	===================================================================
	--- head/sys/dev/usb/usb_dev.c (revision 327172)
	+++ head/sys/dev/usb/usb_dev.c (revision 327173)
	@@ -1,2482 +1,2470 @@
	/* $FreeBSD$ */
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2006-2008 Hans Petter Selasky. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*
	* usb_dev.c - An abstraction layer for creating devices under /dev/...
	*/

	#ifdef USB_GLOBAL_INCLUDE_FILE
	#include USB_GLOBAL_INCLUDE_FILE
	#else
	#include <sys/stdint.h>
	#include <sys/stddef.h>
	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>
	#include <sys/vnode.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usb_ioctl.h>
	#include <dev/usb/usbdi.h>
	#include <dev/usb/usbdi_util.h>

	#define USB_DEBUG_VAR usb_fifo_debug

	#include <dev/usb/usb_core.h>
	#include <dev/usb/usb_dev.h>
	#include <dev/usb/usb_mbuf.h>
	#include <dev/usb/usb_process.h>
	#include <dev/usb/usb_device.h>
	#include <dev/usb/usb_debug.h>
	#include <dev/usb/usb_busdma.h>
	#include <dev/usb/usb_generic.h>
	#include <dev/usb/usb_dynamic.h>
	#include <dev/usb/usb_util.h>

	#include <dev/usb/usb_controller.h>
	#include <dev/usb/usb_bus.h>

	#include <sys/filio.h>
	#include <sys/ttycom.h>
	#include <sys/syscallsubr.h>

	#include <machine/stdarg.h>
	#endif /* USB_GLOBAL_INCLUDE_FILE */

	#if USB_HAVE_UGEN

	#ifdef USB_DEBUG
	static int usb_fifo_debug = 0;

	static SYSCTL_NODE(_hw_usb, OID_AUTO, dev, CTLFLAG_RW, 0, "USB device");
	SYSCTL_INT(_hw_usb_dev, OID_AUTO, debug, CTLFLAG_RWTUN,
	&usb_fifo_debug, 0, "Debug Level");
	#endif

	#if ((__FreeBSD_version >= 700001) \|\| (__FreeBSD_version == 0) \|\| \
	((__FreeBSD_version >= 600034) && (__FreeBSD_version < 700000)))
	#define USB_UCRED struct ucred *ucred,
	#else
	#define USB_UCRED
	#endif

	/* prototypes */

	static int usb_fifo_open(struct usb_cdev_privdata *,
	struct usb_fifo *, int);
	static void usb_fifo_close(struct usb_fifo *, int);
	static void usb_dev_init(void *);
	static void usb_dev_init_post(void *);
	static void usb_dev_uninit(void *);
	static int usb_fifo_uiomove(struct usb_fifo , void , int,
	struct uio *);
	static void usb_fifo_check_methods(struct usb_fifo_methods *);
	static struct usb_fifo usb_fifo_alloc(struct mtx );
	static struct usb_endpoint usb_dev_get_ep(struct usb_device , uint8_t,
	uint8_t);
	static void usb_loc_fill(struct usb_fs_privdata *,
	struct usb_cdev_privdata *);
	static void usb_close(void *);
	static usb_error_t usb_ref_device(struct usb_cdev_privdata , struct usb_cdev_refdata , int);
	static usb_error_t usb_usb_ref_device(struct usb_cdev_privdata , struct usb_cdev_refdata );
	static void usb_unref_device(struct usb_cdev_privdata , struct usb_cdev_refdata );

	static d_open_t usb_open;
	static d_ioctl_t usb_ioctl;
	static d_read_t usb_read;
	static d_write_t usb_write;
	static d_poll_t usb_poll;
	static d_kqfilter_t usb_kqfilter;

	static d_ioctl_t usb_static_ioctl;

	static usb_fifo_open_t usb_fifo_dummy_open;
	static usb_fifo_close_t usb_fifo_dummy_close;
	static usb_fifo_ioctl_t usb_fifo_dummy_ioctl;
	static usb_fifo_cmd_t usb_fifo_dummy_cmd;

	/* character device structure used for devices (/dev/ugenX.Y and /dev/uXXX) */
	struct cdevsw usb_devsw = {
	.d_version = D_VERSION,
	.d_open = usb_open,
	.d_ioctl = usb_ioctl,
	.d_name = "usbdev",
	.d_flags = D_TRACKCLOSE,
	.d_read = usb_read,
	.d_write = usb_write,
	.d_poll = usb_poll,
	.d_kqfilter = usb_kqfilter,
	};

	static struct cdev* usb_dev = NULL;

	/* character device structure used for /dev/usb */
	static struct cdevsw usb_static_devsw = {
	.d_version = D_VERSION,
	.d_ioctl = usb_static_ioctl,
	.d_name = "usb"
	};

	static TAILQ_HEAD(, usb_symlink) usb_sym_head;
	static struct sx usb_sym_lock;

	struct mtx usb_ref_lock;

	/------------------------------------------------------------------------
	* usb_loc_fill
	*
	* This is used to fill out a usb_cdev_privdata structure based on the
	* device's address as contained in usb_fs_privdata.
	------------------------------------------------------------------------/
	static void
	usb_loc_fill(struct usb_fs_privdata* pd, struct usb_cdev_privdata *cpd)
	{
	cpd->bus_index = pd->bus_index;
	cpd->dev_index = pd->dev_index;
	cpd->ep_addr = pd->ep_addr;
	cpd->fifo_index = pd->fifo_index;
	}

	/------------------------------------------------------------------------
	* usb_ref_device
	*
	* This function is used to atomically refer an USB device by its
	* device location. If this function returns success the USB device
	* will not disappear until the USB device is unreferenced.
	*
	* Return values:
	* 0: Success, refcount incremented on the given USB device.
	* Else: Failure.
	------------------------------------------------------------------------/
	static usb_error_t
	usb_ref_device(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd, int need_uref)
	{
	struct usb_fifo **ppf;
	struct usb_fifo *f;

	DPRINTFN(2, "cpd=%p need uref=%d\n", cpd, need_uref);

	/* clear all refs */
	memset(crd, 0, sizeof(*crd));

	mtx_lock(&usb_ref_lock);
	cpd->bus = devclass_get_softc(usb_devclass_ptr, cpd->bus_index);
	if (cpd->bus == NULL) {
	DPRINTFN(2, "no bus at %u\n", cpd->bus_index);
	goto error;
	}
	cpd->udev = cpd->bus->devices[cpd->dev_index];
	if (cpd->udev == NULL) {
	DPRINTFN(2, "no device at %u\n", cpd->dev_index);
	goto error;
	}
	if (cpd->udev->state == USB_STATE_DETACHED &&
	(need_uref != 2)) {
	DPRINTFN(2, "device is detached\n");
	goto error;
	}
	if (need_uref) {
	DPRINTFN(2, "ref udev - needed\n");

	if (cpd->udev->refcount == USB_DEV_REF_MAX) {
	DPRINTFN(2, "no dev ref\n");
	goto error;
	}
	cpd->udev->refcount++;

	mtx_unlock(&usb_ref_lock);

	/*
	* We need to grab the enumeration SX-lock before
	* grabbing the FIFO refs to avoid deadlock at detach!
	*/
	crd->do_unlock = usbd_enum_lock_sig(cpd->udev);

	mtx_lock(&usb_ref_lock);

	/*
	* Set "is_uref" after grabbing the default SX lock
	*/
	crd->is_uref = 1;

	/* check for signal */
	if (crd->do_unlock > 1) {
	crd->do_unlock = 0;
	goto error;
	}
	}

	/* check if we are doing an open */
	if (cpd->fflags == 0) {
	/* use zero defaults */
	} else {
	/* check for write */
	if (cpd->fflags & FWRITE) {
	ppf = cpd->udev->fifo;
	f = ppf[cpd->fifo_index + USB_FIFO_TX];
	crd->txfifo = f;
	crd->is_write = 1; /* ref */
	if (f == NULL \|\| f->refcount == USB_FIFO_REF_MAX)
	goto error;
	if (f->curr_cpd != cpd)
	goto error;
	/* check if USB-FS is active */
	if (f->fs_ep_max != 0) {
	crd->is_usbfs = 1;
	}
	}

	/* check for read */
	if (cpd->fflags & FREAD) {
	ppf = cpd->udev->fifo;
	f = ppf[cpd->fifo_index + USB_FIFO_RX];
	crd->rxfifo = f;
	crd->is_read = 1; /* ref */
	if (f == NULL \|\| f->refcount == USB_FIFO_REF_MAX)
	goto error;
	if (f->curr_cpd != cpd)
	goto error;
	/* check if USB-FS is active */
	if (f->fs_ep_max != 0) {
	crd->is_usbfs = 1;
	}
	}
	}

	/* when everything is OK we increment the refcounts */
	if (crd->is_write) {
	DPRINTFN(2, "ref write\n");
	crd->txfifo->refcount++;
	}
	if (crd->is_read) {
	DPRINTFN(2, "ref read\n");
	crd->rxfifo->refcount++;
	}
	mtx_unlock(&usb_ref_lock);

	return (0);

	error:
	if (crd->do_unlock)
	usbd_enum_unlock(cpd->udev);

	if (crd->is_uref) {
	if (--(cpd->udev->refcount) == 0)
	cv_broadcast(&cpd->udev->ref_cv);
	}
	mtx_unlock(&usb_ref_lock);
	DPRINTFN(2, "fail\n");

	/* clear all refs */
	memset(crd, 0, sizeof(*crd));

	return (USB_ERR_INVAL);
	}

	/------------------------------------------------------------------------
	* usb_usb_ref_device
	*
	* This function is used to upgrade an USB reference to include the
	* USB device reference on a USB location.
	*
	* Return values:
	* 0: Success, refcount incremented on the given USB device.
	* Else: Failure.
	------------------------------------------------------------------------/
	static usb_error_t
	usb_usb_ref_device(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd)
	{
	/*
	* Check if we already got an USB reference on this location:
	*/
	if (crd->is_uref)
	return (0); /* success */

	/*
	* To avoid deadlock at detach we need to drop the FIFO ref
	* and re-acquire a new ref!
	*/
	usb_unref_device(cpd, crd);

	return (usb_ref_device(cpd, crd, 1 /* need uref */));
	}

	/------------------------------------------------------------------------
	* usb_unref_device
	*
	* This function will release the reference count by one unit for the
	* given USB device.
	------------------------------------------------------------------------/
	static void
	usb_unref_device(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd)
	{

	DPRINTFN(2, "cpd=%p is_uref=%d\n", cpd, crd->is_uref);

	if (crd->do_unlock)
	usbd_enum_unlock(cpd->udev);

	mtx_lock(&usb_ref_lock);
	if (crd->is_read) {
	if (--(crd->rxfifo->refcount) == 0) {
	cv_signal(&crd->rxfifo->cv_drain);
	}
	crd->is_read = 0;
	}
	if (crd->is_write) {
	if (--(crd->txfifo->refcount) == 0) {
	cv_signal(&crd->txfifo->cv_drain);
	}
	crd->is_write = 0;
	}
	if (crd->is_uref) {
	crd->is_uref = 0;
	if (--(cpd->udev->refcount) == 0)
	cv_broadcast(&cpd->udev->ref_cv);
	}
	mtx_unlock(&usb_ref_lock);
	}

	static struct usb_fifo *
	usb_fifo_alloc(struct mtx *mtx)
	{
	struct usb_fifo *f;

	f = malloc(sizeof(*f), M_USBDEV, M_WAITOK \| M_ZERO);
	if (f != NULL) {
	cv_init(&f->cv_io, "FIFO-IO");
	cv_init(&f->cv_drain, "FIFO-DRAIN");
	f->priv_mtx = mtx;
	f->refcount = 1;
	knlist_init_mtx(&f->selinfo.si_note, mtx);
	}
	return (f);
	}

	/------------------------------------------------------------------------
	* usb_fifo_create
	------------------------------------------------------------------------/
	static int
	usb_fifo_create(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd)
	{
	struct usb_device *udev = cpd->udev;
	struct usb_fifo *f;
	struct usb_endpoint *ep;
	uint8_t n;
	uint8_t is_tx;
	uint8_t is_rx;
	uint8_t no_null;
	uint8_t is_busy;
	int e = cpd->ep_addr;

	is_tx = (cpd->fflags & FWRITE) ? 1 : 0;
	is_rx = (cpd->fflags & FREAD) ? 1 : 0;
	no_null = 1;
	is_busy = 0;

	/* Preallocated FIFO */
	if (e < 0) {
	DPRINTFN(5, "Preallocated FIFO\n");
	if (is_tx) {
	f = udev->fifo[cpd->fifo_index + USB_FIFO_TX];
	if (f == NULL)
	return (EINVAL);
	crd->txfifo = f;
	}
	if (is_rx) {
	f = udev->fifo[cpd->fifo_index + USB_FIFO_RX];
	if (f == NULL)
	return (EINVAL);
	crd->rxfifo = f;
	}
	return (0);
	}

	KASSERT(e >= 0 && e <= 15, ("endpoint %d out of range", e));

	/* search for a free FIFO slot */
	DPRINTFN(5, "Endpoint device, searching for 0x%02x\n", e);
	for (n = 0;; n += 2) {

	if (n == USB_FIFO_MAX) {
	if (no_null) {
	no_null = 0;
	n = 0;
	} else {
	/* end of FIFOs reached */
	DPRINTFN(5, "out of FIFOs\n");
	return (ENOMEM);
	}
	}
	/* Check for TX FIFO */
	if (is_tx) {
	f = udev->fifo[n + USB_FIFO_TX];
	if (f != NULL) {
	if (f->dev_ep_index != e) {
	/* wrong endpoint index */
	continue;
	}
	if (f->curr_cpd != NULL) {
	/* FIFO is opened */
	is_busy = 1;
	continue;
	}
	} else if (no_null) {
	continue;
	}
	}
	/* Check for RX FIFO */
	if (is_rx) {
	f = udev->fifo[n + USB_FIFO_RX];
	if (f != NULL) {
	if (f->dev_ep_index != e) {
	/* wrong endpoint index */
	continue;
	}
	if (f->curr_cpd != NULL) {
	/* FIFO is opened */
	is_busy = 1;
	continue;
	}
	} else if (no_null) {
	continue;
	}
	}
	break;
	}

	if (no_null == 0) {
	if (e >= (USB_EP_MAX / 2)) {
	/* we don't create any endpoints in this range */
	DPRINTFN(5, "ep out of range\n");
	return (is_busy ? EBUSY : EINVAL);
	}
	}

	if ((e != 0) && is_busy) {
	/*
	* Only the default control endpoint is allowed to be
	* opened multiple times!
	*/
	DPRINTFN(5, "busy\n");
	return (EBUSY);
	}

	/* Check TX FIFO */
	if (is_tx &&
	(udev->fifo[n + USB_FIFO_TX] == NULL)) {
	ep = usb_dev_get_ep(udev, e, USB_FIFO_TX);
	DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_TX);
	if (ep == NULL) {
	DPRINTFN(5, "dev_get_endpoint returned NULL\n");
	return (EINVAL);
	}
	f = usb_fifo_alloc(&udev->device_mtx);
	if (f == NULL) {
	DPRINTFN(5, "could not alloc tx fifo\n");
	return (ENOMEM);
	}
	/* update some fields */
	f->fifo_index = n + USB_FIFO_TX;
	f->dev_ep_index = e;
	f->priv_sc0 = ep;
	f->methods = &usb_ugen_methods;
	f->iface_index = ep->iface_index;
	f->udev = udev;
	mtx_lock(&usb_ref_lock);
	udev->fifo[n + USB_FIFO_TX] = f;
	mtx_unlock(&usb_ref_lock);
	}
	/* Check RX FIFO */
	if (is_rx &&
	(udev->fifo[n + USB_FIFO_RX] == NULL)) {

	ep = usb_dev_get_ep(udev, e, USB_FIFO_RX);
	DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_RX);
	if (ep == NULL) {
	DPRINTFN(5, "dev_get_endpoint returned NULL\n");
	return (EINVAL);
	}
	f = usb_fifo_alloc(&udev->device_mtx);
	if (f == NULL) {
	DPRINTFN(5, "could not alloc rx fifo\n");
	return (ENOMEM);
	}
	/* update some fields */
	f->fifo_index = n + USB_FIFO_RX;
	f->dev_ep_index = e;
	f->priv_sc0 = ep;
	f->methods = &usb_ugen_methods;
	f->iface_index = ep->iface_index;
	f->udev = udev;
	mtx_lock(&usb_ref_lock);
	udev->fifo[n + USB_FIFO_RX] = f;
	mtx_unlock(&usb_ref_lock);
	}
	if (is_tx) {
	crd->txfifo = udev->fifo[n + USB_FIFO_TX];
	}
	if (is_rx) {
	crd->rxfifo = udev->fifo[n + USB_FIFO_RX];
	}
	/* fill out fifo index */
	DPRINTFN(5, "fifo index = %d\n", n);
	cpd->fifo_index = n;

	/* complete */

	return (0);
	}

	void
	usb_fifo_free(struct usb_fifo *f)
	{
	uint8_t n;

	if (f == NULL) {
	/* be NULL safe */
	return;
	}
	/* destroy symlink devices, if any */
	for (n = 0; n != 2; n++) {
	if (f->symlink[n]) {
	usb_free_symlink(f->symlink[n]);
	f->symlink[n] = NULL;
	}
	}
	mtx_lock(&usb_ref_lock);

	/* delink ourselves to stop calls from userland */
	if ((f->fifo_index < USB_FIFO_MAX) &&
	(f->udev != NULL) &&
	(f->udev->fifo[f->fifo_index] == f)) {
	f->udev->fifo[f->fifo_index] = NULL;
	} else {
	DPRINTFN(0, "USB FIFO %p has not been linked\n", f);
	}

	/* decrease refcount */
	f->refcount--;
	/* need to wait until all callers have exited */
	while (f->refcount != 0) {
	mtx_unlock(&usb_ref_lock); /* avoid LOR */
	mtx_lock(f->priv_mtx);
	/* prevent write flush, if any */
	f->flag_iserror = 1;
	/* get I/O thread out of any sleep state */
	if (f->flag_sleeping) {
	f->flag_sleeping = 0;
	cv_broadcast(&f->cv_io);
	}
	mtx_unlock(f->priv_mtx);
	mtx_lock(&usb_ref_lock);

	/*
	* Check if the "f->refcount" variable reached zero
	* during the unlocked time before entering wait:
	*/
	if (f->refcount == 0)
	break;

	/* wait for sync */
	cv_wait(&f->cv_drain, &usb_ref_lock);
	}
	mtx_unlock(&usb_ref_lock);

	/* take care of closing the device here, if any */
	usb_fifo_close(f, 0);

	cv_destroy(&f->cv_io);
	cv_destroy(&f->cv_drain);

	knlist_clear(&f->selinfo.si_note, 0);
	seldrain(&f->selinfo);
	knlist_destroy(&f->selinfo.si_note);

	free(f, M_USBDEV);
	}

	static struct usb_endpoint *
	usb_dev_get_ep(struct usb_device *udev, uint8_t ep_index, uint8_t dir)
	{
	struct usb_endpoint *ep;
	uint8_t ep_dir;

	if (ep_index == 0) {
	ep = &udev->ctrl_ep;
	} else {
	if (dir == USB_FIFO_RX) {
	if (udev->flags.usb_mode == USB_MODE_HOST) {
	ep_dir = UE_DIR_IN;
	} else {
	ep_dir = UE_DIR_OUT;
	}
	} else {
	if (udev->flags.usb_mode == USB_MODE_HOST) {
	ep_dir = UE_DIR_OUT;
	} else {
	ep_dir = UE_DIR_IN;
	}
	}
	ep = usbd_get_ep_by_addr(udev, ep_index \| ep_dir);
	}

	if (ep == NULL) {
	/* if the endpoint does not exist then return */
	return (NULL);
	}
	if (ep->edesc == NULL) {
	/* invalid endpoint */
	return (NULL);
	}
	return (ep); /* success */
	}

	/------------------------------------------------------------------------
	* usb_fifo_open
	*
	* Returns:
	* 0: Success
	* Else: Failure
	------------------------------------------------------------------------/
	static int
	usb_fifo_open(struct usb_cdev_privdata *cpd,
	struct usb_fifo *f, int fflags)
	{
	int err;

	if (f == NULL) {
	/* no FIFO there */
	DPRINTFN(2, "no FIFO\n");
	return (ENXIO);
	}
	/* remove FWRITE and FREAD flags */
	fflags &= ~(FWRITE \| FREAD);

	/* set correct file flags */
	if ((f->fifo_index & 1) == USB_FIFO_TX) {
	fflags \|= FWRITE;
	} else {
	fflags \|= FREAD;
	}

	/* check if we are already opened */
	/* we don't need any locks when checking this variable */
	if (f->curr_cpd != NULL) {
	err = EBUSY;
	goto done;
	}

	/* reset short flag before open */
	f->flag_short = 0;

	/* call open method */
	err = (f->methods->f_open) (f, fflags);
	if (err) {
	goto done;
	}
	mtx_lock(f->priv_mtx);

	/* reset sleep flag */
	f->flag_sleeping = 0;

	/* reset error flag */
	f->flag_iserror = 0;

	/* reset complete flag */
	f->flag_iscomplete = 0;

	/* reset select flag */
	f->flag_isselect = 0;

	/* reset flushing flag */
	f->flag_flushing = 0;

	/* reset ASYNC proc flag */
	f->async_p = NULL;

	mtx_lock(&usb_ref_lock);
	/* flag the fifo as opened to prevent others */
	f->curr_cpd = cpd;
	mtx_unlock(&usb_ref_lock);

	/* reset queue */
	usb_fifo_reset(f);

	mtx_unlock(f->priv_mtx);
	done:
	return (err);
	}

	/------------------------------------------------------------------------
	* usb_fifo_reset
	------------------------------------------------------------------------/
	void
	usb_fifo_reset(struct usb_fifo *f)
	{
	struct usb_mbuf *m;

	if (f == NULL) {
	return;
	}
	while (1) {
	USB_IF_DEQUEUE(&f->used_q, m);
	if (m) {
	USB_IF_ENQUEUE(&f->free_q, m);
	} else {
	break;
	}
	}
	/* reset have fragment flag */
	f->flag_have_fragment = 0;
	}

	/------------------------------------------------------------------------
	* usb_fifo_close
	------------------------------------------------------------------------/
	static void
	usb_fifo_close(struct usb_fifo *f, int fflags)
	{
	int err;

	/* check if we are not opened */
	if (f->curr_cpd == NULL) {
	/* nothing to do - already closed */
	return;
	}
	mtx_lock(f->priv_mtx);

	/* clear current cdev private data pointer */
	mtx_lock(&usb_ref_lock);
	f->curr_cpd = NULL;
	mtx_unlock(&usb_ref_lock);

	/* check if we are watched by kevent */
	KNOTE_LOCKED(&f->selinfo.si_note, 0);

	/* check if we are selected */
	if (f->flag_isselect) {
	selwakeup(&f->selinfo);
	f->flag_isselect = 0;
	}
	/* check if a thread wants SIGIO */
	if (f->async_p != NULL) {
	PROC_LOCK(f->async_p);
	kern_psignal(f->async_p, SIGIO);
	PROC_UNLOCK(f->async_p);
	f->async_p = NULL;
	}
	/* remove FWRITE and FREAD flags */
	fflags &= ~(FWRITE \| FREAD);

	/* flush written data, if any */
	if ((f->fifo_index & 1) == USB_FIFO_TX) {

	if (!f->flag_iserror) {

	/* set flushing flag */
	f->flag_flushing = 1;

	/* get the last packet in */
	if (f->flag_have_fragment) {
	struct usb_mbuf *m;
	f->flag_have_fragment = 0;
	USB_IF_DEQUEUE(&f->free_q, m);
	if (m) {
	USB_IF_ENQUEUE(&f->used_q, m);
	}
	}

	/* start write transfer, if not already started */
	(f->methods->f_start_write) (f);

	/* check if flushed already */
	while (f->flag_flushing &&
	(!f->flag_iserror)) {
	/* wait until all data has been written */
	f->flag_sleeping = 1;
	err = cv_timedwait_sig(&f->cv_io, f->priv_mtx,
	USB_MS_TO_TICKS(USB_DEFAULT_TIMEOUT));
	if (err) {
	DPRINTF("signal received\n");
	break;
	}
	}
	}
	fflags \|= FWRITE;

	/* stop write transfer, if not already stopped */
	(f->methods->f_stop_write) (f);
	} else {
	fflags \|= FREAD;

	/* stop write transfer, if not already stopped */
	(f->methods->f_stop_read) (f);
	}

	/* check if we are sleeping */
	if (f->flag_sleeping) {
	DPRINTFN(2, "Sleeping at close!\n");
	}
	mtx_unlock(f->priv_mtx);

	/* call close method */
	(f->methods->f_close) (f, fflags);

	DPRINTF("closed\n");
	}

	/------------------------------------------------------------------------
	* usb_open - cdev callback
	------------------------------------------------------------------------/
	static int
	usb_open(struct cdev dev, int fflags, int devtype, struct thread td)
	{
	struct usb_fs_privdata* pd = (struct usb_fs_privdata*)dev->si_drv1;
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata *cpd;
	- int err, ep;
	+ int err;

	DPRINTFN(2, "%s fflags=0x%08x\n", devtoname(dev), fflags);

	KASSERT(fflags & (FREAD\|FWRITE), ("invalid open flags"));
	if (((fflags & FREAD) && !(pd->mode & FREAD)) \|\|
	((fflags & FWRITE) && !(pd->mode & FWRITE))) {
	DPRINTFN(2, "access mode not supported\n");
	return (EPERM);
	}

	cpd = malloc(sizeof(*cpd), M_USBDEV, M_WAITOK \| M_ZERO);
	- ep = cpd->ep_addr = pd->ep_addr;

	usb_loc_fill(pd, cpd);
	err = usb_ref_device(cpd, &refs, 1);
	if (err) {
	DPRINTFN(2, "cannot ref device\n");
	free(cpd, M_USBDEV);
	return (ENXIO);
	}
	cpd->fflags = fflags; /* access mode for open lifetime */

	/* create FIFOs, if any */
	err = usb_fifo_create(cpd, &refs);
	/* check for error */
	if (err) {
	DPRINTFN(2, "cannot create fifo\n");
	usb_unref_device(cpd, &refs);
	free(cpd, M_USBDEV);
	return (err);
	}
	if (fflags & FREAD) {
	err = usb_fifo_open(cpd, refs.rxfifo, fflags);
	if (err) {
	DPRINTFN(2, "read open failed\n");
	usb_unref_device(cpd, &refs);
	free(cpd, M_USBDEV);
	return (err);
	}
	}
	if (fflags & FWRITE) {
	err = usb_fifo_open(cpd, refs.txfifo, fflags);
	if (err) {
	DPRINTFN(2, "write open failed\n");
	if (fflags & FREAD) {
	usb_fifo_close(refs.rxfifo, fflags);
	}
	usb_unref_device(cpd, &refs);
	free(cpd, M_USBDEV);
	return (err);
	}
	}
	usb_unref_device(cpd, &refs);
	devfs_set_cdevpriv(cpd, usb_close);

	return (0);
	}

	/------------------------------------------------------------------------
	* usb_close - cdev callback
	------------------------------------------------------------------------/
	static void
	usb_close(void *arg)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata *cpd = arg;
	int err;

	DPRINTFN(2, "cpd=%p\n", cpd);

	err = usb_ref_device(cpd, &refs,
	2 /* uref and allow detached state */);
	if (err) {
	DPRINTFN(2, "Cannot grab USB reference when "
	"closing USB file handle\n");
	goto done;
	}
	if (cpd->fflags & FREAD) {
	usb_fifo_close(refs.rxfifo, cpd->fflags);
	}
	if (cpd->fflags & FWRITE) {
	usb_fifo_close(refs.txfifo, cpd->fflags);
	}
	usb_unref_device(cpd, &refs);
	done:
	free(cpd, M_USBDEV);
	}

	static void
	usb_dev_init(void *arg)
	{
	mtx_init(&usb_ref_lock, "USB ref mutex", NULL, MTX_DEF);
	sx_init(&usb_sym_lock, "USB sym mutex");
	TAILQ_INIT(&usb_sym_head);

	/* check the UGEN methods */
	usb_fifo_check_methods(&usb_ugen_methods);
	}

	SYSINIT(usb_dev_init, SI_SUB_KLD, SI_ORDER_FIRST, usb_dev_init, NULL);

	static void
	usb_dev_init_post(void *arg)
	{
	/*
	* Create /dev/usb - this is needed for usbconfig(8), which
	* needs a well-known device name to access.
	*/
	usb_dev = make_dev(&usb_static_devsw, 0, UID_ROOT, GID_OPERATOR,
	0644, USB_DEVICE_NAME);
	if (usb_dev == NULL) {
	DPRINTFN(0, "Could not create usb bus device\n");
	}
	}

	SYSINIT(usb_dev_init_post, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, usb_dev_init_post, NULL);

	static void
	usb_dev_uninit(void *arg)
	{
	if (usb_dev != NULL) {
	destroy_dev(usb_dev);
	usb_dev = NULL;
	}
	mtx_destroy(&usb_ref_lock);
	sx_destroy(&usb_sym_lock);
	}

	SYSUNINIT(usb_dev_uninit, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, usb_dev_uninit, NULL);

	static int
	usb_ioctl_f_sub(struct usb_fifo f, u_long cmd, void addr,
	struct thread *td)
	{
	int error = 0;

	switch (cmd) {
	case FIODTYPE:
	(int )addr = 0; /* character device */
	break;

	case FIONBIO:
	/* handled by upper FS layer */
	break;

	case FIOASYNC:
	if ((int )addr) {
	if (f->async_p != NULL) {
	error = EBUSY;
	break;
	}
	f->async_p = USB_TD_GET_PROC(td);
	} else {
	f->async_p = NULL;
	}
	break;

	/* XXX this is not the most general solution */
	case TIOCSPGRP:
	if (f->async_p == NULL) {
	error = EINVAL;
	break;
	}
	if ((int )addr != USB_PROC_GET_GID(f->async_p)) {
	error = EPERM;
	break;
	}
	break;
	default:
	return (ENOIOCTL);
	}
	DPRINTFN(3, "cmd 0x%lx = %d\n", cmd, error);
	return (error);
	}

	/------------------------------------------------------------------------
	* usb_ioctl - cdev callback
	------------------------------------------------------------------------/
	static int
	usb_ioctl(struct cdev dev, u_long cmd, caddr_t addr, int fflag, struct thread td)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	int fflags;
	int err;

	DPRINTFN(2, "cmd=0x%lx\n", cmd);

	err = devfs_get_cdevpriv((void **)&cpd);
	if (err != 0)
	return (err);

	/*
	* Performance optimisation: We try to check for IOCTL's that
	* don't need the USB reference first. Then we grab the USB
	* reference if we need it!
	*/
	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
	if (err)
	return (ENXIO);

	fflags = cpd->fflags;

	f = NULL; /* set default value */
	err = ENOIOCTL; /* set default value */

	if (fflags & FWRITE) {
	f = refs.txfifo;
	err = usb_ioctl_f_sub(f, cmd, addr, td);
	}
	if (fflags & FREAD) {
	f = refs.rxfifo;
	err = usb_ioctl_f_sub(f, cmd, addr, td);
	}
	KASSERT(f != NULL, ("fifo not found"));
	if (err != ENOIOCTL)
	goto done;

	err = (f->methods->f_ioctl) (f, cmd, addr, fflags);

	DPRINTFN(2, "f_ioctl cmd 0x%lx = %d\n", cmd, err);

	if (err != ENOIOCTL)
	goto done;

	if (usb_usb_ref_device(cpd, &refs)) {
	/* we lost the reference */
	return (ENXIO);
	}

	err = (f->methods->f_ioctl_post) (f, cmd, addr, fflags);

	DPRINTFN(2, "f_ioctl_post cmd 0x%lx = %d\n", cmd, err);

	if (err == ENOIOCTL)
	err = ENOTTY;

	if (err)
	goto done;

	/* Wait for re-enumeration, if any */

	while (f->udev->re_enumerate_wait != USB_RE_ENUM_DONE) {

	usb_unref_device(cpd, &refs);

	usb_pause_mtx(NULL, hz / 128);

	while (usb_ref_device(cpd, &refs, 1 /* need uref */)) {
	if (usb_ref_device(cpd, &refs, 0)) {
	/* device no longer exists */
	return (ENXIO);
	}
	usb_unref_device(cpd, &refs);
	usb_pause_mtx(NULL, hz / 128);
	}
	}

	done:
	usb_unref_device(cpd, &refs);
	return (err);
	}

	static void
	usb_filter_detach(struct knote *kn)
	{
	struct usb_fifo *f = kn->kn_hook;
	knlist_remove(&f->selinfo.si_note, kn, 0);
	}

	static int
	usb_filter_write(struct knote *kn, long hint)
	{
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;

	DPRINTFN(2, "\n");

	f = kn->kn_hook;

	USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);

	cpd = f->curr_cpd;
	if (cpd == NULL) {
	m = (void *)1;
	} else if (f->fs_ep_max == 0) {
	if (f->flag_iserror) {
	/* we got an error */
	m = (void *)1;
	} else {
	if (f->queue_data == NULL) {
	/*
	* start write transfer, if not
	* already started
	*/
	(f->methods->f_start_write) (f);
	}
	/* check if any packets are available */
	USB_IF_POLL(&f->free_q, m);
	}
	} else {
	if (f->flag_iscomplete) {
	m = (void *)1;
	} else {
	m = NULL;
	}
	}
	return (m ? 1 : 0);
	}

	static int
	usb_filter_read(struct knote *kn, long hint)
	{
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;

	DPRINTFN(2, "\n");

	f = kn->kn_hook;

	USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);

	cpd = f->curr_cpd;
	if (cpd == NULL) {
	m = (void *)1;
	} else if (f->fs_ep_max == 0) {
	if (f->flag_iserror) {
	/* we have an error */
	m = (void *)1;
	} else {
	if (f->queue_data == NULL) {
	/*
	* start read transfer, if not
	* already started
	*/
	(f->methods->f_start_read) (f);
	}
	/* check if any packets are available */
	USB_IF_POLL(&f->used_q, m);

	/* start reading data, if any */
	if (m == NULL)
	(f->methods->f_start_read) (f);
	}
	} else {
	if (f->flag_iscomplete) {
	m = (void *)1;
	} else {
	m = NULL;
	}
	}
	return (m ? 1 : 0);
	}

	static struct filterops usb_filtops_write = {
	.f_isfd = 1,
	.f_detach = usb_filter_detach,
	.f_event = usb_filter_write,
	};

	static struct filterops usb_filtops_read = {
	.f_isfd = 1,
	.f_detach = usb_filter_detach,
	.f_event = usb_filter_read,
	};


	/* ARGSUSED */
	static int
	usb_kqfilter(struct cdev* dev, struct knote *kn)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	int fflags;
	int err = EINVAL;

	DPRINTFN(2, "\n");

	if (devfs_get_cdevpriv((void **)&cpd) != 0 \|\|
	usb_ref_device(cpd, &refs, 0) != 0)
	return (ENXIO);

	fflags = cpd->fflags;

	/* Figure out who needs service */
	switch (kn->kn_filter) {
	case EVFILT_WRITE:
	if (fflags & FWRITE) {
	f = refs.txfifo;
	kn->kn_fop = &usb_filtops_write;
	err = 0;
	}
	break;
	case EVFILT_READ:
	if (fflags & FREAD) {
	f = refs.rxfifo;
	kn->kn_fop = &usb_filtops_read;
	err = 0;
	}
	break;
	default:
	err = EOPNOTSUPP;
	break;
	}

	if (err == 0) {
	kn->kn_hook = f;
	mtx_lock(f->priv_mtx);
	knlist_add(&f->selinfo.si_note, kn, 1);
	mtx_unlock(f->priv_mtx);
	}

	usb_unref_device(cpd, &refs);
	return (err);
	}

	/* ARGSUSED */
	static int
	usb_poll(struct cdev* dev, int events, struct thread* td)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;
	int fflags, revents;

	if (devfs_get_cdevpriv((void **)&cpd) != 0 \|\|
	usb_ref_device(cpd, &refs, 0) != 0)
	return (events &
	(POLLHUP\|POLLIN\|POLLRDNORM\|POLLOUT\|POLLWRNORM));

	fflags = cpd->fflags;

	/* Figure out who needs service */
	revents = 0;
	if ((events & (POLLOUT \| POLLWRNORM)) &&
	(fflags & FWRITE)) {

	f = refs.txfifo;

	mtx_lock(f->priv_mtx);

	if (!refs.is_usbfs) {
	if (f->flag_iserror) {
	/* we got an error */
	m = (void *)1;
	} else {
	if (f->queue_data == NULL) {
	/*
	* start write transfer, if not
	* already started
	*/
	(f->methods->f_start_write) (f);
	}
	/* check if any packets are available */
	USB_IF_POLL(&f->free_q, m);
	}
	} else {
	if (f->flag_iscomplete) {
	m = (void *)1;
	} else {
	m = NULL;
	}
	}

	if (m) {
	revents \|= events & (POLLOUT \| POLLWRNORM);
	} else {
	f->flag_isselect = 1;
	selrecord(td, &f->selinfo);
	}

	mtx_unlock(f->priv_mtx);
	}
	if ((events & (POLLIN \| POLLRDNORM)) &&
	(fflags & FREAD)) {

	f = refs.rxfifo;

	mtx_lock(f->priv_mtx);

	if (!refs.is_usbfs) {
	if (f->flag_iserror) {
	/* we have an error */
	m = (void *)1;
	} else {
	if (f->queue_data == NULL) {
	/*
	* start read transfer, if not
	* already started
	*/
	(f->methods->f_start_read) (f);
	}
	/* check if any packets are available */
	USB_IF_POLL(&f->used_q, m);
	}
	} else {
	if (f->flag_iscomplete) {
	m = (void *)1;
	} else {
	m = NULL;
	}
	}

	if (m) {
	revents \|= events & (POLLIN \| POLLRDNORM);
	} else {
	f->flag_isselect = 1;
	selrecord(td, &f->selinfo);

	if (!refs.is_usbfs) {
	/* start reading data */
	(f->methods->f_start_read) (f);
	}
	}

	mtx_unlock(f->priv_mtx);
	}
	usb_unref_device(cpd, &refs);
	return (revents);
	}

	static int
	usb_read(struct cdev dev, struct uio uio, int ioflag)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;
	- int fflags;
	- int resid;
	int io_len;
	int err;
	uint8_t tr_data = 0;

	err = devfs_get_cdevpriv((void **)&cpd);
	if (err != 0)
	return (err);

	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
	if (err)
	return (ENXIO);

	- fflags = cpd->fflags;
	-
	f = refs.rxfifo;
	if (f == NULL) {
	/* should not happen */
	usb_unref_device(cpd, &refs);
	return (EPERM);
	}

	- resid = uio->uio_resid;
	-
	mtx_lock(f->priv_mtx);

	/* check for permanent read error */
	if (f->flag_iserror) {
	err = EIO;
	goto done;
	}
	/* check if USB-FS interface is active */
	if (refs.is_usbfs) {
	/*
	* The queue is used for events that should be
	* retrieved using the "USB_FS_COMPLETE" ioctl.
	*/
	err = EINVAL;
	goto done;
	}
	while (uio->uio_resid > 0) {

	USB_IF_DEQUEUE(&f->used_q, m);

	if (m == NULL) {

	/* start read transfer, if not already started */

	(f->methods->f_start_read) (f);

	if (ioflag & IO_NDELAY) {
	if (tr_data) {
	/* return length before error */
	break;
	}
	err = EWOULDBLOCK;
	break;
	}
	DPRINTF("sleeping\n");

	err = usb_fifo_wait(f);
	if (err) {
	break;
	}
	continue;
	}
	if (f->methods->f_filter_read) {
	/*
	* Sometimes it is convenient to process data at the
	* expense of a userland process instead of a kernel
	* process.
	*/
	(f->methods->f_filter_read) (f, m);
	}
	tr_data = 1;

	io_len = MIN(m->cur_data_len, uio->uio_resid);

	DPRINTFN(2, "transfer %d bytes from %p\n",
	io_len, m->cur_data_ptr);

	err = usb_fifo_uiomove(f,
	m->cur_data_ptr, io_len, uio);

	m->cur_data_len -= io_len;
	m->cur_data_ptr += io_len;

	if (m->cur_data_len == 0) {

	uint8_t last_packet;

	last_packet = m->last_packet;

	USB_IF_ENQUEUE(&f->free_q, m);

	if (last_packet) {
	/* keep framing */
	break;
	}
	} else {
	USB_IF_PREPEND(&f->used_q, m);
	}

	if (err) {
	break;
	}
	}
	done:
	mtx_unlock(f->priv_mtx);

	usb_unref_device(cpd, &refs);

	return (err);
	}

	static int
	usb_write(struct cdev dev, struct uio uio, int ioflag)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;
	uint8_t *pdata;
	- int fflags;
	- int resid;
	int io_len;
	int err;
	uint8_t tr_data = 0;

	DPRINTFN(2, "\n");

	err = devfs_get_cdevpriv((void **)&cpd);
	if (err != 0)
	return (err);

	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
	if (err)
	return (ENXIO);

	- fflags = cpd->fflags;
	-
	f = refs.txfifo;
	if (f == NULL) {
	/* should not happen */
	usb_unref_device(cpd, &refs);
	return (EPERM);
	}
	- resid = uio->uio_resid;

	mtx_lock(f->priv_mtx);

	/* check for permanent write error */
	if (f->flag_iserror) {
	err = EIO;
	goto done;
	}
	/* check if USB-FS interface is active */
	if (refs.is_usbfs) {
	/*
	* The queue is used for events that should be
	* retrieved using the "USB_FS_COMPLETE" ioctl.
	*/
	err = EINVAL;
	goto done;
	}
	if (f->queue_data == NULL) {
	/* start write transfer, if not already started */
	(f->methods->f_start_write) (f);
	}
	/* we allow writing zero length data */
	do {
	USB_IF_DEQUEUE(&f->free_q, m);

	if (m == NULL) {

	if (ioflag & IO_NDELAY) {
	if (tr_data) {
	/* return length before error */
	break;
	}
	err = EWOULDBLOCK;
	break;
	}
	DPRINTF("sleeping\n");

	err = usb_fifo_wait(f);
	if (err) {
	break;
	}
	continue;
	}
	tr_data = 1;

	if (f->flag_have_fragment == 0) {
	USB_MBUF_RESET(m);
	io_len = m->cur_data_len;
	pdata = m->cur_data_ptr;
	if (io_len > uio->uio_resid)
	io_len = uio->uio_resid;
	m->cur_data_len = io_len;
	} else {
	io_len = m->max_data_len - m->cur_data_len;
	pdata = m->cur_data_ptr + m->cur_data_len;
	if (io_len > uio->uio_resid)
	io_len = uio->uio_resid;
	m->cur_data_len += io_len;
	}

	DPRINTFN(2, "transfer %d bytes to %p\n",
	io_len, pdata);

	err = usb_fifo_uiomove(f, pdata, io_len, uio);

	if (err) {
	f->flag_have_fragment = 0;
	USB_IF_ENQUEUE(&f->free_q, m);
	break;
	}

	/* check if the buffer is ready to be transmitted */

	if ((f->flag_write_defrag == 0) \|\|
	(m->cur_data_len == m->max_data_len)) {
	f->flag_have_fragment = 0;

	/*
	* Check for write filter:
	*
	* Sometimes it is convenient to process data
	* at the expense of a userland process
	* instead of a kernel process.
	*/
	if (f->methods->f_filter_write) {
	(f->methods->f_filter_write) (f, m);
	}

	/* Put USB mbuf in the used queue */
	USB_IF_ENQUEUE(&f->used_q, m);

	/* Start writing data, if not already started */
	(f->methods->f_start_write) (f);
	} else {
	/* Wait for more data or close */
	f->flag_have_fragment = 1;
	USB_IF_PREPEND(&f->free_q, m);
	}

	} while (uio->uio_resid > 0);
	done:
	mtx_unlock(f->priv_mtx);

	usb_unref_device(cpd, &refs);

	return (err);
	}

	int
	usb_static_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
	struct thread *td)
	{
	union {
	struct usb_read_dir *urd;
	void* data;
	} u;
	int err;

	u.data = data;
	switch (cmd) {
	case USB_READ_DIR:
	err = usb_read_symlink(u.urd->urd_data,
	u.urd->urd_startentry, u.urd->urd_maxlen);
	break;
	case USB_DEV_QUIRK_GET:
	case USB_QUIRK_NAME_GET:
	case USB_DEV_QUIRK_ADD:
	case USB_DEV_QUIRK_REMOVE:
	err = usb_quirk_ioctl_p(cmd, data, fflag, td);
	break;
	case USB_GET_TEMPLATE:
	(int )data = usb_template;
	err = 0;
	break;
	case USB_SET_TEMPLATE:
	err = priv_check(curthread, PRIV_DRIVER);
	if (err)
	break;
	usb_template = (int )data;
	break;
	default:
	err = ENOTTY;
	break;
	}
	return (err);
	}

	static int
	usb_fifo_uiomove(struct usb_fifo f, void cp,
	int n, struct uio *uio)
	{
	int error;

	mtx_unlock(f->priv_mtx);

	/*
	* "uiomove()" can sleep so one needs to make a wrapper,
	* exiting the mutex and checking things:
	*/
	error = uiomove(cp, n, uio);

	mtx_lock(f->priv_mtx);

	return (error);
	}

	int
	usb_fifo_wait(struct usb_fifo *f)
	{
	int err;

	USB_MTX_ASSERT(f->priv_mtx, MA_OWNED);

	if (f->flag_iserror) {
	/* we are gone */
	return (EIO);
	}
	f->flag_sleeping = 1;

	err = cv_wait_sig(&f->cv_io, f->priv_mtx);

	if (f->flag_iserror) {
	/* we are gone */
	err = EIO;
	}
	return (err);
	}

	void
	usb_fifo_signal(struct usb_fifo *f)
	{
	if (f->flag_sleeping) {
	f->flag_sleeping = 0;
	cv_broadcast(&f->cv_io);
	}
	}

	void
	usb_fifo_wakeup(struct usb_fifo *f)
	{
	usb_fifo_signal(f);

	KNOTE_LOCKED(&f->selinfo.si_note, 0);

	if (f->flag_isselect) {
	selwakeup(&f->selinfo);
	f->flag_isselect = 0;
	}
	if (f->async_p != NULL) {
	PROC_LOCK(f->async_p);
	kern_psignal(f->async_p, SIGIO);
	PROC_UNLOCK(f->async_p);
	}
	}

	static int
	usb_fifo_dummy_open(struct usb_fifo *fifo, int fflags)
	{
	return (0);
	}

	static void
	usb_fifo_dummy_close(struct usb_fifo *fifo, int fflags)
	{
	return;
	}

	static int
	usb_fifo_dummy_ioctl(struct usb_fifo fifo, u_long cmd, void addr, int fflags)
	{
	return (ENOIOCTL);
	}

	static void
	usb_fifo_dummy_cmd(struct usb_fifo *fifo)
	{
	fifo->flag_flushing = 0; /* not flushing */
	}

	static void
	usb_fifo_check_methods(struct usb_fifo_methods *pm)
	{
	/* check that all callback functions are OK */

	if (pm->f_open == NULL)
	pm->f_open = &usb_fifo_dummy_open;

	if (pm->f_close == NULL)
	pm->f_close = &usb_fifo_dummy_close;

	if (pm->f_ioctl == NULL)
	pm->f_ioctl = &usb_fifo_dummy_ioctl;

	if (pm->f_ioctl_post == NULL)
	pm->f_ioctl_post = &usb_fifo_dummy_ioctl;

	if (pm->f_start_read == NULL)
	pm->f_start_read = &usb_fifo_dummy_cmd;

	if (pm->f_stop_read == NULL)
	pm->f_stop_read = &usb_fifo_dummy_cmd;

	if (pm->f_start_write == NULL)
	pm->f_start_write = &usb_fifo_dummy_cmd;

	if (pm->f_stop_write == NULL)
	pm->f_stop_write = &usb_fifo_dummy_cmd;
	}

	/------------------------------------------------------------------------
	* usb_fifo_attach
	*
	* The following function will create a duplex FIFO.
	*
	* Return values:
	* 0: Success.
	* Else: Failure.
	------------------------------------------------------------------------/
	int
	usb_fifo_attach(struct usb_device udev, void priv_sc,
	struct mtx priv_mtx, struct usb_fifo_methods pm,
	struct usb_fifo_sc *f_sc, uint16_t unit, int16_t subunit,
	uint8_t iface_index, uid_t uid, gid_t gid, int mode)
	{
	struct usb_fifo *f_tx;
	struct usb_fifo *f_rx;
	char devname[32];
	uint8_t n;

	f_sc->fp[USB_FIFO_TX] = NULL;
	f_sc->fp[USB_FIFO_RX] = NULL;

	if (pm == NULL)
	return (EINVAL);

	/* check the methods */
	usb_fifo_check_methods(pm);

	if (priv_mtx == NULL)
	priv_mtx = &Giant;

	/* search for a free FIFO slot */
	for (n = 0;; n += 2) {

	if (n == USB_FIFO_MAX) {
	/* end of FIFOs reached */
	return (ENOMEM);
	}
	/* Check for TX FIFO */
	if (udev->fifo[n + USB_FIFO_TX] != NULL) {
	continue;
	}
	/* Check for RX FIFO */
	if (udev->fifo[n + USB_FIFO_RX] != NULL) {
	continue;
	}
	break;
	}

	f_tx = usb_fifo_alloc(priv_mtx);
	f_rx = usb_fifo_alloc(priv_mtx);

	if ((f_tx == NULL) \|\| (f_rx == NULL)) {
	usb_fifo_free(f_tx);
	usb_fifo_free(f_rx);
	return (ENOMEM);
	}
	/* initialise FIFO structures */

	f_tx->fifo_index = n + USB_FIFO_TX;
	f_tx->dev_ep_index = -1;
	f_tx->priv_sc0 = priv_sc;
	f_tx->methods = pm;
	f_tx->iface_index = iface_index;
	f_tx->udev = udev;

	f_rx->fifo_index = n + USB_FIFO_RX;
	f_rx->dev_ep_index = -1;
	f_rx->priv_sc0 = priv_sc;
	f_rx->methods = pm;
	f_rx->iface_index = iface_index;
	f_rx->udev = udev;

	f_sc->fp[USB_FIFO_TX] = f_tx;
	f_sc->fp[USB_FIFO_RX] = f_rx;

	mtx_lock(&usb_ref_lock);
	udev->fifo[f_tx->fifo_index] = f_tx;
	udev->fifo[f_rx->fifo_index] = f_rx;
	mtx_unlock(&usb_ref_lock);

	for (n = 0; n != 4; n++) {

	if (pm->basename[n] == NULL) {
	continue;
	}
	if (subunit < 0) {
	if (snprintf(devname, sizeof(devname),
	"%s%u%s", pm->basename[n],
	unit, pm->postfix[n] ?
	pm->postfix[n] : "")) {
	/* ignore */
	}
	} else {
	if (snprintf(devname, sizeof(devname),
	"%s%u.%d%s", pm->basename[n],
	unit, subunit, pm->postfix[n] ?
	pm->postfix[n] : "")) {
	/* ignore */
	}
	}

	/*
	* Distribute the symbolic links into two FIFO structures:
	*/
	if (n & 1) {
	f_rx->symlink[n / 2] =
	usb_alloc_symlink(devname);
	} else {
	f_tx->symlink[n / 2] =
	usb_alloc_symlink(devname);
	}

	/* Create the device */
	f_sc->dev = usb_make_dev(udev, devname, -1,
	f_tx->fifo_index & f_rx->fifo_index,
	FREAD\|FWRITE, uid, gid, mode);
	}

	DPRINTFN(2, "attached %p/%p\n", f_tx, f_rx);
	return (0);
	}

	/------------------------------------------------------------------------
	* usb_fifo_alloc_buffer
	*
	* Return values:
	* 0: Success
	* Else failure
	------------------------------------------------------------------------/
	int
	usb_fifo_alloc_buffer(struct usb_fifo *f, usb_size_t bufsize,
	uint16_t nbuf)
	{
	usb_fifo_free_buffer(f);

	/* allocate an endpoint */
	f->free_q.ifq_maxlen = nbuf;
	f->used_q.ifq_maxlen = nbuf;

	f->queue_data = usb_alloc_mbufs(
	M_USBDEV, &f->free_q, bufsize, nbuf);

	if ((f->queue_data == NULL) && bufsize && nbuf) {
	return (ENOMEM);
	}
	return (0); /* success */
	}

	/------------------------------------------------------------------------
	* usb_fifo_free_buffer
	*
	* This function will free the buffers associated with a FIFO. This
	* function can be called multiple times in a row.
	------------------------------------------------------------------------/
	void
	usb_fifo_free_buffer(struct usb_fifo *f)
	{
	if (f->queue_data) {
	/* free old buffer */
	free(f->queue_data, M_USBDEV);
	f->queue_data = NULL;
	}
	/* reset queues */

	memset(&f->free_q, 0, sizeof(f->free_q));
	memset(&f->used_q, 0, sizeof(f->used_q));
	}

	void
	usb_fifo_detach(struct usb_fifo_sc *f_sc)
	{
	if (f_sc == NULL) {
	return;
	}
	usb_fifo_free(f_sc->fp[USB_FIFO_TX]);
	usb_fifo_free(f_sc->fp[USB_FIFO_RX]);

	f_sc->fp[USB_FIFO_TX] = NULL;
	f_sc->fp[USB_FIFO_RX] = NULL;

	usb_destroy_dev(f_sc->dev);

	f_sc->dev = NULL;

	DPRINTFN(2, "detached %p\n", f_sc);
	}

	usb_size_t
	usb_fifo_put_bytes_max(struct usb_fifo *f)
	{
	struct usb_mbuf *m;
	usb_size_t len;

	USB_IF_POLL(&f->free_q, m);

	if (m) {
	len = m->max_data_len;
	} else {
	len = 0;
	}
	return (len);
	}

	/------------------------------------------------------------------------
	* usb_fifo_put_data
	*
	* what:
	* 0 - normal operation
	* 1 - set last packet flag to enforce framing
	------------------------------------------------------------------------/
	void
	usb_fifo_put_data(struct usb_fifo f, struct usb_page_cache pc,
	usb_frlength_t offset, usb_frlength_t len, uint8_t what)
	{
	struct usb_mbuf *m;
	usb_frlength_t io_len;

	while (len \|\| (what == 1)) {

	USB_IF_DEQUEUE(&f->free_q, m);

	if (m) {
	USB_MBUF_RESET(m);

	io_len = MIN(len, m->cur_data_len);

	usbd_copy_out(pc, offset, m->cur_data_ptr, io_len);

	m->cur_data_len = io_len;
	offset += io_len;
	len -= io_len;

	if ((len == 0) && (what == 1)) {
	m->last_packet = 1;
	}
	USB_IF_ENQUEUE(&f->used_q, m);

	usb_fifo_wakeup(f);

	if ((len == 0) \|\| (what == 1)) {
	break;
	}
	} else {
	break;
	}
	}
	}

	void
	usb_fifo_put_data_linear(struct usb_fifo f, void ptr,
	usb_size_t len, uint8_t what)
	{
	struct usb_mbuf *m;
	usb_size_t io_len;

	while (len \|\| (what == 1)) {

	USB_IF_DEQUEUE(&f->free_q, m);

	if (m) {
	USB_MBUF_RESET(m);

	io_len = MIN(len, m->cur_data_len);

	memcpy(m->cur_data_ptr, ptr, io_len);

	m->cur_data_len = io_len;
	ptr = USB_ADD_BYTES(ptr, io_len);
	len -= io_len;

	if ((len == 0) && (what == 1)) {
	m->last_packet = 1;
	}
	USB_IF_ENQUEUE(&f->used_q, m);

	usb_fifo_wakeup(f);

	if ((len == 0) \|\| (what == 1)) {
	break;
	}
	} else {
	break;
	}
	}
	}

	uint8_t
	usb_fifo_put_data_buffer(struct usb_fifo f, void ptr, usb_size_t len)
	{
	struct usb_mbuf *m;

	USB_IF_DEQUEUE(&f->free_q, m);

	if (m) {
	m->cur_data_len = len;
	m->cur_data_ptr = ptr;
	USB_IF_ENQUEUE(&f->used_q, m);
	usb_fifo_wakeup(f);
	return (1);
	}
	return (0);
	}

	void
	usb_fifo_put_data_error(struct usb_fifo *f)
	{
	f->flag_iserror = 1;
	usb_fifo_wakeup(f);
	}

	/------------------------------------------------------------------------
	* usb_fifo_get_data
	*
	* what:
	* 0 - normal operation
	* 1 - only get one "usb_mbuf"
	*
	* returns:
	* 0 - no more data
	* 1 - data in buffer
	------------------------------------------------------------------------/
	uint8_t
	usb_fifo_get_data(struct usb_fifo f, struct usb_page_cache pc,
	usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen,
	uint8_t what)
	{
	struct usb_mbuf *m;
	usb_frlength_t io_len;
	uint8_t tr_data = 0;

	actlen[0] = 0;

	while (1) {

	USB_IF_DEQUEUE(&f->used_q, m);

	if (m) {

	tr_data = 1;

	io_len = MIN(len, m->cur_data_len);

	usbd_copy_in(pc, offset, m->cur_data_ptr, io_len);

	len -= io_len;
	offset += io_len;
	actlen[0] += io_len;
	m->cur_data_ptr += io_len;
	m->cur_data_len -= io_len;

	if ((m->cur_data_len == 0) \|\| (what == 1)) {
	USB_IF_ENQUEUE(&f->free_q, m);

	usb_fifo_wakeup(f);

	if (what == 1) {
	break;
	}
	} else {
	USB_IF_PREPEND(&f->used_q, m);
	}
	} else {

	if (tr_data) {
	/* wait for data to be written out */
	break;
	}
	if (f->flag_flushing) {
	/* check if we should send a short packet */
	if (f->flag_short != 0) {
	f->flag_short = 0;
	tr_data = 1;
	break;
	}
	/* flushing complete */
	f->flag_flushing = 0;
	usb_fifo_wakeup(f);
	}
	break;
	}
	if (len == 0) {
	break;
	}
	}
	return (tr_data);
	}

	uint8_t
	usb_fifo_get_data_linear(struct usb_fifo f, void ptr,
	usb_size_t len, usb_size_t *actlen, uint8_t what)
	{
	struct usb_mbuf *m;
	usb_size_t io_len;
	uint8_t tr_data = 0;

	actlen[0] = 0;

	while (1) {

	USB_IF_DEQUEUE(&f->used_q, m);

	if (m) {

	tr_data = 1;

	io_len = MIN(len, m->cur_data_len);

	memcpy(ptr, m->cur_data_ptr, io_len);

	len -= io_len;
	ptr = USB_ADD_BYTES(ptr, io_len);
	actlen[0] += io_len;
	m->cur_data_ptr += io_len;
	m->cur_data_len -= io_len;

	if ((m->cur_data_len == 0) \|\| (what == 1)) {
	USB_IF_ENQUEUE(&f->free_q, m);

	usb_fifo_wakeup(f);

	if (what == 1) {
	break;
	}
	} else {
	USB_IF_PREPEND(&f->used_q, m);
	}
	} else {

	if (tr_data) {
	/* wait for data to be written out */
	break;
	}
	if (f->flag_flushing) {
	/* check if we should send a short packet */
	if (f->flag_short != 0) {
	f->flag_short = 0;
	tr_data = 1;
	break;
	}
	/* flushing complete */
	f->flag_flushing = 0;
	usb_fifo_wakeup(f);
	}
	break;
	}
	if (len == 0) {
	break;
	}
	}
	return (tr_data);
	}

	uint8_t
	usb_fifo_get_data_buffer(struct usb_fifo f, void pptr, usb_size_t plen)
	{
	struct usb_mbuf *m;

	USB_IF_POLL(&f->used_q, m);

	if (m) {
	*plen = m->cur_data_len;
	*pptr = m->cur_data_ptr;

	return (1);
	}
	return (0);
	}

	void
	usb_fifo_get_data_error(struct usb_fifo *f)
	{
	f->flag_iserror = 1;
	usb_fifo_wakeup(f);
	}

	/------------------------------------------------------------------------
	* usb_alloc_symlink
	*
	* Return values:
	* NULL: Failure
	* Else: Pointer to symlink entry
	------------------------------------------------------------------------/
	struct usb_symlink *
	usb_alloc_symlink(const char *target)
	{
	struct usb_symlink *ps;

	ps = malloc(sizeof(*ps), M_USBDEV, M_WAITOK);
	if (ps == NULL) {
	return (ps);
	}
	/* XXX no longer needed */
	strlcpy(ps->src_path, target, sizeof(ps->src_path));
	ps->src_len = strlen(ps->src_path);
	strlcpy(ps->dst_path, target, sizeof(ps->dst_path));
	ps->dst_len = strlen(ps->dst_path);

	sx_xlock(&usb_sym_lock);
	TAILQ_INSERT_TAIL(&usb_sym_head, ps, sym_entry);
	sx_unlock(&usb_sym_lock);
	return (ps);
	}

	/------------------------------------------------------------------------
	* usb_free_symlink
	------------------------------------------------------------------------/
	void
	usb_free_symlink(struct usb_symlink *ps)
	{
	if (ps == NULL) {
	return;
	}
	sx_xlock(&usb_sym_lock);
	TAILQ_REMOVE(&usb_sym_head, ps, sym_entry);
	sx_unlock(&usb_sym_lock);

	free(ps, M_USBDEV);
	}

	/------------------------------------------------------------------------
	* usb_read_symlink
	*
	* Return value:
	* 0: Success
	* Else: Failure
	------------------------------------------------------------------------/
	int
	usb_read_symlink(uint8_t *user_ptr, uint32_t startentry, uint32_t user_len)
	{
	struct usb_symlink *ps;
	uint32_t temp;
	uint32_t delta = 0;
	uint8_t len;
	int error = 0;

	sx_xlock(&usb_sym_lock);

	TAILQ_FOREACH(ps, &usb_sym_head, sym_entry) {

	/*
	* Compute total length of source and destination symlink
	* strings pluss one length byte and two NUL bytes:
	*/
	temp = ps->src_len + ps->dst_len + 3;

	if (temp > 255) {
	/*
	* Skip entry because this length cannot fit
	* into one byte:
	*/
	continue;
	}
	if (startentry != 0) {
	/* decrement read offset */
	startentry--;
	continue;
	}
	if (temp > user_len) {
	/* out of buffer space */
	break;
	}
	len = temp;

	/* copy out total length */

	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	if (error) {
	break;
	}
	delta += 1;

	/* copy out source string */

	error = copyout(ps->src_path,
	USB_ADD_BYTES(user_ptr, delta), ps->src_len);
	if (error) {
	break;
	}
	len = 0;
	delta += ps->src_len;
	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	if (error) {
	break;
	}
	delta += 1;

	/* copy out destination string */

	error = copyout(ps->dst_path,
	USB_ADD_BYTES(user_ptr, delta), ps->dst_len);
	if (error) {
	break;
	}
	len = 0;
	delta += ps->dst_len;
	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	if (error) {
	break;
	}
	delta += 1;

	user_len -= temp;
	}

	/* a zero length entry indicates the end */

	if ((user_len != 0) && (error == 0)) {

	len = 0;

	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	}
	sx_unlock(&usb_sym_lock);
	return (error);
	}

	void
	usb_fifo_set_close_zlp(struct usb_fifo *f, uint8_t onoff)
	{
	if (f == NULL)
	return;

	/* send a Zero Length Packet, ZLP, before close */
	f->flag_short = onoff;
	}

	void
	usb_fifo_set_write_defrag(struct usb_fifo *f, uint8_t onoff)
	{
	if (f == NULL)
	return;

	/* defrag written data */
	f->flag_write_defrag = onoff;
	/* reset defrag state */
	f->flag_have_fragment = 0;
	}

	void *
	usb_fifo_softc(struct usb_fifo *f)
	{
	return (f->priv_sc0);
	}
	#endif /* USB_HAVE_UGEN */
	Index: head/sys/dev/vnic/nic_main.c
	===================================================================
	--- head/sys/dev/vnic/nic_main.c (revision 327172)
	+++ head/sys/dev/vnic/nic_main.c (revision 327173)
	@@ -1,1232 +1,1229 @@
	/*
	* Copyright (C) 2015 Cavium Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bitset.h>
	#include <sys/bitstring.h>
	#include <sys/bus.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/rman.h>
	#include <sys/pciio.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/cpuset.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>

	#include <net/ethernet.h>
	#include <net/if.h>
	#include <net/if_media.h>

	#include <machine/bus.h>
	#include <machine/_inttypes.h>

	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcivar.h>

	#include <sys/dnv.h>
	#include <sys/nv.h>
	#ifdef PCI_IOV
	#include <sys/iov_schema.h>
	#include <dev/pci/pci_iov.h>
	#endif

	#include "thunder_bgx.h"
	#include "nic_reg.h"
	#include "nic.h"
	#include "q_struct.h"

	#define VNIC_PF_DEVSTR "Cavium Thunder NIC Physical Function Driver"

	#define VNIC_PF_REG_RID PCIR_BAR(PCI_CFG_REG_BAR_NUM)

	#define NIC_SET_VF_LMAC_MAP(bgx, lmac) ((((bgx) & 0xF) << 4) \| ((lmac) & 0xF))
	#define NIC_GET_BGX_FROM_VF_LMAC_MAP(map) (((map) >> 4) & 0xF)
	#define NIC_GET_LMAC_FROM_VF_LMAC_MAP(map) ((map) & 0xF)

	/* Structure to be used by the SR-IOV for VF configuration schemas */
	struct nicvf_info {
	boolean_t vf_enabled;
	int vf_flags;
	};

	struct nicpf {
	device_t dev;
	uint8_t node;
	u_int flags;
	uint8_t num_vf_en; /* No of VF enabled */
	struct nicvf_info vf_info[MAX_NUM_VFS_SUPPORTED];
	struct resource * reg_base; /* Register start address */
	struct pkind_cfg pkind;
	uint8_t vf_lmac_map[MAX_LMAC];
	boolean_t mbx_lock[MAX_NUM_VFS_SUPPORTED];

	struct callout check_link;
	struct mtx check_link_mtx;

	uint8_t link[MAX_LMAC];
	uint8_t duplex[MAX_LMAC];
	uint32_t speed[MAX_LMAC];
	uint16_t cpi_base[MAX_NUM_VFS_SUPPORTED];
	uint16_t rssi_base[MAX_NUM_VFS_SUPPORTED];
	uint16_t rss_ind_tbl_size;

	/* MSI-X */
	boolean_t msix_enabled;
	uint8_t num_vec;
	struct msix_entry msix_entries[NIC_PF_MSIX_VECTORS];
	struct resource * msix_table_res;
	};

	static int nicpf_probe(device_t);
	static int nicpf_attach(device_t);
	static int nicpf_detach(device_t);

	#ifdef PCI_IOV
	static int nicpf_iov_init(device_t, uint16_t, const nvlist_t *);
	static void nicpf_iov_uninit(device_t);
	static int nicpf_iov_add_vf(device_t, uint16_t, const nvlist_t *);
	#endif

	static device_method_t nicpf_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, nicpf_probe),
	DEVMETHOD(device_attach, nicpf_attach),
	DEVMETHOD(device_detach, nicpf_detach),
	/* PCI SR-IOV interface */
	#ifdef PCI_IOV
	DEVMETHOD(pci_iov_init, nicpf_iov_init),
	DEVMETHOD(pci_iov_uninit, nicpf_iov_uninit),
	DEVMETHOD(pci_iov_add_vf, nicpf_iov_add_vf),
	#endif
	DEVMETHOD_END,
	};

	static driver_t vnicpf_driver = {
	"vnicpf",
	nicpf_methods,
	sizeof(struct nicpf),
	};

	static devclass_t vnicpf_devclass;

	DRIVER_MODULE(vnicpf, pci, vnicpf_driver, vnicpf_devclass, 0, 0);
	MODULE_VERSION(vnicpf, 1);
	MODULE_DEPEND(vnicpf, pci, 1, 1, 1);
	MODULE_DEPEND(vnicpf, ether, 1, 1, 1);
	MODULE_DEPEND(vnicpf, thunder_bgx, 1, 1, 1);

	static int nicpf_alloc_res(struct nicpf *);
	static void nicpf_free_res(struct nicpf *);
	static void nic_set_lmac_vf_mapping(struct nicpf *);
	static void nic_init_hw(struct nicpf *);
	static int nic_sriov_init(device_t, struct nicpf *);
	static void nic_poll_for_link(void *);
	static int nic_register_interrupts(struct nicpf *);
	static void nic_unregister_interrupts(struct nicpf *);

	/*
	* Device interface
	*/
	static int
	nicpf_probe(device_t dev)
	{
	uint16_t vendor_id;
	uint16_t device_id;

	vendor_id = pci_get_vendor(dev);
	device_id = pci_get_device(dev);

	if (vendor_id == PCI_VENDOR_ID_CAVIUM &&
	device_id == PCI_DEVICE_ID_THUNDER_NIC_PF) {
	device_set_desc(dev, VNIC_PF_DEVSTR);
	return (BUS_PROBE_DEFAULT);
	}

	return (ENXIO);
	}

	static int
	nicpf_attach(device_t dev)
	{
	struct nicpf *nic;
	int err;

	nic = device_get_softc(dev);
	nic->dev = dev;

	/* Enable bus mastering */
	pci_enable_busmaster(dev);

	/* Allocate PCI resources */
	err = nicpf_alloc_res(nic);
	if (err != 0) {
	device_printf(dev, "Could not allocate PCI resources\n");
	return (err);
	}

	nic->node = nic_get_node_id(nic->reg_base);

	/* Enable Traffic Network Switch (TNS) bypass mode by default */
	nic->flags &= ~NIC_TNS_ENABLED;
	nic_set_lmac_vf_mapping(nic);

	/* Initialize hardware */
	nic_init_hw(nic);

	/* Set RSS TBL size for each VF */
	nic->rss_ind_tbl_size = NIC_MAX_RSS_IDR_TBL_SIZE;

	/* Setup interrupts */
	err = nic_register_interrupts(nic);
	if (err != 0)
	goto err_free_res;

	/* Configure SRIOV */
	err = nic_sriov_init(dev, nic);
	if (err != 0)
	goto err_free_intr;

	if (nic->flags & NIC_TNS_ENABLED)
	return (0);

	mtx_init(&nic->check_link_mtx, "VNIC PF link poll", NULL, MTX_DEF);
	/* Register physical link status poll callout */
	callout_init_mtx(&nic->check_link, &nic->check_link_mtx, 0);
	mtx_lock(&nic->check_link_mtx);
	nic_poll_for_link(nic);
	mtx_unlock(&nic->check_link_mtx);

	return (0);

	err_free_intr:
	nic_unregister_interrupts(nic);
	err_free_res:
	nicpf_free_res(nic);
	pci_disable_busmaster(dev);

	return (err);
	}

	static int
	nicpf_detach(device_t dev)
	{
	struct nicpf *nic;
	int err;

	err = 0;
	nic = device_get_softc(dev);

	callout_drain(&nic->check_link);
	mtx_destroy(&nic->check_link_mtx);

	nic_unregister_interrupts(nic);
	nicpf_free_res(nic);
	pci_disable_busmaster(dev);

	#ifdef PCI_IOV
	err = pci_iov_detach(dev);
	if (err != 0)
	device_printf(dev, "SR-IOV in use. Detach first.\n");
	#endif
	return (err);
	}

	/*
	* SR-IOV interface
	*/
	#ifdef PCI_IOV
	static int
	nicpf_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
	{
	struct nicpf *nic;

	nic = device_get_softc(dev);

	if (num_vfs == 0)
	return (ENXIO);

	nic->flags \|= NIC_SRIOV_ENABLED;

	return (0);
	}

	static void
	nicpf_iov_uninit(device_t dev)
	{

	/* ARM64TODO: Implement this function */
	}

	static int
	nicpf_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
	{
	const void *mac;
	struct nicpf *nic;
	size_t size;
	int bgx, lmac;

	nic = device_get_softc(dev);

	if ((nic->flags & NIC_SRIOV_ENABLED) == 0)
	return (ENXIO);

	if (vfnum > (nic->num_vf_en - 1))
	return (EINVAL);

	if (nvlist_exists_binary(params, "mac-addr") != 0) {
	mac = nvlist_get_binary(params, "mac-addr", &size);
	bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vfnum]);
	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vfnum]);
	bgx_set_lmac_mac(nic->node, bgx, lmac, mac);
	}

	return (0);
	}
	#endif

	/*
	* Helper routines
	*/
	static int
	nicpf_alloc_res(struct nicpf *nic)
	{
	device_t dev;
	int rid;

	dev = nic->dev;

	rid = VNIC_PF_REG_RID;
	nic->reg_base = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
	RF_ACTIVE);
	if (nic->reg_base == NULL) {
	/* For verbose output print some more details */
	if (bootverbose) {
	device_printf(dev,
	"Could not allocate registers memory\n");
	}
	return (ENXIO);
	}

	return (0);
	}

	static void
	nicpf_free_res(struct nicpf *nic)
	{
	device_t dev;

	dev = nic->dev;

	if (nic->reg_base != NULL) {
	bus_release_resource(dev, SYS_RES_MEMORY,
	rman_get_rid(nic->reg_base), nic->reg_base);
	}
	}

	/* Register read/write APIs */
	static __inline void
	nic_reg_write(struct nicpf *nic, bus_space_handle_t offset,
	uint64_t val)
	{

	bus_write_8(nic->reg_base, offset, val);
	}

	static __inline uint64_t
	nic_reg_read(struct nicpf *nic, uint64_t offset)
	{
	uint64_t val;

	val = bus_read_8(nic->reg_base, offset);
	return (val);
	}

	/* PF -> VF mailbox communication APIs */
	static void
	nic_enable_mbx_intr(struct nicpf *nic)
	{

	/* Enable mailbox interrupt for all 128 VFs */
	nic_reg_write(nic, NIC_PF_MAILBOX_ENA_W1S, ~0UL);
	nic_reg_write(nic, NIC_PF_MAILBOX_ENA_W1S + sizeof(uint64_t), ~0UL);
	}

	static void
	nic_clear_mbx_intr(struct nicpf *nic, int vf, int mbx_reg)
	{

	nic_reg_write(nic, NIC_PF_MAILBOX_INT + (mbx_reg << 3), (1UL << vf));
	}

	static uint64_t
	nic_get_mbx_addr(int vf)
	{

	return (NIC_PF_VF_0_127_MAILBOX_0_1 + (vf << NIC_VF_NUM_SHIFT));
	}

	/*
	* Send a mailbox message to VF
	* @vf: vf to which this message to be sent
	* @mbx: Message to be sent
	*/
	static void
	nic_send_msg_to_vf(struct nicpf nic, int vf, union nic_mbx mbx)
	{
	bus_space_handle_t mbx_addr = nic_get_mbx_addr(vf);
	uint64_t msg = (uint64_t )mbx;

	/*
	* In first revision HW, mbox interrupt is triggerred
	* when PF writes to MBOX(1), in next revisions when
	* PF writes to MBOX(0)
	*/
	if (pass1_silicon(nic->dev)) {
	nic_reg_write(nic, mbx_addr + 0, msg[0]);
	nic_reg_write(nic, mbx_addr + 8, msg[1]);
	} else {
	nic_reg_write(nic, mbx_addr + 8, msg[1]);
	nic_reg_write(nic, mbx_addr + 0, msg[0]);
	}
	}

	/*
	* Responds to VF's READY message with VF's
	* ID, node, MAC address e.t.c
	* @vf: VF which sent READY message
	*/
	static void
	nic_mbx_send_ready(struct nicpf *nic, int vf)
	{
	union nic_mbx mbx = {};
	int bgx_idx, lmac;
	const char *mac;

	mbx.nic_cfg.msg = NIC_MBOX_MSG_READY;
	mbx.nic_cfg.vf_id = vf;

	if (nic->flags & NIC_TNS_ENABLED)
	mbx.nic_cfg.tns_mode = NIC_TNS_MODE;
	else
	mbx.nic_cfg.tns_mode = NIC_TNS_BYPASS_MODE;

	if (vf < MAX_LMAC) {
	bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);

	mac = bgx_get_lmac_mac(nic->node, bgx_idx, lmac);
	if (mac) {
	memcpy((uint8_t *)&mbx.nic_cfg.mac_addr, mac,
	ETHER_ADDR_LEN);
	}
	}
	mbx.nic_cfg.node_id = nic->node;

	mbx.nic_cfg.loopback_supported = vf < MAX_LMAC;

	nic_send_msg_to_vf(nic, vf, &mbx);
	}

	/*
	* ACKs VF's mailbox message
	* @vf: VF to which ACK to be sent
	*/
	static void
	nic_mbx_send_ack(struct nicpf *nic, int vf)
	{
	union nic_mbx mbx = {};

	mbx.msg.msg = NIC_MBOX_MSG_ACK;
	nic_send_msg_to_vf(nic, vf, &mbx);
	}

	/*
	* NACKs VF's mailbox message that PF is not able to
	* complete the action
	* @vf: VF to which ACK to be sent
	*/
	static void
	nic_mbx_send_nack(struct nicpf *nic, int vf)
	{
	union nic_mbx mbx = {};

	mbx.msg.msg = NIC_MBOX_MSG_NACK;
	nic_send_msg_to_vf(nic, vf, &mbx);
	}

	/*
	* Flush all in flight receive packets to memory and
	* bring down an active RQ
	*/
	static int
	nic_rcv_queue_sw_sync(struct nicpf *nic)
	{
	uint16_t timeout = ~0x00;

	nic_reg_write(nic, NIC_PF_SW_SYNC_RX, 0x01);
	/* Wait till sync cycle is finished */
	while (timeout) {
	if (nic_reg_read(nic, NIC_PF_SW_SYNC_RX_DONE) & 0x1)
	break;
	timeout--;
	}
	nic_reg_write(nic, NIC_PF_SW_SYNC_RX, 0x00);
	if (!timeout) {
	device_printf(nic->dev, "Receive queue software sync failed\n");
	return (ETIMEDOUT);
	}
	return (0);
	}

	/* Get BGX Rx/Tx stats and respond to VF's request */
	static void
	nic_get_bgx_stats(struct nicpf nic, struct bgx_stats_msg bgx)
	{
	int bgx_idx, lmac;
	union nic_mbx mbx = {};

	bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[bgx->vf_id]);
	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[bgx->vf_id]);

	mbx.bgx_stats.msg = NIC_MBOX_MSG_BGX_STATS;
	mbx.bgx_stats.vf_id = bgx->vf_id;
	mbx.bgx_stats.rx = bgx->rx;
	mbx.bgx_stats.idx = bgx->idx;
	if (bgx->rx != 0) {
	mbx.bgx_stats.stats =
	bgx_get_rx_stats(nic->node, bgx_idx, lmac, bgx->idx);
	} else {
	mbx.bgx_stats.stats =
	bgx_get_tx_stats(nic->node, bgx_idx, lmac, bgx->idx);
	}
	nic_send_msg_to_vf(nic, bgx->vf_id, &mbx);
	}

	/* Update hardware min/max frame size */
	static int
	nic_update_hw_frs(struct nicpf *nic, int new_frs, int vf)
	{

	if ((new_frs > NIC_HW_MAX_FRS) \|\| (new_frs < NIC_HW_MIN_FRS)) {
	device_printf(nic->dev,
	"Invalid MTU setting from VF%d rejected, "
	"should be between %d and %d\n",
	vf, NIC_HW_MIN_FRS, NIC_HW_MAX_FRS);
	return (EINVAL);
	}
	new_frs += ETHER_HDR_LEN;
	if (new_frs <= nic->pkind.maxlen)
	return (0);

	nic->pkind.maxlen = new_frs;
	nic_reg_write(nic, NIC_PF_PKIND_0_15_CFG, (uint64_t )&nic->pkind);
	return (0);
	}

	/* Set minimum transmit packet size */
	static void
	nic_set_tx_pkt_pad(struct nicpf *nic, int size)
	{
	int lmac;
	uint64_t lmac_cfg;

	/* Max value that can be set is 60 */
	if (size > 60)
	size = 60;

	for (lmac = 0; lmac < (MAX_BGX_PER_CN88XX * MAX_LMAC_PER_BGX); lmac++) {
	lmac_cfg = nic_reg_read(nic, NIC_PF_LMAC_0_7_CFG \| (lmac << 3));
	lmac_cfg &= ~(0xF << 2);
	lmac_cfg \|= ((size / 4) << 2);
	nic_reg_write(nic, NIC_PF_LMAC_0_7_CFG \| (lmac << 3), lmac_cfg);
	}
	}

	/*
	* Function to check number of LMACs present and set VF::LMAC mapping.
	* Mapping will be used while initializing channels.
	*/
	static void
	nic_set_lmac_vf_mapping(struct nicpf *nic)
	{
	unsigned bgx_map = bgx_get_map(nic->node);
	int bgx, next_bgx_lmac = 0;
	int lmac, lmac_cnt = 0;
	uint64_t lmac_credit;

	nic->num_vf_en = 0;
	if (nic->flags & NIC_TNS_ENABLED) {
	nic->num_vf_en = DEFAULT_NUM_VF_ENABLED;
	return;
	}

	for (bgx = 0; bgx < NIC_MAX_BGX; bgx++) {
	if ((bgx_map & (1 << bgx)) == 0)
	continue;
	lmac_cnt = bgx_get_lmac_count(nic->node, bgx);
	for (lmac = 0; lmac < lmac_cnt; lmac++)
	nic->vf_lmac_map[next_bgx_lmac++] =
	NIC_SET_VF_LMAC_MAP(bgx, lmac);
	nic->num_vf_en += lmac_cnt;

	/* Program LMAC credits */
	lmac_credit = (1UL << 1); /* channel credit enable */
	lmac_credit \|= (0x1ff << 2); /* Max outstanding pkt count */
	/* 48KB BGX Tx buffer size, each unit is of size 16bytes */
	lmac_credit \|= (((((48 * 1024) / lmac_cnt) -
	NIC_HW_MAX_FRS) / 16) << 12);
	lmac = bgx * MAX_LMAC_PER_BGX;
	for (; lmac < lmac_cnt + (bgx * MAX_LMAC_PER_BGX); lmac++) {
	nic_reg_write(nic, NIC_PF_LMAC_0_7_CREDIT + (lmac * 8),
	lmac_credit);
	}
	}
	}

	#define TNS_PORT0_BLOCK 6
	#define TNS_PORT1_BLOCK 7
	#define BGX0_BLOCK 8
	#define BGX1_BLOCK 9

	static void
	nic_init_hw(struct nicpf *nic)
	{
	int i;

	/* Enable NIC HW block */
	nic_reg_write(nic, NIC_PF_CFG, 0x3);

	/* Enable backpressure */
	nic_reg_write(nic, NIC_PF_BP_CFG, (1UL << 6) \| 0x03);

	if (nic->flags & NIC_TNS_ENABLED) {
	nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
	(NIC_TNS_MODE << 7) \| TNS_PORT0_BLOCK);
	nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG \| (1 << 8),
	(NIC_TNS_MODE << 7) \| TNS_PORT1_BLOCK);
	nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
	(1UL << 63) \| TNS_PORT0_BLOCK);
	nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
	(1UL << 63) \| TNS_PORT1_BLOCK);

	} else {
	/* Disable TNS mode on both interfaces */
	nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
	(NIC_TNS_BYPASS_MODE << 7) \| BGX0_BLOCK);
	nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG \| (1 << 8),
	(NIC_TNS_BYPASS_MODE << 7) \| BGX1_BLOCK);
	nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
	(1UL << 63) \| BGX0_BLOCK);
	nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
	(1UL << 63) \| BGX1_BLOCK);
	}

	/* PKIND configuration */
	nic->pkind.minlen = 0;
	nic->pkind.maxlen = NIC_HW_MAX_FRS + ETHER_HDR_LEN;
	nic->pkind.lenerr_en = 1;
	nic->pkind.rx_hdr = 0;
	nic->pkind.hdr_sl = 0;

	for (i = 0; i < NIC_MAX_PKIND; i++) {
	nic_reg_write(nic, NIC_PF_PKIND_0_15_CFG \| (i << 3),
	(uint64_t )&nic->pkind);
	}

	nic_set_tx_pkt_pad(nic, NIC_HW_MIN_FRS);

	/* Timer config */
	nic_reg_write(nic, NIC_PF_INTR_TIMER_CFG, NICPF_CLK_PER_INT_TICK);

	/* Enable VLAN ethertype matching and stripping */
	nic_reg_write(nic, NIC_PF_RX_ETYPE_0_7,
	(2 << 19) \| (ETYPE_ALG_VLAN_STRIP << 16) \| ETHERTYPE_VLAN);
	}

	/* Channel parse index configuration */
	static void
	nic_config_cpi(struct nicpf nic, struct cpi_cfg_msg cfg)
	{
	uint32_t vnic, bgx, lmac, chan;
	uint32_t padd, cpi_count = 0;
	uint64_t cpi_base, cpi, rssi_base, rssi;
	uint8_t qset, rq_idx = 0;

	vnic = cfg->vf_id;
	bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vnic]);
	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vnic]);

	chan = (lmac * MAX_BGX_CHANS_PER_LMAC) + (bgx * NIC_CHANS_PER_INF);
	cpi_base = (lmac * NIC_MAX_CPI_PER_LMAC) + (bgx * NIC_CPI_PER_BGX);
	rssi_base = (lmac * nic->rss_ind_tbl_size) + (bgx * NIC_RSSI_PER_BGX);

	/* Rx channel configuration */
	nic_reg_write(nic, NIC_PF_CHAN_0_255_RX_BP_CFG \| (chan << 3),
	(1UL << 63) \| (vnic << 0));
	nic_reg_write(nic, NIC_PF_CHAN_0_255_RX_CFG \| (chan << 3),
	((uint64_t)cfg->cpi_alg << 62) \| (cpi_base << 48));

	if (cfg->cpi_alg == CPI_ALG_NONE)
	cpi_count = 1;
	else if (cfg->cpi_alg == CPI_ALG_VLAN) /* 3 bits of PCP */
	cpi_count = 8;
	else if (cfg->cpi_alg == CPI_ALG_VLAN16) /* 3 bits PCP + DEI */
	cpi_count = 16;
	else if (cfg->cpi_alg == CPI_ALG_DIFF) /* 6bits DSCP */
	cpi_count = NIC_MAX_CPI_PER_LMAC;

	/* RSS Qset, Qidx mapping */
	qset = cfg->vf_id;
	rssi = rssi_base;
	for (; rssi < (rssi_base + cfg->rq_cnt); rssi++) {
	nic_reg_write(nic, NIC_PF_RSSI_0_4097_RQ \| (rssi << 3),
	(qset << 3) \| rq_idx);
	rq_idx++;
	}

	rssi = 0;
	cpi = cpi_base;
	for (; cpi < (cpi_base + cpi_count); cpi++) {
	/* Determine port to channel adder */
	if (cfg->cpi_alg != CPI_ALG_DIFF)
	padd = cpi % cpi_count;
	else
	padd = cpi % 8; /* 3 bits CS out of 6bits DSCP */

	/* Leave RSS_SIZE as '0' to disable RSS */
	if (pass1_silicon(nic->dev)) {
	nic_reg_write(nic, NIC_PF_CPI_0_2047_CFG \| (cpi << 3),
	(vnic << 24) \| (padd << 16) \| (rssi_base + rssi));
	} else {
	/* Set MPI_ALG to '0' to disable MCAM parsing */
	nic_reg_write(nic, NIC_PF_CPI_0_2047_CFG \| (cpi << 3),
	(padd << 16));
	/* MPI index is same as CPI if MPI_ALG is not enabled */
	nic_reg_write(nic, NIC_PF_MPI_0_2047_CFG \| (cpi << 3),
	(vnic << 24) \| (rssi_base + rssi));
	}

	if ((rssi + 1) >= cfg->rq_cnt)
	continue;

	if (cfg->cpi_alg == CPI_ALG_VLAN)
	rssi++;
	else if (cfg->cpi_alg == CPI_ALG_VLAN16)
	rssi = ((cpi - cpi_base) & 0xe) >> 1;
	else if (cfg->cpi_alg == CPI_ALG_DIFF)
	rssi = ((cpi - cpi_base) & 0x38) >> 3;
	}
	nic->cpi_base[cfg->vf_id] = cpi_base;
	nic->rssi_base[cfg->vf_id] = rssi_base;
	}

	/* Responsds to VF with its RSS indirection table size */
	static void
	nic_send_rss_size(struct nicpf *nic, int vf)
	{
	union nic_mbx mbx = {};
	- uint64_t *msg;
	-
	- msg = (uint64_t *)&mbx;

	mbx.rss_size.msg = NIC_MBOX_MSG_RSS_SIZE;
	mbx.rss_size.ind_tbl_size = nic->rss_ind_tbl_size;
	nic_send_msg_to_vf(nic, vf, &mbx);
	}

	/*
	* Receive side scaling configuration
	* configure:
	* - RSS index
	* - indir table i.e hash::RQ mapping
	* - no of hash bits to consider
	*/
	static void
	nic_config_rss(struct nicpf nic, struct rss_cfg_msg cfg)
	{
	uint8_t qset, idx;
	uint64_t cpi_cfg, cpi_base, rssi_base, rssi;
	uint64_t idx_addr;

	idx = 0;
	rssi_base = nic->rssi_base[cfg->vf_id] + cfg->tbl_offset;

	rssi = rssi_base;
	qset = cfg->vf_id;

	for (; rssi < (rssi_base + cfg->tbl_len); rssi++) {
	nic_reg_write(nic, NIC_PF_RSSI_0_4097_RQ \| (rssi << 3),
	(qset << 3) \| (cfg->ind_tbl[idx] & 0x7));
	idx++;
	}

	cpi_base = nic->cpi_base[cfg->vf_id];
	if (pass1_silicon(nic->dev))
	idx_addr = NIC_PF_CPI_0_2047_CFG;
	else
	idx_addr = NIC_PF_MPI_0_2047_CFG;
	cpi_cfg = nic_reg_read(nic, idx_addr \| (cpi_base << 3));
	cpi_cfg &= ~(0xFUL << 20);
	cpi_cfg \|= (cfg->hash_bits << 20);
	nic_reg_write(nic, idx_addr \| (cpi_base << 3), cpi_cfg);
	}

	/*
	* 4 level transmit side scheduler configutation
	* for TNS bypass mode
	*
	* Sample configuration for SQ0
	* VNIC0-SQ0 -> TL4(0) -> TL3[0] -> TL2[0] -> TL1[0] -> BGX0
	* VNIC1-SQ0 -> TL4(8) -> TL3[2] -> TL2[0] -> TL1[0] -> BGX0
	* VNIC2-SQ0 -> TL4(16) -> TL3[4] -> TL2[1] -> TL1[0] -> BGX0
	* VNIC3-SQ0 -> TL4(24) -> TL3[6] -> TL2[1] -> TL1[0] -> BGX0
	* VNIC4-SQ0 -> TL4(512) -> TL3[128] -> TL2[32] -> TL1[1] -> BGX1
	* VNIC5-SQ0 -> TL4(520) -> TL3[130] -> TL2[32] -> TL1[1] -> BGX1
	* VNIC6-SQ0 -> TL4(528) -> TL3[132] -> TL2[33] -> TL1[1] -> BGX1
	* VNIC7-SQ0 -> TL4(536) -> TL3[134] -> TL2[33] -> TL1[1] -> BGX1
	*/
	static void
	nic_tx_channel_cfg(struct nicpf nic, uint8_t vnic, struct sq_cfg_msg sq)
	{
	uint32_t bgx, lmac, chan;
	uint32_t tl2, tl3, tl4;
	uint32_t rr_quantum;
	uint8_t sq_idx = sq->sq_num;
	uint8_t pqs_vnic;

	pqs_vnic = vnic;

	bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[pqs_vnic]);
	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[pqs_vnic]);

	/* 24 bytes for FCS, IPG and preamble */
	rr_quantum = ((NIC_HW_MAX_FRS + 24) / 4);

	tl4 = (lmac * NIC_TL4_PER_LMAC) + (bgx * NIC_TL4_PER_BGX);
	tl4 += sq_idx;

	tl3 = tl4 / (NIC_MAX_TL4 / NIC_MAX_TL3);
	nic_reg_write(nic, NIC_PF_QSET_0_127_SQ_0_7_CFG2 \|
	((uint64_t)vnic << NIC_QS_ID_SHIFT) \|
	((uint32_t)sq_idx << NIC_Q_NUM_SHIFT), tl4);
	nic_reg_write(nic, NIC_PF_TL4_0_1023_CFG \| (tl4 << 3),
	((uint64_t)vnic << 27) \| ((uint32_t)sq_idx << 24) \| rr_quantum);

	nic_reg_write(nic, NIC_PF_TL3_0_255_CFG \| (tl3 << 3), rr_quantum);
	chan = (lmac * MAX_BGX_CHANS_PER_LMAC) + (bgx * NIC_CHANS_PER_INF);
	nic_reg_write(nic, NIC_PF_TL3_0_255_CHAN \| (tl3 << 3), chan);
	/* Enable backpressure on the channel */
	nic_reg_write(nic, NIC_PF_CHAN_0_255_TX_CFG \| (chan << 3), 1);

	tl2 = tl3 >> 2;
	nic_reg_write(nic, NIC_PF_TL3A_0_63_CFG \| (tl2 << 3), tl2);
	nic_reg_write(nic, NIC_PF_TL2_0_63_CFG \| (tl2 << 3), rr_quantum);
	/* No priorities as of now */
	nic_reg_write(nic, NIC_PF_TL2_0_63_PRI \| (tl2 << 3), 0x00);
	}

	static int
	nic_config_loopback(struct nicpf nic, struct set_loopback lbk)
	{
	int bgx_idx, lmac_idx;

	if (lbk->vf_id > MAX_LMAC)
	return (ENXIO);

	bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lbk->vf_id]);
	lmac_idx = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lbk->vf_id]);

	bgx_lmac_internal_loopback(nic->node, bgx_idx, lmac_idx, lbk->enable);

	return (0);
	}

	/* Interrupt handler to handle mailbox messages from VFs */
	static void
	nic_handle_mbx_intr(struct nicpf *nic, int vf)
	{
	union nic_mbx mbx = {};
	uint64_t *mbx_data;
	uint64_t mbx_addr;
	uint64_t reg_addr;
	uint64_t cfg;
	int bgx, lmac;
	int i;
	int ret = 0;

	nic->mbx_lock[vf] = TRUE;

	mbx_addr = nic_get_mbx_addr(vf);
	mbx_data = (uint64_t *)&mbx;

	for (i = 0; i < NIC_PF_VF_MAILBOX_SIZE; i++) {
	*mbx_data = nic_reg_read(nic, mbx_addr);
	mbx_data++;
	mbx_addr += sizeof(uint64_t);
	}

	switch (mbx.msg.msg) {
	case NIC_MBOX_MSG_READY:
	nic_mbx_send_ready(nic, vf);
	if (vf < MAX_LMAC) {
	nic->link[vf] = 0;
	nic->duplex[vf] = 0;
	nic->speed[vf] = 0;
	}
	ret = 1;
	break;
	case NIC_MBOX_MSG_QS_CFG:
	reg_addr = NIC_PF_QSET_0_127_CFG \|
	(mbx.qs.num << NIC_QS_ID_SHIFT);
	cfg = mbx.qs.cfg;
	nic_reg_write(nic, reg_addr, cfg);
	break;
	case NIC_MBOX_MSG_RQ_CFG:
	reg_addr = NIC_PF_QSET_0_127_RQ_0_7_CFG \|
	(mbx.rq.qs_num << NIC_QS_ID_SHIFT) \|
	(mbx.rq.rq_num << NIC_Q_NUM_SHIFT);
	nic_reg_write(nic, reg_addr, mbx.rq.cfg);
	break;
	case NIC_MBOX_MSG_RQ_BP_CFG:
	reg_addr = NIC_PF_QSET_0_127_RQ_0_7_BP_CFG \|
	(mbx.rq.qs_num << NIC_QS_ID_SHIFT) \|
	(mbx.rq.rq_num << NIC_Q_NUM_SHIFT);
	nic_reg_write(nic, reg_addr, mbx.rq.cfg);
	break;
	case NIC_MBOX_MSG_RQ_SW_SYNC:
	ret = nic_rcv_queue_sw_sync(nic);
	break;
	case NIC_MBOX_MSG_RQ_DROP_CFG:
	reg_addr = NIC_PF_QSET_0_127_RQ_0_7_DROP_CFG \|
	(mbx.rq.qs_num << NIC_QS_ID_SHIFT) \|
	(mbx.rq.rq_num << NIC_Q_NUM_SHIFT);
	nic_reg_write(nic, reg_addr, mbx.rq.cfg);
	break;
	case NIC_MBOX_MSG_SQ_CFG:
	reg_addr = NIC_PF_QSET_0_127_SQ_0_7_CFG \|
	(mbx.sq.qs_num << NIC_QS_ID_SHIFT) \|
	(mbx.sq.sq_num << NIC_Q_NUM_SHIFT);
	nic_reg_write(nic, reg_addr, mbx.sq.cfg);
	nic_tx_channel_cfg(nic, mbx.qs.num, &mbx.sq);
	break;
	case NIC_MBOX_MSG_SET_MAC:
	lmac = mbx.mac.vf_id;
	bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
	bgx_set_lmac_mac(nic->node, bgx, lmac, mbx.mac.mac_addr);
	break;
	case NIC_MBOX_MSG_SET_MAX_FRS:
	ret = nic_update_hw_frs(nic, mbx.frs.max_frs, mbx.frs.vf_id);
	break;
	case NIC_MBOX_MSG_CPI_CFG:
	nic_config_cpi(nic, &mbx.cpi_cfg);
	break;
	case NIC_MBOX_MSG_RSS_SIZE:
	nic_send_rss_size(nic, vf);
	goto unlock;
	case NIC_MBOX_MSG_RSS_CFG:
	case NIC_MBOX_MSG_RSS_CFG_CONT: /* fall through */
	nic_config_rss(nic, &mbx.rss_cfg);
	break;
	case NIC_MBOX_MSG_CFG_DONE:
	/* Last message of VF config msg sequence */
	nic->vf_info[vf].vf_enabled = TRUE;
	goto unlock;
	case NIC_MBOX_MSG_SHUTDOWN:
	/* First msg in VF teardown sequence */
	nic->vf_info[vf].vf_enabled = FALSE;
	break;
	case NIC_MBOX_MSG_BGX_STATS:
	nic_get_bgx_stats(nic, &mbx.bgx_stats);
	goto unlock;
	case NIC_MBOX_MSG_LOOPBACK:
	ret = nic_config_loopback(nic, &mbx.lbk);
	break;
	default:
	device_printf(nic->dev,
	"Invalid msg from VF%d, msg 0x%x\n", vf, mbx.msg.msg);
	break;
	}

	if (ret == 0)
	nic_mbx_send_ack(nic, vf);
	else if (mbx.msg.msg != NIC_MBOX_MSG_READY)
	nic_mbx_send_nack(nic, vf);
	unlock:
	nic->mbx_lock[vf] = FALSE;
	}

	static void
	nic_mbx_intr_handler(struct nicpf *nic, int mbx)
	{
	uint64_t intr;
	uint8_t vf, vf_per_mbx_reg = 64;

	intr = nic_reg_read(nic, NIC_PF_MAILBOX_INT + (mbx << 3));
	for (vf = 0; vf < vf_per_mbx_reg; vf++) {
	if (intr & (1UL << vf)) {
	nic_handle_mbx_intr(nic, vf + (mbx * vf_per_mbx_reg));
	nic_clear_mbx_intr(nic, vf, mbx);
	}
	}
	}

	static void
	nic_mbx0_intr_handler (void *arg)
	{
	struct nicpf nic = (struct nicpf )arg;

	nic_mbx_intr_handler(nic, 0);
	}

	static void
	nic_mbx1_intr_handler (void *arg)
	{
	struct nicpf nic = (struct nicpf )arg;

	nic_mbx_intr_handler(nic, 1);
	}

	static int
	nic_enable_msix(struct nicpf *nic)
	{
	struct pci_devinfo *dinfo;
	int rid, count;
	int ret;

	dinfo = device_get_ivars(nic->dev);
	rid = dinfo->cfg.msix.msix_table_bar;
	nic->msix_table_res =
	bus_alloc_resource_any(nic->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
	if (nic->msix_table_res == NULL) {
	device_printf(nic->dev,
	"Could not allocate memory for MSI-X table\n");
	return (ENXIO);
	}

	count = nic->num_vec = NIC_PF_MSIX_VECTORS;

	ret = pci_alloc_msix(nic->dev, &count);
	if ((ret != 0) \|\| (count != nic->num_vec)) {
	device_printf(nic->dev,
	"Request for #%d msix vectors failed, error: %d\n",
	nic->num_vec, ret);
	return (ret);
	}

	nic->msix_enabled = 1;
	return (0);
	}

	static void
	nic_disable_msix(struct nicpf *nic)
	{
	if (nic->msix_enabled) {
	pci_release_msi(nic->dev);
	nic->msix_enabled = 0;
	nic->num_vec = 0;
	}

	bus_release_resource(nic->dev, SYS_RES_MEMORY,
	rman_get_rid(nic->msix_table_res), nic->msix_table_res);
	}

	static void
	nic_free_all_interrupts(struct nicpf *nic)
	{
	int irq;

	for (irq = 0; irq < nic->num_vec; irq++) {
	if (nic->msix_entries[irq].irq_res == NULL)
	continue;
	if (nic->msix_entries[irq].handle != NULL) {
	bus_teardown_intr(nic->dev,
	nic->msix_entries[irq].irq_res,
	nic->msix_entries[irq].handle);
	}

	bus_release_resource(nic->dev, SYS_RES_IRQ, irq + 1,
	nic->msix_entries[irq].irq_res);
	}
	}

	static int
	nic_register_interrupts(struct nicpf *nic)
	{
	int irq, rid;
	int ret;

	/* Enable MSI-X */
	ret = nic_enable_msix(nic);
	if (ret != 0)
	return (ret);

	/* Register mailbox interrupt handlers */
	irq = NIC_PF_INTR_ID_MBOX0;
	rid = irq + 1;
	nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
	SYS_RES_IRQ, &rid, (RF_SHAREABLE \| RF_ACTIVE));
	if (nic->msix_entries[irq].irq_res == NULL) {
	ret = ENXIO;
	goto fail;
	}
	ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
	(INTR_MPSAFE \| INTR_TYPE_MISC), NULL, nic_mbx0_intr_handler, nic,
	&nic->msix_entries[irq].handle);
	if (ret != 0)
	goto fail;

	irq = NIC_PF_INTR_ID_MBOX1;
	rid = irq + 1;
	nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
	SYS_RES_IRQ, &rid, (RF_SHAREABLE \| RF_ACTIVE));
	if (nic->msix_entries[irq].irq_res == NULL) {
	ret = ENXIO;
	goto fail;
	}
	ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
	(INTR_MPSAFE \| INTR_TYPE_MISC), NULL, nic_mbx1_intr_handler, nic,
	&nic->msix_entries[irq].handle);
	if (ret != 0)
	goto fail;

	/* Enable mailbox interrupt */
	nic_enable_mbx_intr(nic);
	return (0);

	fail:
	nic_free_all_interrupts(nic);
	return (ret);
	}

	static void
	nic_unregister_interrupts(struct nicpf *nic)
	{

	nic_free_all_interrupts(nic);
	nic_disable_msix(nic);
	}

	static int nic_sriov_init(device_t dev, struct nicpf *nic)
	{
	#ifdef PCI_IOV
	nvlist_t pf_schema, vf_schema;
	int iov_pos;
	int err;
	uint16_t total_vf_cnt;

	err = pci_find_extcap(dev, PCIZ_SRIOV, &iov_pos);
	if (err != 0) {
	device_printf(dev,
	"SR-IOV capability is not found in PCIe config space\n");
	return (err);
	}
	/* Fix-up the number of enabled VFs */
	total_vf_cnt = pci_read_config(dev, iov_pos + PCIR_SRIOV_TOTAL_VFS, 2);
	if (total_vf_cnt == 0)
	return (ENXIO);

	/* Attach SR-IOV */
	pf_schema = pci_iov_schema_alloc_node();
	vf_schema = pci_iov_schema_alloc_node();
	pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL);
	/*
	* All VFs can change their MACs.
	* This flag will be ignored but we set it just for the record.
	*/
	pci_iov_schema_add_bool(vf_schema, "allow-set-mac",
	IOV_SCHEMA_HASDEFAULT, TRUE);

	err = pci_iov_attach(dev, pf_schema, vf_schema);
	if (err != 0) {
	device_printf(dev,
	"Failed to initialize SR-IOV (error=%d)\n",
	err);
	return (err);
	}
	#endif
	return (0);
	}

	/*
	* Poll for BGX LMAC link status and update corresponding VF
	* if there is a change, valid only if internal L2 switch
	* is not present otherwise VF link is always treated as up
	*/
	static void
	nic_poll_for_link(void *arg)
	{
	union nic_mbx mbx = {};
	struct nicpf *nic;
	struct bgx_link_status link;
	uint8_t vf, bgx, lmac;

	nic = (struct nicpf *)arg;

	mbx.link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE;

	for (vf = 0; vf < nic->num_vf_en; vf++) {
	/* Poll only if VF is UP */
	if (!nic->vf_info[vf].vf_enabled)
	continue;

	/* Get BGX, LMAC indices for the VF */
	bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
	/* Get interface link status */
	bgx_get_lmac_link_state(nic->node, bgx, lmac, &link);

	/* Inform VF only if link status changed */
	if (nic->link[vf] == link.link_up)
	continue;

	if (!nic->mbx_lock[vf]) {
	nic->link[vf] = link.link_up;
	nic->duplex[vf] = link.duplex;
	nic->speed[vf] = link.speed;

	/* Send a mbox message to VF with current link status */
	mbx.link_status.link_up = link.link_up;
	mbx.link_status.duplex = link.duplex;
	mbx.link_status.speed = link.speed;
	nic_send_msg_to_vf(nic, vf, &mbx);
	}
	}
	callout_reset(&nic->check_link, hz * 2, nic_poll_for_link, nic);
	}
	Index: head/sys/dev/vnic/nicvf_main.c
	===================================================================
	--- head/sys/dev/vnic/nicvf_main.c (revision 327172)
	+++ head/sys/dev/vnic/nicvf_main.c (revision 327173)
	@@ -1,1627 +1,1625 @@
	/*
	* Copyright (C) 2015 Cavium Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*
	*/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bitset.h>
	#include <sys/bitstring.h>
	#include <sys/bus.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/rman.h>
	#include <sys/pciio.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/stdatomic.h>
	#include <sys/cpuset.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/smp.h>
	#include <sys/taskqueue.h>

	#include <net/bpf.h>
	#include <net/ethernet.h>
	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_arp.h>
	#include <net/if_dl.h>
	#include <net/if_media.h>
	#include <net/if_types.h>
	#include <net/if_vlan_var.h>

	#include <netinet/in.h>
	#include <netinet/ip.h>
	#include <netinet/if_ether.h>
	#include <netinet/tcp_lro.h>

	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcivar.h>

	#include <sys/dnv.h>
	#include <sys/nv.h>
	#include <sys/iov_schema.h>

	#include <machine/bus.h>

	#include "thunder_bgx.h"
	#include "nic_reg.h"
	#include "nic.h"
	#include "nicvf_queues.h"

	#define VNIC_VF_DEVSTR "Cavium Thunder NIC Virtual Function Driver"

	#define VNIC_VF_REG_RID PCIR_BAR(PCI_CFG_REG_BAR_NUM)

	/* Lock for core interface settings */
	#define NICVF_CORE_LOCK_INIT(nic) \
	sx_init(&(nic)->core_sx, device_get_nameunit((nic)->dev))

	#define NICVF_CORE_LOCK_DESTROY(nic) \
	sx_destroy(&(nic)->core_sx)

	#define NICVF_CORE_LOCK(nic) sx_xlock(&(nic)->core_sx)
	#define NICVF_CORE_UNLOCK(nic) sx_xunlock(&(nic)->core_sx)

	#define NICVF_CORE_LOCK_ASSERT(nic) sx_assert(&(nic)->core_sx, SA_XLOCKED)

	#define SPEED_10 10
	#define SPEED_100 100
	#define SPEED_1000 1000
	#define SPEED_10000 10000
	#define SPEED_40000 40000

	MALLOC_DEFINE(M_NICVF, "nicvf", "ThunderX VNIC VF dynamic memory");

	static int nicvf_probe(device_t);
	static int nicvf_attach(device_t);
	static int nicvf_detach(device_t);

	static device_method_t nicvf_methods[] = {
	/* Device interface */
	DEVMETHOD(device_probe, nicvf_probe),
	DEVMETHOD(device_attach, nicvf_attach),
	DEVMETHOD(device_detach, nicvf_detach),

	DEVMETHOD_END,
	};

	static driver_t nicvf_driver = {
	"vnic",
	nicvf_methods,
	sizeof(struct nicvf),
	};

	static devclass_t nicvf_devclass;

	DRIVER_MODULE(vnicvf, pci, nicvf_driver, nicvf_devclass, 0, 0);
	MODULE_VERSION(vnicvf, 1);
	MODULE_DEPEND(vnicvf, pci, 1, 1, 1);
	MODULE_DEPEND(vnicvf, ether, 1, 1, 1);
	MODULE_DEPEND(vnicvf, vnicpf, 1, 1, 1);

	static int nicvf_allocate_misc_interrupt(struct nicvf *);
	static int nicvf_enable_misc_interrupt(struct nicvf *);
	static int nicvf_allocate_net_interrupts(struct nicvf *);
	static void nicvf_release_all_interrupts(struct nicvf *);
	static int nicvf_update_hw_max_frs(struct nicvf *, int);
	static int nicvf_hw_set_mac_addr(struct nicvf , uint8_t );
	static void nicvf_config_cpi(struct nicvf *);
	static int nicvf_rss_init(struct nicvf *);
	static int nicvf_init_resources(struct nicvf *);

	static int nicvf_setup_ifnet(struct nicvf *);
	static int nicvf_setup_ifmedia(struct nicvf *);
	static void nicvf_hw_addr_random(uint8_t *);

	static int nicvf_if_ioctl(struct ifnet *, u_long, caddr_t);
	static void nicvf_if_init(void *);
	static void nicvf_if_init_locked(struct nicvf *);
	static int nicvf_if_transmit(struct ifnet , struct mbuf );
	static void nicvf_if_qflush(struct ifnet *);
	static uint64_t nicvf_if_getcounter(struct ifnet *, ift_counter);

	static int nicvf_stop_locked(struct nicvf *);

	static void nicvf_media_status(struct ifnet , struct ifmediareq );
	static int nicvf_media_change(struct ifnet *);

	static void nicvf_tick_stats(void *);

	static int
	nicvf_probe(device_t dev)
	{
	uint16_t vendor_id;
	uint16_t device_id;

	vendor_id = pci_get_vendor(dev);
	device_id = pci_get_device(dev);

	if (vendor_id != PCI_VENDOR_ID_CAVIUM)
	return (ENXIO);

	if (device_id == PCI_DEVICE_ID_THUNDER_NIC_VF \|\|
	device_id == PCI_DEVICE_ID_THUNDER_PASS1_NIC_VF) {
	device_set_desc(dev, VNIC_VF_DEVSTR);
	return (BUS_PROBE_DEFAULT);
	}

	return (ENXIO);
	}

	static int
	nicvf_attach(device_t dev)
	{
	struct nicvf *nic;
	int rid, qcount;
	int err = 0;
	uint8_t hwaddr[ETHER_ADDR_LEN];
	uint8_t zeromac[] = {[0 ... (ETHER_ADDR_LEN - 1)] = 0};

	nic = device_get_softc(dev);
	nic->dev = dev;
	nic->pnicvf = nic;

	NICVF_CORE_LOCK_INIT(nic);
	/* Enable HW TSO on Pass2 */
	if (!pass1_silicon(dev))
	nic->hw_tso = TRUE;

	rid = VNIC_VF_REG_RID;
	nic->reg_base = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
	RF_ACTIVE);
	if (nic->reg_base == NULL) {
	device_printf(dev, "Could not allocate registers memory\n");
	return (ENXIO);
	}

	qcount = MAX_CMP_QUEUES_PER_QS;
	nic->max_queues = qcount;

	err = nicvf_set_qset_resources(nic);
	if (err != 0)
	goto err_free_res;

	/* Check if PF is alive and get MAC address for this VF */
	err = nicvf_allocate_misc_interrupt(nic);
	if (err != 0)
	goto err_free_res;

	NICVF_CORE_LOCK(nic);
	err = nicvf_enable_misc_interrupt(nic);
	NICVF_CORE_UNLOCK(nic);
	if (err != 0)
	goto err_release_intr;

	err = nicvf_allocate_net_interrupts(nic);
	if (err != 0) {
	device_printf(dev,
	"Could not allocate network interface interrupts\n");
	goto err_free_ifnet;
	}

	/* If no MAC address was obtained we generate random one */
	if (memcmp(nic->hwaddr, zeromac, ETHER_ADDR_LEN) == 0) {
	nicvf_hw_addr_random(hwaddr);
	memcpy(nic->hwaddr, hwaddr, ETHER_ADDR_LEN);
	NICVF_CORE_LOCK(nic);
	nicvf_hw_set_mac_addr(nic, hwaddr);
	NICVF_CORE_UNLOCK(nic);
	}

	/* Configure CPI alorithm */
	nic->cpi_alg = CPI_ALG_NONE;
	NICVF_CORE_LOCK(nic);
	nicvf_config_cpi(nic);
	/* Configure receive side scaling */
	if (nic->qs->rq_cnt > 1)
	nicvf_rss_init(nic);
	NICVF_CORE_UNLOCK(nic);

	err = nicvf_setup_ifnet(nic);
	if (err != 0) {
	device_printf(dev, "Could not set-up ifnet\n");
	goto err_release_intr;
	}

	err = nicvf_setup_ifmedia(nic);
	if (err != 0) {
	device_printf(dev, "Could not set-up ifmedia\n");
	goto err_free_ifnet;
	}

	mtx_init(&nic->stats_mtx, "VNIC stats", NULL, MTX_DEF);
	callout_init_mtx(&nic->stats_callout, &nic->stats_mtx, 0);

	ether_ifattach(nic->ifp, nic->hwaddr);

	return (0);

	err_free_ifnet:
	if_free(nic->ifp);
	err_release_intr:
	nicvf_release_all_interrupts(nic);
	err_free_res:
	bus_release_resource(dev, SYS_RES_MEMORY, rman_get_rid(nic->reg_base),
	nic->reg_base);

	return (err);
	}

	static int
	nicvf_detach(device_t dev)
	{
	struct nicvf *nic;

	nic = device_get_softc(dev);

	NICVF_CORE_LOCK(nic);
	/* Shut down the port and release ring resources */
	nicvf_stop_locked(nic);
	/* Release stats lock */
	mtx_destroy(&nic->stats_mtx);
	/* Release interrupts */
	nicvf_release_all_interrupts(nic);
	/* Release memory resource */
	if (nic->reg_base != NULL) {
	bus_release_resource(dev, SYS_RES_MEMORY,
	rman_get_rid(nic->reg_base), nic->reg_base);
	}

	/* Remove all ifmedia configurations */
	ifmedia_removeall(&nic->if_media);
	/* Free this ifnet */
	if_free(nic->ifp);
	NICVF_CORE_UNLOCK(nic);
	/* Finally destroy the lock */
	NICVF_CORE_LOCK_DESTROY(nic);

	return (0);
	}

	static void
	nicvf_hw_addr_random(uint8_t *hwaddr)
	{
	uint32_t rnd;
	uint8_t addr[ETHER_ADDR_LEN];

	/*
	* Create randomized MAC address.
	* Set 'bsd' + random 24 low-order bits.
	*/
	rnd = arc4random() & 0x00ffffff;
	addr[0] = 'b';
	addr[1] = 's';
	addr[2] = 'd';
	addr[3] = rnd >> 16;
	addr[4] = rnd >> 8;
	addr[5] = rnd >> 0;

	memcpy(hwaddr, addr, ETHER_ADDR_LEN);
	}

	static int
	nicvf_setup_ifnet(struct nicvf *nic)
	{
	struct ifnet *ifp;

	ifp = if_alloc(IFT_ETHER);
	if (ifp == NULL) {
	device_printf(nic->dev, "Could not allocate ifnet structure\n");
	return (ENOMEM);
	}

	nic->ifp = ifp;

	if_setsoftc(ifp, nic);
	if_initname(ifp, device_get_name(nic->dev), device_get_unit(nic->dev));
	if_setflags(ifp, IFF_BROADCAST \| IFF_SIMPLEX \| IFF_MULTICAST);

	if_settransmitfn(ifp, nicvf_if_transmit);
	if_setqflushfn(ifp, nicvf_if_qflush);
	if_setioctlfn(ifp, nicvf_if_ioctl);
	if_setinitfn(ifp, nicvf_if_init);
	if_setgetcounterfn(ifp, nicvf_if_getcounter);

	if_setmtu(ifp, ETHERMTU);

	/* Reset caps */
	if_setcapabilities(ifp, 0);

	/* Set the default values */
	if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU \| IFCAP_JUMBO_MTU, 0);
	if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
	if (nic->hw_tso) {
	/* TSO */
	if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
	/* TSO parameters */
	if_sethwtsomax(ifp, NICVF_TSO_MAXSIZE);
	if_sethwtsomaxsegcount(ifp, NICVF_TSO_NSEGS);
	if_sethwtsomaxsegsize(ifp, MCLBYTES);
	}
	/* IP/TCP/UDP HW checksums */
	if_setcapabilitiesbit(ifp, IFCAP_HWCSUM, 0);
	if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0);
	/*
	* HW offload enable
	*/
	if_clearhwassist(ifp);
	if_sethwassistbits(ifp, (CSUM_IP \| CSUM_TCP \| CSUM_UDP \| CSUM_SCTP), 0);
	if (nic->hw_tso)
	if_sethwassistbits(ifp, (CSUM_TSO), 0);
	if_setcapenable(ifp, if_getcapabilities(ifp));

	return (0);
	}

	static int
	nicvf_setup_ifmedia(struct nicvf *nic)
	{

	ifmedia_init(&nic->if_media, IFM_IMASK, nicvf_media_change,
	nicvf_media_status);

	/*
	* Advertise availability of all possible connection types,
	* even though not all are possible at the same time.
	*/

	ifmedia_add(&nic->if_media, (IFM_ETHER \| IFM_10_T \| IFM_FDX),
	0, NULL);
	ifmedia_add(&nic->if_media, (IFM_ETHER \| IFM_100_TX \| IFM_FDX),
	0, NULL);
	ifmedia_add(&nic->if_media, (IFM_ETHER \| IFM_1000_T \| IFM_FDX),
	0, NULL);
	ifmedia_add(&nic->if_media, (IFM_ETHER \| IFM_10G_SR \| IFM_FDX),
	0, NULL);
	ifmedia_add(&nic->if_media, (IFM_ETHER \| IFM_40G_CR4 \| IFM_FDX),
	0, NULL);
	ifmedia_add(&nic->if_media, (IFM_ETHER \| IFM_AUTO \| IFM_FDX),
	0, NULL);

	ifmedia_set(&nic->if_media, (IFM_ETHER \| IFM_AUTO \| IFM_FDX));

	return (0);
	}

	static int
	nicvf_if_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	struct nicvf *nic;
	struct rcv_queue *rq;
	struct ifreq *ifr;
	- uint32_t flags;
	int mask, err;
	int rq_idx;
	#if defined(INET) \|\| defined(INET6)
	struct ifaddr *ifa;
	boolean_t avoid_reset = FALSE;
	#endif

	nic = if_getsoftc(ifp);
	ifr = (struct ifreq *)data;
	#if defined(INET) \|\| defined(INET6)
	ifa = (struct ifaddr *)data;
	#endif
	err = 0;
	switch (cmd) {
	case SIOCSIFADDR:
	#ifdef INET
	if (ifa->ifa_addr->sa_family == AF_INET)
	avoid_reset = TRUE;
	#endif
	#ifdef INET6
	if (ifa->ifa_addr->sa_family == AF_INET6)
	avoid_reset = TRUE;
	#endif

	#if defined(INET) \|\| defined(INET6)
	/* Avoid reinitialization unless it's necessary */
	if (avoid_reset) {
	if_setflagbits(ifp, IFF_UP, 0);
	if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
	nicvf_if_init(nic);
	#ifdef INET
	if (!(if_getflags(ifp) & IFF_NOARP))
	arp_ifinit(ifp, ifa);
	#endif

	return (0);
	}
	#endif
	err = ether_ioctl(ifp, cmd, data);
	break;
	case SIOCSIFMTU:
	if (ifr->ifr_mtu < NIC_HW_MIN_FRS \|\|
	ifr->ifr_mtu > NIC_HW_MAX_FRS) {
	err = EINVAL;
	} else {
	NICVF_CORE_LOCK(nic);
	err = nicvf_update_hw_max_frs(nic, ifr->ifr_mtu);
	if (err == 0)
	if_setmtu(ifp, ifr->ifr_mtu);
	NICVF_CORE_UNLOCK(nic);
	}
	break;
	case SIOCSIFFLAGS:
	NICVF_CORE_LOCK(nic);
	if (if_getflags(ifp) & IFF_UP) {
	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
	- flags = if_getflags(ifp) ^ nic->if_flags;
	if ((nic->if_flags & if_getflags(ifp)) &
	IFF_PROMISC) {
	/* Change promiscous mode */
	#if 0
	/* ARM64TODO */
	nicvf_set_promiscous(nic);
	#endif
	}

	if ((nic->if_flags ^ if_getflags(ifp)) &
	IFF_ALLMULTI) {
	/* Change multicasting settings */
	#if 0
	/* ARM64TODO */
	nicvf_set_multicast(nic);
	#endif
	}
	} else {
	nicvf_if_init_locked(nic);
	}
	} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
	nicvf_stop_locked(nic);

	nic->if_flags = if_getflags(ifp);
	NICVF_CORE_UNLOCK(nic);
	break;

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
	#if 0
	NICVF_CORE_LOCK(nic);
	/* ARM64TODO */
	nicvf_set_multicast(nic);
	NICVF_CORE_UNLOCK(nic);
	#endif
	}
	break;

	case SIOCSIFMEDIA:
	case SIOCGIFMEDIA:
	err = ifmedia_ioctl(ifp, ifr, &nic->if_media, cmd);
	break;

	case SIOCSIFCAP:
	mask = if_getcapenable(ifp) ^ ifr->ifr_reqcap;
	if (mask & IFCAP_VLAN_MTU) {
	/* No work to do except acknowledge the change took. */
	if_togglecapenable(ifp, IFCAP_VLAN_MTU);
	}
	if (mask & IFCAP_TXCSUM)
	if_togglecapenable(ifp, IFCAP_TXCSUM);
	if (mask & IFCAP_RXCSUM)
	if_togglecapenable(ifp, IFCAP_RXCSUM);
	if ((mask & IFCAP_TSO4) && nic->hw_tso)
	if_togglecapenable(ifp, IFCAP_TSO4);
	if (mask & IFCAP_LRO) {
	/*
	* Lock the driver for a moment to avoid
	* mismatch in per-queue settings.
	*/
	NICVF_CORE_LOCK(nic);
	if_togglecapenable(ifp, IFCAP_LRO);
	if ((if_getdrvflags(nic->ifp) & IFF_DRV_RUNNING) != 0) {
	/*
	* Now disable LRO for subsequent packets.
	* Atomicity of this change is not necessary
	* as we don't need precise toggle of this
	* feature for all threads processing the
	* completion queue.
	*/
	for (rq_idx = 0;
	rq_idx < nic->qs->rq_cnt; rq_idx++) {
	rq = &nic->qs->rq[rq_idx];
	rq->lro_enabled = !rq->lro_enabled;
	}
	}
	NICVF_CORE_UNLOCK(nic);
	}

	break;

	default:
	err = ether_ioctl(ifp, cmd, data);
	break;
	}

	return (err);
	}

	static void
	nicvf_if_init_locked(struct nicvf *nic)
	{
	struct queue_set *qs = nic->qs;
	struct ifnet *ifp;
	int qidx;
	int err;
	caddr_t if_addr;

	NICVF_CORE_LOCK_ASSERT(nic);
	ifp = nic->ifp;

	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0)
	nicvf_stop_locked(nic);

	err = nicvf_enable_misc_interrupt(nic);
	if (err != 0) {
	if_printf(ifp, "Could not reenable Mbox interrupt\n");
	return;
	}

	/* Get the latest MAC address */
	if_addr = if_getlladdr(ifp);
	/* Update MAC address if changed */
	if (memcmp(nic->hwaddr, if_addr, ETHER_ADDR_LEN) != 0) {
	memcpy(nic->hwaddr, if_addr, ETHER_ADDR_LEN);
	nicvf_hw_set_mac_addr(nic, if_addr);
	}

	/* Initialize the queues */
	err = nicvf_init_resources(nic);
	if (err != 0)
	goto error;

	/* Make sure queue initialization is written */
	wmb();

	nicvf_reg_write(nic, NIC_VF_INT, ~0UL);
	/* Enable Qset err interrupt */
	nicvf_enable_intr(nic, NICVF_INTR_QS_ERR, 0);

	/* Enable completion queue interrupt */
	for (qidx = 0; qidx < qs->cq_cnt; qidx++)
	nicvf_enable_intr(nic, NICVF_INTR_CQ, qidx);

	/* Enable RBDR threshold interrupt */
	for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
	nicvf_enable_intr(nic, NICVF_INTR_RBDR, qidx);

	nic->drv_stats.txq_stop = 0;
	nic->drv_stats.txq_wake = 0;

	/* Activate network interface */
	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);

	/* Schedule callout to update stats */
	callout_reset(&nic->stats_callout, hz, nicvf_tick_stats, nic);

	return;

	error:
	/* Something went very wrong. Disable this ifnet for good */
	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
	}

	static void
	nicvf_if_init(void *if_softc)
	{
	struct nicvf *nic = if_softc;

	NICVF_CORE_LOCK(nic);
	nicvf_if_init_locked(nic);
	NICVF_CORE_UNLOCK(nic);
	}

	static int
	nicvf_if_transmit(struct ifnet ifp, struct mbuf mbuf)
	{
	struct nicvf *nic = if_getsoftc(ifp);
	struct queue_set *qs = nic->qs;
	struct snd_queue *sq;
	struct mbuf *mtmp;
	int qidx;
	int err = 0;


	if (__predict_false(qs == NULL)) {
	panic("%s: missing queue set for %s", __func__,
	device_get_nameunit(nic->dev));
	}

	/* Select queue */
	if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
	qidx = mbuf->m_pkthdr.flowid % qs->sq_cnt;
	else
	qidx = curcpu % qs->sq_cnt;

	sq = &qs->sq[qidx];

	if (mbuf->m_next != NULL &&
	(mbuf->m_pkthdr.csum_flags &
	(CSUM_IP \| CSUM_TCP \| CSUM_UDP \| CSUM_SCTP)) != 0) {
	if (M_WRITABLE(mbuf) == 0) {
	mtmp = m_dup(mbuf, M_NOWAIT);
	m_freem(mbuf);
	if (mtmp == NULL)
	return (ENOBUFS);
	mbuf = mtmp;
	}
	}

	err = drbr_enqueue(ifp, sq->br, mbuf);
	if (((if_getdrvflags(ifp) & (IFF_DRV_RUNNING \| IFF_DRV_OACTIVE)) !=
	IFF_DRV_RUNNING) \|\| !nic->link_up \|\| (err != 0)) {
	/*
	* Try to enqueue packet to the ring buffer.
	* If the driver is not active, link down or enqueue operation
	* failed, return with the appropriate error code.
	*/
	return (err);
	}

	if (NICVF_TX_TRYLOCK(sq) != 0) {
	err = nicvf_xmit_locked(sq);
	NICVF_TX_UNLOCK(sq);
	return (err);
	} else
	taskqueue_enqueue(sq->snd_taskq, &sq->snd_task);

	return (0);
	}

	static void
	nicvf_if_qflush(struct ifnet *ifp)
	{
	struct nicvf *nic;
	struct queue_set *qs;
	struct snd_queue *sq;
	struct mbuf *mbuf;
	size_t idx;

	nic = if_getsoftc(ifp);
	qs = nic->qs;

	for (idx = 0; idx < qs->sq_cnt; idx++) {
	sq = &qs->sq[idx];
	NICVF_TX_LOCK(sq);
	while ((mbuf = buf_ring_dequeue_sc(sq->br)) != NULL)
	m_freem(mbuf);
	NICVF_TX_UNLOCK(sq);
	}
	if_qflush(ifp);
	}

	static uint64_t
	nicvf_if_getcounter(struct ifnet *ifp, ift_counter cnt)
	{
	struct nicvf *nic;
	struct nicvf_hw_stats *hw_stats;
	struct nicvf_drv_stats *drv_stats;

	nic = if_getsoftc(ifp);
	hw_stats = &nic->hw_stats;
	drv_stats = &nic->drv_stats;

	switch (cnt) {
	case IFCOUNTER_IPACKETS:
	return (drv_stats->rx_frames_ok);
	case IFCOUNTER_OPACKETS:
	return (drv_stats->tx_frames_ok);
	case IFCOUNTER_IBYTES:
	return (hw_stats->rx_bytes);
	case IFCOUNTER_OBYTES:
	return (hw_stats->tx_bytes_ok);
	case IFCOUNTER_IMCASTS:
	return (hw_stats->rx_mcast_frames);
	case IFCOUNTER_COLLISIONS:
	return (0);
	case IFCOUNTER_IQDROPS:
	return (drv_stats->rx_drops);
	case IFCOUNTER_OQDROPS:
	return (drv_stats->tx_drops);
	default:
	return (if_get_counter_default(ifp, cnt));
	}

	}

	static void
	nicvf_media_status(struct ifnet ifp, struct ifmediareq ifmr)
	{
	struct nicvf *nic = if_getsoftc(ifp);

	NICVF_CORE_LOCK(nic);

	ifmr->ifm_status = IFM_AVALID;
	ifmr->ifm_active = IFM_ETHER;

	if (nic->link_up) {
	/* Device attached to working network */
	ifmr->ifm_status \|= IFM_ACTIVE;
	}

	switch (nic->speed) {
	case SPEED_10:
	ifmr->ifm_active \|= IFM_10_T;
	break;
	case SPEED_100:
	ifmr->ifm_active \|= IFM_100_TX;
	break;
	case SPEED_1000:
	ifmr->ifm_active \|= IFM_1000_T;
	break;
	case SPEED_10000:
	ifmr->ifm_active \|= IFM_10G_SR;
	break;
	case SPEED_40000:
	ifmr->ifm_active \|= IFM_40G_CR4;
	break;
	default:
	ifmr->ifm_active \|= IFM_AUTO;
	break;
	}

	if (nic->duplex)
	ifmr->ifm_active \|= IFM_FDX;
	else
	ifmr->ifm_active \|= IFM_HDX;

	NICVF_CORE_UNLOCK(nic);
	}

	static int
	nicvf_media_change(struct ifnet *ifp __unused)
	{

	return (0);
	}

	/* Register read/write APIs */
	void
	nicvf_reg_write(struct nicvf *nic, bus_space_handle_t offset, uint64_t val)
	{

	bus_write_8(nic->reg_base, offset, val);
	}

	uint64_t
	nicvf_reg_read(struct nicvf *nic, uint64_t offset)
	{

	return (bus_read_8(nic->reg_base, offset));
	}

	void
	nicvf_queue_reg_write(struct nicvf *nic, bus_space_handle_t offset,
	uint64_t qidx, uint64_t val)
	{

	bus_write_8(nic->reg_base, offset + (qidx << NIC_Q_NUM_SHIFT), val);
	}

	uint64_t
	nicvf_queue_reg_read(struct nicvf *nic, bus_space_handle_t offset,
	uint64_t qidx)
	{

	return (bus_read_8(nic->reg_base, offset + (qidx << NIC_Q_NUM_SHIFT)));
	}

	/* VF -> PF mailbox communication */
	static void
	nicvf_write_to_mbx(struct nicvf nic, union nic_mbx mbx)
	{
	uint64_t msg = (uint64_t )mbx;

	nicvf_reg_write(nic, NIC_VF_PF_MAILBOX_0_1 + 0, msg[0]);
	nicvf_reg_write(nic, NIC_VF_PF_MAILBOX_0_1 + 8, msg[1]);
	}

	int
	nicvf_send_msg_to_pf(struct nicvf nic, union nic_mbx mbx)
	{
	int timeout = NIC_MBOX_MSG_TIMEOUT * 10;
	int sleep = 2;

	NICVF_CORE_LOCK_ASSERT(nic);

	nic->pf_acked = FALSE;
	nic->pf_nacked = FALSE;

	nicvf_write_to_mbx(nic, mbx);

	/* Wait for previous message to be acked, timeout 2sec */
	while (!nic->pf_acked) {
	if (nic->pf_nacked)
	return (EINVAL);

	DELAY(sleep * 1000);

	if (nic->pf_acked)
	break;
	timeout -= sleep;
	if (!timeout) {
	device_printf(nic->dev,
	"PF didn't ack to mbox msg %d from VF%d\n",
	(mbx->msg.msg & 0xFF), nic->vf_id);

	return (EBUSY);
	}
	}
	return (0);
	}

	/*
	* Checks if VF is able to comminicate with PF
	* and also gets the VNIC number this VF is associated to.
	*/
	static int
	nicvf_check_pf_ready(struct nicvf *nic)
	{
	union nic_mbx mbx = {};

	mbx.msg.msg = NIC_MBOX_MSG_READY;
	if (nicvf_send_msg_to_pf(nic, &mbx)) {
	device_printf(nic->dev,
	"PF didn't respond to READY msg\n");
	return 0;
	}

	return 1;
	}

	static void
	nicvf_read_bgx_stats(struct nicvf nic, struct bgx_stats_msg bgx)
	{

	if (bgx->rx)
	nic->bgx_stats.rx_stats[bgx->idx] = bgx->stats;
	else
	nic->bgx_stats.tx_stats[bgx->idx] = bgx->stats;
	}

	static void
	nicvf_handle_mbx_intr(struct nicvf *nic)
	{
	union nic_mbx mbx = {};
	uint64_t *mbx_data;
	uint64_t mbx_addr;
	int i;

	mbx_addr = NIC_VF_PF_MAILBOX_0_1;
	mbx_data = (uint64_t *)&mbx;

	for (i = 0; i < NIC_PF_VF_MAILBOX_SIZE; i++) {
	*mbx_data = nicvf_reg_read(nic, mbx_addr);
	mbx_data++;
	mbx_addr += sizeof(uint64_t);
	}

	switch (mbx.msg.msg) {
	case NIC_MBOX_MSG_READY:
	nic->pf_acked = TRUE;
	nic->vf_id = mbx.nic_cfg.vf_id & 0x7F;
	nic->tns_mode = mbx.nic_cfg.tns_mode & 0x7F;
	nic->node = mbx.nic_cfg.node_id;
	memcpy(nic->hwaddr, mbx.nic_cfg.mac_addr, ETHER_ADDR_LEN);
	nic->loopback_supported = mbx.nic_cfg.loopback_supported;
	nic->link_up = FALSE;
	nic->duplex = 0;
	nic->speed = 0;
	break;
	case NIC_MBOX_MSG_ACK:
	nic->pf_acked = TRUE;
	break;
	case NIC_MBOX_MSG_NACK:
	nic->pf_nacked = TRUE;
	break;
	case NIC_MBOX_MSG_RSS_SIZE:
	nic->rss_info.rss_size = mbx.rss_size.ind_tbl_size;
	nic->pf_acked = TRUE;
	break;
	case NIC_MBOX_MSG_BGX_STATS:
	nicvf_read_bgx_stats(nic, &mbx.bgx_stats);
	nic->pf_acked = TRUE;
	break;
	case NIC_MBOX_MSG_BGX_LINK_CHANGE:
	nic->pf_acked = TRUE;
	nic->link_up = mbx.link_status.link_up;
	nic->duplex = mbx.link_status.duplex;
	nic->speed = mbx.link_status.speed;
	if (nic->link_up) {
	if_setbaudrate(nic->ifp, nic->speed * 1000000);
	if_link_state_change(nic->ifp, LINK_STATE_UP);
	} else {
	if_setbaudrate(nic->ifp, 0);
	if_link_state_change(nic->ifp, LINK_STATE_DOWN);
	}
	break;
	default:
	device_printf(nic->dev,
	"Invalid message from PF, msg 0x%x\n", mbx.msg.msg);
	break;
	}
	nicvf_clear_intr(nic, NICVF_INTR_MBOX, 0);
	}

	static int
	nicvf_update_hw_max_frs(struct nicvf *nic, int mtu)
	{
	union nic_mbx mbx = {};

	mbx.frs.msg = NIC_MBOX_MSG_SET_MAX_FRS;
	mbx.frs.max_frs = mtu;
	mbx.frs.vf_id = nic->vf_id;

	return nicvf_send_msg_to_pf(nic, &mbx);
	}

	static int
	nicvf_hw_set_mac_addr(struct nicvf nic, uint8_t hwaddr)
	{
	union nic_mbx mbx = {};

	mbx.mac.msg = NIC_MBOX_MSG_SET_MAC;
	mbx.mac.vf_id = nic->vf_id;
	memcpy(mbx.mac.mac_addr, hwaddr, ETHER_ADDR_LEN);

	return (nicvf_send_msg_to_pf(nic, &mbx));
	}

	static void
	nicvf_config_cpi(struct nicvf *nic)
	{
	union nic_mbx mbx = {};

	mbx.cpi_cfg.msg = NIC_MBOX_MSG_CPI_CFG;
	mbx.cpi_cfg.vf_id = nic->vf_id;
	mbx.cpi_cfg.cpi_alg = nic->cpi_alg;
	mbx.cpi_cfg.rq_cnt = nic->qs->rq_cnt;

	nicvf_send_msg_to_pf(nic, &mbx);
	}

	static void
	nicvf_get_rss_size(struct nicvf *nic)
	{
	union nic_mbx mbx = {};

	mbx.rss_size.msg = NIC_MBOX_MSG_RSS_SIZE;
	mbx.rss_size.vf_id = nic->vf_id;
	nicvf_send_msg_to_pf(nic, &mbx);
	}

	static void
	nicvf_config_rss(struct nicvf *nic)
	{
	union nic_mbx mbx = {};
	struct nicvf_rss_info *rss;
	int ind_tbl_len;
	int i, nextq;

	rss = &nic->rss_info;
	ind_tbl_len = rss->rss_size;
	nextq = 0;

	mbx.rss_cfg.vf_id = nic->vf_id;
	mbx.rss_cfg.hash_bits = rss->hash_bits;
	while (ind_tbl_len != 0) {
	mbx.rss_cfg.tbl_offset = nextq;
	mbx.rss_cfg.tbl_len = MIN(ind_tbl_len,
	RSS_IND_TBL_LEN_PER_MBX_MSG);
	mbx.rss_cfg.msg = mbx.rss_cfg.tbl_offset ?
	NIC_MBOX_MSG_RSS_CFG_CONT : NIC_MBOX_MSG_RSS_CFG;

	for (i = 0; i < mbx.rss_cfg.tbl_len; i++)
	mbx.rss_cfg.ind_tbl[i] = rss->ind_tbl[nextq++];

	nicvf_send_msg_to_pf(nic, &mbx);

	ind_tbl_len -= mbx.rss_cfg.tbl_len;
	}
	}

	static void
	nicvf_set_rss_key(struct nicvf *nic)
	{
	struct nicvf_rss_info *rss;
	uint64_t key_addr;
	int idx;

	rss = &nic->rss_info;
	key_addr = NIC_VNIC_RSS_KEY_0_4;

	for (idx = 0; idx < RSS_HASH_KEY_SIZE; idx++) {
	nicvf_reg_write(nic, key_addr, rss->key[idx]);
	key_addr += sizeof(uint64_t);
	}
	}

	static int
	nicvf_rss_init(struct nicvf *nic)
	{
	struct nicvf_rss_info *rss;
	int idx;

	nicvf_get_rss_size(nic);

	rss = &nic->rss_info;
	if (nic->cpi_alg != CPI_ALG_NONE) {
	rss->enable = FALSE;
	rss->hash_bits = 0;
	return (ENXIO);
	}

	rss->enable = TRUE;

	/* Using the HW reset value for now */
	rss->key[0] = 0xFEED0BADFEED0BADUL;
	rss->key[1] = 0xFEED0BADFEED0BADUL;
	rss->key[2] = 0xFEED0BADFEED0BADUL;
	rss->key[3] = 0xFEED0BADFEED0BADUL;
	rss->key[4] = 0xFEED0BADFEED0BADUL;

	nicvf_set_rss_key(nic);

	rss->cfg = RSS_IP_HASH_ENA \| RSS_TCP_HASH_ENA \| RSS_UDP_HASH_ENA;
	nicvf_reg_write(nic, NIC_VNIC_RSS_CFG, rss->cfg);

	rss->hash_bits = fls(rss->rss_size) - 1;
	for (idx = 0; idx < rss->rss_size; idx++)
	rss->ind_tbl[idx] = idx % nic->rx_queues;

	nicvf_config_rss(nic);

	return (0);
	}

	static int
	nicvf_init_resources(struct nicvf *nic)
	{
	int err;
	union nic_mbx mbx = {};

	mbx.msg.msg = NIC_MBOX_MSG_CFG_DONE;

	/* Enable Qset */
	nicvf_qset_config(nic, TRUE);

	/* Initialize queues and HW for data transfer */
	err = nicvf_config_data_transfer(nic, TRUE);
	if (err) {
	device_printf(nic->dev,
	"Failed to alloc/config VF's QSet resources\n");
	return (err);
	}

	/* Send VF config done msg to PF */
	nicvf_write_to_mbx(nic, &mbx);

	return (0);
	}

	static void
	nicvf_misc_intr_handler(void *arg)
	{
	struct nicvf nic = (struct nicvf )arg;
	uint64_t intr;

	intr = nicvf_reg_read(nic, NIC_VF_INT);
	/* Check for spurious interrupt */
	if (!(intr & NICVF_INTR_MBOX_MASK))
	return;

	nicvf_handle_mbx_intr(nic);
	}

	static int
	nicvf_intr_handler(void *arg)
	{
	struct nicvf *nic;
	struct cmp_queue *cq;
	int qidx;

	cq = (struct cmp_queue *)arg;
	nic = cq->nic;
	qidx = cq->idx;

	/* Disable interrupts */
	nicvf_disable_intr(nic, NICVF_INTR_CQ, qidx);

	taskqueue_enqueue(cq->cmp_taskq, &cq->cmp_task);

	/* Clear interrupt */
	nicvf_clear_intr(nic, NICVF_INTR_CQ, qidx);

	return (FILTER_HANDLED);
	}

	static void
	nicvf_rbdr_intr_handler(void *arg)
	{
	struct nicvf *nic;
	struct queue_set *qs;
	struct rbdr *rbdr;
	int qidx;

	nic = (struct nicvf *)arg;

	/* Disable RBDR interrupt and schedule softirq */
	for (qidx = 0; qidx < nic->qs->rbdr_cnt; qidx++) {
	if (!nicvf_is_intr_enabled(nic, NICVF_INTR_RBDR, qidx))
	continue;
	nicvf_disable_intr(nic, NICVF_INTR_RBDR, qidx);

	qs = nic->qs;
	rbdr = &qs->rbdr[qidx];
	taskqueue_enqueue(rbdr->rbdr_taskq, &rbdr->rbdr_task_nowait);
	/* Clear interrupt */
	nicvf_clear_intr(nic, NICVF_INTR_RBDR, qidx);
	}
	}

	static void
	nicvf_qs_err_intr_handler(void *arg)
	{
	struct nicvf nic = (struct nicvf )arg;
	struct queue_set *qs = nic->qs;

	/* Disable Qset err interrupt and schedule softirq */
	nicvf_disable_intr(nic, NICVF_INTR_QS_ERR, 0);
	taskqueue_enqueue(qs->qs_err_taskq, &qs->qs_err_task);
	nicvf_clear_intr(nic, NICVF_INTR_QS_ERR, 0);

	}

	static int
	nicvf_enable_msix(struct nicvf *nic)
	{
	struct pci_devinfo *dinfo;
	int rid, count;
	int ret;

	dinfo = device_get_ivars(nic->dev);
	rid = dinfo->cfg.msix.msix_table_bar;
	nic->msix_table_res =
	bus_alloc_resource_any(nic->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
	if (nic->msix_table_res == NULL) {
	device_printf(nic->dev,
	"Could not allocate memory for MSI-X table\n");
	return (ENXIO);
	}

	count = nic->num_vec = NIC_VF_MSIX_VECTORS;

	ret = pci_alloc_msix(nic->dev, &count);
	if ((ret != 0) \|\| (count != nic->num_vec)) {
	device_printf(nic->dev,
	"Request for #%d msix vectors failed, error: %d\n",
	nic->num_vec, ret);
	return (ret);
	}

	nic->msix_enabled = 1;
	return (0);
	}

	static void
	nicvf_disable_msix(struct nicvf *nic)
	{

	if (nic->msix_enabled) {
	pci_release_msi(nic->dev);
	nic->msix_enabled = 0;
	nic->num_vec = 0;
	}
	}

	static void
	nicvf_release_all_interrupts(struct nicvf *nic)
	{
	struct resource *res;
	int irq;
	int err;

	/* Free registered interrupts */
	for (irq = 0; irq < nic->num_vec; irq++) {
	res = nic->msix_entries[irq].irq_res;
	if (res == NULL)
	continue;
	/* Teardown interrupt first */
	if (nic->msix_entries[irq].handle != NULL) {
	err = bus_teardown_intr(nic->dev,
	nic->msix_entries[irq].irq_res,
	nic->msix_entries[irq].handle);
	KASSERT(err == 0,
	("ERROR: Unable to teardown interrupt %d", irq));
	nic->msix_entries[irq].handle = NULL;
	}

	bus_release_resource(nic->dev, SYS_RES_IRQ,
	rman_get_rid(res), nic->msix_entries[irq].irq_res);
	nic->msix_entries[irq].irq_res = NULL;
	}
	/* Disable MSI-X */
	nicvf_disable_msix(nic);
	}

	/*
	* Initialize MSIX vectors and register MISC interrupt.
	* Send READY message to PF to check if its alive
	*/
	static int
	nicvf_allocate_misc_interrupt(struct nicvf *nic)
	{
	struct resource *res;
	int irq, rid;
	int ret = 0;

	/* Return if mailbox interrupt is already registered */
	if (nic->msix_enabled)
	return (0);

	/* Enable MSI-X */
	if (nicvf_enable_msix(nic) != 0)
	return (ENXIO);

	irq = NICVF_INTR_ID_MISC;
	rid = irq + 1;
	nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
	SYS_RES_IRQ, &rid, (RF_SHAREABLE \| RF_ACTIVE));
	if (nic->msix_entries[irq].irq_res == NULL) {
	device_printf(nic->dev,
	"Could not allocate Mbox interrupt for VF%d\n",
	device_get_unit(nic->dev));
	return (ENXIO);
	}

	ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
	(INTR_MPSAFE \| INTR_TYPE_MISC), NULL, nicvf_misc_intr_handler, nic,
	&nic->msix_entries[irq].handle);
	if (ret != 0) {
	res = nic->msix_entries[irq].irq_res;
	bus_release_resource(nic->dev, SYS_RES_IRQ,
	rman_get_rid(res), res);
	nic->msix_entries[irq].irq_res = NULL;
	return (ret);
	}

	return (0);
	}

	static int
	nicvf_enable_misc_interrupt(struct nicvf *nic)
	{

	/* Enable mailbox interrupt */
	nicvf_enable_intr(nic, NICVF_INTR_MBOX, 0);

	/* Check if VF is able to communicate with PF */
	if (!nicvf_check_pf_ready(nic)) {
	nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0);
	return (ENXIO);
	}

	return (0);
	}

	static void
	nicvf_release_net_interrupts(struct nicvf *nic)
	{
	struct resource *res;
	int irq;
	int err;

	for_each_cq_irq(irq) {
	res = nic->msix_entries[irq].irq_res;
	if (res == NULL)
	continue;
	/* Teardown active interrupts first */
	if (nic->msix_entries[irq].handle != NULL) {
	err = bus_teardown_intr(nic->dev,
	nic->msix_entries[irq].irq_res,
	nic->msix_entries[irq].handle);
	KASSERT(err == 0,
	("ERROR: Unable to teardown CQ interrupt %d",
	(irq - NICVF_INTR_ID_CQ)));
	if (err != 0)
	continue;
	}

	/* Release resource */
	bus_release_resource(nic->dev, SYS_RES_IRQ, rman_get_rid(res),
	res);
	nic->msix_entries[irq].irq_res = NULL;
	}

	for_each_rbdr_irq(irq) {
	res = nic->msix_entries[irq].irq_res;
	if (res == NULL)
	continue;
	/* Teardown active interrupts first */
	if (nic->msix_entries[irq].handle != NULL) {
	err = bus_teardown_intr(nic->dev,
	nic->msix_entries[irq].irq_res,
	nic->msix_entries[irq].handle);
	KASSERT(err == 0,
	("ERROR: Unable to teardown RDBR interrupt %d",
	(irq - NICVF_INTR_ID_RBDR)));
	if (err != 0)
	continue;
	}

	/* Release resource */
	bus_release_resource(nic->dev, SYS_RES_IRQ, rman_get_rid(res),
	res);
	nic->msix_entries[irq].irq_res = NULL;
	}

	irq = NICVF_INTR_ID_QS_ERR;
	res = nic->msix_entries[irq].irq_res;
	if (res != NULL) {
	/* Teardown active interrupts first */
	if (nic->msix_entries[irq].handle != NULL) {
	err = bus_teardown_intr(nic->dev,
	nic->msix_entries[irq].irq_res,
	nic->msix_entries[irq].handle);
	KASSERT(err == 0,
	("ERROR: Unable to teardown QS Error interrupt %d",
	irq));
	if (err != 0)
	return;
	}

	/* Release resource */
	bus_release_resource(nic->dev, SYS_RES_IRQ, rman_get_rid(res),
	res);
	nic->msix_entries[irq].irq_res = NULL;
	}
	}

	static int
	nicvf_allocate_net_interrupts(struct nicvf *nic)
	{
	u_int cpuid;
	int irq, rid;
	int qidx;
	int ret = 0;

	/* MSI-X must be configured by now */
	if (!nic->msix_enabled) {
	device_printf(nic->dev, "Cannot alloacte queue interrups. "
	"MSI-X interrupts disabled.\n");
	return (ENXIO);
	}

	/* Register CQ interrupts */
	for_each_cq_irq(irq) {
	if (irq >= (NICVF_INTR_ID_CQ + nic->qs->cq_cnt))
	break;

	qidx = irq - NICVF_INTR_ID_CQ;
	rid = irq + 1;
	nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
	SYS_RES_IRQ, &rid, (RF_SHAREABLE \| RF_ACTIVE));
	if (nic->msix_entries[irq].irq_res == NULL) {
	device_printf(nic->dev,
	"Could not allocate CQ interrupt %d for VF%d\n",
	(irq - NICVF_INTR_ID_CQ), device_get_unit(nic->dev));
	ret = ENXIO;
	goto error;
	}
	ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
	(INTR_MPSAFE \| INTR_TYPE_NET), nicvf_intr_handler,
	NULL, &nic->qs->cq[qidx], &nic->msix_entries[irq].handle);
	if (ret != 0) {
	device_printf(nic->dev,
	"Could not setup CQ interrupt %d for VF%d\n",
	(irq - NICVF_INTR_ID_CQ), device_get_unit(nic->dev));
	goto error;
	}
	cpuid = (device_get_unit(nic->dev) * CMP_QUEUE_CNT) + qidx;
	cpuid %= mp_ncpus;
	/*
	* Save CPU ID for later use when system-wide RSS is enabled.
	* It will be used to pit the CQ task to the same CPU that got
	* interrupted.
	*/
	nic->qs->cq[qidx].cmp_cpuid = cpuid;
	if (bootverbose) {
	device_printf(nic->dev, "bind CQ%d IRQ to CPU%d\n",
	qidx, cpuid);
	}
	/* Bind interrupts to the given CPU */
	bus_bind_intr(nic->dev, nic->msix_entries[irq].irq_res, cpuid);
	}

	/* Register RBDR interrupt */
	for_each_rbdr_irq(irq) {
	if (irq >= (NICVF_INTR_ID_RBDR + nic->qs->rbdr_cnt))
	break;

	rid = irq + 1;
	nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
	SYS_RES_IRQ, &rid, (RF_SHAREABLE \| RF_ACTIVE));
	if (nic->msix_entries[irq].irq_res == NULL) {
	device_printf(nic->dev,
	"Could not allocate RBDR interrupt %d for VF%d\n",
	(irq - NICVF_INTR_ID_RBDR),
	device_get_unit(nic->dev));
	ret = ENXIO;
	goto error;
	}
	ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
	(INTR_MPSAFE \| INTR_TYPE_NET), NULL,
	nicvf_rbdr_intr_handler, nic,
	&nic->msix_entries[irq].handle);
	if (ret != 0) {
	device_printf(nic->dev,
	"Could not setup RBDR interrupt %d for VF%d\n",
	(irq - NICVF_INTR_ID_RBDR),
	device_get_unit(nic->dev));
	goto error;
	}
	}

	/* Register QS error interrupt */
	irq = NICVF_INTR_ID_QS_ERR;
	rid = irq + 1;
	nic->msix_entries[irq].irq_res = bus_alloc_resource_any(nic->dev,
	SYS_RES_IRQ, &rid, (RF_SHAREABLE \| RF_ACTIVE));
	if (nic->msix_entries[irq].irq_res == NULL) {
	device_printf(nic->dev,
	"Could not allocate QS Error interrupt for VF%d\n",
	device_get_unit(nic->dev));
	ret = ENXIO;
	goto error;
	}
	ret = bus_setup_intr(nic->dev, nic->msix_entries[irq].irq_res,
	(INTR_MPSAFE \| INTR_TYPE_NET), NULL, nicvf_qs_err_intr_handler,
	nic, &nic->msix_entries[irq].handle);
	if (ret != 0) {
	device_printf(nic->dev,
	"Could not setup QS Error interrupt for VF%d\n",
	device_get_unit(nic->dev));
	goto error;
	}

	return (0);
	error:
	nicvf_release_net_interrupts(nic);
	return (ret);
	}

	static int
	nicvf_stop_locked(struct nicvf *nic)
	{
	struct ifnet *ifp;
	int qidx;
	struct queue_set *qs = nic->qs;
	union nic_mbx mbx = {};

	NICVF_CORE_LOCK_ASSERT(nic);
	/* Stop callout. Can block here since holding SX lock */
	callout_drain(&nic->stats_callout);

	ifp = nic->ifp;

	mbx.msg.msg = NIC_MBOX_MSG_SHUTDOWN;
	nicvf_send_msg_to_pf(nic, &mbx);

	/* Disable RBDR & QS error interrupts */
	for (qidx = 0; qidx < qs->rbdr_cnt; qidx++) {
	nicvf_disable_intr(nic, NICVF_INTR_RBDR, qidx);
	nicvf_clear_intr(nic, NICVF_INTR_RBDR, qidx);
	}
	nicvf_disable_intr(nic, NICVF_INTR_QS_ERR, 0);
	nicvf_clear_intr(nic, NICVF_INTR_QS_ERR, 0);

	/* Deactivate network interface */
	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);

	/* Free resources */
	nicvf_config_data_transfer(nic, FALSE);

	/* Disable HW Qset */
	nicvf_qset_config(nic, FALSE);

	/* disable mailbox interrupt */
	nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0);

	return (0);
	}

	static void
	nicvf_update_stats(struct nicvf *nic)
	{
	int qidx;
	struct nicvf_hw_stats *stats = &nic->hw_stats;
	struct nicvf_drv_stats *drv_stats = &nic->drv_stats;
	struct queue_set *qs = nic->qs;

	#define GET_RX_STATS(reg) \
	nicvf_reg_read(nic, NIC_VNIC_RX_STAT_0_13 \| ((reg) << 3))
	#define GET_TX_STATS(reg) \
	nicvf_reg_read(nic, NIC_VNIC_TX_STAT_0_4 \| ((reg) << 3))

	stats->rx_bytes = GET_RX_STATS(RX_OCTS);
	stats->rx_ucast_frames = GET_RX_STATS(RX_UCAST);
	stats->rx_bcast_frames = GET_RX_STATS(RX_BCAST);
	stats->rx_mcast_frames = GET_RX_STATS(RX_MCAST);
	stats->rx_fcs_errors = GET_RX_STATS(RX_FCS);
	stats->rx_l2_errors = GET_RX_STATS(RX_L2ERR);
	stats->rx_drop_red = GET_RX_STATS(RX_RED);
	stats->rx_drop_red_bytes = GET_RX_STATS(RX_RED_OCTS);
	stats->rx_drop_overrun = GET_RX_STATS(RX_ORUN);
	stats->rx_drop_overrun_bytes = GET_RX_STATS(RX_ORUN_OCTS);
	stats->rx_drop_bcast = GET_RX_STATS(RX_DRP_BCAST);
	stats->rx_drop_mcast = GET_RX_STATS(RX_DRP_MCAST);
	stats->rx_drop_l3_bcast = GET_RX_STATS(RX_DRP_L3BCAST);
	stats->rx_drop_l3_mcast = GET_RX_STATS(RX_DRP_L3MCAST);

	stats->tx_bytes_ok = GET_TX_STATS(TX_OCTS);
	stats->tx_ucast_frames_ok = GET_TX_STATS(TX_UCAST);
	stats->tx_bcast_frames_ok = GET_TX_STATS(TX_BCAST);
	stats->tx_mcast_frames_ok = GET_TX_STATS(TX_MCAST);
	stats->tx_drops = GET_TX_STATS(TX_DROP);

	drv_stats->tx_frames_ok = stats->tx_ucast_frames_ok +
	stats->tx_bcast_frames_ok + stats->tx_mcast_frames_ok;
	drv_stats->rx_drops = stats->rx_drop_red + stats->rx_drop_overrun;
	drv_stats->tx_drops = stats->tx_drops;

	/* Update RQ and SQ stats */
	for (qidx = 0; qidx < qs->rq_cnt; qidx++)
	nicvf_update_rq_stats(nic, qidx);
	for (qidx = 0; qidx < qs->sq_cnt; qidx++)
	nicvf_update_sq_stats(nic, qidx);
	}

	static void
	nicvf_tick_stats(void *arg)
	{
	struct nicvf *nic;

	nic = (struct nicvf *)arg;

	/* Read the statistics */
	nicvf_update_stats(nic);

	callout_reset(&nic->stats_callout, hz, nicvf_tick_stats, nic);
	}
	Index: head/sys/dev/vnic/nicvf_queues.c
	===================================================================
	--- head/sys/dev/vnic/nicvf_queues.c (revision 327172)
	+++ head/sys/dev/vnic/nicvf_queues.c (revision 327173)
	@@ -1,2367 +1,2366 @@
	/*
	* Copyright (C) 2015 Cavium Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*
	*/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bitset.h>
	#include <sys/bitstring.h>
	#include <sys/buf_ring.h>
	#include <sys/bus.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/rman.h>
	#include <sys/pciio.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/sockio.h>
	#include <sys/socket.h>
	#include <sys/stdatomic.h>
	#include <sys/cpuset.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/smp.h>
	#include <sys/taskqueue.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include <machine/bus.h>
	#include <machine/vmparam.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_media.h>
	#include <net/ifq.h>
	#include <net/bpf.h>
	#include <net/ethernet.h>

	#include <netinet/in_systm.h>
	#include <netinet/in.h>
	#include <netinet/if_ether.h>
	#include <netinet/ip.h>
	#include <netinet/ip6.h>
	#include <netinet/sctp.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_lro.h>
	#include <netinet/udp.h>

	#include <netinet6/ip6_var.h>

	#include <dev/pci/pcireg.h>
	#include <dev/pci/pcivar.h>

	#include "thunder_bgx.h"
	#include "nic_reg.h"
	#include "nic.h"
	#include "q_struct.h"
	#include "nicvf_queues.h"

	#define DEBUG
	#undef DEBUG

	#ifdef DEBUG
	#define dprintf(dev, fmt, ...) device_printf(dev, fmt, ##__VA_ARGS__)
	#else
	#define dprintf(dev, fmt, ...)
	#endif

	MALLOC_DECLARE(M_NICVF);

	static void nicvf_free_snd_queue(struct nicvf , struct snd_queue );
	static struct mbuf * nicvf_get_rcv_mbuf(struct nicvf , struct cqe_rx_t );
	static void nicvf_sq_disable(struct nicvf *, int);
	static void nicvf_sq_enable(struct nicvf , struct snd_queue , int);
	static void nicvf_put_sq_desc(struct snd_queue *, int);
	static void nicvf_cmp_queue_config(struct nicvf , struct queue_set , int,
	boolean_t);
	static void nicvf_sq_free_used_descs(struct nicvf , struct snd_queue , int);

	static int nicvf_tx_mbuf_locked(struct snd_queue , struct mbuf *);

	static void nicvf_rbdr_task(void *, int);
	static void nicvf_rbdr_task_nowait(void *, int);

	struct rbuf_info {
	bus_dma_tag_t dmat;
	bus_dmamap_t dmap;
	struct mbuf * mbuf;
	};

	#define GET_RBUF_INFO(x) ((struct rbuf_info *)((x) - NICVF_RCV_BUF_ALIGN_BYTES))

	/* Poll a register for a specific value */
	static int nicvf_poll_reg(struct nicvf *nic, int qidx,
	uint64_t reg, int bit_pos, int bits, int val)
	{
	uint64_t bit_mask;
	uint64_t reg_val;
	int timeout = 10;

	bit_mask = (1UL << bits) - 1;
	bit_mask = (bit_mask << bit_pos);

	while (timeout) {
	reg_val = nicvf_queue_reg_read(nic, reg, qidx);
	if (((reg_val & bit_mask) >> bit_pos) == val)
	return (0);

	DELAY(1000);
	timeout--;
	}
	device_printf(nic->dev, "Poll on reg 0x%lx failed\n", reg);
	return (ETIMEDOUT);
	}

	/* Callback for bus_dmamap_load() */
	static void
	nicvf_dmamap_q_cb(void arg, bus_dma_segment_t segs, int nseg, int error)
	{
	bus_addr_t *paddr;

	KASSERT(nseg == 1, ("wrong number of segments, should be 1"));
	paddr = arg;
	*paddr = segs->ds_addr;
	}

	/* Allocate memory for a queue's descriptors */
	static int
	nicvf_alloc_q_desc_mem(struct nicvf nic, struct q_desc_mem dmem,
	int q_len, int desc_size, int align_bytes)
	{
	int err, err_dmat;

	/* Create DMA tag first */
	err = bus_dma_tag_create(
	bus_get_dma_tag(nic->dev), /* parent tag */
	align_bytes, /* alignment */
	0, /* boundary */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filtfunc, filtfuncarg */
	(q_len * desc_size), /* maxsize */
	1, /* nsegments */
	(q_len * desc_size), /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockfuncarg */
	&dmem->dmat); /* dmat */

	if (err != 0) {
	device_printf(nic->dev,
	"Failed to create busdma tag for descriptors ring\n");
	return (err);
	}

	/* Allocate segment of continuous DMA safe memory */
	err = bus_dmamem_alloc(
	dmem->dmat, /* DMA tag */
	&dmem->base, /* virtual address */
	(BUS_DMA_NOWAIT \| BUS_DMA_ZERO), /* flags */
	&dmem->dmap); /* DMA map */
	if (err != 0) {
	device_printf(nic->dev, "Failed to allocate DMA safe memory for"
	"descriptors ring\n");
	goto dmamem_fail;
	}

	err = bus_dmamap_load(
	dmem->dmat,
	dmem->dmap,
	dmem->base,
	(q_len * desc_size), /* allocation size */
	nicvf_dmamap_q_cb, /* map to DMA address cb. */
	&dmem->phys_base, /* physical address */
	BUS_DMA_NOWAIT);
	if (err != 0) {
	device_printf(nic->dev,
	"Cannot load DMA map of descriptors ring\n");
	goto dmamap_fail;
	}

	dmem->q_len = q_len;
	dmem->size = (desc_size * q_len);

	return (0);

	dmamap_fail:
	bus_dmamem_free(dmem->dmat, dmem->base, dmem->dmap);
	dmem->phys_base = 0;
	dmamem_fail:
	err_dmat = bus_dma_tag_destroy(dmem->dmat);
	dmem->base = NULL;
	KASSERT(err_dmat == 0,
	("%s: Trying to destroy BUSY DMA tag", __func__));

	return (err);
	}

	/* Free queue's descriptor memory */
	static void
	nicvf_free_q_desc_mem(struct nicvf nic, struct q_desc_mem dmem)
	{
	int err;

	if ((dmem == NULL) \|\| (dmem->base == NULL))
	return;

	/* Unload a map */
	bus_dmamap_sync(dmem->dmat, dmem->dmap, BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(dmem->dmat, dmem->dmap);
	/* Free DMA memory */
	bus_dmamem_free(dmem->dmat, dmem->base, dmem->dmap);
	/* Destroy DMA tag */
	err = bus_dma_tag_destroy(dmem->dmat);

	KASSERT(err == 0,
	("%s: Trying to destroy BUSY DMA tag", __func__));

	dmem->phys_base = 0;
	dmem->base = NULL;
	}

	/*
	* Allocate buffer for packet reception
	* HW returns memory address where packet is DMA'ed but not a pointer
	* into RBDR ring, so save buffer address at the start of fragment and
	* align the start address to a cache aligned address
	*/
	static __inline int
	nicvf_alloc_rcv_buffer(struct nicvf nic, struct rbdr rbdr,
	bus_dmamap_t dmap, int mflags, uint32_t buf_len, bus_addr_t *rbuf)
	{
	struct mbuf *mbuf;
	struct rbuf_info *rinfo;
	bus_dma_segment_t segs[1];
	int nsegs;
	int err;

	mbuf = m_getjcl(mflags, MT_DATA, M_PKTHDR, MCLBYTES);
	if (mbuf == NULL)
	return (ENOMEM);

	/*
	* The length is equal to the actual length + one 128b line
	* used as a room for rbuf_info structure.
	*/
	mbuf->m_len = mbuf->m_pkthdr.len = buf_len;

	err = bus_dmamap_load_mbuf_sg(rbdr->rbdr_buff_dmat, dmap, mbuf, segs,
	&nsegs, BUS_DMA_NOWAIT);
	if (err != 0) {
	device_printf(nic->dev,
	"Failed to map mbuf into DMA visible memory, err: %d\n",
	err);
	m_freem(mbuf);
	bus_dmamap_destroy(rbdr->rbdr_buff_dmat, dmap);
	return (err);
	}
	if (nsegs != 1)
	panic("Unexpected number of DMA segments for RB: %d", nsegs);
	/*
	* Now use the room for rbuf_info structure
	* and adjust mbuf data and length.
	*/
	rinfo = (struct rbuf_info *)mbuf->m_data;
	m_adj(mbuf, NICVF_RCV_BUF_ALIGN_BYTES);

	rinfo->dmat = rbdr->rbdr_buff_dmat;
	rinfo->dmap = dmap;
	rinfo->mbuf = mbuf;

	*rbuf = segs[0].ds_addr + NICVF_RCV_BUF_ALIGN_BYTES;

	return (0);
	}

	/* Retrieve mbuf for received packet */
	static struct mbuf *
	nicvf_rb_ptr_to_mbuf(struct nicvf *nic, bus_addr_t rb_ptr)
	{
	struct mbuf *mbuf;
	struct rbuf_info *rinfo;

	/* Get buffer start address and alignment offset */
	rinfo = GET_RBUF_INFO(PHYS_TO_DMAP(rb_ptr));

	/* Now retrieve mbuf to give to stack */
	mbuf = rinfo->mbuf;
	if (__predict_false(mbuf == NULL)) {
	panic("%s: Received packet fragment with NULL mbuf",
	device_get_nameunit(nic->dev));
	}
	/*
	* Clear the mbuf in the descriptor to indicate
	* that this slot is processed and free to use.
	*/
	rinfo->mbuf = NULL;

	bus_dmamap_sync(rinfo->dmat, rinfo->dmap, BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(rinfo->dmat, rinfo->dmap);

	return (mbuf);
	}

	/* Allocate RBDR ring and populate receive buffers */
	static int
	nicvf_init_rbdr(struct nicvf nic, struct rbdr rbdr, int ring_len,
	int buf_size, int qidx)
	{
	bus_dmamap_t dmap;
	bus_addr_t rbuf;
	struct rbdr_entry_t *desc;
	int idx;
	int err;

	/* Allocate rbdr descriptors ring */
	err = nicvf_alloc_q_desc_mem(nic, &rbdr->dmem, ring_len,
	sizeof(struct rbdr_entry_t), NICVF_RCV_BUF_ALIGN_BYTES);
	if (err != 0) {
	device_printf(nic->dev,
	"Failed to create RBDR descriptors ring\n");
	return (err);
	}

	rbdr->desc = rbdr->dmem.base;
	/*
	* Buffer size has to be in multiples of 128 bytes.
	* Make room for metadata of size of one line (128 bytes).
	*/
	rbdr->dma_size = buf_size - NICVF_RCV_BUF_ALIGN_BYTES;
	rbdr->enable = TRUE;
	rbdr->thresh = RBDR_THRESH;
	rbdr->nic = nic;
	rbdr->idx = qidx;

	/*
	* Create DMA tag for Rx buffers.
	* Each map created using this tag is intended to store Rx payload for
	* one fragment and one header structure containing rbuf_info (thus
	* additional 128 byte line since RB must be a multiple of 128 byte
	* cache line).
	*/
	if (buf_size > MCLBYTES) {
	device_printf(nic->dev,
	"Buffer size to large for mbuf cluster\n");
	return (EINVAL);
	}
	err = bus_dma_tag_create(
	bus_get_dma_tag(nic->dev), /* parent tag */
	NICVF_RCV_BUF_ALIGN_BYTES, /* alignment */
	0, /* boundary */
	DMAP_MAX_PHYSADDR, /* lowaddr */
	DMAP_MIN_PHYSADDR, /* highaddr */
	NULL, NULL, /* filtfunc, filtfuncarg */
	roundup2(buf_size, MCLBYTES), /* maxsize */
	1, /* nsegments */
	roundup2(buf_size, MCLBYTES), /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockfuncarg */
	&rbdr->rbdr_buff_dmat); /* dmat */

	if (err != 0) {
	device_printf(nic->dev,
	"Failed to create busdma tag for RBDR buffers\n");
	return (err);
	}

	rbdr->rbdr_buff_dmaps = malloc(sizeof(rbdr->rbdr_buff_dmaps)
	ring_len, M_NICVF, (M_WAITOK \| M_ZERO));

	for (idx = 0; idx < ring_len; idx++) {
	err = bus_dmamap_create(rbdr->rbdr_buff_dmat, 0, &dmap);
	if (err != 0) {
	device_printf(nic->dev,
	"Failed to create DMA map for RB\n");
	return (err);
	}
	rbdr->rbdr_buff_dmaps[idx] = dmap;

	err = nicvf_alloc_rcv_buffer(nic, rbdr, dmap, M_WAITOK,
	DMA_BUFFER_LEN, &rbuf);
	if (err != 0)
	return (err);

	desc = GET_RBDR_DESC(rbdr, idx);
	desc->buf_addr = (rbuf >> NICVF_RCV_BUF_ALIGN);
	}

	/* Allocate taskqueue */
	TASK_INIT(&rbdr->rbdr_task, 0, nicvf_rbdr_task, rbdr);
	TASK_INIT(&rbdr->rbdr_task_nowait, 0, nicvf_rbdr_task_nowait, rbdr);
	rbdr->rbdr_taskq = taskqueue_create_fast("nicvf_rbdr_taskq", M_WAITOK,
	taskqueue_thread_enqueue, &rbdr->rbdr_taskq);
	taskqueue_start_threads(&rbdr->rbdr_taskq, 1, PI_NET, "%s: rbdr_taskq",
	device_get_nameunit(nic->dev));

	return (0);
	}

	/* Free RBDR ring and its receive buffers */
	static void
	nicvf_free_rbdr(struct nicvf nic, struct rbdr rbdr)
	{
	struct mbuf *mbuf;
	struct queue_set *qs;
	struct rbdr_entry_t *desc;
	struct rbuf_info *rinfo;
	bus_addr_t buf_addr;
	int head, tail, idx;
	int err;

	qs = nic->qs;

	if ((qs == NULL) \|\| (rbdr == NULL))
	return;

	rbdr->enable = FALSE;
	if (rbdr->rbdr_taskq != NULL) {
	/* Remove tasks */
	while (taskqueue_cancel(rbdr->rbdr_taskq,
	&rbdr->rbdr_task_nowait, NULL) != 0) {
	/* Finish the nowait task first */
	taskqueue_drain(rbdr->rbdr_taskq,
	&rbdr->rbdr_task_nowait);
	}
	taskqueue_free(rbdr->rbdr_taskq);
	rbdr->rbdr_taskq = NULL;

	while (taskqueue_cancel(taskqueue_thread,
	&rbdr->rbdr_task, NULL) != 0) {
	/* Now finish the sleepable task */
	taskqueue_drain(taskqueue_thread, &rbdr->rbdr_task);
	}
	}

	/*
	* Free all of the memory under the RB descriptors.
	* There are assumptions here:
	* 1. Corresponding RBDR is disabled
	* - it is safe to operate using head and tail indexes
	* 2. All bffers that were received are properly freed by
	* the receive handler
	* - there is no need to unload DMA map and free MBUF for other
	* descriptors than unused ones
	*/
	if (rbdr->rbdr_buff_dmat != NULL) {
	head = rbdr->head;
	tail = rbdr->tail;
	while (head != tail) {
	desc = GET_RBDR_DESC(rbdr, head);
	buf_addr = desc->buf_addr << NICVF_RCV_BUF_ALIGN;
	rinfo = GET_RBUF_INFO(PHYS_TO_DMAP(buf_addr));
	bus_dmamap_unload(rbdr->rbdr_buff_dmat, rinfo->dmap);
	mbuf = rinfo->mbuf;
	/* This will destroy everything including rinfo! */
	m_freem(mbuf);
	head++;
	head &= (rbdr->dmem.q_len - 1);
	}
	/* Free tail descriptor */
	desc = GET_RBDR_DESC(rbdr, tail);
	buf_addr = desc->buf_addr << NICVF_RCV_BUF_ALIGN;
	rinfo = GET_RBUF_INFO(PHYS_TO_DMAP(buf_addr));
	bus_dmamap_unload(rbdr->rbdr_buff_dmat, rinfo->dmap);
	mbuf = rinfo->mbuf;
	/* This will destroy everything including rinfo! */
	m_freem(mbuf);

	/* Destroy DMA maps */
	for (idx = 0; idx < qs->rbdr_len; idx++) {
	if (rbdr->rbdr_buff_dmaps[idx] == NULL)
	continue;
	err = bus_dmamap_destroy(rbdr->rbdr_buff_dmat,
	rbdr->rbdr_buff_dmaps[idx]);
	KASSERT(err == 0,
	("%s: Could not destroy DMA map for RB, desc: %d",
	__func__, idx));
	rbdr->rbdr_buff_dmaps[idx] = NULL;
	}

	/* Now destroy the tag */
	err = bus_dma_tag_destroy(rbdr->rbdr_buff_dmat);
	KASSERT(err == 0,
	("%s: Trying to destroy BUSY DMA tag", __func__));

	rbdr->head = 0;
	rbdr->tail = 0;
	}

	/* Free RBDR ring */
	nicvf_free_q_desc_mem(nic, &rbdr->dmem);
	}

	/*
	* Refill receive buffer descriptors with new buffers.
	*/
	static int
	nicvf_refill_rbdr(struct rbdr *rbdr, int mflags)
	{
	struct nicvf *nic;
	struct queue_set *qs;
	int rbdr_idx;
	int tail, qcount;
	int refill_rb_cnt;
	struct rbdr_entry_t *desc;
	bus_dmamap_t dmap;
	bus_addr_t rbuf;
	boolean_t rb_alloc_fail;
	int new_rb;

	rb_alloc_fail = TRUE;
	new_rb = 0;
	nic = rbdr->nic;
	qs = nic->qs;
	rbdr_idx = rbdr->idx;

	/* Check if it's enabled */
	if (!rbdr->enable)
	return (0);

	/* Get no of desc's to be refilled */
	qcount = nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_STATUS0, rbdr_idx);
	qcount &= 0x7FFFF;
	/* Doorbell can be ringed with a max of ring size minus 1 */
	if (qcount >= (qs->rbdr_len - 1)) {
	rb_alloc_fail = FALSE;
	goto out;
	} else
	refill_rb_cnt = qs->rbdr_len - qcount - 1;

	/* Start filling descs from tail */
	tail = nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_TAIL, rbdr_idx) >> 3;
	while (refill_rb_cnt) {
	tail++;
	tail &= (rbdr->dmem.q_len - 1);

	dmap = rbdr->rbdr_buff_dmaps[tail];
	if (nicvf_alloc_rcv_buffer(nic, rbdr, dmap, mflags,
	DMA_BUFFER_LEN, &rbuf)) {
	/* Something went wrong. Resign */
	break;
	}
	desc = GET_RBDR_DESC(rbdr, tail);
	desc->buf_addr = (rbuf >> NICVF_RCV_BUF_ALIGN);
	refill_rb_cnt--;
	new_rb++;
	}

	/* make sure all memory stores are done before ringing doorbell */
	wmb();

	/* Check if buffer allocation failed */
	if (refill_rb_cnt == 0)
	rb_alloc_fail = FALSE;

	/* Notify HW */
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_DOOR,
	rbdr_idx, new_rb);
	out:
	if (!rb_alloc_fail) {
	/*
	* Re-enable RBDR interrupts only
	* if buffer allocation is success.
	*/
	nicvf_enable_intr(nic, NICVF_INTR_RBDR, rbdr_idx);

	return (0);
	}

	return (ENOMEM);
	}

	/* Refill RBs even if sleep is needed to reclaim memory */
	static void
	nicvf_rbdr_task(void *arg, int pending)
	{
	struct rbdr *rbdr;
	int err;

	rbdr = (struct rbdr *)arg;

	err = nicvf_refill_rbdr(rbdr, M_WAITOK);
	if (__predict_false(err != 0)) {
	panic("%s: Failed to refill RBs even when sleep enabled",
	__func__);
	}
	}

	/* Refill RBs as soon as possible without waiting */
	static void
	nicvf_rbdr_task_nowait(void *arg, int pending)
	{
	struct rbdr *rbdr;
	int err;

	rbdr = (struct rbdr *)arg;

	err = nicvf_refill_rbdr(rbdr, M_NOWAIT);
	if (err != 0) {
	/*
	* Schedule another, sleepable kernel thread
	* that will for sure refill the buffers.
	*/
	taskqueue_enqueue(taskqueue_thread, &rbdr->rbdr_task);
	}
	}

	static int
	nicvf_rcv_pkt_handler(struct nicvf nic, struct cmp_queue cq,
	struct cqe_rx_t *cqe_rx, int cqe_type)
	{
	struct mbuf *mbuf;
	struct rcv_queue *rq;
	int rq_idx;
	int err = 0;

	rq_idx = cqe_rx->rq_idx;
	rq = &nic->qs->rq[rq_idx];

	/* Check for errors */
	err = nicvf_check_cqe_rx_errs(nic, cq, cqe_rx);
	if (err && !cqe_rx->rb_cnt)
	return (0);

	mbuf = nicvf_get_rcv_mbuf(nic, cqe_rx);
	if (mbuf == NULL) {
	dprintf(nic->dev, "Packet not received\n");
	return (0);
	}

	/* If error packet */
	if (err != 0) {
	m_freem(mbuf);
	return (0);
	}

	if (rq->lro_enabled &&
	((cqe_rx->l3_type == L3TYPE_IPV4) && (cqe_rx->l4_type == L4TYPE_TCP)) &&
	(mbuf->m_pkthdr.csum_flags & (CSUM_DATA_VALID \| CSUM_PSEUDO_HDR)) ==
	(CSUM_DATA_VALID \| CSUM_PSEUDO_HDR)) {
	/*
	* At this point it is known that there are no errors in the
	* packet. Attempt to LRO enqueue. Send to stack if no resources
	* or enqueue error.
	*/
	if ((rq->lro.lro_cnt != 0) &&
	(tcp_lro_rx(&rq->lro, mbuf, 0) == 0))
	return (0);
	}
	/*
	* Push this packet to the stack later to avoid
	* unlocking completion task in the middle of work.
	*/
	err = buf_ring_enqueue(cq->rx_br, mbuf);
	if (err != 0) {
	/*
	* Failed to enqueue this mbuf.
	* We don't drop it, just schedule another task.
	*/
	return (err);
	}

	return (0);
	}

	static void
	nicvf_snd_pkt_handler(struct nicvf nic, struct cmp_queue cq,
	struct cqe_send_t *cqe_tx, int cqe_type)
	{
	bus_dmamap_t dmap;
	struct mbuf *mbuf;
	struct snd_queue *sq;
	struct sq_hdr_subdesc *hdr;

	mbuf = NULL;
	sq = &nic->qs->sq[cqe_tx->sq_idx];

	hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, cqe_tx->sqe_ptr);
	if (hdr->subdesc_type != SQ_DESC_TYPE_HEADER)
	return;

	dprintf(nic->dev,
	"%s Qset #%d SQ #%d SQ ptr #%d subdesc count %d\n",
	__func__, cqe_tx->sq_qs, cqe_tx->sq_idx,
	cqe_tx->sqe_ptr, hdr->subdesc_cnt);

	dmap = (bus_dmamap_t)sq->snd_buff[cqe_tx->sqe_ptr].dmap;
	bus_dmamap_unload(sq->snd_buff_dmat, dmap);

	mbuf = (struct mbuf *)sq->snd_buff[cqe_tx->sqe_ptr].mbuf;
	if (mbuf != NULL) {
	m_freem(mbuf);
	sq->snd_buff[cqe_tx->sqe_ptr].mbuf = NULL;
	nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1);
	}

	nicvf_check_cqe_tx_errs(nic, cq, cqe_tx);
	}

	static int
	nicvf_cq_intr_handler(struct nicvf *nic, uint8_t cq_idx)
	{
	struct mbuf *mbuf;
	struct ifnet *ifp;
	int processed_cqe, work_done = 0, tx_done = 0;
	int cqe_count, cqe_head;
	struct queue_set *qs = nic->qs;
	struct cmp_queue *cq = &qs->cq[cq_idx];
	struct snd_queue *sq = &qs->sq[cq_idx];
	struct rcv_queue *rq;
	struct cqe_rx_t *cq_desc;
	struct lro_ctrl *lro;
	int rq_idx;
	int cmp_err;

	NICVF_CMP_LOCK(cq);
	cmp_err = 0;
	processed_cqe = 0;
	/* Get no of valid CQ entries to process */
	cqe_count = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_STATUS, cq_idx);
	cqe_count &= CQ_CQE_COUNT;
	if (cqe_count == 0)
	goto out;

	/* Get head of the valid CQ entries */
	cqe_head = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_HEAD, cq_idx) >> 9;
	cqe_head &= 0xFFFF;

	dprintf(nic->dev, "%s CQ%d cqe_count %d cqe_head %d\n",
	__func__, cq_idx, cqe_count, cqe_head);
	while (processed_cqe < cqe_count) {
	/* Get the CQ descriptor */
	cq_desc = (struct cqe_rx_t *)GET_CQ_DESC(cq, cqe_head);
	cqe_head++;
	cqe_head &= (cq->dmem.q_len - 1);
	/* Prefetch next CQ descriptor */
	__builtin_prefetch((struct cqe_rx_t *)GET_CQ_DESC(cq, cqe_head));

	dprintf(nic->dev, "CQ%d cq_desc->cqe_type %d\n", cq_idx,
	cq_desc->cqe_type);
	switch (cq_desc->cqe_type) {
	case CQE_TYPE_RX:
	cmp_err = nicvf_rcv_pkt_handler(nic, cq, cq_desc,
	CQE_TYPE_RX);
	if (__predict_false(cmp_err != 0)) {
	/*
	* Ups. Cannot finish now.
	* Let's try again later.
	*/
	goto done;
	}
	work_done++;
	break;
	case CQE_TYPE_SEND:
	nicvf_snd_pkt_handler(nic, cq, (void *)cq_desc,
	CQE_TYPE_SEND);
	tx_done++;
	break;
	case CQE_TYPE_INVALID:
	case CQE_TYPE_RX_SPLIT:
	case CQE_TYPE_RX_TCP:
	case CQE_TYPE_SEND_PTP:
	/* Ignore for now */
	break;
	}
	processed_cqe++;
	}
	done:
	dprintf(nic->dev,
	"%s CQ%d processed_cqe %d work_done %d\n",
	__func__, cq_idx, processed_cqe, work_done);

	/* Ring doorbell to inform H/W to reuse processed CQEs */
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_DOOR, cq_idx, processed_cqe);

	if ((tx_done > 0) &&
	((if_getdrvflags(nic->ifp) & IFF_DRV_RUNNING) != 0)) {
	/* Reenable TXQ if its stopped earlier due to SQ full */
	if_setdrvflagbits(nic->ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
	taskqueue_enqueue(sq->snd_taskq, &sq->snd_task);
	}
	out:
	/*
	* Flush any outstanding LRO work
	*/
	rq_idx = cq_idx;
	rq = &nic->qs->rq[rq_idx];
	lro = &rq->lro;
	tcp_lro_flush_all(lro);

	NICVF_CMP_UNLOCK(cq);

	ifp = nic->ifp;
	/* Push received MBUFs to the stack */
	while (!buf_ring_empty(cq->rx_br)) {
	mbuf = buf_ring_dequeue_mc(cq->rx_br);
	if (__predict_true(mbuf != NULL))
	(*ifp->if_input)(ifp, mbuf);
	}

	return (cmp_err);
	}

	/*
	* Qset error interrupt handler
	*
	* As of now only CQ errors are handled
	*/
	static void
	nicvf_qs_err_task(void *arg, int pending)
	{
	struct nicvf *nic;
	struct queue_set *qs;
	int qidx;
	uint64_t status;
	boolean_t enable = TRUE;

	nic = (struct nicvf *)arg;
	qs = nic->qs;

	/* Deactivate network interface */
	if_setdrvflagbits(nic->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);

	/* Check if it is CQ err */
	for (qidx = 0; qidx < qs->cq_cnt; qidx++) {
	status = nicvf_queue_reg_read(nic, NIC_QSET_CQ_0_7_STATUS,
	qidx);
	if ((status & CQ_ERR_MASK) == 0)
	continue;
	/* Process already queued CQEs and reconfig CQ */
	nicvf_disable_intr(nic, NICVF_INTR_CQ, qidx);
	nicvf_sq_disable(nic, qidx);
	(void)nicvf_cq_intr_handler(nic, qidx);
	nicvf_cmp_queue_config(nic, qs, qidx, enable);
	nicvf_sq_free_used_descs(nic, &qs->sq[qidx], qidx);
	nicvf_sq_enable(nic, &qs->sq[qidx], qidx);
	nicvf_enable_intr(nic, NICVF_INTR_CQ, qidx);
	}

	if_setdrvflagbits(nic->ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
	/* Re-enable Qset error interrupt */
	nicvf_enable_intr(nic, NICVF_INTR_QS_ERR, 0);
	}

	static void
	nicvf_cmp_task(void *arg, int pending)
	{
	struct cmp_queue *cq;
	struct nicvf *nic;
	int cmp_err;

	cq = (struct cmp_queue *)arg;
	nic = cq->nic;

	/* Handle CQ descriptors */
	cmp_err = nicvf_cq_intr_handler(nic, cq->idx);
	if (__predict_false(cmp_err != 0)) {
	/*
	* Schedule another thread here since we did not
	* process the entire CQ due to Tx or Rx CQ parse error.
	*/
	taskqueue_enqueue(cq->cmp_taskq, &cq->cmp_task);

	}

	nicvf_clear_intr(nic, NICVF_INTR_CQ, cq->idx);
	/* Reenable interrupt (previously disabled in nicvf_intr_handler() */
	nicvf_enable_intr(nic, NICVF_INTR_CQ, cq->idx);

	}

	/* Initialize completion queue */
	static int
	nicvf_init_cmp_queue(struct nicvf nic, struct cmp_queue cq, int q_len,
	int qidx)
	{
	int err;

	/* Initizalize lock */
	snprintf(cq->mtx_name, sizeof(cq->mtx_name), "%s: CQ(%d) lock",
	device_get_nameunit(nic->dev), qidx);
	mtx_init(&cq->mtx, cq->mtx_name, NULL, MTX_DEF);

	err = nicvf_alloc_q_desc_mem(nic, &cq->dmem, q_len, CMP_QUEUE_DESC_SIZE,
	NICVF_CQ_BASE_ALIGN_BYTES);

	if (err != 0) {
	device_printf(nic->dev,
	"Could not allocate DMA memory for CQ\n");
	return (err);
	}

	cq->desc = cq->dmem.base;
	cq->thresh = pass1_silicon(nic->dev) ? 0 : CMP_QUEUE_CQE_THRESH;
	cq->nic = nic;
	cq->idx = qidx;
	nic->cq_coalesce_usecs = (CMP_QUEUE_TIMER_THRESH * 0.05) - 1;

	cq->rx_br = buf_ring_alloc(CMP_QUEUE_LEN * 8, M_DEVBUF, M_WAITOK,
	&cq->mtx);

	/* Allocate taskqueue */
	TASK_INIT(&cq->cmp_task, 0, nicvf_cmp_task, cq);
	cq->cmp_taskq = taskqueue_create_fast("nicvf_cmp_taskq", M_WAITOK,
	taskqueue_thread_enqueue, &cq->cmp_taskq);
	taskqueue_start_threads(&cq->cmp_taskq, 1, PI_NET, "%s: cmp_taskq(%d)",
	device_get_nameunit(nic->dev), qidx);

	return (0);
	}

	static void
	nicvf_free_cmp_queue(struct nicvf nic, struct cmp_queue cq)
	{

	if (cq == NULL)
	return;
	/*
	* The completion queue itself should be disabled by now
	* (ref. nicvf_snd_queue_config()).
	* Ensure that it is safe to disable it or panic.
	*/
	if (cq->enable)
	panic("%s: Trying to free working CQ(%d)", __func__, cq->idx);

	if (cq->cmp_taskq != NULL) {
	/* Remove task */
	while (taskqueue_cancel(cq->cmp_taskq, &cq->cmp_task, NULL) != 0)
	taskqueue_drain(cq->cmp_taskq, &cq->cmp_task);

	taskqueue_free(cq->cmp_taskq);
	cq->cmp_taskq = NULL;
	}
	/*
	* Completion interrupt will possibly enable interrupts again
	* so disable interrupting now after we finished processing
	* completion task. It is safe to do so since the corresponding CQ
	* was already disabled.
	*/
	nicvf_disable_intr(nic, NICVF_INTR_CQ, cq->idx);
	nicvf_clear_intr(nic, NICVF_INTR_CQ, cq->idx);

	NICVF_CMP_LOCK(cq);
	nicvf_free_q_desc_mem(nic, &cq->dmem);
	drbr_free(cq->rx_br, M_DEVBUF);
	NICVF_CMP_UNLOCK(cq);
	mtx_destroy(&cq->mtx);
	memset(cq->mtx_name, 0, sizeof(cq->mtx_name));
	}

	int
	nicvf_xmit_locked(struct snd_queue *sq)
	{
	struct nicvf *nic;
	struct ifnet *ifp;
	struct mbuf *next;
	int err;

	NICVF_TX_LOCK_ASSERT(sq);

	nic = sq->nic;
	ifp = nic->ifp;
	err = 0;

	while ((next = drbr_peek(ifp, sq->br)) != NULL) {
	/* Send a copy of the frame to the BPF listener */
	ETHER_BPF_MTAP(ifp, next);

	err = nicvf_tx_mbuf_locked(sq, &next);
	if (err != 0) {
	if (next == NULL)
	drbr_advance(ifp, sq->br);
	else
	drbr_putback(ifp, sq->br, next);

	break;
	}
	drbr_advance(ifp, sq->br);
	}
	return (err);
	}

	static void
	nicvf_snd_task(void *arg, int pending)
	{
	struct snd_queue sq = (struct snd_queue )arg;
	struct nicvf *nic;
	struct ifnet *ifp;
	int err;

	nic = sq->nic;
	ifp = nic->ifp;

	/*
	* Skip sending anything if the driver is not running,
	* SQ full or link is down.
	*/
	if (((if_getdrvflags(ifp) & (IFF_DRV_RUNNING \| IFF_DRV_OACTIVE)) !=
	IFF_DRV_RUNNING) \|\| !nic->link_up)
	return;

	NICVF_TX_LOCK(sq);
	err = nicvf_xmit_locked(sq);
	NICVF_TX_UNLOCK(sq);
	/* Try again */
	if (err != 0)
	taskqueue_enqueue(sq->snd_taskq, &sq->snd_task);
	}

	/* Initialize transmit queue */
	static int
	nicvf_init_snd_queue(struct nicvf nic, struct snd_queue sq, int q_len,
	int qidx)
	{
	size_t i;
	int err;

	/* Initizalize TX lock for this queue */
	snprintf(sq->mtx_name, sizeof(sq->mtx_name), "%s: SQ(%d) lock",
	device_get_nameunit(nic->dev), qidx);
	mtx_init(&sq->mtx, sq->mtx_name, NULL, MTX_DEF);

	NICVF_TX_LOCK(sq);
	/* Allocate buffer ring */
	sq->br = buf_ring_alloc(q_len / MIN_SQ_DESC_PER_PKT_XMIT, M_DEVBUF,
	M_NOWAIT, &sq->mtx);
	if (sq->br == NULL) {
	device_printf(nic->dev,
	"ERROR: Could not set up buf ring for SQ(%d)\n", qidx);
	err = ENOMEM;
	goto error;
	}

	/* Allocate DMA memory for Tx descriptors */
	err = nicvf_alloc_q_desc_mem(nic, &sq->dmem, q_len, SND_QUEUE_DESC_SIZE,
	NICVF_SQ_BASE_ALIGN_BYTES);
	if (err != 0) {
	device_printf(nic->dev,
	"Could not allocate DMA memory for SQ\n");
	goto error;
	}

	sq->desc = sq->dmem.base;
	sq->head = sq->tail = 0;
	atomic_store_rel_int(&sq->free_cnt, q_len - 1);
	sq->thresh = SND_QUEUE_THRESH;
	sq->idx = qidx;
	sq->nic = nic;

	/*
	* Allocate DMA maps for Tx buffers
	*/

	/* Create DMA tag first */
	err = bus_dma_tag_create(
	bus_get_dma_tag(nic->dev), /* parent tag */
	1, /* alignment */
	0, /* boundary */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filtfunc, filtfuncarg */
	NICVF_TSO_MAXSIZE, /* maxsize */
	NICVF_TSO_NSEGS, /* nsegments */
	MCLBYTES, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockfuncarg */
	&sq->snd_buff_dmat); /* dmat */

	if (err != 0) {
	device_printf(nic->dev,
	"Failed to create busdma tag for Tx buffers\n");
	goto error;
	}

	/* Allocate send buffers array */
	sq->snd_buff = malloc(sizeof(sq->snd_buff) q_len, M_NICVF,
	(M_NOWAIT \| M_ZERO));
	if (sq->snd_buff == NULL) {
	device_printf(nic->dev,
	"Could not allocate memory for Tx buffers array\n");
	err = ENOMEM;
	goto error;
	}

	/* Now populate maps */
	for (i = 0; i < q_len; i++) {
	err = bus_dmamap_create(sq->snd_buff_dmat, 0,
	&sq->snd_buff[i].dmap);
	if (err != 0) {
	device_printf(nic->dev,
	"Failed to create DMA maps for Tx buffers\n");
	goto error;
	}
	}
	NICVF_TX_UNLOCK(sq);

	/* Allocate taskqueue */
	TASK_INIT(&sq->snd_task, 0, nicvf_snd_task, sq);
	sq->snd_taskq = taskqueue_create_fast("nicvf_snd_taskq", M_WAITOK,
	taskqueue_thread_enqueue, &sq->snd_taskq);
	taskqueue_start_threads(&sq->snd_taskq, 1, PI_NET, "%s: snd_taskq(%d)",
	device_get_nameunit(nic->dev), qidx);

	return (0);
	error:
	NICVF_TX_UNLOCK(sq);
	return (err);
	}

	static void
	nicvf_free_snd_queue(struct nicvf nic, struct snd_queue sq)
	{
	struct queue_set *qs = nic->qs;
	size_t i;
	int err;

	if (sq == NULL)
	return;

	if (sq->snd_taskq != NULL) {
	/* Remove task */
	while (taskqueue_cancel(sq->snd_taskq, &sq->snd_task, NULL) != 0)
	taskqueue_drain(sq->snd_taskq, &sq->snd_task);

	taskqueue_free(sq->snd_taskq);
	sq->snd_taskq = NULL;
	}

	NICVF_TX_LOCK(sq);
	if (sq->snd_buff_dmat != NULL) {
	if (sq->snd_buff != NULL) {
	for (i = 0; i < qs->sq_len; i++) {
	m_freem(sq->snd_buff[i].mbuf);
	sq->snd_buff[i].mbuf = NULL;

	bus_dmamap_unload(sq->snd_buff_dmat,
	sq->snd_buff[i].dmap);
	err = bus_dmamap_destroy(sq->snd_buff_dmat,
	sq->snd_buff[i].dmap);
	/*
	* If bus_dmamap_destroy fails it can cause
	* random panic later if the tag is also
	* destroyed in the process.
	*/
	KASSERT(err == 0,
	("%s: Could not destroy DMA map for SQ",
	__func__));
	}
	}

	free(sq->snd_buff, M_NICVF);

	err = bus_dma_tag_destroy(sq->snd_buff_dmat);
	KASSERT(err == 0,
	("%s: Trying to destroy BUSY DMA tag", __func__));
	}

	/* Free private driver ring for this send queue */
	if (sq->br != NULL)
	drbr_free(sq->br, M_DEVBUF);

	if (sq->dmem.base != NULL)
	nicvf_free_q_desc_mem(nic, &sq->dmem);

	NICVF_TX_UNLOCK(sq);
	/* Destroy Tx lock */
	mtx_destroy(&sq->mtx);
	memset(sq->mtx_name, 0, sizeof(sq->mtx_name));
	}

	static void
	nicvf_reclaim_snd_queue(struct nicvf nic, struct queue_set qs, int qidx)
	{

	/* Disable send queue */
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, 0);
	/* Check if SQ is stopped */
	if (nicvf_poll_reg(nic, qidx, NIC_QSET_SQ_0_7_STATUS, 21, 1, 0x01))
	return;
	/* Reset send queue */
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, NICVF_SQ_RESET);
	}

	static void
	nicvf_reclaim_rcv_queue(struct nicvf nic, struct queue_set qs, int qidx)
	{
	union nic_mbx mbx = {};

	/* Make sure all packets in the pipeline are written back into mem */
	mbx.msg.msg = NIC_MBOX_MSG_RQ_SW_SYNC;
	nicvf_send_msg_to_pf(nic, &mbx);
	}

	static void
	nicvf_reclaim_cmp_queue(struct nicvf nic, struct queue_set qs, int qidx)
	{

	/* Disable timer threshold (doesn't get reset upon CQ reset */
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG2, qidx, 0);
	/* Disable completion queue */
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, 0);
	/* Reset completion queue */
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, NICVF_CQ_RESET);
	}

	static void
	nicvf_reclaim_rbdr(struct nicvf nic, struct rbdr rbdr, int qidx)
	{
	uint64_t tmp, fifo_state;
	int timeout = 10;

	/* Save head and tail pointers for feeing up buffers */
	rbdr->head =
	nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_HEAD, qidx) >> 3;
	rbdr->tail =
	nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_TAIL, qidx) >> 3;

	/*
	* If RBDR FIFO is in 'FAIL' state then do a reset first
	* before relaiming.
	*/
	fifo_state = nicvf_queue_reg_read(nic, NIC_QSET_RBDR_0_1_STATUS0, qidx);
	if (((fifo_state >> 62) & 0x03) == 0x3) {
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG,
	qidx, NICVF_RBDR_RESET);
	}

	/* Disable RBDR */
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx, 0);
	if (nicvf_poll_reg(nic, qidx, NIC_QSET_RBDR_0_1_STATUS0, 62, 2, 0x00))
	return;
	while (1) {
	tmp = nicvf_queue_reg_read(nic,
	NIC_QSET_RBDR_0_1_PREFETCH_STATUS, qidx);
	if ((tmp & 0xFFFFFFFF) == ((tmp >> 32) & 0xFFFFFFFF))
	break;

	DELAY(1000);
	timeout--;
	if (!timeout) {
	device_printf(nic->dev,
	"Failed polling on prefetch status\n");
	return;
	}
	}
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx,
	NICVF_RBDR_RESET);

	if (nicvf_poll_reg(nic, qidx, NIC_QSET_RBDR_0_1_STATUS0, 62, 2, 0x02))
	return;
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx, 0x00);
	if (nicvf_poll_reg(nic, qidx, NIC_QSET_RBDR_0_1_STATUS0, 62, 2, 0x00))
	return;
	}

	/* Configures receive queue */
	static void
	nicvf_rcv_queue_config(struct nicvf nic, struct queue_set qs,
	int qidx, bool enable)
	{
	union nic_mbx mbx = {};
	struct rcv_queue *rq;
	struct rq_cfg rq_cfg;
	struct ifnet *ifp;
	struct lro_ctrl *lro;

	ifp = nic->ifp;

	rq = &qs->rq[qidx];
	rq->enable = enable;

	lro = &rq->lro;

	/* Disable receive queue */
	nicvf_queue_reg_write(nic, NIC_QSET_RQ_0_7_CFG, qidx, 0);

	if (!rq->enable) {
	nicvf_reclaim_rcv_queue(nic, qs, qidx);
	/* Free LRO memory */
	tcp_lro_free(lro);
	rq->lro_enabled = FALSE;
	return;
	}

	/* Configure LRO if enabled */
	rq->lro_enabled = FALSE;
	if ((if_getcapenable(ifp) & IFCAP_LRO) != 0) {
	if (tcp_lro_init(lro) != 0) {
	device_printf(nic->dev,
	"Failed to initialize LRO for RXQ%d\n", qidx);
	} else {
	rq->lro_enabled = TRUE;
	lro->ifp = nic->ifp;
	}
	}

	rq->cq_qs = qs->vnic_id;
	rq->cq_idx = qidx;
	rq->start_rbdr_qs = qs->vnic_id;
	rq->start_qs_rbdr_idx = qs->rbdr_cnt - 1;
	rq->cont_rbdr_qs = qs->vnic_id;
	rq->cont_qs_rbdr_idx = qs->rbdr_cnt - 1;
	/* all writes of RBDR data to be loaded into L2 Cache as well*/
	rq->caching = 1;

	/* Send a mailbox msg to PF to config RQ */
	mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG;
	mbx.rq.qs_num = qs->vnic_id;
	mbx.rq.rq_num = qidx;
	mbx.rq.cfg = (rq->caching << 26) \| (rq->cq_qs << 19) \|
	(rq->cq_idx << 16) \| (rq->cont_rbdr_qs << 9) \|
	(rq->cont_qs_rbdr_idx << 8) \| (rq->start_rbdr_qs << 1) \|
	(rq->start_qs_rbdr_idx);
	nicvf_send_msg_to_pf(nic, &mbx);

	mbx.rq.msg = NIC_MBOX_MSG_RQ_BP_CFG;
	mbx.rq.cfg = (1UL << 63) \| (1UL << 62) \| (qs->vnic_id << 0);
	nicvf_send_msg_to_pf(nic, &mbx);

	/*
	* RQ drop config
	* Enable CQ drop to reserve sufficient CQEs for all tx packets
	*/
	mbx.rq.msg = NIC_MBOX_MSG_RQ_DROP_CFG;
	mbx.rq.cfg = (1UL << 62) \| (RQ_CQ_DROP << 8);
	nicvf_send_msg_to_pf(nic, &mbx);

	nicvf_queue_reg_write(nic, NIC_QSET_RQ_GEN_CFG, 0, 0x00);

	/* Enable Receive queue */
	rq_cfg.ena = 1;
	rq_cfg.tcp_ena = 0;
	nicvf_queue_reg_write(nic, NIC_QSET_RQ_0_7_CFG, qidx,
	(uint64_t )&rq_cfg);
	}

	/* Configures completion queue */
	static void
	nicvf_cmp_queue_config(struct nicvf nic, struct queue_set qs,
	int qidx, boolean_t enable)
	{
	struct cmp_queue *cq;
	struct cq_cfg cq_cfg;

	cq = &qs->cq[qidx];
	cq->enable = enable;

	if (!cq->enable) {
	nicvf_reclaim_cmp_queue(nic, qs, qidx);
	return;
	}

	/* Reset completion queue */
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, NICVF_CQ_RESET);

	/* Set completion queue base address */
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_BASE, qidx,
	(uint64_t)(cq->dmem.phys_base));

	/* Enable Completion queue */
	cq_cfg.ena = 1;
	cq_cfg.reset = 0;
	cq_cfg.caching = 0;
	cq_cfg.qsize = CMP_QSIZE;
	cq_cfg.avg_con = 0;
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG, qidx, (uint64_t )&cq_cfg);

	/* Set threshold value for interrupt generation */
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_THRESH, qidx, cq->thresh);
	nicvf_queue_reg_write(nic, NIC_QSET_CQ_0_7_CFG2, qidx,
	nic->cq_coalesce_usecs);
	}

	/* Configures transmit queue */
	static void
	nicvf_snd_queue_config(struct nicvf nic, struct queue_set qs, int qidx,
	boolean_t enable)
	{
	union nic_mbx mbx = {};
	struct snd_queue *sq;
	struct sq_cfg sq_cfg;

	sq = &qs->sq[qidx];
	sq->enable = enable;

	if (!sq->enable) {
	nicvf_reclaim_snd_queue(nic, qs, qidx);
	return;
	}

	/* Reset send queue */
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, NICVF_SQ_RESET);

	sq->cq_qs = qs->vnic_id;
	sq->cq_idx = qidx;

	/* Send a mailbox msg to PF to config SQ */
	mbx.sq.msg = NIC_MBOX_MSG_SQ_CFG;
	mbx.sq.qs_num = qs->vnic_id;
	mbx.sq.sq_num = qidx;
	mbx.sq.sqs_mode = nic->sqs_mode;
	mbx.sq.cfg = (sq->cq_qs << 3) \| sq->cq_idx;
	nicvf_send_msg_to_pf(nic, &mbx);

	/* Set queue base address */
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_BASE, qidx,
	(uint64_t)(sq->dmem.phys_base));

	/* Enable send queue & set queue size */
	sq_cfg.ena = 1;
	sq_cfg.reset = 0;
	sq_cfg.ldwb = 0;
	sq_cfg.qsize = SND_QSIZE;
	sq_cfg.tstmp_bgx_intf = 0;
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, (uint64_t )&sq_cfg);

	/* Set threshold value for interrupt generation */
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_THRESH, qidx, sq->thresh);
	}

	/* Configures receive buffer descriptor ring */
	static void
	nicvf_rbdr_config(struct nicvf nic, struct queue_set qs, int qidx,
	boolean_t enable)
	{
	struct rbdr *rbdr;
	struct rbdr_cfg rbdr_cfg;

	rbdr = &qs->rbdr[qidx];
	nicvf_reclaim_rbdr(nic, rbdr, qidx);
	if (!enable)
	return;

	/* Set descriptor base address */
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_BASE, qidx,
	(uint64_t)(rbdr->dmem.phys_base));

	/* Enable RBDR & set queue size */
	/* Buffer size should be in multiples of 128 bytes */
	rbdr_cfg.ena = 1;
	rbdr_cfg.reset = 0;
	rbdr_cfg.ldwb = 0;
	rbdr_cfg.qsize = RBDR_SIZE;
	rbdr_cfg.avg_con = 0;
	rbdr_cfg.lines = rbdr->dma_size / 128;
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_CFG, qidx,
	(uint64_t )&rbdr_cfg);

	/* Notify HW */
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_DOOR, qidx,
	qs->rbdr_len - 1);

	/* Set threshold value for interrupt generation */
	nicvf_queue_reg_write(nic, NIC_QSET_RBDR_0_1_THRESH, qidx,
	rbdr->thresh - 1);
	}

	/* Requests PF to assign and enable Qset */
	void
	nicvf_qset_config(struct nicvf *nic, boolean_t enable)
	{
	union nic_mbx mbx = {};
	struct queue_set *qs;
	struct qs_cfg *qs_cfg;

	qs = nic->qs;
	if (qs == NULL) {
	device_printf(nic->dev,
	"Qset is still not allocated, don't init queues\n");
	return;
	}

	qs->enable = enable;
	qs->vnic_id = nic->vf_id;

	/* Send a mailbox msg to PF to config Qset */
	mbx.qs.msg = NIC_MBOX_MSG_QS_CFG;
	mbx.qs.num = qs->vnic_id;

	mbx.qs.cfg = 0;
	qs_cfg = (struct qs_cfg *)&mbx.qs.cfg;
	if (qs->enable) {
	qs_cfg->ena = 1;
	qs_cfg->vnic = qs->vnic_id;
	}
	nicvf_send_msg_to_pf(nic, &mbx);
	}

	static void
	nicvf_free_resources(struct nicvf *nic)
	{
	int qidx;
	struct queue_set *qs;

	qs = nic->qs;
	/*
	* Remove QS error task first since it has to be dead
	* to safely free completion queue tasks.
	*/
	if (qs->qs_err_taskq != NULL) {
	/* Shut down QS error tasks */
	while (taskqueue_cancel(qs->qs_err_taskq,
	&qs->qs_err_task, NULL) != 0) {
	taskqueue_drain(qs->qs_err_taskq, &qs->qs_err_task);

	}
	taskqueue_free(qs->qs_err_taskq);
	qs->qs_err_taskq = NULL;
	}
	/* Free receive buffer descriptor ring */
	for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
	nicvf_free_rbdr(nic, &qs->rbdr[qidx]);

	/* Free completion queue */
	for (qidx = 0; qidx < qs->cq_cnt; qidx++)
	nicvf_free_cmp_queue(nic, &qs->cq[qidx]);

	/* Free send queue */
	for (qidx = 0; qidx < qs->sq_cnt; qidx++)
	nicvf_free_snd_queue(nic, &qs->sq[qidx]);
	}

	static int
	nicvf_alloc_resources(struct nicvf *nic)
	{
	struct queue_set *qs = nic->qs;
	int qidx;

	/* Alloc receive buffer descriptor ring */
	for (qidx = 0; qidx < qs->rbdr_cnt; qidx++) {
	if (nicvf_init_rbdr(nic, &qs->rbdr[qidx], qs->rbdr_len,
	DMA_BUFFER_LEN, qidx))
	goto alloc_fail;
	}

	/* Alloc send queue */
	for (qidx = 0; qidx < qs->sq_cnt; qidx++) {
	if (nicvf_init_snd_queue(nic, &qs->sq[qidx], qs->sq_len, qidx))
	goto alloc_fail;
	}

	/* Alloc completion queue */
	for (qidx = 0; qidx < qs->cq_cnt; qidx++) {
	if (nicvf_init_cmp_queue(nic, &qs->cq[qidx], qs->cq_len, qidx))
	goto alloc_fail;
	}

	/* Allocate QS error taskqueue */
	TASK_INIT(&qs->qs_err_task, 0, nicvf_qs_err_task, nic);
	qs->qs_err_taskq = taskqueue_create_fast("nicvf_qs_err_taskq", M_WAITOK,
	taskqueue_thread_enqueue, &qs->qs_err_taskq);
	taskqueue_start_threads(&qs->qs_err_taskq, 1, PI_NET, "%s: qs_taskq",
	device_get_nameunit(nic->dev));

	return (0);
	alloc_fail:
	nicvf_free_resources(nic);
	return (ENOMEM);
	}

	int
	nicvf_set_qset_resources(struct nicvf *nic)
	{
	struct queue_set *qs;

	qs = malloc(sizeof(*qs), M_NICVF, (M_ZERO \| M_WAITOK));
	nic->qs = qs;

	/* Set count of each queue */
	qs->rbdr_cnt = RBDR_CNT;
	qs->rq_cnt = RCV_QUEUE_CNT;

	qs->sq_cnt = SND_QUEUE_CNT;
	qs->cq_cnt = CMP_QUEUE_CNT;

	/* Set queue lengths */
	qs->rbdr_len = RCV_BUF_COUNT;
	qs->sq_len = SND_QUEUE_LEN;
	qs->cq_len = CMP_QUEUE_LEN;

	nic->rx_queues = qs->rq_cnt;
	nic->tx_queues = qs->sq_cnt;

	return (0);
	}

	int
	nicvf_config_data_transfer(struct nicvf *nic, boolean_t enable)
	{
	boolean_t disable = FALSE;
	struct queue_set *qs;
	int qidx;

	qs = nic->qs;
	if (qs == NULL)
	return (0);

	if (enable) {
	if (nicvf_alloc_resources(nic) != 0)
	return (ENOMEM);

	for (qidx = 0; qidx < qs->sq_cnt; qidx++)
	nicvf_snd_queue_config(nic, qs, qidx, enable);
	for (qidx = 0; qidx < qs->cq_cnt; qidx++)
	nicvf_cmp_queue_config(nic, qs, qidx, enable);
	for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
	nicvf_rbdr_config(nic, qs, qidx, enable);
	for (qidx = 0; qidx < qs->rq_cnt; qidx++)
	nicvf_rcv_queue_config(nic, qs, qidx, enable);
	} else {
	for (qidx = 0; qidx < qs->rq_cnt; qidx++)
	nicvf_rcv_queue_config(nic, qs, qidx, disable);
	for (qidx = 0; qidx < qs->rbdr_cnt; qidx++)
	nicvf_rbdr_config(nic, qs, qidx, disable);
	for (qidx = 0; qidx < qs->sq_cnt; qidx++)
	nicvf_snd_queue_config(nic, qs, qidx, disable);
	for (qidx = 0; qidx < qs->cq_cnt; qidx++)
	nicvf_cmp_queue_config(nic, qs, qidx, disable);

	nicvf_free_resources(nic);
	}

	return (0);
	}

	/*
	* Get a free desc from SQ
	* returns descriptor ponter & descriptor number
	*/
	static __inline int
	nicvf_get_sq_desc(struct snd_queue *sq, int desc_cnt)
	{
	int qentry;

	qentry = sq->tail;
	atomic_subtract_int(&sq->free_cnt, desc_cnt);
	sq->tail += desc_cnt;
	sq->tail &= (sq->dmem.q_len - 1);

	return (qentry);
	}

	/* Free descriptor back to SQ for future use */
	static void
	nicvf_put_sq_desc(struct snd_queue *sq, int desc_cnt)
	{

	atomic_add_int(&sq->free_cnt, desc_cnt);
	sq->head += desc_cnt;
	sq->head &= (sq->dmem.q_len - 1);
	}

	static __inline int
	nicvf_get_nxt_sqentry(struct snd_queue *sq, int qentry)
	{
	qentry++;
	qentry &= (sq->dmem.q_len - 1);
	return (qentry);
	}

	static void
	nicvf_sq_enable(struct nicvf nic, struct snd_queue sq, int qidx)
	{
	uint64_t sq_cfg;

	sq_cfg = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_CFG, qidx);
	sq_cfg \|= NICVF_SQ_EN;
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, sq_cfg);
	/* Ring doorbell so that H/W restarts processing SQEs */
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_DOOR, qidx, 0);
	}

	static void
	nicvf_sq_disable(struct nicvf *nic, int qidx)
	{
	uint64_t sq_cfg;

	sq_cfg = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_CFG, qidx);
	sq_cfg &= ~NICVF_SQ_EN;
	nicvf_queue_reg_write(nic, NIC_QSET_SQ_0_7_CFG, qidx, sq_cfg);
	}

	static void
	nicvf_sq_free_used_descs(struct nicvf nic, struct snd_queue sq, int qidx)
	{
	- uint64_t head, tail;
	+ uint64_t head;
	struct snd_buff *snd_buff;
	struct sq_hdr_subdesc *hdr;

	NICVF_TX_LOCK(sq);
	head = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_HEAD, qidx) >> 4;
	- tail = nicvf_queue_reg_read(nic, NIC_QSET_SQ_0_7_TAIL, qidx) >> 4;
	while (sq->head != head) {
	hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, sq->head);
	if (hdr->subdesc_type != SQ_DESC_TYPE_HEADER) {
	nicvf_put_sq_desc(sq, 1);
	continue;
	}
	snd_buff = &sq->snd_buff[sq->head];
	if (snd_buff->mbuf != NULL) {
	bus_dmamap_unload(sq->snd_buff_dmat, snd_buff->dmap);
	m_freem(snd_buff->mbuf);
	sq->snd_buff[sq->head].mbuf = NULL;
	}
	nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1);
	}
	NICVF_TX_UNLOCK(sq);
	}

	/*
	* Add SQ HEADER subdescriptor.
	* First subdescriptor for every send descriptor.
	*/
	static __inline int
	nicvf_sq_add_hdr_subdesc(struct snd_queue *sq, int qentry,
	int subdesc_cnt, struct mbuf *mbuf, int len)
	{
	struct nicvf *nic;
	struct sq_hdr_subdesc *hdr;
	struct ether_vlan_header *eh;
	#ifdef INET
	struct ip *ip;
	struct tcphdr *th;
	#endif
	uint16_t etype;
	int ehdrlen, iphlen, poff, proto;

	nic = sq->nic;

	hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, qentry);
	sq->snd_buff[qentry].mbuf = mbuf;

	memset(hdr, 0, SND_QUEUE_DESC_SIZE);
	hdr->subdesc_type = SQ_DESC_TYPE_HEADER;
	/* Enable notification via CQE after processing SQE */
	hdr->post_cqe = 1;
	/* No of subdescriptors following this */
	hdr->subdesc_cnt = subdesc_cnt;
	hdr->tot_len = len;

	eh = mtod(mbuf, struct ether_vlan_header *);
	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
	ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
	etype = ntohs(eh->evl_proto);
	} else {
	ehdrlen = ETHER_HDR_LEN;
	etype = ntohs(eh->evl_encap_proto);
	}

	poff = proto = -1;
	switch (etype) {
	#ifdef INET6
	case ETHERTYPE_IPV6:
	if (mbuf->m_len < ehdrlen + sizeof(struct ip6_hdr)) {
	mbuf = m_pullup(mbuf, ehdrlen +sizeof(struct ip6_hdr));
	sq->snd_buff[qentry].mbuf = NULL;
	if (mbuf == NULL)
	return (ENOBUFS);
	}
	poff = ip6_lasthdr(mbuf, ehdrlen, IPPROTO_IPV6, &proto);
	if (poff < 0)
	return (ENOBUFS);
	poff += ehdrlen;
	break;
	#endif
	#ifdef INET
	case ETHERTYPE_IP:
	if (mbuf->m_len < ehdrlen + sizeof(struct ip)) {
	mbuf = m_pullup(mbuf, ehdrlen + sizeof(struct ip));
	sq->snd_buff[qentry].mbuf = mbuf;
	if (mbuf == NULL)
	return (ENOBUFS);
	}
	if (mbuf->m_pkthdr.csum_flags & CSUM_IP)
	hdr->csum_l3 = 1; /* Enable IP csum calculation */

	ip = (struct ip *)(mbuf->m_data + ehdrlen);
	iphlen = ip->ip_hl << 2;
	poff = ehdrlen + iphlen;
	proto = ip->ip_p;
	break;
	#endif
	}

	#if defined(INET6) \|\| defined(INET)
	if (poff > 0 && mbuf->m_pkthdr.csum_flags != 0) {
	switch (proto) {
	case IPPROTO_TCP:
	if ((mbuf->m_pkthdr.csum_flags & CSUM_TCP) == 0)
	break;

	if (mbuf->m_len < (poff + sizeof(struct tcphdr))) {
	mbuf = m_pullup(mbuf, poff + sizeof(struct tcphdr));
	sq->snd_buff[qentry].mbuf = mbuf;
	if (mbuf == NULL)
	return (ENOBUFS);
	}
	hdr->csum_l4 = SEND_L4_CSUM_TCP;
	break;
	case IPPROTO_UDP:
	if ((mbuf->m_pkthdr.csum_flags & CSUM_UDP) == 0)
	break;

	if (mbuf->m_len < (poff + sizeof(struct udphdr))) {
	mbuf = m_pullup(mbuf, poff + sizeof(struct udphdr));
	sq->snd_buff[qentry].mbuf = mbuf;
	if (mbuf == NULL)
	return (ENOBUFS);
	}
	hdr->csum_l4 = SEND_L4_CSUM_UDP;
	break;
	case IPPROTO_SCTP:
	if ((mbuf->m_pkthdr.csum_flags & CSUM_SCTP) == 0)
	break;

	if (mbuf->m_len < (poff + sizeof(struct sctphdr))) {
	mbuf = m_pullup(mbuf, poff + sizeof(struct sctphdr));
	sq->snd_buff[qentry].mbuf = mbuf;
	if (mbuf == NULL)
	return (ENOBUFS);
	}
	hdr->csum_l4 = SEND_L4_CSUM_SCTP;
	break;
	default:
	break;
	}
	hdr->l3_offset = ehdrlen;
	hdr->l4_offset = poff;
	}

	if ((mbuf->m_pkthdr.tso_segsz != 0) && nic->hw_tso) {
	th = (struct tcphdr *)((caddr_t)(mbuf->m_data + poff));

	hdr->tso = 1;
	hdr->tso_start = poff + (th->th_off * 4);
	hdr->tso_max_paysize = mbuf->m_pkthdr.tso_segsz;
	hdr->inner_l3_offset = ehdrlen - 2;
	nic->drv_stats.tx_tso++;
	}
	#endif

	return (0);
	}

	/*
	* SQ GATHER subdescriptor
	* Must follow HDR descriptor
	*/
	static inline void nicvf_sq_add_gather_subdesc(struct snd_queue *sq, int qentry,
	int size, uint64_t data)
	{
	struct sq_gather_subdesc *gather;

	qentry &= (sq->dmem.q_len - 1);
	gather = (struct sq_gather_subdesc *)GET_SQ_DESC(sq, qentry);

	memset(gather, 0, SND_QUEUE_DESC_SIZE);
	gather->subdesc_type = SQ_DESC_TYPE_GATHER;
	gather->ld_type = NIC_SEND_LD_TYPE_E_LDD;
	gather->size = size;
	gather->addr = data;
	}

	/* Put an mbuf to a SQ for packet transfer. */
	static int
	nicvf_tx_mbuf_locked(struct snd_queue sq, struct mbuf *mbufp)
	{
	bus_dma_segment_t segs[256];
	struct snd_buff *snd_buff;
	size_t seg;
	int nsegs, qentry;
	int subdesc_cnt;
	int err;

	NICVF_TX_LOCK_ASSERT(sq);

	if (sq->free_cnt == 0)
	return (ENOBUFS);

	snd_buff = &sq->snd_buff[sq->tail];

	err = bus_dmamap_load_mbuf_sg(sq->snd_buff_dmat, snd_buff->dmap,
	*mbufp, segs, &nsegs, BUS_DMA_NOWAIT);
	if (__predict_false(err != 0)) {
	/* ARM64TODO: Add mbuf defragmenting if we lack maps */
	m_freem(*mbufp);
	*mbufp = NULL;
	return (err);
	}

	/* Set how many subdescriptors is required */
	subdesc_cnt = MIN_SQ_DESC_PER_PKT_XMIT + nsegs - 1;
	if (subdesc_cnt > sq->free_cnt) {
	/* ARM64TODO: Add mbuf defragmentation if we lack descriptors */
	bus_dmamap_unload(sq->snd_buff_dmat, snd_buff->dmap);
	return (ENOBUFS);
	}

	qentry = nicvf_get_sq_desc(sq, subdesc_cnt);

	/* Add SQ header subdesc */
	err = nicvf_sq_add_hdr_subdesc(sq, qentry, subdesc_cnt - 1, *mbufp,
	(*mbufp)->m_pkthdr.len);
	if (err != 0) {
	nicvf_put_sq_desc(sq, subdesc_cnt);
	bus_dmamap_unload(sq->snd_buff_dmat, snd_buff->dmap);
	if (err == ENOBUFS) {
	m_freem(*mbufp);
	*mbufp = NULL;
	}
	return (err);
	}

	/* Add SQ gather subdescs */
	for (seg = 0; seg < nsegs; seg++) {
	qentry = nicvf_get_nxt_sqentry(sq, qentry);
	nicvf_sq_add_gather_subdesc(sq, qentry, segs[seg].ds_len,
	segs[seg].ds_addr);
	}

	/* make sure all memory stores are done before ringing doorbell */
	bus_dmamap_sync(sq->dmem.dmat, sq->dmem.dmap, BUS_DMASYNC_PREWRITE);

	dprintf(sq->nic->dev, "%s: sq->idx: %d, subdesc_cnt: %d\n",
	__func__, sq->idx, subdesc_cnt);
	/* Inform HW to xmit new packet */
	nicvf_queue_reg_write(sq->nic, NIC_QSET_SQ_0_7_DOOR,
	sq->idx, subdesc_cnt);
	return (0);
	}

	static __inline u_int
	frag_num(u_int i)
	{
	#if BYTE_ORDER == BIG_ENDIAN
	return ((i & ~3) + 3 - (i & 3));
	#else
	return (i);
	#endif
	}

	/* Returns MBUF for a received packet */
	struct mbuf *
	nicvf_get_rcv_mbuf(struct nicvf nic, struct cqe_rx_t cqe_rx)
	{
	int frag;
	int payload_len = 0;
	struct mbuf *mbuf;
	struct mbuf *mbuf_frag;
	uint16_t *rb_lens = NULL;
	uint64_t *rb_ptrs = NULL;

	mbuf = NULL;
	rb_lens = (uint16_t )((uint8_t )cqe_rx + (3 * sizeof(uint64_t)));
	rb_ptrs = (uint64_t )((uint8_t )cqe_rx + (6 * sizeof(uint64_t)));

	dprintf(nic->dev, "%s rb_cnt %d rb0_ptr %lx rb0_sz %d\n",
	__func__, cqe_rx->rb_cnt, cqe_rx->rb0_ptr, cqe_rx->rb0_sz);

	for (frag = 0; frag < cqe_rx->rb_cnt; frag++) {
	payload_len = rb_lens[frag_num(frag)];
	if (frag == 0) {
	/* First fragment */
	mbuf = nicvf_rb_ptr_to_mbuf(nic,
	(*rb_ptrs - cqe_rx->align_pad));
	mbuf->m_len = payload_len;
	mbuf->m_data += cqe_rx->align_pad;
	if_setrcvif(mbuf, nic->ifp);
	} else {
	/* Add fragments */
	mbuf_frag = nicvf_rb_ptr_to_mbuf(nic, *rb_ptrs);
	m_append(mbuf, payload_len, mbuf_frag->m_data);
	m_freem(mbuf_frag);
	}
	/* Next buffer pointer */
	rb_ptrs++;
	}

	if (__predict_true(mbuf != NULL)) {
	m_fixhdr(mbuf);
	mbuf->m_pkthdr.flowid = cqe_rx->rq_idx;
	M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE);
	if (__predict_true((if_getcapenable(nic->ifp) & IFCAP_RXCSUM) != 0)) {
	/*
	* HW by default verifies IP & TCP/UDP/SCTP checksums
	*/
	if (__predict_true(cqe_rx->l3_type == L3TYPE_IPV4)) {
	mbuf->m_pkthdr.csum_flags =
	(CSUM_IP_CHECKED \| CSUM_IP_VALID);
	}

	switch (cqe_rx->l4_type) {
	case L4TYPE_UDP:
	case L4TYPE_TCP: /* fall through */
	mbuf->m_pkthdr.csum_flags \|=
	(CSUM_DATA_VALID \| CSUM_PSEUDO_HDR);
	mbuf->m_pkthdr.csum_data = 0xffff;
	break;
	case L4TYPE_SCTP:
	mbuf->m_pkthdr.csum_flags \|= CSUM_SCTP_VALID;
	break;
	default:
	break;
	}
	}
	}

	return (mbuf);
	}

	/* Enable interrupt */
	void
	nicvf_enable_intr(struct nicvf *nic, int int_type, int q_idx)
	{
	uint64_t reg_val;

	reg_val = nicvf_reg_read(nic, NIC_VF_ENA_W1S);

	switch (int_type) {
	case NICVF_INTR_CQ:
	reg_val \|= ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
	break;
	case NICVF_INTR_SQ:
	reg_val \|= ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
	break;
	case NICVF_INTR_RBDR:
	reg_val \|= ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
	break;
	case NICVF_INTR_PKT_DROP:
	reg_val \|= (1UL << NICVF_INTR_PKT_DROP_SHIFT);
	break;
	case NICVF_INTR_TCP_TIMER:
	reg_val \|= (1UL << NICVF_INTR_TCP_TIMER_SHIFT);
	break;
	case NICVF_INTR_MBOX:
	reg_val \|= (1UL << NICVF_INTR_MBOX_SHIFT);
	break;
	case NICVF_INTR_QS_ERR:
	reg_val \|= (1UL << NICVF_INTR_QS_ERR_SHIFT);
	break;
	default:
	device_printf(nic->dev,
	"Failed to enable interrupt: unknown type\n");
	break;
	}

	nicvf_reg_write(nic, NIC_VF_ENA_W1S, reg_val);
	}

	/* Disable interrupt */
	void
	nicvf_disable_intr(struct nicvf *nic, int int_type, int q_idx)
	{
	uint64_t reg_val = 0;

	switch (int_type) {
	case NICVF_INTR_CQ:
	reg_val \|= ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
	break;
	case NICVF_INTR_SQ:
	reg_val \|= ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
	break;
	case NICVF_INTR_RBDR:
	reg_val \|= ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
	break;
	case NICVF_INTR_PKT_DROP:
	reg_val \|= (1UL << NICVF_INTR_PKT_DROP_SHIFT);
	break;
	case NICVF_INTR_TCP_TIMER:
	reg_val \|= (1UL << NICVF_INTR_TCP_TIMER_SHIFT);
	break;
	case NICVF_INTR_MBOX:
	reg_val \|= (1UL << NICVF_INTR_MBOX_SHIFT);
	break;
	case NICVF_INTR_QS_ERR:
	reg_val \|= (1UL << NICVF_INTR_QS_ERR_SHIFT);
	break;
	default:
	device_printf(nic->dev,
	"Failed to disable interrupt: unknown type\n");
	break;
	}

	nicvf_reg_write(nic, NIC_VF_ENA_W1C, reg_val);
	}

	/* Clear interrupt */
	void
	nicvf_clear_intr(struct nicvf *nic, int int_type, int q_idx)
	{
	uint64_t reg_val = 0;

	switch (int_type) {
	case NICVF_INTR_CQ:
	reg_val = ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
	break;
	case NICVF_INTR_SQ:
	reg_val = ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
	break;
	case NICVF_INTR_RBDR:
	reg_val = ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
	break;
	case NICVF_INTR_PKT_DROP:
	reg_val = (1UL << NICVF_INTR_PKT_DROP_SHIFT);
	break;
	case NICVF_INTR_TCP_TIMER:
	reg_val = (1UL << NICVF_INTR_TCP_TIMER_SHIFT);
	break;
	case NICVF_INTR_MBOX:
	reg_val = (1UL << NICVF_INTR_MBOX_SHIFT);
	break;
	case NICVF_INTR_QS_ERR:
	reg_val \|= (1UL << NICVF_INTR_QS_ERR_SHIFT);
	break;
	default:
	device_printf(nic->dev,
	"Failed to clear interrupt: unknown type\n");
	break;
	}

	nicvf_reg_write(nic, NIC_VF_INT, reg_val);
	}

	/* Check if interrupt is enabled */
	int
	nicvf_is_intr_enabled(struct nicvf *nic, int int_type, int q_idx)
	{
	uint64_t reg_val;
	uint64_t mask = 0xff;

	reg_val = nicvf_reg_read(nic, NIC_VF_ENA_W1S);

	switch (int_type) {
	case NICVF_INTR_CQ:
	mask = ((1UL << q_idx) << NICVF_INTR_CQ_SHIFT);
	break;
	case NICVF_INTR_SQ:
	mask = ((1UL << q_idx) << NICVF_INTR_SQ_SHIFT);
	break;
	case NICVF_INTR_RBDR:
	mask = ((1UL << q_idx) << NICVF_INTR_RBDR_SHIFT);
	break;
	case NICVF_INTR_PKT_DROP:
	mask = NICVF_INTR_PKT_DROP_MASK;
	break;
	case NICVF_INTR_TCP_TIMER:
	mask = NICVF_INTR_TCP_TIMER_MASK;
	break;
	case NICVF_INTR_MBOX:
	mask = NICVF_INTR_MBOX_MASK;
	break;
	case NICVF_INTR_QS_ERR:
	mask = NICVF_INTR_QS_ERR_MASK;
	break;
	default:
	device_printf(nic->dev,
	"Failed to check interrupt enable: unknown type\n");
	break;
	}

	return (reg_val & mask);
	}

	void
	nicvf_update_rq_stats(struct nicvf *nic, int rq_idx)
	{
	struct rcv_queue *rq;

	#define GET_RQ_STATS(reg) \
	nicvf_reg_read(nic, NIC_QSET_RQ_0_7_STAT_0_1 \|\
	(rq_idx << NIC_Q_NUM_SHIFT) \| (reg << 3))

	rq = &nic->qs->rq[rq_idx];
	rq->stats.bytes = GET_RQ_STATS(RQ_SQ_STATS_OCTS);
	rq->stats.pkts = GET_RQ_STATS(RQ_SQ_STATS_PKTS);
	}

	void
	nicvf_update_sq_stats(struct nicvf *nic, int sq_idx)
	{
	struct snd_queue *sq;

	#define GET_SQ_STATS(reg) \
	nicvf_reg_read(nic, NIC_QSET_SQ_0_7_STAT_0_1 \|\
	(sq_idx << NIC_Q_NUM_SHIFT) \| (reg << 3))

	sq = &nic->qs->sq[sq_idx];
	sq->stats.bytes = GET_SQ_STATS(RQ_SQ_STATS_OCTS);
	sq->stats.pkts = GET_SQ_STATS(RQ_SQ_STATS_PKTS);
	}

	/* Check for errors in the receive cmp.queue entry */
	int
	nicvf_check_cqe_rx_errs(struct nicvf nic, struct cmp_queue cq,
	struct cqe_rx_t *cqe_rx)
	{
	struct nicvf_hw_stats *stats = &nic->hw_stats;
	struct nicvf_drv_stats *drv_stats = &nic->drv_stats;

	if (!cqe_rx->err_level && !cqe_rx->err_opcode) {
	drv_stats->rx_frames_ok++;
	return (0);
	}

	switch (cqe_rx->err_opcode) {
	case CQ_RX_ERROP_RE_PARTIAL:
	stats->rx_bgx_truncated_pkts++;
	break;
	case CQ_RX_ERROP_RE_JABBER:
	stats->rx_jabber_errs++;
	break;
	case CQ_RX_ERROP_RE_FCS:
	stats->rx_fcs_errs++;
	break;
	case CQ_RX_ERROP_RE_RX_CTL:
	stats->rx_bgx_errs++;
	break;
	case CQ_RX_ERROP_PREL2_ERR:
	stats->rx_prel2_errs++;
	break;
	case CQ_RX_ERROP_L2_MAL:
	stats->rx_l2_hdr_malformed++;
	break;
	case CQ_RX_ERROP_L2_OVERSIZE:
	stats->rx_oversize++;
	break;
	case CQ_RX_ERROP_L2_UNDERSIZE:
	stats->rx_undersize++;
	break;
	case CQ_RX_ERROP_L2_LENMISM:
	stats->rx_l2_len_mismatch++;
	break;
	case CQ_RX_ERROP_L2_PCLP:
	stats->rx_l2_pclp++;
	break;
	case CQ_RX_ERROP_IP_NOT:
	stats->rx_ip_ver_errs++;
	break;
	case CQ_RX_ERROP_IP_CSUM_ERR:
	stats->rx_ip_csum_errs++;
	break;
	case CQ_RX_ERROP_IP_MAL:
	stats->rx_ip_hdr_malformed++;
	break;
	case CQ_RX_ERROP_IP_MALD:
	stats->rx_ip_payload_malformed++;
	break;
	case CQ_RX_ERROP_IP_HOP:
	stats->rx_ip_ttl_errs++;
	break;
	case CQ_RX_ERROP_L3_PCLP:
	stats->rx_l3_pclp++;
	break;
	case CQ_RX_ERROP_L4_MAL:
	stats->rx_l4_malformed++;
	break;
	case CQ_RX_ERROP_L4_CHK:
	stats->rx_l4_csum_errs++;
	break;
	case CQ_RX_ERROP_UDP_LEN:
	stats->rx_udp_len_errs++;
	break;
	case CQ_RX_ERROP_L4_PORT:
	stats->rx_l4_port_errs++;
	break;
	case CQ_RX_ERROP_TCP_FLAG:
	stats->rx_tcp_flag_errs++;
	break;
	case CQ_RX_ERROP_TCP_OFFSET:
	stats->rx_tcp_offset_errs++;
	break;
	case CQ_RX_ERROP_L4_PCLP:
	stats->rx_l4_pclp++;
	break;
	case CQ_RX_ERROP_RBDR_TRUNC:
	stats->rx_truncated_pkts++;
	break;
	}

	return (1);
	}

	/* Check for errors in the send cmp.queue entry */
	int
	nicvf_check_cqe_tx_errs(struct nicvf nic, struct cmp_queue cq,
	struct cqe_send_t *cqe_tx)
	{
	struct cmp_queue_stats *stats = &cq->stats;

	switch (cqe_tx->send_status) {
	case CQ_TX_ERROP_GOOD:
	stats->tx.good++;
	return (0);
	case CQ_TX_ERROP_DESC_FAULT:
	stats->tx.desc_fault++;
	break;
	case CQ_TX_ERROP_HDR_CONS_ERR:
	stats->tx.hdr_cons_err++;
	break;
	case CQ_TX_ERROP_SUBDC_ERR:
	stats->tx.subdesc_err++;
	break;
	case CQ_TX_ERROP_IMM_SIZE_OFLOW:
	stats->tx.imm_size_oflow++;
	break;
	case CQ_TX_ERROP_DATA_SEQUENCE_ERR:
	stats->tx.data_seq_err++;
	break;
	case CQ_TX_ERROP_MEM_SEQUENCE_ERR:
	stats->tx.mem_seq_err++;
	break;
	case CQ_TX_ERROP_LOCK_VIOL:
	stats->tx.lock_viol++;
	break;
	case CQ_TX_ERROP_DATA_FAULT:
	stats->tx.data_fault++;
	break;
	case CQ_TX_ERROP_TSTMP_CONFLICT:
	stats->tx.tstmp_conflict++;
	break;
	case CQ_TX_ERROP_TSTMP_TIMEOUT:
	stats->tx.tstmp_timeout++;
	break;
	case CQ_TX_ERROP_MEM_FAULT:
	stats->tx.mem_fault++;
	break;
	case CQ_TX_ERROP_CK_OVERLAP:
	stats->tx.csum_overlap++;
	break;
	case CQ_TX_ERROP_CK_OFLOW:
	stats->tx.csum_overflow++;
	break;
	}

	return (1);
	}
	Index: head/sys/fs/cd9660/cd9660_vfsops.c
	===================================================================
	--- head/sys/fs/cd9660/cd9660_vfsops.c (revision 327172)
	+++ head/sys/fs/cd9660/cd9660_vfsops.c (revision 327173)
	@@ -1,857 +1,855 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1994
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley
	* by Pace Willisson (pace@blitz.com). The Rock Ridge Extension
	* Support code is derived from software contributed to Berkeley
	* by Atsushi Murai (amurai@spec.co.jp).
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/kernel.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/cdio.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/malloc.h>
	#include <sys/stat.h>
	#include <sys/syslog.h>
	#include <sys/iconv.h>

	#include <fs/cd9660/iso.h>
	#include <fs/cd9660/iso_rrip.h>
	#include <fs/cd9660/cd9660_node.h>
	#include <fs/cd9660/cd9660_mount.h>

	#include <geom/geom.h>
	#include <geom/geom_vfs.h>

	MALLOC_DEFINE(M_ISOFSMNT, "isofs_mount", "ISOFS mount structure");
	MALLOC_DEFINE(M_ISOFSNODE, "isofs_node", "ISOFS vnode private part");

	struct iconv_functions *cd9660_iconv = NULL;

	static vfs_mount_t cd9660_mount;
	static vfs_cmount_t cd9660_cmount;
	static vfs_unmount_t cd9660_unmount;
	static vfs_root_t cd9660_root;
	static vfs_statfs_t cd9660_statfs;
	static vfs_vget_t cd9660_vget;
	static vfs_fhtovp_t cd9660_fhtovp;

	static struct vfsops cd9660_vfsops = {
	.vfs_fhtovp = cd9660_fhtovp,
	.vfs_mount = cd9660_mount,
	.vfs_cmount = cd9660_cmount,
	.vfs_root = cd9660_root,
	.vfs_statfs = cd9660_statfs,
	.vfs_unmount = cd9660_unmount,
	.vfs_vget = cd9660_vget,
	};
	VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY);
	MODULE_VERSION(cd9660, 1);

	static int cd9660_vfs_hash_cmp(struct vnode vp, void pino);
	static int iso_mountfs(struct vnode devvp, struct mount mp);

	/*
	* VFS Operations.
	*/

	static int
	cd9660_cmount(struct mntarg ma, void data, uint64_t flags)
	{
	struct iso_args args;
	struct export_args exp;
	int error;

	error = copyin(data, &args, sizeof args);
	if (error)
	return (error);
	vfs_oexport_conv(&args.export, &exp);

	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
	ma = mount_arg(ma, "export", &exp, sizeof(exp));
	ma = mount_argsu(ma, "cs_disk", args.cs_disk, 64);
	ma = mount_argsu(ma, "cs_local", args.cs_local, 64);
	ma = mount_argf(ma, "ssector", "%u", args.ssector);
	ma = mount_argb(ma, !(args.flags & ISOFSMNT_NORRIP), "norrip");
	ma = mount_argb(ma, args.flags & ISOFSMNT_GENS, "nogens");
	ma = mount_argb(ma, args.flags & ISOFSMNT_EXTATT, "noextatt");
	ma = mount_argb(ma, !(args.flags & ISOFSMNT_NOJOLIET), "nojoliet");
	ma = mount_argb(ma,
	args.flags & ISOFSMNT_BROKENJOLIET, "nobrokenjoliet");
	ma = mount_argb(ma, args.flags & ISOFSMNT_KICONV, "nokiconv");

	error = kernel_mount(ma, flags);

	return (error);
	}

	static int
	cd9660_mount(struct mount *mp)
	{
	struct vnode *devvp;
	struct thread *td;
	char *fspec;
	int error;
	accmode_t accmode;
	struct nameidata ndp;
	struct iso_mnt *imp = NULL;

	td = curthread;

	/*
	* Unconditionally mount as read-only.
	*/
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_RDONLY;
	MNT_IUNLOCK(mp);

	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
	if (error)
	return (error);

	imp = VFSTOISOFS(mp);

	if (mp->mnt_flag & MNT_UPDATE) {
	if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
	return (0);
	}
	/*
	* Not an update, or updating the name: look up the name
	* and verify that it refers to a sensible block device.
	*/
	NDINIT(&ndp, LOOKUP, FOLLOW \| LOCKLEAF, UIO_SYSSPACE, fspec, td);
	if ((error = namei(&ndp)))
	return (error);
	NDFREE(&ndp, NDF_ONLY_PNBUF);
	devvp = ndp.ni_vp;

	if (!vn_isdisk(devvp, &error)) {
	vput(devvp);
	return (error);
	}

	/*
	* Verify that user has necessary permissions on the device,
	* or has superuser abilities
	*/
	accmode = VREAD;
	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
	if (error)
	error = priv_check(td, PRIV_VFS_MOUNT_PERM);
	if (error) {
	vput(devvp);
	return (error);
	}

	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
	error = iso_mountfs(devvp, mp);
	if (error)
	vrele(devvp);
	} else {
	if (devvp != imp->im_devvp)
	error = EINVAL; /* needs translation */
	vput(devvp);
	}
	if (error)
	return (error);
	vfs_mountedfrom(mp, fspec);
	return (0);
	}

	/*
	* Common code for mount and mountroot
	*/
	static int
	iso_mountfs(devvp, mp)
	struct vnode *devvp;
	struct mount *mp;
	{
	struct iso_mnt *isomp = NULL;
	struct buf *bp = NULL;
	struct buf pribp = NULL, supbp = NULL;
	struct cdev *dev;
	int error = EINVAL;
	int high_sierra = 0;
	int iso_bsize;
	int iso_blknum;
	int joliet_level;
	int isverified = 0;
	struct iso_volume_descriptor *vdp = NULL;
	struct iso_primary_descriptor *pri = NULL;
	struct iso_sierra_primary_descriptor *pri_sierra = NULL;
	struct iso_supplementary_descriptor *sup = NULL;
	struct iso_directory_record *rootp;
	int logical_block_size, ssector;
	struct g_consumer *cp;
	struct bufobj *bo;
	char cs_local, cs_disk;

	dev = devvp->v_rdev;
	dev_ref(dev);
	g_topology_lock();
	error = g_vfs_open(devvp, &cp, "cd9660", 0);
	if (error == 0)
	g_getattr("MNT::verified", cp, &isverified);
	g_topology_unlock();
	VOP_UNLOCK(devvp, 0);
	if (error)
	goto out;
	if (devvp->v_rdev->si_iosize_max != 0)
	mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
	if (mp->mnt_iosize_max > MAXPHYS)
	mp->mnt_iosize_max = MAXPHYS;

	bo = &devvp->v_bufobj;

	/* This is the "logical sector size". The standard says this
	* should be 2048 or the physical sector size on the device,
	* whichever is greater.
	*/
	if ((ISO_DEFAULT_BLOCK_SIZE % cp->provider->sectorsize) != 0) {
	error = EINVAL;
	goto out;
	}

	iso_bsize = cp->provider->sectorsize;

	joliet_level = 0;
	if (1 != vfs_scanopt(mp->mnt_optnew, "ssector", "%d", &ssector))
	ssector = 0;
	for (iso_blknum = 16 + ssector;
	iso_blknum < 100 + ssector;
	iso_blknum++) {
	if ((error = bread(devvp, iso_blknum * btodb(ISO_DEFAULT_BLOCK_SIZE),
	iso_bsize, NOCRED, &bp)) != 0)
	goto out;

	vdp = (struct iso_volume_descriptor *)bp->b_data;
	if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) {
	if (bcmp (vdp->id_sierra, ISO_SIERRA_ID,
	sizeof vdp->id_sierra) != 0) {
	error = EINVAL;
	goto out;
	} else
	high_sierra = 1;
	}
	switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){
	case ISO_VD_PRIMARY:
	if (pribp == NULL) {
	pribp = bp;
	bp = NULL;
	pri = (struct iso_primary_descriptor *)vdp;
	pri_sierra =
	(struct iso_sierra_primary_descriptor *)vdp;
	}
	break;

	case ISO_VD_SUPPLEMENTARY:
	if (supbp == NULL) {
	supbp = bp;
	bp = NULL;
	sup = (struct iso_supplementary_descriptor *)vdp;

	if (!vfs_flagopt(mp->mnt_optnew, "nojoliet", NULL, 0)) {
	if (bcmp(sup->escape, "%/@", 3) == 0)
	joliet_level = 1;
	if (bcmp(sup->escape, "%/C", 3) == 0)
	joliet_level = 2;
	if (bcmp(sup->escape, "%/E", 3) == 0)
	joliet_level = 3;

	if ((isonum_711 (sup->flags) & 1) &&
	!vfs_flagopt(mp->mnt_optnew, "brokenjoliet", NULL, 0))
	joliet_level = 0;
	}
	}
	break;

	case ISO_VD_END:
	goto vd_end;

	default:
	break;
	}
	if (bp != NULL) {
	brelse(bp);
	bp = NULL;
	}
	}
	vd_end:
	if (bp != NULL) {
	brelse(bp);
	bp = NULL;
	}

	if (pri == NULL) {
	error = EINVAL;
	goto out;
	}

	logical_block_size =
	isonum_723 (high_sierra?
	pri_sierra->logical_block_size:
	pri->logical_block_size);

	if (logical_block_size < DEV_BSIZE \|\| logical_block_size > MAXBSIZE
	\|\| (logical_block_size & (logical_block_size - 1)) != 0) {
	error = EINVAL;
	goto out;
	}

	rootp = (struct iso_directory_record *)
	(high_sierra?
	pri_sierra->root_directory_record:
	pri->root_directory_record);

	isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK \| M_ZERO);
	isomp->im_cp = cp;
	isomp->im_bo = bo;
	isomp->logical_block_size = logical_block_size;
	isomp->volume_space_size =
	isonum_733 (high_sierra?
	pri_sierra->volume_space_size:
	pri->volume_space_size);
	isomp->joliet_level = 0;
	/*
	* Since an ISO9660 multi-session CD can also access previous
	* sessions, we have to include them into the space consider-
	* ations. This doesn't yield a very accurate number since
	* parts of the old sessions might be inaccessible now, but we
	* can't do much better. This is also important for the NFS
	* filehandle validation.
	*/
	isomp->volume_space_size += ssector;
	bcopy (rootp, isomp->root, sizeof isomp->root);
	isomp->root_extent = isonum_733 (rootp->extent);
	isomp->root_size = isonum_733 (rootp->size);

	isomp->im_bmask = logical_block_size - 1;
	isomp->im_bshift = ffs(logical_block_size) - 1;

	pribp->b_flags \|= B_AGE;
	brelse(pribp);
	pribp = NULL;
	rootp = NULL;
	pri = NULL;
	pri_sierra = NULL;

	mp->mnt_data = isomp;
	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
	mp->mnt_maxsymlinklen = 0;
	MNT_ILOCK(mp);
	if (isverified)
	mp->mnt_flag \|= MNT_VERIFIED;
	mp->mnt_flag \|= MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED \| MNTK_EXTENDED_SHARED;
	MNT_IUNLOCK(mp);
	isomp->im_mountp = mp;
	isomp->im_dev = dev;
	isomp->im_devvp = devvp;

	vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP);
	vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS);
	vfs_flagopt(mp->mnt_optnew, "extatt", &isomp->im_flags, ISOFSMNT_EXTATT);
	vfs_flagopt(mp->mnt_optnew, "nojoliet", &isomp->im_flags, ISOFSMNT_NOJOLIET);
	vfs_flagopt(mp->mnt_optnew, "kiconv", &isomp->im_flags, ISOFSMNT_KICONV);

	/* Check the Rock Ridge Extension support */
	if (!(isomp->im_flags & ISOFSMNT_NORRIP)) {
	if ((error = bread(isomp->im_devvp, (isomp->root_extent +
	isonum_711(((struct iso_directory_record *)isomp->root)->
	ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT),
	isomp->logical_block_size, NOCRED, &bp)) != 0)
	goto out;

	rootp = (struct iso_directory_record *)bp->b_data;

	if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) {
	isomp->im_flags \|= ISOFSMNT_NORRIP;
	} else {
	isomp->im_flags &= ~ISOFSMNT_GENS;
	}

	/*
	* The contents are valid,
	* but they will get reread as part of another vnode, so...
	*/
	bp->b_flags \|= B_AGE;
	brelse(bp);
	bp = NULL;
	rootp = NULL;
	}

	if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
	cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error);
	if (error)
	goto out;
	cs_disk = vfs_getopts(mp->mnt_optnew, "cs_disk", &error);
	if (error)
	goto out;
	cd9660_iconv->open(cs_local, cs_disk, &isomp->im_d2l);
	cd9660_iconv->open(cs_disk, cs_local, &isomp->im_l2d);
	} else {
	isomp->im_d2l = NULL;
	isomp->im_l2d = NULL;
	}

	if (high_sierra) {
	/* this effectively ignores all the mount flags */
	if (bootverbose)
	log(LOG_INFO, "cd9660: High Sierra Format\n");
	isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA;
	} else
	switch (isomp->im_flags&(ISOFSMNT_NORRIP\|ISOFSMNT_GENS)) {
	default:
	isomp->iso_ftype = ISO_FTYPE_DEFAULT;
	break;
	case ISOFSMNT_GENS\|ISOFSMNT_NORRIP:
	isomp->iso_ftype = ISO_FTYPE_9660;
	break;
	case 0:
	if (bootverbose)
	log(LOG_INFO, "cd9660: RockRidge Extension\n");
	isomp->iso_ftype = ISO_FTYPE_RRIP;
	break;
	}

	/* Decide whether to use the Joliet descriptor */

	if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) {
	if (bootverbose)
	log(LOG_INFO, "cd9660: Joliet Extension (Level %d)\n",
	joliet_level);
	rootp = (struct iso_directory_record *)
	sup->root_directory_record;
	bcopy (rootp, isomp->root, sizeof isomp->root);
	isomp->root_extent = isonum_733 (rootp->extent);
	isomp->root_size = isonum_733 (rootp->size);
	isomp->joliet_level = joliet_level;
	supbp->b_flags \|= B_AGE;
	}

	if (supbp) {
	brelse(supbp);
	supbp = NULL;
	sup = NULL;
	}

	return 0;
	out:
	if (bp != NULL)
	brelse(bp);
	if (pribp != NULL)
	brelse(pribp);
	if (supbp != NULL)
	brelse(supbp);
	if (cp != NULL) {
	g_topology_lock();
	g_vfs_close(cp);
	g_topology_unlock();
	}
	if (isomp) {
	free(isomp, M_ISOFSMNT);
	mp->mnt_data = NULL;
	}
	dev_rel(dev);
	return error;
	}

	/*
	* unmount system call
	*/
	static int
	cd9660_unmount(mp, mntflags)
	struct mount *mp;
	int mntflags;
	{
	struct iso_mnt *isomp;
	int error, flags = 0;

	if (mntflags & MNT_FORCE)
	flags \|= FORCECLOSE;
	if ((error = vflush(mp, 0, flags, curthread)))
	return (error);

	isomp = VFSTOISOFS(mp);

	if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
	if (isomp->im_d2l)
	cd9660_iconv->close(isomp->im_d2l);
	if (isomp->im_l2d)
	cd9660_iconv->close(isomp->im_l2d);
	}
	g_topology_lock();
	g_vfs_close(isomp->im_cp);
	g_topology_unlock();
	vrele(isomp->im_devvp);
	dev_rel(isomp->im_dev);
	free(isomp, M_ISOFSMNT);
	mp->mnt_data = NULL;
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_LOCAL;
	MNT_IUNLOCK(mp);
	return (error);
	}

	/*
	* Return root of a filesystem
	*/
	static int
	cd9660_root(mp, flags, vpp)
	struct mount *mp;
	int flags;
	struct vnode **vpp;
	{
	struct iso_mnt *imp = VFSTOISOFS(mp);
	struct iso_directory_record *dp =
	(struct iso_directory_record *)imp->root;
	cd_ino_t ino = isodirino(dp, imp);

	/*
	* With RRIP we must use the `.' entry of the root directory.
	* Simply tell vget, that it's a relocated directory.
	*/
	return (cd9660_vget_internal(mp, ino, flags, vpp,
	imp->iso_ftype == ISO_FTYPE_RRIP, dp));
	}

	/*
	* Get filesystem statistics.
	*/
	static int
	cd9660_statfs(mp, sbp)
	struct mount *mp;
	struct statfs *sbp;
	{
	struct iso_mnt *isomp;

	isomp = VFSTOISOFS(mp);

	sbp->f_bsize = isomp->logical_block_size;
	sbp->f_iosize = sbp->f_bsize; /* XXX */
	sbp->f_blocks = isomp->volume_space_size;
	sbp->f_bfree = 0; /* total free blocks */
	sbp->f_bavail = 0; /* blocks free for non superuser */
	sbp->f_files = 0; /* total files */
	sbp->f_ffree = 0; /* free file nodes */
	return 0;
	}

	/*
	* File handle to vnode
	*
	* Have to be really careful about stale file handles:
	* - check that the inode number is in range
	* - call iget() to get the locked inode
	* - check for an unallocated inode (i_mode == 0)
	* - check that the generation number matches
	*/

	/* ARGSUSED */
	static int
	cd9660_fhtovp(mp, fhp, flags, vpp)
	struct mount *mp;
	struct fid *fhp;
	int flags;
	struct vnode **vpp;
	{
	struct ifid ifh;
	struct iso_node *ip;
	struct vnode *nvp;
	int error;

	memcpy(&ifh, fhp, sizeof(ifh));

	#ifdef ISOFS_DBG
	printf("fhtovp: ino %d, start %ld\n",
	ifh.ifid_ino, ifh.ifid_start);
	#endif

	if ((error = VFS_VGET(mp, ifh.ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) {
	*vpp = NULLVP;
	return (error);
	}
	ip = VTOI(nvp);
	if (ip->inode.iso_mode == 0) {
	vput(nvp);
	*vpp = NULLVP;
	return (ESTALE);
	}
	*vpp = nvp;
	vnode_create_vobject(*vpp, ip->i_size, curthread);
	return (0);
	}

	/*
	* Conform to standard VFS interface; can't vget arbitrary inodes beyond 4GB
	* into media with current inode scheme and 32-bit ino_t. This shouldn't be
	* needed for anything other than nfsd, and who exports a mounted DVD over NFS?
	*/
	static int
	cd9660_vget(mp, ino, flags, vpp)
	struct mount *mp;
	ino_t ino;
	int flags;
	struct vnode **vpp;
	{

	/*
	* XXXX
	* It would be nice if we didn't always set the `relocated' flag
	* and force the extra read, but I don't want to think about fixing
	* that right now.
	*/
	return (cd9660_vget_internal(mp, ino, flags, vpp,
	#if 0
	VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP,
	#else
	0,
	#endif
	(struct iso_directory_record *)0));
	}

	/* Use special comparator for full 64-bit ino comparison. */
	static int
	cd9660_vfs_hash_cmp(vp, pino)
	struct vnode *vp;
	void *pino;
	{
	struct iso_node *ip;
	cd_ino_t ino;

	ip = VTOI(vp);
	ino = (cd_ino_t )pino;
	return (ip->i_number != ino);
	}

	int
	cd9660_vget_internal(mp, ino, flags, vpp, relocated, isodir)
	struct mount *mp;
	cd_ino_t ino;
	int flags;
	struct vnode **vpp;
	int relocated;
	struct iso_directory_record *isodir;
	{
	struct iso_mnt *imp;
	struct iso_node *ip;
	struct buf *bp;
	struct vnode *vp;
	- struct cdev *dev;
	int error;
	struct thread *td;

	td = curthread;
	error = vfs_hash_get(mp, ino, flags, td, vpp, cd9660_vfs_hash_cmp,
	&ino);
	if (error \|\| *vpp != NULL)
	return (error);

	/*
	* We must promote to an exclusive lock for vnode creation. This
	* can happen if lookup is passed LOCKSHARED.
	*/
	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
	flags &= ~LK_TYPE_MASK;
	flags \|= LK_EXCLUSIVE;
	}

	/*
	* We do not lock vnode creation as it is believed to be too
	* expensive for such rare case as simultaneous creation of vnode
	* for same ino by different processes. We just allow them to race
	* and check later to decide who wins. Let the race begin!
	*/

	imp = VFSTOISOFS(mp);
	- dev = imp->im_dev;

	/* Allocate a new vnode/iso_node. */
	if ((error = getnewvnode("isofs", mp, &cd9660_vnodeops, &vp)) != 0) {
	*vpp = NULLVP;
	return (error);
	}
	ip = malloc(sizeof(struct iso_node), M_ISOFSNODE,
	M_WAITOK \| M_ZERO);
	vp->v_data = ip;
	ip->i_vnode = vp;
	ip->i_number = ino;

	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
	error = insmntque(vp, mp);
	if (error != 0) {
	free(ip, M_ISOFSNODE);
	*vpp = NULLVP;
	return (error);
	}
	error = vfs_hash_insert(vp, ino, flags, td, vpp, cd9660_vfs_hash_cmp,
	&ino);
	if (error \|\| *vpp != NULL)
	return (error);

	if (isodir == NULL) {
	int lbn, off;

	lbn = lblkno(imp, ino);
	if (lbn >= imp->volume_space_size) {
	vput(vp);
	printf("fhtovp: lbn exceed volume space %d\n", lbn);
	return (ESTALE);
	}

	off = blkoff(imp, ino);
	if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) {
	vput(vp);
	printf("fhtovp: crosses block boundary %d\n",
	off + ISO_DIRECTORY_RECORD_SIZE);
	return (ESTALE);
	}

	error = bread(imp->im_devvp,
	lbn << (imp->im_bshift - DEV_BSHIFT),
	imp->logical_block_size, NOCRED, &bp);
	if (error) {
	vput(vp);
	brelse(bp);
	printf("fhtovp: bread error %d\n",error);
	return (error);
	}
	isodir = (struct iso_directory_record *)(bp->b_data + off);

	if (off + isonum_711(isodir->length) >
	imp->logical_block_size) {
	vput(vp);
	brelse(bp);
	printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n",
	off +isonum_711(isodir->length), off,
	isonum_711(isodir->length));
	return (ESTALE);
	}

	#if 0
	if (isonum_733(isodir->extent) +
	isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) {
	brelse(bp);
	printf("fhtovp: file start miss %d vs %d\n",
	isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length),
	ifhp->ifid_start);
	return (ESTALE);
	}
	#endif
	} else
	bp = NULL;

	ip->i_mnt = imp;

	if (relocated) {
	/*
	* On relocated directories we must
	* read the `.' entry out of a dir.
	*/
	ip->iso_start = ino >> imp->im_bshift;
	if (bp != NULL)
	brelse(bp);
	if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) {
	vput(vp);
	return (error);
	}
	isodir = (struct iso_directory_record *)bp->b_data;
	}

	ip->iso_extent = isonum_733(isodir->extent);
	ip->i_size = isonum_733(isodir->size);
	ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent;

	/*
	* Setup time stamp, attribute
	*/
	vp->v_type = VNON;
	switch (imp->iso_ftype) {
	default: /* ISO_FTYPE_9660 */
	{
	struct buf *bp2;
	int off;
	if ((imp->im_flags & ISOFSMNT_EXTATT)
	&& (off = isonum_711(isodir->ext_attr_length)))
	cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL,
	&bp2);
	else
	bp2 = NULL;
	cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660);
	cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660);
	if (bp2)
	brelse(bp2);
	break;
	}
	case ISO_FTYPE_RRIP:
	cd9660_rrip_analyze(isodir, ip, imp);
	break;
	}

	brelse(bp);

	/*
	* Initialize the associated vnode
	*/
	switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) {
	case VFIFO:
	vp->v_op = &cd9660_fifoops;
	break;
	default:
	VN_LOCK_ASHARE(vp);
	break;
	}

	if (ip->iso_extent == imp->root_extent)
	vp->v_vflag \|= VV_ROOT;

	/*
	* XXX need generation number?
	*/

	*vpp = vp;
	return (0);
	}
	Index: head/sys/fs/nfs/nfs_commonkrpc.c
	===================================================================
	--- head/sys/fs/nfs/nfs_commonkrpc.c (revision 327172)
	+++ head/sys/fs/nfs/nfs_commonkrpc.c (revision 327173)
	@@ -1,1347 +1,1344 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1989, 1991, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Rick Macklem at The University of Guelph.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* Socket operations for use by nfs
	*/

	#include "opt_kgssapi.h"
	#include "opt_nfs.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/vnode.h>

	#include <rpc/rpc.h>
	#include <rpc/krpc.h>

	#include <kgssapi/krb5/kcrypto.h>

	#include <fs/nfs/nfsport.h>

	#ifdef KDTRACE_HOOKS
	#include <sys/dtrace_bsd.h>

	dtrace_nfsclient_nfs23_start_probe_func_t
	dtrace_nfscl_nfs234_start_probe;

	dtrace_nfsclient_nfs23_done_probe_func_t
	dtrace_nfscl_nfs234_done_probe;

	/*
	* Registered probes by RPC type.
	*/
	uint32_t nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
	uint32_t nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];

	uint32_t nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
	uint32_t nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];

	uint32_t nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
	uint32_t nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
	#endif

	NFSSTATESPINLOCK;
	NFSREQSPINLOCK;
	NFSDLOCKMUTEX;
	NFSCLSTATEMUTEX;
	extern struct nfsstatsv1 nfsstatsv1;
	extern struct nfsreqhead nfsd_reqq;
	extern int nfscl_ticks;
	extern void (ncl_call_invalcaches)(struct vnode );
	extern int nfs_numnfscbd;
	extern int nfscl_debuglevel;

	SVCPOOL *nfscbd_pool;
	static int nfsrv_gsscallbackson = 0;
	static int nfs_bufpackets = 4;
	static int nfs_reconnects;
	static int nfs3_jukebox_delay = 10;
	static int nfs_skip_wcc_data_onerr = 1;

	SYSCTL_DECL(_vfs_nfs);

	SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
	"Buffer reservation size 2 < x < 64");
	SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
	"Number of times the nfs client has had to reconnect");
	SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
	"Number of seconds to delay a retry after receiving EJUKEBOX");
	SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
	"Disable weak cache consistency checking when server returns an error");

	static void nfs_down(struct nfsmount , struct thread , const char *,
	int, int);
	static void nfs_up(struct nfsmount , struct thread , const char *,
	int, int);
	static int nfs_msg(struct thread , const char , const char *, int);

	struct nfs_cached_auth {
	int ca_refs; /* refcount, including 1 from the cache */
	uid_t ca_uid; /* uid that corresponds to this auth */
	AUTH ca_auth; / RPC auth handle */
	};

	static int nfsv2_procid[NFS_V3NPROCS] = {
	NFSV2PROC_NULL,
	NFSV2PROC_GETATTR,
	NFSV2PROC_SETATTR,
	NFSV2PROC_LOOKUP,
	NFSV2PROC_NOOP,
	NFSV2PROC_READLINK,
	NFSV2PROC_READ,
	NFSV2PROC_WRITE,
	NFSV2PROC_CREATE,
	NFSV2PROC_MKDIR,
	NFSV2PROC_SYMLINK,
	NFSV2PROC_CREATE,
	NFSV2PROC_REMOVE,
	NFSV2PROC_RMDIR,
	NFSV2PROC_RENAME,
	NFSV2PROC_LINK,
	NFSV2PROC_READDIR,
	NFSV2PROC_NOOP,
	NFSV2PROC_STATFS,
	NFSV2PROC_NOOP,
	NFSV2PROC_NOOP,
	NFSV2PROC_NOOP,
	};

	/*
	* Initialize sockets and congestion for a new NFS connection.
	* We do not free the sockaddr if error.
	*/
	int
	newnfs_connect(struct nfsmount nmp, struct nfssockreq nrp,
	struct ucred cred, NFSPROC_T p, int callback_retry_mult)
	{
	int rcvreserve, sndreserve;
	int pktscale, pktscalesav;
	struct sockaddr *saddr;
	struct ucred *origcred;
	CLIENT *client;
	struct netconfig *nconf;
	struct socket *so;
	int one = 1, retries, error = 0;
	struct thread *td = curthread;
	SVCXPRT *xprt;
	struct timeval timo;

	/*
	* We need to establish the socket using the credentials of
	* the mountpoint. Some parts of this process (such as
	* sobind() and soconnect()) will use the curent thread's
	* credential instead of the socket credential. To work
	* around this, temporarily change the current thread's
	* credential to that of the mountpoint.
	*
	* XXX: It would be better to explicitly pass the correct
	* credential to sobind() and soconnect().
	*/
	origcred = td->td_ucred;

	/*
	* Use the credential in nr_cred, if not NULL.
	*/
	if (nrp->nr_cred != NULL)
	td->td_ucred = nrp->nr_cred;
	else
	td->td_ucred = cred;
	saddr = nrp->nr_nam;

	if (saddr->sa_family == AF_INET)
	if (nrp->nr_sotype == SOCK_DGRAM)
	nconf = getnetconfigent("udp");
	else
	nconf = getnetconfigent("tcp");
	else if (saddr->sa_family == AF_LOCAL)
	nconf = getnetconfigent("local");
	else
	if (nrp->nr_sotype == SOCK_DGRAM)
	nconf = getnetconfigent("udp6");
	else
	nconf = getnetconfigent("tcp6");

	pktscale = nfs_bufpackets;
	if (pktscale < 2)
	pktscale = 2;
	if (pktscale > 64)
	pktscale = 64;
	pktscalesav = pktscale;
	/*
	* soreserve() can fail if sb_max is too small, so shrink pktscale
	* and try again if there is an error.
	* Print a log message suggesting increasing sb_max.
	* Creating a socket and doing this is necessary since, if the
	* reservation sizes are too large and will make soreserve() fail,
	* the connection will work until a large send is attempted and
	* then it will loop in the krpc code.
	*/
	so = NULL;
	saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
	error = socreate(saddr->sa_family, &so, nrp->nr_sotype,
	nrp->nr_soproto, td->td_ucred, td);
	if (error) {
	td->td_ucred = origcred;
	goto out;
	}
	do {
	if (error != 0 && pktscale > 2) {
	if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
	pktscale == pktscalesav)
	printf("Consider increasing kern.ipc.maxsockbuf\n");
	pktscale--;
	}
	if (nrp->nr_sotype == SOCK_DGRAM) {
	if (nmp != NULL) {
	sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
	pktscale;
	rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
	pktscale;
	} else {
	sndreserve = rcvreserve = 1024 * pktscale;
	}
	} else {
	if (nrp->nr_sotype != SOCK_STREAM)
	panic("nfscon sotype");
	if (nmp != NULL) {
	sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR +
	sizeof (u_int32_t)) * pktscale;
	rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR +
	sizeof (u_int32_t)) * pktscale;
	} else {
	sndreserve = rcvreserve = 1024 * pktscale;
	}
	}
	error = soreserve(so, sndreserve, rcvreserve);
	if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
	pktscale <= 2)
	printf("Must increase kern.ipc.maxsockbuf or reduce"
	" rsize, wsize\n");
	} while (error != 0 && pktscale > 2);
	soclose(so);
	if (error) {
	td->td_ucred = origcred;
	goto out;
	}

	client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
	nrp->nr_vers, sndreserve, rcvreserve);
	CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
	if (nmp != NULL) {
	if ((nmp->nm_flag & NFSMNT_INT))
	CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
	if ((nmp->nm_flag & NFSMNT_RESVPORT))
	CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
	if (NFSHASSOFT(nmp)) {
	if (nmp->nm_sotype == SOCK_DGRAM)
	/*
	* For UDP, the large timeout for a reconnect
	* will be set to "nm_retry * nm_timeo / 2", so
	* we only want to do 2 reconnect timeout
	* retries.
	*/
	retries = 2;
	else
	retries = nmp->nm_retry;
	} else
	retries = INT_MAX;
	/* cred == NULL for DS connects. */
	if (NFSHASNFSV4N(nmp) && cred != NULL) {
	/*
	* Make sure the nfscbd_pool doesn't get destroyed
	* while doing this.
	*/
	NFSD_LOCK();
	if (nfs_numnfscbd > 0) {
	nfs_numnfscbd++;
	NFSD_UNLOCK();
	xprt = svc_vc_create_backchannel(nfscbd_pool);
	CLNT_CONTROL(client, CLSET_BACKCHANNEL, xprt);
	NFSD_LOCK();
	nfs_numnfscbd--;
	if (nfs_numnfscbd == 0)
	wakeup(&nfs_numnfscbd);
	}
	NFSD_UNLOCK();
	}
	} else {
	/*
	* Three cases:
	* - Null RPC callback to client
	* - Non-Null RPC callback to client, wait a little longer
	* - upcalls to nfsuserd and gssd (clp == NULL)
	*/
	if (callback_retry_mult == 0) {
	retries = NFSV4_UPCALLRETRY;
	CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
	} else {
	retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
	}
	}
	CLNT_CONTROL(client, CLSET_RETRIES, &retries);

	if (nmp != NULL) {
	/*
	* For UDP, there are 2 timeouts:
	* - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
	* that does a retransmit of an RPC request using the same
	* socket and xid. This is what you normally want to do,
	* since NFS servers depend on "same xid" for their
	* Duplicate Request Cache.
	* - timeout specified in CLNT_CALL_MBUF(), which specifies when
	* retransmits on the same socket should fail and a fresh
	* socket created. Each of these timeouts counts as one
	* CLSET_RETRIES as set above.
	* Set the initial retransmit timeout for UDP. This timeout
	* doesn't exist for TCP and the following call just fails,
	* which is ok.
	*/
	timo.tv_sec = nmp->nm_timeo / NFS_HZ;
	timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
	CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
	}

	mtx_lock(&nrp->nr_mtx);
	if (nrp->nr_client != NULL) {
	mtx_unlock(&nrp->nr_mtx);
	/*
	* Someone else already connected.
	*/
	CLNT_RELEASE(client);
	} else {
	nrp->nr_client = client;
	/*
	* Protocols that do not require connections may be optionally
	* left unconnected for servers that reply from a port other
	* than NFS_PORT.
	*/
	if (nmp == NULL \|\| (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
	mtx_unlock(&nrp->nr_mtx);
	CLNT_CONTROL(client, CLSET_CONNECT, &one);
	} else
	mtx_unlock(&nrp->nr_mtx);
	}


	/* Restore current thread's credentials. */
	td->td_ucred = origcred;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* NFS disconnect. Clean up and unlink.
	*/
	void
	newnfs_disconnect(struct nfssockreq *nrp)
	{
	CLIENT *client;

	mtx_lock(&nrp->nr_mtx);
	if (nrp->nr_client != NULL) {
	client = nrp->nr_client;
	nrp->nr_client = NULL;
	mtx_unlock(&nrp->nr_mtx);
	rpc_gss_secpurge_call(client);
	CLNT_CLOSE(client);
	CLNT_RELEASE(client);
	} else {
	mtx_unlock(&nrp->nr_mtx);
	}
	}

	static AUTH *
	nfs_getauth(struct nfssockreq nrp, int secflavour, char clnt_principal,
	char srv_principal, gss_OID mech_oid, struct ucred cred)
	{
	rpc_gss_service_t svc;
	AUTH *auth;

	switch (secflavour) {
	case RPCSEC_GSS_KRB5:
	case RPCSEC_GSS_KRB5I:
	case RPCSEC_GSS_KRB5P:
	if (!mech_oid) {
	if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
	return (NULL);
	}
	if (secflavour == RPCSEC_GSS_KRB5)
	svc = rpc_gss_svc_none;
	else if (secflavour == RPCSEC_GSS_KRB5I)
	svc = rpc_gss_svc_integrity;
	else
	svc = rpc_gss_svc_privacy;

	if (clnt_principal == NULL)
	auth = rpc_gss_secfind_call(nrp->nr_client, cred,
	srv_principal, mech_oid, svc);
	else {
	auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
	clnt_principal, srv_principal, "kerberosv5",
	svc, NULL, NULL, NULL);
	return (auth);
	}
	if (auth != NULL)
	return (auth);
	/* fallthrough */
	case AUTH_SYS:
	default:
	return (authunix_create(cred));

	}
	}

	/*
	* Callback from the RPC code to generate up/down notifications.
	*/

	struct nfs_feedback_arg {
	struct nfsmount *nf_mount;
	int nf_lastmsg; /* last tprintf */
	int nf_tprintfmsg;
	struct thread *nf_td;
	};

	static void
	nfs_feedback(int type, int proc, void *arg)
	{
	struct nfs_feedback_arg nf = (struct nfs_feedback_arg ) arg;
	struct nfsmount *nmp = nf->nf_mount;
	time_t now;

	switch (type) {
	case FEEDBACK_REXMIT2:
	case FEEDBACK_RECONNECT:
	now = NFSD_MONOSEC;
	if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
	nfs_down(nmp, nf->nf_td,
	"not responding", 0, NFSSTA_TIMEO);
	nf->nf_tprintfmsg = TRUE;
	nf->nf_lastmsg = now;
	}
	break;

	case FEEDBACK_OK:
	nfs_up(nf->nf_mount, nf->nf_td,
	"is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
	break;
	}
	}

	/*
	* newnfs_request - goes something like this
	* - does the rpc by calling the krpc layer
	* - break down rpc header and return with nfs reply
	* nb: always frees up nd_mreq mbuf list
	*/
	int
	newnfs_request(struct nfsrv_descript nd, struct nfsmount nmp,
	struct nfsclient clp, struct nfssockreq nrp, vnode_t vp,
	struct thread td, struct ucred cred, u_int32_t prog, u_int32_t vers,
	u_char retsum, int toplevel, u_int64_t xidp, struct nfsclsession *dssep)
	{
	uint32_t retseq, retval, slotseq, *tl;
	time_t waituntil;
	int i = 0, j = 0, opcnt, set_sigset = 0, slot;
	- int trycnt, error = 0, usegssname = 0, secflavour = AUTH_SYS;
	+ int error = 0, usegssname = 0, secflavour = AUTH_SYS;
	int freeslot, maxslot, reterr, slotpos, timeo;
	u_int16_t procnum;
	u_int trylater_delay = 1;
	struct nfs_feedback_arg nf;
	struct timeval timo;
	AUTH *auth;
	struct rpc_callextra ext;
	enum clnt_stat stat;
	struct nfsreq *rep = NULL;
	char srv_principal = NULL, clnt_principal = NULL;
	sigset_t oldset;
	struct ucred *authcred;
	struct nfsclsession *sep;
	uint8_t sessionid[NFSX_V4SESSIONID];

	sep = dssep;
	if (xidp != NULL)
	*xidp = 0;
	/* Reject requests while attempting a forced unmount. */
	if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
	m_freem(nd->nd_mreq);
	return (ESTALE);
	}

	/*
	* Set authcred, which is used to acquire RPC credentials to
	* the cred argument, by default. The crhold() should not be
	* necessary, but will ensure that some future code change
	* doesn't result in the credential being free'd prematurely.
	*/
	authcred = crhold(cred);

	/* For client side interruptible mounts, mask off the signals. */
	if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
	newnfs_set_sigmask(td, &oldset);
	set_sigset = 1;
	}

	/*
	* XXX if not already connected call nfs_connect now. Longer
	* term, change nfs_mount to call nfs_connect unconditionally
	* and let clnt_reconnect_create handle reconnects.
	*/
	if (nrp->nr_client == NULL)
	newnfs_connect(nmp, nrp, cred, td, 0);

	/*
	* For a client side mount, nmp is != NULL and clp == NULL. For
	* server calls (callbacks or upcalls), nmp == NULL.
	*/
	if (clp != NULL) {
	NFSLOCKSTATE();
	if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
	secflavour = RPCSEC_GSS_KRB5;
	if (nd->nd_procnum != NFSPROC_NULL) {
	if (clp->lc_flags & LCL_GSSINTEGRITY)
	secflavour = RPCSEC_GSS_KRB5I;
	else if (clp->lc_flags & LCL_GSSPRIVACY)
	secflavour = RPCSEC_GSS_KRB5P;
	}
	}
	NFSUNLOCKSTATE();
	} else if (nmp != NULL && NFSHASKERB(nmp) &&
	nd->nd_procnum != NFSPROC_NULL) {
	if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
	nd->nd_flag \|= ND_USEGSSNAME;
	if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
	/*
	* If there is a client side host based credential,
	* use that, otherwise use the system uid, if set.
	* The system uid is in the nmp->nm_sockreq.nr_cred
	* credentials.
	*/
	if (nmp->nm_krbnamelen > 0) {
	usegssname = 1;
	clnt_principal = nmp->nm_krbname;
	} else if (nmp->nm_uid != (uid_t)-1) {
	KASSERT(nmp->nm_sockreq.nr_cred != NULL,
	("newnfs_request: NULL nr_cred"));
	crfree(authcred);
	authcred = crhold(nmp->nm_sockreq.nr_cred);
	}
	} else if (nmp->nm_krbnamelen == 0 &&
	nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
	/*
	* If there is no host based principal name and
	* the system uid is set and this is root, use the
	* system uid, since root won't have user
	* credentials in a credentials cache file.
	* The system uid is in the nmp->nm_sockreq.nr_cred
	* credentials.
	*/
	KASSERT(nmp->nm_sockreq.nr_cred != NULL,
	("newnfs_request: NULL nr_cred"));
	crfree(authcred);
	authcred = crhold(nmp->nm_sockreq.nr_cred);
	}
	if (NFSHASINTEGRITY(nmp))
	secflavour = RPCSEC_GSS_KRB5I;
	else if (NFSHASPRIVACY(nmp))
	secflavour = RPCSEC_GSS_KRB5P;
	else
	secflavour = RPCSEC_GSS_KRB5;
	srv_principal = NFSMNT_SRVKRBNAME(nmp);
	} else if (nmp != NULL && !NFSHASKERB(nmp) &&
	nd->nd_procnum != NFSPROC_NULL &&
	(nd->nd_flag & ND_USEGSSNAME) != 0) {
	/*
	* Use the uid that did the mount when the RPC is doing
	* NFSv4 system operations, as indicated by the
	* ND_USEGSSNAME flag, for the AUTH_SYS case.
	* The credentials in nm_sockreq.nr_cred were used for the
	* mount.
	*/
	KASSERT(nmp->nm_sockreq.nr_cred != NULL,
	("newnfs_request: NULL nr_cred"));
	crfree(authcred);
	authcred = crhold(nmp->nm_sockreq.nr_cred);
	}

	if (nmp != NULL) {
	bzero(&nf, sizeof(struct nfs_feedback_arg));
	nf.nf_mount = nmp;
	nf.nf_td = td;
	nf.nf_lastmsg = NFSD_MONOSEC -
	((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
	}

	if (nd->nd_procnum == NFSPROC_NULL)
	auth = authnone_create();
	else if (usegssname) {
	/*
	* For this case, the authenticator is held in the
	* nfssockreq structure, so don't release the reference count
	* held on it. --> Don't AUTH_DESTROY() it in this function.
	*/
	if (nrp->nr_auth == NULL)
	nrp->nr_auth = nfs_getauth(nrp, secflavour,
	clnt_principal, srv_principal, NULL, authcred);
	else
	rpc_gss_refresh_auth_call(nrp->nr_auth);
	auth = nrp->nr_auth;
	} else
	auth = nfs_getauth(nrp, secflavour, NULL,
	srv_principal, NULL, authcred);
	crfree(authcred);
	if (auth == NULL) {
	m_freem(nd->nd_mreq);
	if (set_sigset)
	newnfs_restore_sigmask(td, &oldset);
	return (EACCES);
	}
	bzero(&ext, sizeof(ext));
	ext.rc_auth = auth;
	if (nmp != NULL) {
	ext.rc_feedback = nfs_feedback;
	ext.rc_feedback_arg = &nf;
	}

	procnum = nd->nd_procnum;
	if ((nd->nd_flag & ND_NFSV4) &&
	nd->nd_procnum != NFSPROC_NULL &&
	nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
	procnum = NFSV4PROC_COMPOUND;

	if (nmp != NULL) {
	NFSINCRGLOBAL(nfsstatsv1.rpcrequests);

	/* Map the procnum to the old NFSv2 one, as required. */
	if ((nd->nd_flag & ND_NFSV2) != 0) {
	if (nd->nd_procnum < NFS_V3NPROCS)
	procnum = nfsv2_procid[nd->nd_procnum];
	else
	procnum = NFSV2PROC_NOOP;
	}

	/*
	* Now only used for the R_DONTRECOVER case, but until that is
	* supported within the krpc code, I need to keep a queue of
	* outstanding RPCs for nfsv4 client requests.
	*/
	if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
	MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq),
	M_NFSDREQ, M_WAITOK);
	#ifdef KDTRACE_HOOKS
	if (dtrace_nfscl_nfs234_start_probe != NULL) {
	uint32_t probe_id;
	int probe_procnum;

	if (nd->nd_flag & ND_NFSV4) {
	probe_id =
	nfscl_nfs4_start_probes[nd->nd_procnum];
	probe_procnum = nd->nd_procnum;
	} else if (nd->nd_flag & ND_NFSV3) {
	probe_id = nfscl_nfs3_start_probes[procnum];
	probe_procnum = procnum;
	} else {
	probe_id =
	nfscl_nfs2_start_probes[nd->nd_procnum];
	probe_procnum = procnum;
	}
	if (probe_id != 0)
	(dtrace_nfscl_nfs234_start_probe)
	(probe_id, vp, nd->nd_mreq, cred,
	probe_procnum);
	}
	#endif
	}
	- trycnt = 0;
	freeslot = -1; /* Set to slot that needs to be free'd */
	tryagain:
	slot = -1; /* Slot that needs a sequence# increment. */
	/*
	* This timeout specifies when a new socket should be created,
	* along with new xid values. For UDP, this should be done
	* infrequently, since retransmits of RPC requests should normally
	* use the same xid.
	*/
	if (nmp == NULL) {
	timo.tv_usec = 0;
	if (clp == NULL)
	timo.tv_sec = NFSV4_UPCALLTIMEO;
	else
	timo.tv_sec = NFSV4_CALLBACKTIMEO;
	} else {
	if (nrp->nr_sotype != SOCK_DGRAM) {
	timo.tv_usec = 0;
	if ((nmp->nm_flag & NFSMNT_NFSV4))
	timo.tv_sec = INT_MAX;
	else
	timo.tv_sec = NFS_TCPTIMEO;
	} else {
	if (NFSHASSOFT(nmp)) {
	/*
	* CLSET_RETRIES is set to 2, so this should be
	* half of the total timeout required.
	*/
	timeo = nmp->nm_retry * nmp->nm_timeo / 2;
	if (timeo < 1)
	timeo = 1;
	timo.tv_sec = timeo / NFS_HZ;
	timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
	NFS_HZ;
	} else {
	/* For UDP hard mounts, use a large value. */
	timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
	timo.tv_usec = 0;
	}
	}

	if (rep != NULL) {
	rep->r_flags = 0;
	rep->r_nmp = nmp;
	/*
	* Chain request into list of outstanding requests.
	*/
	NFSLOCKREQ();
	TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
	NFSUNLOCKREQ();
	}
	}

	nd->nd_mrep = NULL;
	if (clp != NULL && sep != NULL)
	stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
	nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
	else
	stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
	nd->nd_mreq, &nd->nd_mrep, timo);

	if (rep != NULL) {
	/*
	* RPC done, unlink the request.
	*/
	NFSLOCKREQ();
	TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
	NFSUNLOCKREQ();
	}

	/*
	* If there was a successful reply and a tprintf msg.
	* tprintf a response.
	*/
	if (stat == RPC_SUCCESS) {
	error = 0;
	} else if (stat == RPC_TIMEDOUT) {
	NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
	error = ETIMEDOUT;
	} else if (stat == RPC_VERSMISMATCH) {
	NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
	error = EOPNOTSUPP;
	} else if (stat == RPC_PROGVERSMISMATCH) {
	NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
	error = EPROTONOSUPPORT;
	} else if (stat == RPC_INTR) {
	error = EINTR;
	} else {
	NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
	error = EACCES;
	}
	if (error) {
	m_freem(nd->nd_mreq);
	if (usegssname == 0)
	AUTH_DESTROY(auth);
	if (rep != NULL)
	FREE((caddr_t)rep, M_NFSDREQ);
	if (set_sigset)
	newnfs_restore_sigmask(td, &oldset);
	return (error);
	}

	KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));

	/*
	* Search for any mbufs that are not a multiple of 4 bytes long
	* or with m_data not longword aligned.
	* These could cause pointer alignment problems, so copy them to
	* well aligned mbufs.
	*/
	newnfs_realign(&nd->nd_mrep, M_WAITOK);
	nd->nd_md = nd->nd_mrep;
	nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
	nd->nd_repstat = 0;
	if (nd->nd_procnum != NFSPROC_NULL &&
	nd->nd_procnum != NFSV4PROC_CBNULL) {
	/* If sep == NULL, set it to the default in nmp. */
	if (sep == NULL && nmp != NULL)
	sep = nfsmnt_mdssession(nmp);
	/*
	* and now the actual NFS xdr.
	*/
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
	if (nd->nd_repstat >= 10000)
	NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
	(int)nd->nd_repstat);

	/*
	* Get rid of the tag, return count and SEQUENCE result for
	* NFSv4.
	*/
	if ((nd->nd_flag & ND_NFSV4) != 0) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	NFSM_DISSECT(tl, u_int32_t , 3 NFSX_UNSIGNED);
	opcnt = fxdr_unsigned(int, *tl++);
	i = fxdr_unsigned(int, *tl++);
	j = fxdr_unsigned(int, *tl);
	if (j >= 10000)
	NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
	/*
	* If the first op is Sequence, free up the slot.
	*/
	if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) \|\|
	(clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0))
	NFSCL_DEBUG(1, "failed seq=%d\n", j);
	if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) \|\|
	(clp != NULL && i == NFSV4OP_CBSEQUENCE && j == 0)
	) {
	if (i == NFSV4OP_SEQUENCE)
	NFSM_DISSECT(tl, uint32_t *,
	NFSX_V4SESSIONID +
	5 * NFSX_UNSIGNED);
	else
	NFSM_DISSECT(tl, uint32_t *,
	NFSX_V4SESSIONID +
	4 * NFSX_UNSIGNED);
	mtx_lock(&sep->nfsess_mtx);
	if (bcmp(tl, sep->nfsess_sessionid,
	NFSX_V4SESSIONID) == 0) {
	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
	retseq = fxdr_unsigned(uint32_t, *tl++);
	slot = fxdr_unsigned(int, *tl++);
	freeslot = slot;
	if (retseq != sep->nfsess_slotseq[slot])
	printf("retseq diff 0x%x\n",
	retseq);
	retval = fxdr_unsigned(uint32_t, *++tl);
	if ((retval + 1) < sep->nfsess_foreslots
	)
	sep->nfsess_foreslots = (retval
	+ 1);
	else if ((retval + 1) >
	sep->nfsess_foreslots)
	sep->nfsess_foreslots = (retval
	< 64) ? (retval + 1) : 64;
	}
	mtx_unlock(&sep->nfsess_mtx);

	/* Grab the op and status for the next one. */
	if (opcnt > 1) {
	NFSM_DISSECT(tl, uint32_t *,
	2 * NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl++);
	j = fxdr_unsigned(int, *tl);
	}
	}
	}
	if (nd->nd_repstat != 0) {
	if (nd->nd_repstat == NFSERR_BADSESSION &&
	nmp != NULL && dssep == NULL) {
	/*
	* If this is a client side MDS RPC, mark
	* the MDS session defunct and initiate
	* recovery, as required.
	* The nfsess_defunct field is protected by
	* the NFSLOCKMNT()/nm_mtx lock and not the
	* nfsess_mtx lock to simplify its handling,
	* for the MDS session. This lock is also
	* sufficient for nfsess_sessionid, since it
	* never changes in the structure.
	*/
	NFSCL_DEBUG(1, "Got badsession\n");
	NFSLOCKCLSTATE();
	NFSLOCKMNT(nmp);
	sep = NFSMNT_MDSSESSION(nmp);
	if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
	NFSX_V4SESSIONID) == 0) {
	/* Initiate recovery. */
	sep->nfsess_defunct = 1;
	NFSCL_DEBUG(1, "Marked defunct\n");
	if (nmp->nm_clp != NULL) {
	nmp->nm_clp->nfsc_flags \|=
	NFSCLFLAGS_RECOVER;
	wakeup(nmp->nm_clp);
	}
	}
	NFSUNLOCKCLSTATE();
	/*
	* Sleep for up to 1sec waiting for a new
	* session.
	*/
	mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
	"nfsbadsess", hz);
	/*
	* Get the session again, in case a new one
	* has been created during the sleep.
	*/
	sep = NFSMNT_MDSSESSION(nmp);
	NFSUNLOCKMNT(nmp);
	if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
	reterr = nfsv4_sequencelookup(nmp, sep,
	&slotpos, &maxslot, &slotseq,
	sessionid);
	if (reterr == 0) {
	/* Fill in new session info. */
	NFSCL_DEBUG(1,
	"Filling in new sequence\n");
	tl = nd->nd_sequence;
	bcopy(sessionid, tl,
	NFSX_V4SESSIONID);
	tl += NFSX_V4SESSIONID /
	NFSX_UNSIGNED;
	*tl++ = txdr_unsigned(slotseq);
	*tl++ = txdr_unsigned(slotpos);
	*tl = txdr_unsigned(maxslot);
	}
	if (reterr == NFSERR_BADSESSION \|\|
	reterr == 0) {
	NFSCL_DEBUG(1,
	"Badsession looping\n");
	m_freem(nd->nd_mrep);
	nd->nd_mrep = NULL;
	goto tryagain;
	}
	nd->nd_repstat = reterr;
	NFSCL_DEBUG(1, "Got err=%d\n", reterr);
	}
	}
	if (((nd->nd_repstat == NFSERR_DELAY \|\|
	nd->nd_repstat == NFSERR_GRACE) &&
	(nd->nd_flag & ND_NFSV4) &&
	nd->nd_procnum != NFSPROC_DELEGRETURN &&
	nd->nd_procnum != NFSPROC_SETATTR &&
	nd->nd_procnum != NFSPROC_READ &&
	nd->nd_procnum != NFSPROC_READDS &&
	nd->nd_procnum != NFSPROC_WRITE &&
	nd->nd_procnum != NFSPROC_WRITEDS &&
	nd->nd_procnum != NFSPROC_OPEN &&
	nd->nd_procnum != NFSPROC_CREATE &&
	nd->nd_procnum != NFSPROC_OPENCONFIRM &&
	nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
	nd->nd_procnum != NFSPROC_CLOSE &&
	nd->nd_procnum != NFSPROC_LOCK &&
	nd->nd_procnum != NFSPROC_LOCKU) \|\|
	(nd->nd_repstat == NFSERR_DELAY &&
	(nd->nd_flag & ND_NFSV4) == 0) \|\|
	nd->nd_repstat == NFSERR_RESOURCE) {
	if (trylater_delay > NFS_TRYLATERDEL)
	trylater_delay = NFS_TRYLATERDEL;
	waituntil = NFSD_MONOSEC + trylater_delay;
	while (NFSD_MONOSEC < waituntil)
	(void) nfs_catnap(PZERO, 0, "nfstry");
	trylater_delay *= 2;
	if (slot != -1) {
	mtx_lock(&sep->nfsess_mtx);
	sep->nfsess_slotseq[slot]++;
	*nd->nd_slotseq = txdr_unsigned(
	sep->nfsess_slotseq[slot]);
	mtx_unlock(&sep->nfsess_mtx);
	}
	m_freem(nd->nd_mrep);
	nd->nd_mrep = NULL;
	goto tryagain;
	}

	/*
	* If the File Handle was stale, invalidate the
	* lookup cache, just in case.
	* (vp != NULL implies a client side call)
	*/
	if (nd->nd_repstat == ESTALE && vp != NULL) {
	cache_purge(vp);
	if (ncl_call_invalcaches != NULL)
	(*ncl_call_invalcaches)(vp);
	}
	}
	if ((nd->nd_flag & ND_NFSV4) != 0) {
	/* Free the slot, as required. */
	if (freeslot != -1)
	nfsv4_freeslot(sep, freeslot);
	/*
	* If this op is Putfh, throw its results away.
	*/
	if (j >= 10000)
	NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
	if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
	NFSM_DISSECT(tl,u_int32_t ,2 NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl++);
	j = fxdr_unsigned(int, *tl);
	if (j >= 10000)
	NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
	j);
	/*
	* All Compounds that do an Op that must
	* be in sequence consist of NFSV4OP_PUTFH
	* followed by one of these. As such, we
	* can determine if the seqid# should be
	* incremented, here.
	*/
	if ((i == NFSV4OP_OPEN \|\|
	i == NFSV4OP_OPENCONFIRM \|\|
	i == NFSV4OP_OPENDOWNGRADE \|\|
	i == NFSV4OP_CLOSE \|\|
	i == NFSV4OP_LOCK \|\|
	i == NFSV4OP_LOCKU) &&
	(j == 0 \|\|
	(j != NFSERR_STALECLIENTID &&
	j != NFSERR_STALESTATEID &&
	j != NFSERR_BADSTATEID &&
	j != NFSERR_BADSEQID &&
	j != NFSERR_BADXDR &&
	j != NFSERR_RESOURCE &&
	j != NFSERR_NOFILEHANDLE)))
	nd->nd_flag \|= ND_INCRSEQID;
	}
	/*
	* If this op's status is non-zero, mark
	* that there is no more data to process.
	* The exception is Setattr, which always has xdr
	* when it has failed.
	*/
	if (j != 0 && i != NFSV4OP_SETATTR)
	nd->nd_flag \|= ND_NOMOREDATA;

	/*
	* If R_DONTRECOVER is set, replace the stale error
	* reply, so that recovery isn't initiated.
	*/
	if ((nd->nd_repstat == NFSERR_STALECLIENTID \|\|
	nd->nd_repstat == NFSERR_BADSESSION \|\|
	nd->nd_repstat == NFSERR_STALESTATEID) &&
	rep != NULL && (rep->r_flags & R_DONTRECOVER))
	nd->nd_repstat = NFSERR_STALEDONTRECOVER;
	}
	}

	#ifdef KDTRACE_HOOKS
	if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
	uint32_t probe_id;
	int probe_procnum;

	if (nd->nd_flag & ND_NFSV4) {
	probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
	probe_procnum = nd->nd_procnum;
	} else if (nd->nd_flag & ND_NFSV3) {
	probe_id = nfscl_nfs3_done_probes[procnum];
	probe_procnum = procnum;
	} else {
	probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
	probe_procnum = procnum;
	}
	if (probe_id != 0)
	(dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
	nd->nd_mreq, cred, probe_procnum, 0);
	}
	#endif

	m_freem(nd->nd_mreq);
	if (usegssname == 0)
	AUTH_DESTROY(auth);
	if (rep != NULL)
	FREE((caddr_t)rep, M_NFSDREQ);
	if (set_sigset)
	newnfs_restore_sigmask(td, &oldset);
	return (0);
	nfsmout:
	mbuf_freem(nd->nd_mrep);
	mbuf_freem(nd->nd_mreq);
	if (usegssname == 0)
	AUTH_DESTROY(auth);
	if (rep != NULL)
	FREE((caddr_t)rep, M_NFSDREQ);
	if (set_sigset)
	newnfs_restore_sigmask(td, &oldset);
	return (error);
	}

	/*
	* Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
	* wait for all requests to complete. This is used by forced unmounts
	* to terminate any outstanding RPCs.
	*/
	int
	newnfs_nmcancelreqs(struct nfsmount *nmp)
	{
	struct nfsclds *dsp;
	struct __rpc_client *cl;

	if (nmp->nm_sockreq.nr_client != NULL)
	CLNT_CLOSE(nmp->nm_sockreq.nr_client);
	lookformore:
	NFSLOCKMNT(nmp);
	TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
	NFSLOCKDS(dsp);
	if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
	(dsp->nfsclds_flags & NFSCLDS_CLOSED) == 0 &&
	dsp->nfsclds_sockp != NULL &&
	dsp->nfsclds_sockp->nr_client != NULL) {
	dsp->nfsclds_flags \|= NFSCLDS_CLOSED;
	cl = dsp->nfsclds_sockp->nr_client;
	NFSUNLOCKDS(dsp);
	NFSUNLOCKMNT(nmp);
	CLNT_CLOSE(cl);
	goto lookformore;
	}
	NFSUNLOCKDS(dsp);
	}
	NFSUNLOCKMNT(nmp);
	return (0);
	}

	/*
	* Any signal that can interrupt an NFS operation in an intr mount
	* should be added to this set. SIGSTOP and SIGKILL cannot be masked.
	*/
	int newnfs_sig_set[] = {
	SIGINT,
	SIGTERM,
	SIGHUP,
	SIGKILL,
	SIGQUIT
	};

	/*
	* Check to see if one of the signals in our subset is pending on
	* the process (in an intr mount).
	*/
	static int
	nfs_sig_pending(sigset_t set)
	{
	int i;

	for (i = 0 ; i < nitems(newnfs_sig_set); i++)
	if (SIGISMEMBER(set, newnfs_sig_set[i]))
	return (1);
	return (0);
	}

	/*
	* The set/restore sigmask functions are used to (temporarily) overwrite
	* the thread td_sigmask during an RPC call (for example). These are also
	* used in other places in the NFS client that might tsleep().
	*/
	void
	newnfs_set_sigmask(struct thread td, sigset_t oldset)
	{
	sigset_t newset;
	int i;
	struct proc *p;

	SIGFILLSET(newset);
	if (td == NULL)
	td = curthread; /* XXX */
	p = td->td_proc;
	/* Remove the NFS set of signals from newset */
	PROC_LOCK(p);
	mtx_lock(&p->p_sigacts->ps_mtx);
	for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
	/*
	* But make sure we leave the ones already masked
	* by the process, ie. remove the signal from the
	* temporary signalmask only if it wasn't already
	* in p_sigmask.
	*/
	if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
	!SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
	SIGDELSET(newset, newnfs_sig_set[i]);
	}
	mtx_unlock(&p->p_sigacts->ps_mtx);
	kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
	SIGPROCMASK_PROC_LOCKED);
	PROC_UNLOCK(p);
	}

	void
	newnfs_restore_sigmask(struct thread td, sigset_t set)
	{
	if (td == NULL)
	td = curthread; /* XXX */
	kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
	}

	/*
	* NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
	* old one after msleep() returns.
	*/
	int
	newnfs_msleep(struct thread td, void ident, struct mtx mtx, int priority, char wmesg, int timo)
	{
	sigset_t oldset;
	int error;
	- struct proc *p;
	-
	+
	if ((priority & PCATCH) == 0)
	return msleep(ident, mtx, priority, wmesg, timo);
	if (td == NULL)
	td = curthread; /* XXX */
	newnfs_set_sigmask(td, &oldset);
	error = msleep(ident, mtx, priority, wmesg, timo);
	newnfs_restore_sigmask(td, &oldset);
	- p = td->td_proc;
	return (error);
	}

	/*
	* Test for a termination condition pending on the process.
	* This is used for NFSMNT_INT mounts.
	*/
	int
	newnfs_sigintr(struct nfsmount nmp, struct thread td)
	{
	struct proc *p;
	sigset_t tmpset;

	/* Terminate all requests while attempting a forced unmount. */
	if (NFSCL_FORCEDISM(nmp->nm_mountp))
	return (EIO);
	if (!(nmp->nm_flag & NFSMNT_INT))
	return (0);
	if (td == NULL)
	return (0);
	p = td->td_proc;
	PROC_LOCK(p);
	tmpset = p->p_siglist;
	SIGSETOR(tmpset, td->td_siglist);
	SIGSETNAND(tmpset, td->td_sigmask);
	mtx_lock(&p->p_sigacts->ps_mtx);
	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
	mtx_unlock(&p->p_sigacts->ps_mtx);
	if ((SIGNOTEMPTY(p->p_siglist) \|\| SIGNOTEMPTY(td->td_siglist))
	&& nfs_sig_pending(tmpset)) {
	PROC_UNLOCK(p);
	return (EINTR);
	}
	PROC_UNLOCK(p);
	return (0);
	}

	static int
	nfs_msg(struct thread td, const char server, const char *msg, int error)
	{
	struct proc *p;

	p = td ? td->td_proc : NULL;
	if (error) {
	tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
	server, msg, error);
	} else {
	tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
	}
	return (0);
	}

	static void
	nfs_down(struct nfsmount nmp, struct thread td, const char *msg,
	int error, int flags)
	{
	if (nmp == NULL)
	return;
	mtx_lock(&nmp->nm_mtx);
	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
	nmp->nm_state \|= NFSSTA_TIMEO;
	mtx_unlock(&nmp->nm_mtx);
	vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
	VQ_NOTRESP, 0);
	} else
	mtx_unlock(&nmp->nm_mtx);
	mtx_lock(&nmp->nm_mtx);
	if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
	nmp->nm_state \|= NFSSTA_LOCKTIMEO;
	mtx_unlock(&nmp->nm_mtx);
	vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
	VQ_NOTRESPLOCK, 0);
	} else
	mtx_unlock(&nmp->nm_mtx);
	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
	}

	static void
	nfs_up(struct nfsmount nmp, struct thread td, const char *msg,
	int flags, int tprintfmsg)
	{
	if (nmp == NULL)
	return;
	if (tprintfmsg) {
	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
	}

	mtx_lock(&nmp->nm_mtx);
	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
	nmp->nm_state &= ~NFSSTA_TIMEO;
	mtx_unlock(&nmp->nm_mtx);
	vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
	VQ_NOTRESP, 1);
	} else
	mtx_unlock(&nmp->nm_mtx);

	mtx_lock(&nmp->nm_mtx);
	if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
	nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
	mtx_unlock(&nmp->nm_mtx);
	vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
	VQ_NOTRESPLOCK, 1);
	} else
	mtx_unlock(&nmp->nm_mtx);
	}

	Index: head/sys/fs/nfs/nfs_commonsubs.c
	===================================================================
	--- head/sys/fs/nfs/nfs_commonsubs.c (revision 327172)
	+++ head/sys/fs/nfs/nfs_commonsubs.c (revision 327173)
	@@ -1,4242 +1,4241 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Rick Macklem at The University of Guelph.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* These functions support the macros and help fiddle mbuf chains for
	* the nfs op functions. They do things like create the rpc header and
	* copy data between mbuf chains and uio lists.
	*/
	#ifndef APPLEKEXT
	#include "opt_inet6.h"

	#include <fs/nfs/nfsport.h>

	#include <security/mac/mac_framework.h>

	/*
	* Data items converted to xdr at startup, since they are constant
	* This is kinda hokey, but may save a little time doing byte swaps
	*/
	u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;

	/* And other global data */
	nfstype nfsv34_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFSOCK,
	NFFIFO, NFNON };
	enum vtype newnv2tov_type[8] = { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON };
	enum vtype nv34tov_type[8]={ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO };
	struct timeval nfsboottime; /* Copy boottime once, so it never changes */
	int nfscl_ticks;
	int nfsrv_useacl = 1;
	struct nfssockreq nfsrv_nfsuserdsock;
	int nfsrv_nfsuserd = 0;
	struct nfsreqhead nfsd_reqq;
	uid_t nfsrv_defaultuid = UID_NOBODY;
	gid_t nfsrv_defaultgid = GID_NOGROUP;
	int nfsrv_lease = NFSRV_LEASE;
	int ncl_mbuf_mlen = MLEN;
	int nfsd_enable_stringtouid = 0;
	static int nfs_enable_uidtostring = 0;
	NFSNAMEIDMUTEX;
	NFSSOCKMUTEX;
	extern int nfsrv_lughashsize;

	SYSCTL_DECL(_vfs_nfs);
	SYSCTL_INT(_vfs_nfs, OID_AUTO, enable_uidtostring, CTLFLAG_RW,
	&nfs_enable_uidtostring, 0, "Make nfs always send numeric owner_names");

	/*
	* This array of structures indicates, for V4:
	* retfh - which of 3 types of calling args are used
	* 0 - doesn't change cfh or use a sfh
	* 1 - replaces cfh with a new one (unless it returns an error status)
	* 2 - uses cfh and sfh
	* needscfh - if the op wants a cfh and premtime
	* 0 - doesn't use a cfh
	* 1 - uses a cfh, but doesn't want pre-op attributes
	* 2 - uses a cfh and wants pre-op attributes
	* savereply - indicates a non-idempotent Op
	* 0 - not non-idempotent
	* 1 - non-idempotent
	* Ops that are ordered via seqid# are handled separately from these
	* non-idempotent Ops.
	* Define it here, since it is used by both the client and server.
	*/
	struct nfsv4_opflag nfsv4_opflag[NFSV41_NOPS] = {
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* undef */
	{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Access */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Close */
	{ 0, 2, 0, 1, LK_EXCLUSIVE, 1, 1 }, /* Commit */
	{ 1, 2, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Create */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Delegpurge */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Delegreturn */
	{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Getattr */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* GetFH */
	{ 2, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Link */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Lock */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* LockT */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* LockU */
	{ 1, 2, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Lookup */
	{ 1, 2, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Lookupp */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* NVerify */
	{ 1, 1, 0, 1, LK_EXCLUSIVE, 1, 0 }, /* Open */
	{ 1, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenAttr */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenConfirm */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* OpenDowngrade */
	{ 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutFH */
	{ 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutPubFH */
	{ 1, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* PutRootFH */
	{ 0, 1, 0, 0, LK_SHARED, 1, 0 }, /* Read */
	{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* Readdir */
	{ 0, 1, 0, 0, LK_SHARED, 1, 1 }, /* ReadLink */
	{ 0, 2, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Remove */
	{ 2, 1, 1, 1, LK_EXCLUSIVE, 1, 1 }, /* Rename */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Renew */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* RestoreFH */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SaveFH */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SecInfo */
	{ 0, 2, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Setattr */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SetClientID */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* SetClientIDConfirm */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Verify */
	{ 0, 2, 1, 1, LK_EXCLUSIVE, 1, 0 }, /* Write */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* ReleaseLockOwner */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Backchannel Ctrl */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Bind Conn to Sess */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Exchange ID */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Create Session */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Destroy Session */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Free StateID */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Dir Deleg */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Device Info */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Get Device List */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Layout Commit */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Layout Get */
	{ 0, 1, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Layout Return */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Secinfo No name */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Sequence */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Set SSV */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Test StateID */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 1 }, /* Want Delegation */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 0, 0 }, /* Destroy ClientID */
	{ 0, 0, 0, 0, LK_EXCLUSIVE, 1, 0 }, /* Reclaim Complete */
	};
	#endif /* !APPLEKEXT */

	static int ncl_mbuf_mhlen = MHLEN;
	static int nfsrv_usercnt = 0;
	static int nfsrv_dnsnamelen;
	static u_char *nfsrv_dnsname = NULL;
	static int nfsrv_usermax = 999999999;
	struct nfsrv_lughash {
	struct mtx mtx;
	struct nfsuserhashhead lughead;
	};
	static struct nfsrv_lughash *nfsuserhash;
	static struct nfsrv_lughash *nfsusernamehash;
	static struct nfsrv_lughash *nfsgrouphash;
	static struct nfsrv_lughash *nfsgroupnamehash;

	/*
	* This static array indicates whether or not the RPC generates a large
	* reply. This is used by nfs_reply() to decide whether or not an mbuf
	* cluster should be allocated. (If a cluster is required by an RPC
	* marked 0 in this array, the code will still work, just not quite as
	* efficiently.)
	*/
	int nfs_bigreply[NFSV41_NPROCS] = { 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 };

	/* local functions */
	static int nfsrv_skipace(struct nfsrv_descript nd, int acesizep);
	static void nfsv4_wanted(struct nfsv4lock *lp);
	static int nfsrv_cmpmixedcase(u_char cp, u_char cp2, int len);
	static int nfsrv_getuser(int procnum, uid_t uid, gid_t gid, char *name,
	NFSPROC_T *p);
	static void nfsrv_removeuser(struct nfsusrgrp *usrp, int isuser);
	static int nfsrv_getrefstr(struct nfsrv_descript , u_char , u_char *,
	int , int );
	static void nfsrv_refstrbigenough(int, u_char , u_char , int *);


	#ifndef APPLE
	/*
	* copies mbuf chain to the uio scatter/gather list
	*/
	int
	nfsm_mbufuio(struct nfsrv_descript nd, struct uio uiop, int siz)
	{
	char mbufcp, uiocp;
	int xfer, left, len;
	mbuf_t mp;
	long uiosiz, rem;
	int error = 0;

	mp = nd->nd_md;
	mbufcp = nd->nd_dpos;
	len = NFSMTOD(mp, caddr_t) + mbuf_len(mp) - mbufcp;
	rem = NFSM_RNDUP(siz) - siz;
	while (siz > 0) {
	if (uiop->uio_iovcnt <= 0 \|\| uiop->uio_iov == NULL) {
	error = EBADRPC;
	goto out;
	}
	left = uiop->uio_iov->iov_len;
	uiocp = uiop->uio_iov->iov_base;
	if (left > siz)
	left = siz;
	uiosiz = left;
	while (left > 0) {
	while (len == 0) {
	mp = mbuf_next(mp);
	if (mp == NULL) {
	error = EBADRPC;
	goto out;
	}
	mbufcp = NFSMTOD(mp, caddr_t);
	len = mbuf_len(mp);
	KASSERT(len >= 0,
	("len %d, corrupted mbuf?", len));
	}
	xfer = (left > len) ? len : left;
	#ifdef notdef
	/* Not Yet.. */
	if (uiop->uio_iov->iov_op != NULL)
	(*(uiop->uio_iov->iov_op))
	(mbufcp, uiocp, xfer);
	else
	#endif
	if (uiop->uio_segflg == UIO_SYSSPACE)
	NFSBCOPY(mbufcp, uiocp, xfer);
	else
	copyout(mbufcp, CAST_USER_ADDR_T(uiocp), xfer);
	left -= xfer;
	len -= xfer;
	mbufcp += xfer;
	uiocp += xfer;
	uiop->uio_offset += xfer;
	uiop->uio_resid -= xfer;
	}
	if (uiop->uio_iov->iov_len <= siz) {
	uiop->uio_iovcnt--;
	uiop->uio_iov++;
	} else {
	uiop->uio_iov->iov_base = (void *)
	((char *)uiop->uio_iov->iov_base + uiosiz);
	uiop->uio_iov->iov_len -= uiosiz;
	}
	siz -= uiosiz;
	}
	nd->nd_dpos = mbufcp;
	nd->nd_md = mp;
	if (rem > 0) {
	if (len < rem)
	error = nfsm_advance(nd, rem, len);
	else
	nd->nd_dpos += rem;
	}

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}
	#endif /* !APPLE */

	/*
	* Help break down an mbuf chain by setting the first siz bytes contiguous
	* pointed to by returned val.
	* This is used by the macro NFSM_DISSECT for tough
	* cases.
	*/
	APPLESTATIC void *
	nfsm_dissct(struct nfsrv_descript *nd, int siz, int how)
	{
	mbuf_t mp2;
	int siz2, xfer;
	caddr_t p;
	int left;
	caddr_t retp;

	retp = NULL;
	left = NFSMTOD(nd->nd_md, caddr_t) + mbuf_len(nd->nd_md) - nd->nd_dpos;
	while (left == 0) {
	nd->nd_md = mbuf_next(nd->nd_md);
	if (nd->nd_md == NULL)
	return (retp);
	left = mbuf_len(nd->nd_md);
	nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
	}
	if (left >= siz) {
	retp = nd->nd_dpos;
	nd->nd_dpos += siz;
	} else if (mbuf_next(nd->nd_md) == NULL) {
	return (retp);
	} else if (siz > ncl_mbuf_mhlen) {
	panic("nfs S too big");
	} else {
	MGET(mp2, MT_DATA, how);
	if (mp2 == NULL)
	return (NULL);
	mbuf_setnext(mp2, mbuf_next(nd->nd_md));
	mbuf_setnext(nd->nd_md, mp2);
	mbuf_setlen(nd->nd_md, mbuf_len(nd->nd_md) - left);
	nd->nd_md = mp2;
	retp = p = NFSMTOD(mp2, caddr_t);
	NFSBCOPY(nd->nd_dpos, p, left); /* Copy what was left */
	siz2 = siz - left;
	p += left;
	mp2 = mbuf_next(mp2);
	/* Loop around copying up the siz2 bytes */
	while (siz2 > 0) {
	if (mp2 == NULL)
	return (NULL);
	xfer = (siz2 > mbuf_len(mp2)) ? mbuf_len(mp2) : siz2;
	if (xfer > 0) {
	NFSBCOPY(NFSMTOD(mp2, caddr_t), p, xfer);
	NFSM_DATAP(mp2, xfer);
	mbuf_setlen(mp2, mbuf_len(mp2) - xfer);
	p += xfer;
	siz2 -= xfer;
	}
	if (siz2 > 0)
	mp2 = mbuf_next(mp2);
	}
	mbuf_setlen(nd->nd_md, siz);
	nd->nd_md = mp2;
	nd->nd_dpos = NFSMTOD(mp2, caddr_t);
	}
	return (retp);
	}

	/*
	* Advance the position in the mbuf chain.
	* If offs == 0, this is a no-op, but it is simpler to just return from
	* here than check for offs > 0 for all calls to nfsm_advance.
	* If left == -1, it should be calculated here.
	*/
	APPLESTATIC int
	nfsm_advance(struct nfsrv_descript *nd, int offs, int left)
	{
	int error = 0;

	if (offs == 0)
	goto out;
	/*
	* A negative offs should be considered a serious problem.
	*/
	if (offs < 0)
	panic("nfsrv_advance");

	/*
	* If left == -1, calculate it here.
	*/
	if (left == -1)
	left = NFSMTOD(nd->nd_md, caddr_t) + mbuf_len(nd->nd_md) -
	nd->nd_dpos;

	/*
	* Loop around, advancing over the mbuf data.
	*/
	while (offs > left) {
	offs -= left;
	nd->nd_md = mbuf_next(nd->nd_md);
	if (nd->nd_md == NULL) {
	error = EBADRPC;
	goto out;
	}
	left = mbuf_len(nd->nd_md);
	nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
	}
	nd->nd_dpos += offs;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Copy a string into mbuf(s).
	* Return the number of bytes output, including XDR overheads.
	*/
	APPLESTATIC int
	nfsm_strtom(struct nfsrv_descript nd, const char cp, int siz)
	{
	mbuf_t m2;
	int xfer, left;
	mbuf_t m1;
	int rem, bytesize;
	u_int32_t *tl;
	char *cp2;

	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(siz);
	rem = NFSM_RNDUP(siz) - siz;
	bytesize = NFSX_UNSIGNED + siz + rem;
	m2 = nd->nd_mb;
	cp2 = nd->nd_bpos;
	left = M_TRAILINGSPACE(m2);

	/*
	* Loop around copying the string to mbuf(s).
	*/
	while (siz > 0) {
	if (left == 0) {
	if (siz > ncl_mbuf_mlen)
	NFSMCLGET(m1, M_WAITOK);
	else
	NFSMGET(m1);
	mbuf_setlen(m1, 0);
	mbuf_setnext(m2, m1);
	m2 = m1;
	cp2 = NFSMTOD(m2, caddr_t);
	left = M_TRAILINGSPACE(m2);
	}
	if (left >= siz)
	xfer = siz;
	else
	xfer = left;
	NFSBCOPY(cp, cp2, xfer);
	cp += xfer;
	mbuf_setlen(m2, mbuf_len(m2) + xfer);
	siz -= xfer;
	left -= xfer;
	if (siz == 0 && rem) {
	if (left < rem)
	panic("nfsm_strtom");
	NFSBZERO(cp2 + xfer, rem);
	mbuf_setlen(m2, mbuf_len(m2) + rem);
	}
	}
	nd->nd_mb = m2;
	nd->nd_bpos = NFSMTOD(m2, caddr_t) + mbuf_len(m2);
	return (bytesize);
	}

	/*
	* Called once to initialize data structures...
	*/
	APPLESTATIC void
	newnfs_init(void)
	{
	static int nfs_inited = 0;

	if (nfs_inited)
	return;
	nfs_inited = 1;

	newnfs_true = txdr_unsigned(TRUE);
	newnfs_false = txdr_unsigned(FALSE);
	newnfs_xdrneg1 = txdr_unsigned(-1);
	nfscl_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
	if (nfscl_ticks < 1)
	nfscl_ticks = 1;
	NFSSETBOOTTIME(nfsboottime);

	/*
	* Initialize reply list and start timer
	*/
	TAILQ_INIT(&nfsd_reqq);
	NFS_TIMERINIT;
	}

	/*
	* Put a file handle in an mbuf list.
	* If the size argument == 0, just use the default size.
	* set_true == 1 if there should be an newnfs_true prepended on the file handle.
	* Return the number of bytes output, including XDR overhead.
	*/
	APPLESTATIC int
	nfsm_fhtom(struct nfsrv_descript nd, u_int8_t fhp, int size, int set_true)
	{
	u_int32_t *tl;
	u_int8_t *cp;
	- int fullsiz, rem, bytesize = 0;
	+ int fullsiz, bytesize = 0;

	if (size == 0)
	size = NFSX_MYFH;
	switch (nd->nd_flag & (ND_NFSV2 \| ND_NFSV3 \| ND_NFSV4)) {
	case ND_NFSV2:
	if (size > NFSX_V2FH)
	panic("fh size > NFSX_V2FH for NFSv2");
	NFSM_BUILD(cp, u_int8_t *, NFSX_V2FH);
	NFSBCOPY(fhp, cp, size);
	if (size < NFSX_V2FH)
	NFSBZERO(cp + size, NFSX_V2FH - size);
	bytesize = NFSX_V2FH;
	break;
	case ND_NFSV3:
	case ND_NFSV4:
	fullsiz = NFSM_RNDUP(size);
	- rem = fullsiz - size;
	if (set_true) {
	bytesize = 2 * NFSX_UNSIGNED + fullsiz;
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = newnfs_true;
	} else {
	bytesize = NFSX_UNSIGNED + fullsiz;
	}
	(void) nfsm_strtom(nd, fhp, size);
	break;
	}
	return (bytesize);
	}

	/*
	* This function compares two net addresses by family and returns TRUE
	* if they are the same host.
	* If there is any doubt, return FALSE.
	* The AF_INET family is handled as a special case so that address mbufs
	* don't need to be saved to store "struct in_addr", which is only 4 bytes.
	*/
	APPLESTATIC int
	nfsaddr_match(int family, union nethostaddr *haddr, NFSSOCKADDR_T nam)
	{
	struct sockaddr_in *inetaddr;

	switch (family) {
	case AF_INET:
	inetaddr = NFSSOCKADDR(nam, struct sockaddr_in *);
	if (inetaddr->sin_family == AF_INET &&
	inetaddr->sin_addr.s_addr == haddr->had_inet.s_addr)
	return (1);
	break;
	#ifdef INET6
	case AF_INET6:
	{
	struct sockaddr_in6 *inetaddr6;

	inetaddr6 = NFSSOCKADDR(nam, struct sockaddr_in6 *);
	/* XXX - should test sin6_scope_id ? */
	if (inetaddr6->sin6_family == AF_INET6 &&
	IN6_ARE_ADDR_EQUAL(&inetaddr6->sin6_addr,
	&haddr->had_inet6))
	return (1);
	}
	break;
	#endif
	}
	return (0);
	}

	/*
	* Similar to the above, but takes to NFSSOCKADDR_T args.
	*/
	APPLESTATIC int
	nfsaddr2_match(NFSSOCKADDR_T nam1, NFSSOCKADDR_T nam2)
	{
	struct sockaddr_in addr1, addr2;
	struct sockaddr *inaddr;

	inaddr = NFSSOCKADDR(nam1, struct sockaddr *);
	switch (inaddr->sa_family) {
	case AF_INET:
	addr1 = NFSSOCKADDR(nam1, struct sockaddr_in *);
	addr2 = NFSSOCKADDR(nam2, struct sockaddr_in *);
	if (addr2->sin_family == AF_INET &&
	addr1->sin_addr.s_addr == addr2->sin_addr.s_addr)
	return (1);
	break;
	#ifdef INET6
	case AF_INET6:
	{
	struct sockaddr_in6 inet6addr1, inet6addr2;

	inet6addr1 = NFSSOCKADDR(nam1, struct sockaddr_in6 *);
	inet6addr2 = NFSSOCKADDR(nam2, struct sockaddr_in6 *);
	/* XXX - should test sin6_scope_id ? */
	if (inet6addr2->sin6_family == AF_INET6 &&
	IN6_ARE_ADDR_EQUAL(&inet6addr1->sin6_addr,
	&inet6addr2->sin6_addr))
	return (1);
	}
	break;
	#endif
	}
	return (0);
	}


	/*
	* Trim the stuff already dissected off the mbuf list.
	*/
	APPLESTATIC void
	newnfs_trimleading(nd)
	struct nfsrv_descript *nd;
	{
	mbuf_t m, n;
	int offs;

	/*
	* First, free up leading mbufs.
	*/
	if (nd->nd_mrep != nd->nd_md) {
	m = nd->nd_mrep;
	while (mbuf_next(m) != nd->nd_md) {
	if (mbuf_next(m) == NULL)
	panic("nfsm trim leading");
	m = mbuf_next(m);
	}
	mbuf_setnext(m, NULL);
	mbuf_freem(nd->nd_mrep);
	}
	m = nd->nd_md;

	/*
	* Now, adjust this mbuf, based on nd_dpos.
	*/
	offs = nd->nd_dpos - NFSMTOD(m, caddr_t);
	if (offs == mbuf_len(m)) {
	n = m;
	m = mbuf_next(m);
	if (m == NULL)
	panic("nfsm trim leading2");
	mbuf_setnext(n, NULL);
	mbuf_freem(n);
	} else if (offs > 0) {
	mbuf_setlen(m, mbuf_len(m) - offs);
	NFSM_DATAP(m, offs);
	} else if (offs < 0)
	panic("nfsm trimleading offs");
	nd->nd_mrep = m;
	nd->nd_md = m;
	nd->nd_dpos = NFSMTOD(m, caddr_t);
	}

	/*
	* Trim trailing data off the mbuf list being built.
	*/
	APPLESTATIC void
	newnfs_trimtrailing(nd, mb, bpos)
	struct nfsrv_descript *nd;
	mbuf_t mb;
	caddr_t bpos;
	{

	if (mbuf_next(mb)) {
	mbuf_freem(mbuf_next(mb));
	mbuf_setnext(mb, NULL);
	}
	mbuf_setlen(mb, bpos - NFSMTOD(mb, caddr_t));
	nd->nd_mb = mb;
	nd->nd_bpos = bpos;
	}

	/*
	* Dissect a file handle on the client.
	*/
	APPLESTATIC int
	nfsm_getfh(struct nfsrv_descript nd, struct nfsfh *nfhpp)
	{
	u_int32_t *tl;
	struct nfsfh *nfhp;
	int error, len;

	*nfhpp = NULL;
	if (nd->nd_flag & (ND_NFSV3 \| ND_NFSV4)) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if ((len = fxdr_unsigned(int, *tl)) <= 0 \|\|
	len > NFSX_FHMAX) {
	error = EBADRPC;
	goto nfsmout;
	}
	} else
	len = NFSX_V2FH;
	MALLOC(nfhp, struct nfsfh *, sizeof (struct nfsfh) + len,
	M_NFSFH, M_WAITOK);
	error = nfsrv_mtostr(nd, nfhp->nfh_fh, len);
	if (error) {
	FREE((caddr_t)nfhp, M_NFSFH);
	goto nfsmout;
	}
	nfhp->nfh_len = len;
	*nfhpp = nfhp;
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Break down the nfsv4 acl.
	* If the aclp == NULL or won't fit in an acl, just discard the acl info.
	*/
	APPLESTATIC int
	nfsrv_dissectacl(struct nfsrv_descript nd, NFSACL_T aclp, int *aclerrp,
	int aclsizep, __unused NFSPROC_T p)
	{
	u_int32_t *tl;
	int i, aclsize;
	int acecnt, error = 0, aceerr = 0, acesize;

	*aclerrp = 0;
	if (aclp)
	aclp->acl_cnt = 0;
	/*
	* Parse out the ace entries and expect them to conform to
	* what can be supported by R/W/X bits.
	*/
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	aclsize = NFSX_UNSIGNED;
	acecnt = fxdr_unsigned(int, *tl);
	if (acecnt > ACL_MAX_ENTRIES)
	aceerr = NFSERR_ATTRNOTSUPP;
	if (nfsrv_useacl == 0)
	aceerr = NFSERR_ATTRNOTSUPP;
	for (i = 0; i < acecnt; i++) {
	if (aclp && !aceerr)
	error = nfsrv_dissectace(nd, &aclp->acl_entry[i],
	&aceerr, &acesize, p);
	else
	error = nfsrv_skipace(nd, &acesize);
	if (error)
	goto nfsmout;
	aclsize += acesize;
	}
	if (aclp && !aceerr)
	aclp->acl_cnt = acecnt;
	if (aceerr)
	*aclerrp = aceerr;
	if (aclsizep)
	*aclsizep = aclsize;
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Skip over an NFSv4 ace entry. Just dissect the xdr and discard it.
	*/
	static int
	nfsrv_skipace(struct nfsrv_descript nd, int acesizep)
	{
	u_int32_t *tl;
	int error, len = 0;

	NFSM_DISSECT(tl, u_int32_t , 4 NFSX_UNSIGNED);
	len = fxdr_unsigned(int, *(tl + 3));
	error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
	nfsmout:
	acesizep = NFSM_RNDUP(len) + (4 NFSX_UNSIGNED);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Get attribute bits from an mbuf list.
	* Returns EBADRPC for a parsing error, 0 otherwise.
	* If the clearinvalid flag is set, clear the bits not supported.
	*/
	APPLESTATIC int
	nfsrv_getattrbits(struct nfsrv_descript nd, nfsattrbit_t attrbitp, int *cntp,
	int *retnotsupp)
	{
	u_int32_t *tl;
	int cnt, i, outcnt;
	int error = 0;

	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	cnt = fxdr_unsigned(int, *tl);
	if (cnt < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (cnt > NFSATTRBIT_MAXWORDS)
	outcnt = NFSATTRBIT_MAXWORDS;
	else
	outcnt = cnt;
	NFSZERO_ATTRBIT(attrbitp);
	if (outcnt > 0) {
	NFSM_DISSECT(tl, u_int32_t , outcnt NFSX_UNSIGNED);
	for (i = 0; i < outcnt; i++)
	attrbitp->bits[i] = fxdr_unsigned(u_int32_t, *tl++);
	}
	for (i = 0; i < (cnt - outcnt); i++) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (retnotsupp != NULL && *tl != 0)
	*retnotsupp = NFSERR_ATTRNOTSUPP;
	}
	if (cntp)
	cntp = NFSX_UNSIGNED + (cnt NFSX_UNSIGNED);
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Get the attributes for V4.
	* If the compare flag is true, test for any attribute changes,
	* otherwise return the attribute values.
	* These attributes cover fields in "struct vattr", "struct statfs",
	* "struct nfsfsinfo", the file handle and the lease duration.
	* The value of retcmpp is set to 1 if all attributes are the same,
	* and 0 otherwise.
	* Returns EBADRPC if it can't be parsed, 0 otherwise.
	*/
	APPLESTATIC int
	nfsv4_loadattr(struct nfsrv_descript *nd, vnode_t vp,
	struct nfsvattr nap, struct nfsfh nfhpp, fhandle_t fhp, int fhsize,
	struct nfsv3_pathconf pc, struct statfs sbp, struct nfsstatfs *sfp,
	struct nfsfsinfo fsp, NFSACL_T aclp, int compare, int *retcmpp,
	u_int32_t leasep, u_int32_t rderrp, NFSPROC_T p, struct ucred cred)
	{
	u_int32_t *tl;
	int i = 0, j, k, l = 0, m, bitpos, attrsum = 0;
	int error, tfhsize, aceerr, attrsize, cnt, retnotsup;
	u_char cp, cp2, namestr[NFSV4_SMALLSTR + 1];
	nfsattrbit_t attrbits, retattrbits, checkattrbits;
	struct nfsfh *tnfhp;
	struct nfsreferral *refp;
	u_quad_t tquad;
	nfsquad_t tnfsquad;
	struct timespec temptime;
	uid_t uid;
	gid_t gid;
	u_int32_t freenum = 0, tuint;
	u_int64_t uquad = 0, thyp, thyp2;
	#ifdef QUOTA
	struct dqblk dqb;
	uid_t savuid;
	#endif

	CTASSERT(sizeof(ino_t) == sizeof(uint64_t));
	if (compare) {
	retnotsup = 0;
	error = nfsrv_getattrbits(nd, &attrbits, NULL, &retnotsup);
	} else {
	error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
	}
	if (error)
	goto nfsmout;

	if (compare) {
	*retcmpp = retnotsup;
	} else {
	/*
	* Just set default values to some of the important ones.
	*/
	if (nap != NULL) {
	nap->na_type = VREG;
	nap->na_mode = 0;
	nap->na_rdev = (NFSDEV_T)0;
	nap->na_mtime.tv_sec = 0;
	nap->na_mtime.tv_nsec = 0;
	nap->na_gen = 0;
	nap->na_flags = 0;
	nap->na_blocksize = NFS_FABLKSIZE;
	}
	if (sbp != NULL) {
	sbp->f_bsize = NFS_FABLKSIZE;
	sbp->f_blocks = 0;
	sbp->f_bfree = 0;
	sbp->f_bavail = 0;
	sbp->f_files = 0;
	sbp->f_ffree = 0;
	}
	if (fsp != NULL) {
	fsp->fs_rtmax = 8192;
	fsp->fs_rtpref = 8192;
	fsp->fs_maxname = NFS_MAXNAMLEN;
	fsp->fs_wtmax = 8192;
	fsp->fs_wtpref = 8192;
	fsp->fs_wtmult = NFS_FABLKSIZE;
	fsp->fs_dtpref = 8192;
	fsp->fs_maxfilesize = 0xffffffffffffffffull;
	fsp->fs_timedelta.tv_sec = 0;
	fsp->fs_timedelta.tv_nsec = 1;
	fsp->fs_properties = (NFSV3_FSFLINK \| NFSV3_FSFSYMLINK \|
	NFSV3_FSFHOMOGENEOUS \| NFSV3_FSFCANSETTIME);
	}
	if (pc != NULL) {
	pc->pc_linkmax = NFS_LINK_MAX;
	pc->pc_namemax = NAME_MAX;
	pc->pc_notrunc = 0;
	pc->pc_chownrestricted = 0;
	pc->pc_caseinsensitive = 0;
	pc->pc_casepreserving = 1;
	}
	if (sfp != NULL) {
	sfp->sf_ffiles = UINT64_MAX;
	sfp->sf_tfiles = UINT64_MAX;
	sfp->sf_afiles = UINT64_MAX;
	sfp->sf_fbytes = UINT64_MAX;
	sfp->sf_tbytes = UINT64_MAX;
	sfp->sf_abytes = UINT64_MAX;
	}
	}

	/*
	* Loop around getting the attributes.
	*/
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsize = fxdr_unsigned(int, *tl);
	for (bitpos = 0; bitpos < NFSATTRBIT_MAX; bitpos++) {
	if (attrsum > attrsize) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (NFSISSET_ATTRBIT(&attrbits, bitpos))
	switch (bitpos) {
	case NFSATTRBIT_SUPPORTEDATTRS:
	retnotsup = 0;
	if (compare \|\| nap == NULL)
	error = nfsrv_getattrbits(nd, &retattrbits,
	&cnt, &retnotsup);
	else
	error = nfsrv_getattrbits(nd, &nap->na_suppattr,
	&cnt, &retnotsup);
	if (error)
	goto nfsmout;
	if (compare && !(*retcmpp)) {
	NFSSETSUPP_ATTRBIT(&checkattrbits);

	/* Some filesystem do not support NFSv4ACL */
	if (nfsrv_useacl == 0 \|\| nfs_supportsnfsv4acls(vp) == 0) {
	NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACL);
	NFSCLRBIT_ATTRBIT(&checkattrbits, NFSATTRBIT_ACLSUPPORT);
	}
	if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits)
	\|\| retnotsup)
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += cnt;
	break;
	case NFSATTRBIT_TYPE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (nap->na_type != nfsv34tov_type(*tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_type = nfsv34tov_type(*tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_FHEXPIRETYPE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare && !(*retcmpp)) {
	if (fxdr_unsigned(int, *tl) !=
	NFSV4FHTYPE_PERSISTENT)
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CHANGE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp)) {
	if (nap->na_filerev != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_filerev = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SIZE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp)) {
	if (nap->na_size != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_size = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_LINKSUPPORT:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (fsp->fs_properties & NFSV3_FSFLINK) {
	if (*tl == newnfs_false)
	*retcmpp = NFSERR_NOTSAME;
	} else {
	if (*tl == newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	}
	} else if (fsp != NULL) {
	if (*tl == newnfs_true)
	fsp->fs_properties \|= NFSV3_FSFLINK;
	else
	fsp->fs_properties &= ~NFSV3_FSFLINK;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_SYMLINKSUPPORT:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (fsp->fs_properties & NFSV3_FSFSYMLINK) {
	if (*tl == newnfs_false)
	*retcmpp = NFSERR_NOTSAME;
	} else {
	if (*tl == newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	}
	} else if (fsp != NULL) {
	if (*tl == newnfs_true)
	fsp->fs_properties \|= NFSV3_FSFSYMLINK;
	else
	fsp->fs_properties &= ~NFSV3_FSFSYMLINK;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_NAMEDATTR:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare && !(*retcmpp)) {
	if (*tl != newnfs_false)
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_FSID:
	NFSM_DISSECT(tl, u_int32_t , 4 NFSX_UNSIGNED);
	thyp = fxdr_hyper(tl);
	tl += 2;
	thyp2 = fxdr_hyper(tl);
	if (compare) {
	if (*retcmpp == 0) {
	if (thyp != (u_int64_t)
	vfs_statfs(vnode_mount(vp))->f_fsid.val[0] \|\|
	thyp2 != (u_int64_t)
	vfs_statfs(vnode_mount(vp))->f_fsid.val[1])
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_filesid[0] = thyp;
	nap->na_filesid[1] = thyp2;
	}
	attrsum += (4 * NFSX_UNSIGNED);
	break;
	case NFSATTRBIT_UNIQUEHANDLES:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare && !(*retcmpp)) {
	if (*tl != newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_LEASETIME:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (fxdr_unsigned(int, *tl) != nfsrv_lease &&
	!(*retcmpp))
	*retcmpp = NFSERR_NOTSAME;
	} else if (leasep != NULL) {
	leasep = fxdr_unsigned(u_int32_t, tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_RDATTRERROR:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp))
	*retcmpp = NFSERR_INVAL;
	} else if (rderrp != NULL) {
	rderrp = fxdr_unsigned(u_int32_t, tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_ACL:
	if (compare) {
	if (!(*retcmpp)) {
	if (nfsrv_useacl && nfs_supportsnfsv4acls(vp)) {
	NFSACL_T *naclp;

	naclp = acl_alloc(M_WAITOK);
	error = nfsrv_dissectacl(nd, naclp, &aceerr,
	&cnt, p);
	if (error) {
	acl_free(naclp);
	goto nfsmout;
	}
	if (aceerr \|\| aclp == NULL \|\|
	nfsrv_compareacl(aclp, naclp))
	*retcmpp = NFSERR_NOTSAME;
	acl_free(naclp);
	} else {
	error = nfsrv_dissectacl(nd, NULL, &aceerr,
	&cnt, p);
	*retcmpp = NFSERR_ATTRNOTSUPP;
	}
	}
	} else {
	if (vp != NULL && aclp != NULL)
	error = nfsrv_dissectacl(nd, aclp, &aceerr,
	&cnt, p);
	else
	error = nfsrv_dissectacl(nd, NULL, &aceerr,
	&cnt, p);
	if (error)
	goto nfsmout;
	}

	attrsum += cnt;
	break;
	case NFSATTRBIT_ACLSUPPORT:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare && !(*retcmpp)) {
	if (nfsrv_useacl && nfs_supportsnfsv4acls(vp)) {
	if (fxdr_unsigned(u_int32_t, *tl) !=
	NFSV4ACE_SUPTYPES)
	*retcmpp = NFSERR_NOTSAME;
	} else {
	*retcmpp = NFSERR_ATTRNOTSUPP;
	}
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_ARCHIVE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CANSETTIME:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (fsp->fs_properties & NFSV3_FSFCANSETTIME) {
	if (*tl == newnfs_false)
	*retcmpp = NFSERR_NOTSAME;
	} else {
	if (*tl == newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	}
	} else if (fsp != NULL) {
	if (*tl == newnfs_true)
	fsp->fs_properties \|= NFSV3_FSFCANSETTIME;
	else
	fsp->fs_properties &= ~NFSV3_FSFCANSETTIME;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CASEINSENSITIVE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (*tl != newnfs_false)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (pc != NULL) {
	pc->pc_caseinsensitive =
	fxdr_unsigned(u_int32_t, *tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CASEPRESERVING:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (*tl != newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (pc != NULL) {
	pc->pc_casepreserving =
	fxdr_unsigned(u_int32_t, *tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CHOWNRESTRICTED:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (*tl != newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (pc != NULL) {
	pc->pc_chownrestricted =
	fxdr_unsigned(u_int32_t, *tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_FILEHANDLE:
	error = nfsm_getfh(nd, &tnfhp);
	if (error)
	goto nfsmout;
	tfhsize = tnfhp->nfh_len;
	if (compare) {
	if (!(*retcmpp) &&
	!NFSRV_CMPFH(tnfhp->nfh_fh, tfhsize,
	fhp, fhsize))
	*retcmpp = NFSERR_NOTSAME;
	FREE((caddr_t)tnfhp, M_NFSFH);
	} else if (nfhpp != NULL) {
	*nfhpp = tnfhp;
	} else {
	FREE((caddr_t)tnfhp, M_NFSFH);
	}
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(tfhsize));
	break;
	case NFSATTRBIT_FILEID:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	thyp = fxdr_hyper(tl);
	if (compare) {
	if (!(*retcmpp)) {
	if (nap->na_fileid != thyp)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL)
	nap->na_fileid = thyp;
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FILESAVAIL:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp) &&
	sfp->sf_afiles != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	} else if (sfp != NULL) {
	sfp->sf_afiles = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FILESFREE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp) &&
	sfp->sf_ffiles != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	} else if (sfp != NULL) {
	sfp->sf_ffiles = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FILESTOTAL:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp) &&
	sfp->sf_tfiles != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	} else if (sfp != NULL) {
	sfp->sf_tfiles = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FSLOCATIONS:
	error = nfsrv_getrefstr(nd, &cp, &cp2, &l, &m);
	if (error)
	goto nfsmout;
	attrsum += l;
	if (compare && !(*retcmpp)) {
	refp = nfsv4root_getreferral(vp, NULL, 0);
	if (refp != NULL) {
	if (cp == NULL \|\| cp2 == NULL \|\|
	strcmp(cp, "/") \|\|
	strcmp(cp2, refp->nfr_srvlist))
	*retcmpp = NFSERR_NOTSAME;
	} else if (m == 0) {
	*retcmpp = NFSERR_NOTSAME;
	}
	}
	if (cp != NULL)
	free(cp, M_NFSSTRING);
	if (cp2 != NULL)
	free(cp2, M_NFSSTRING);
	break;
	case NFSATTRBIT_HIDDEN:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_HOMOGENEOUS:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (fsp->fs_properties &
	NFSV3_FSFHOMOGENEOUS) {
	if (*tl == newnfs_false)
	*retcmpp = NFSERR_NOTSAME;
	} else {
	if (*tl == newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	}
	} else if (fsp != NULL) {
	if (*tl == newnfs_true)
	fsp->fs_properties \|= NFSV3_FSFHOMOGENEOUS;
	else
	fsp->fs_properties &= ~NFSV3_FSFHOMOGENEOUS;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MAXFILESIZE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	tnfsquad.qval = fxdr_hyper(tl);
	if (compare) {
	if (!(*retcmpp)) {
	tquad = NFSRV_MAXFILESIZE;
	if (tquad != tnfsquad.qval)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (fsp != NULL) {
	fsp->fs_maxfilesize = tnfsquad.qval;
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_MAXLINK:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (fxdr_unsigned(int, *tl) != NFS_LINK_MAX)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (pc != NULL) {
	pc->pc_linkmax = fxdr_unsigned(u_int32_t, *tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MAXNAME:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (fsp->fs_maxname !=
	fxdr_unsigned(u_int32_t, *tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else {
	tuint = fxdr_unsigned(u_int32_t, *tl);
	/*
	* Some Linux NFSv4 servers report this
	* as 0 or 4billion, so I'll set it to
	* NFS_MAXNAMLEN. If a server actually creates
	* a name longer than NFS_MAXNAMLEN, it will
	* get an error back.
	*/
	if (tuint == 0 \|\| tuint > NFS_MAXNAMLEN)
	tuint = NFS_MAXNAMLEN;
	if (fsp != NULL)
	fsp->fs_maxname = tuint;
	if (pc != NULL)
	pc->pc_namemax = tuint;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MAXREAD:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp)) {
	if (fsp->fs_rtmax != fxdr_unsigned(u_int32_t,
	(tl + 1)) \|\| tl != 0)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (fsp != NULL) {
	fsp->fs_rtmax = fxdr_unsigned(u_int32_t, *++tl);
	fsp->fs_rtpref = fsp->fs_rtmax;
	fsp->fs_dtpref = fsp->fs_rtpref;
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_MAXWRITE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp)) {
	if (fsp->fs_wtmax != fxdr_unsigned(u_int32_t,
	(tl + 1)) \|\| tl != 0)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (fsp != NULL) {
	fsp->fs_wtmax = fxdr_unsigned(int, *++tl);
	fsp->fs_wtpref = fsp->fs_wtmax;
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_MIMETYPE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_ATTRNOTSUPP;
	break;
	case NFSATTRBIT_MODE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (nap->na_mode != nfstov_mode(*tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_mode = nfstov_mode(*tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_NOTRUNC:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare) {
	if (!(*retcmpp)) {
	if (*tl != newnfs_true)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (pc != NULL) {
	pc->pc_notrunc = fxdr_unsigned(u_int32_t, *tl);
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_NUMLINKS:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	tuint = fxdr_unsigned(u_int32_t, *tl);
	if (compare) {
	if (!(*retcmpp)) {
	if ((u_int32_t)nap->na_nlink != tuint)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_nlink = tuint;
	}
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_OWNER:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	j = fxdr_unsigned(int, *tl);
	if (j < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
	if (j > NFSV4_SMALLSTR)
	cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
	else
	cp = namestr;
	error = nfsrv_mtostr(nd, cp, j);
	if (error) {
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	goto nfsmout;
	}
	if (compare) {
	if (!(*retcmpp)) {
	if (nfsv4_strtouid(nd, cp, j, &uid, p) \|\|
	nap->na_uid != uid)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	if (nfsv4_strtouid(nd, cp, j, &uid, p))
	nap->na_uid = nfsrv_defaultuid;
	else
	nap->na_uid = uid;
	}
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	break;
	case NFSATTRBIT_OWNERGROUP:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	j = fxdr_unsigned(int, *tl);
	if (j < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
	if (j > NFSV4_SMALLSTR)
	cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
	else
	cp = namestr;
	error = nfsrv_mtostr(nd, cp, j);
	if (error) {
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	goto nfsmout;
	}
	if (compare) {
	if (!(*retcmpp)) {
	if (nfsv4_strtogid(nd, cp, j, &gid, p) \|\|
	nap->na_gid != gid)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	if (nfsv4_strtogid(nd, cp, j, &gid, p))
	nap->na_gid = nfsrv_defaultgid;
	else
	nap->na_gid = gid;
	}
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	break;
	case NFSATTRBIT_QUOTAHARD:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (sbp != NULL) {
	if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
	freenum = sbp->f_bfree;
	else
	freenum = sbp->f_bavail;
	#ifdef QUOTA
	/*
	* ufs_quotactl() insists that the uid argument
	* equal p_ruid for non-root quota access, so
	* we'll just make sure that's the case.
	*/
	savuid = p->p_cred->p_ruid;
	p->p_cred->p_ruid = cred->cr_uid;
	if (!VFS_QUOTACTL(vnode_mount(vp),QCMD(Q_GETQUOTA,
	USRQUOTA), cred->cr_uid, (caddr_t)&dqb))
	freenum = min(dqb.dqb_bhardlimit, freenum);
	p->p_cred->p_ruid = savuid;
	#endif /* QUOTA */
	uquad = (u_int64_t)freenum;
	NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize);
	}
	if (compare && !(*retcmpp)) {
	if (uquad != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_QUOTASOFT:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (sbp != NULL) {
	if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
	freenum = sbp->f_bfree;
	else
	freenum = sbp->f_bavail;
	#ifdef QUOTA
	/*
	* ufs_quotactl() insists that the uid argument
	* equal p_ruid for non-root quota access, so
	* we'll just make sure that's the case.
	*/
	savuid = p->p_cred->p_ruid;
	p->p_cred->p_ruid = cred->cr_uid;
	if (!VFS_QUOTACTL(vnode_mount(vp),QCMD(Q_GETQUOTA,
	USRQUOTA), cred->cr_uid, (caddr_t)&dqb))
	freenum = min(dqb.dqb_bsoftlimit, freenum);
	p->p_cred->p_ruid = savuid;
	#endif /* QUOTA */
	uquad = (u_int64_t)freenum;
	NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize);
	}
	if (compare && !(*retcmpp)) {
	if (uquad != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_QUOTAUSED:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (sbp != NULL) {
	freenum = 0;
	#ifdef QUOTA
	/*
	* ufs_quotactl() insists that the uid argument
	* equal p_ruid for non-root quota access, so
	* we'll just make sure that's the case.
	*/
	savuid = p->p_cred->p_ruid;
	p->p_cred->p_ruid = cred->cr_uid;
	if (!VFS_QUOTACTL(vnode_mount(vp),QCMD(Q_GETQUOTA,
	USRQUOTA), cred->cr_uid, (caddr_t)&dqb))
	freenum = dqb.dqb_curblocks;
	p->p_cred->p_ruid = savuid;
	#endif /* QUOTA */
	uquad = (u_int64_t)freenum;
	NFSQUOTABLKTOBYTE(uquad, sbp->f_bsize);
	}
	if (compare && !(*retcmpp)) {
	if (uquad != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_RAWDEV:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4SPECDATA);
	j = fxdr_unsigned(int, *tl++);
	k = fxdr_unsigned(int, *tl);
	if (compare) {
	if (!(*retcmpp)) {
	if (nap->na_rdev != NFSMAKEDEV(j, k))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_rdev = NFSMAKEDEV(j, k);
	}
	attrsum += NFSX_V4SPECDATA;
	break;
	case NFSATTRBIT_SPACEAVAIL:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp) &&
	sfp->sf_abytes != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	} else if (sfp != NULL) {
	sfp->sf_abytes = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SPACEFREE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp) &&
	sfp->sf_fbytes != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	} else if (sfp != NULL) {
	sfp->sf_fbytes = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SPACETOTAL:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (compare) {
	if (!(*retcmpp) &&
	sfp->sf_tbytes != fxdr_hyper(tl))
	*retcmpp = NFSERR_NOTSAME;
	} else if (sfp != NULL) {
	sfp->sf_tbytes = fxdr_hyper(tl);
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SPACEUSED:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	thyp = fxdr_hyper(tl);
	if (compare) {
	if (!(*retcmpp)) {
	if ((u_int64_t)nap->na_bytes != thyp)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_bytes = thyp;
	}
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SYSTEM:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_TIMEACCESS:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	fxdr_nfsv4time(tl, &temptime);
	if (compare) {
	if (!(*retcmpp)) {
	if (!NFS_CMPTIME(temptime, nap->na_atime))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_atime = temptime;
	}
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEACCESSSET:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsum += NFSX_UNSIGNED;
	i = fxdr_unsigned(int, *tl);
	if (i == NFSV4SATTRTIME_TOCLIENT) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	attrsum += NFSX_V4TIME;
	}
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_INVAL;
	break;
	case NFSATTRBIT_TIMEBACKUP:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMECREATE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEDELTA:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	if (fsp != NULL) {
	if (compare) {
	if (!(*retcmpp)) {
	if ((u_int32_t)fsp->fs_timedelta.tv_sec !=
	fxdr_unsigned(u_int32_t, *(tl + 1)) \|\|
	(u_int32_t)fsp->fs_timedelta.tv_nsec !=
	(fxdr_unsigned(u_int32_t, *(tl + 2)) %
	1000000000) \|\|
	*tl != 0)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else {
	fxdr_nfsv4time(tl, &fsp->fs_timedelta);
	}
	}
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMETADATA:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	fxdr_nfsv4time(tl, &temptime);
	if (compare) {
	if (!(*retcmpp)) {
	if (!NFS_CMPTIME(temptime, nap->na_ctime))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_ctime = temptime;
	}
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMODIFY:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	fxdr_nfsv4time(tl, &temptime);
	if (compare) {
	if (!(*retcmpp)) {
	if (!NFS_CMPTIME(temptime, nap->na_mtime))
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL) {
	nap->na_mtime = temptime;
	}
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMODIFYSET:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsum += NFSX_UNSIGNED;
	i = fxdr_unsigned(int, *tl);
	if (i == NFSV4SATTRTIME_TOCLIENT) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	attrsum += NFSX_V4TIME;
	}
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_INVAL;
	break;
	case NFSATTRBIT_MOUNTEDONFILEID:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	thyp = fxdr_hyper(tl);
	if (compare) {
	if (!(*retcmpp)) {
	if (!vp \|\| !nfsrv_atroot(vp, &thyp2))
	thyp2 = nap->na_fileid;
	if (thyp2 != thyp)
	*retcmpp = NFSERR_NOTSAME;
	}
	} else if (nap != NULL)
	nap->na_mntonfileno = thyp;
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SUPPATTREXCLCREAT:
	retnotsup = 0;
	error = nfsrv_getattrbits(nd, &retattrbits,
	&cnt, &retnotsup);
	if (error)
	goto nfsmout;
	if (compare && !(*retcmpp)) {
	NFSSETSUPP_ATTRBIT(&checkattrbits);
	NFSCLRNOTSETABLE_ATTRBIT(&checkattrbits);
	NFSCLRBIT_ATTRBIT(&checkattrbits,
	NFSATTRBIT_TIMEACCESSSET);
	if (!NFSEQUAL_ATTRBIT(&retattrbits, &checkattrbits)
	\|\| retnotsup)
	*retcmpp = NFSERR_NOTSAME;
	}
	attrsum += cnt;
	break;
	default:
	printf("EEK! nfsv4_loadattr unknown attr=%d\n",
	bitpos);
	if (compare && !(*retcmpp))
	*retcmpp = NFSERR_ATTRNOTSUPP;
	/*
	* and get out of the loop, since we can't parse
	* the unknown attrbute data.
	*/
	bitpos = NFSATTRBIT_MAX;
	break;
	}
	}

	/*
	* some clients pad the attrlist, so we need to skip over the
	* padding.
	*/
	if (attrsum > attrsize) {
	error = NFSERR_BADXDR;
	} else {
	attrsize = NFSM_RNDUP(attrsize);
	if (attrsum < attrsize)
	error = nfsm_advance(nd, attrsize - attrsum, -1);
	}
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Implement sleep locks for newnfs. The nfslock_usecnt allows for a
	* shared lock and the NFSXXX_LOCK flag permits an exclusive lock.
	* The first argument is a pointer to an nfsv4lock structure.
	* The second argument is 1 iff a blocking lock is wanted.
	* If this argument is 0, the call waits until no thread either wants nor
	* holds an exclusive lock.
	* It returns 1 if the lock was acquired, 0 otherwise.
	* If several processes call this function concurrently wanting the exclusive
	* lock, one will get the lock and the rest will return without getting the
	* lock. (If the caller must have the lock, it simply calls this function in a
	* loop until the function returns 1 to indicate the lock was acquired.)
	* Any usecnt must be decremented by calling nfsv4_relref() before
	* calling nfsv4_lock(). It was done this way, so nfsv4_lock() could
	* be called in a loop.
	* The isleptp argument is set to indicate if the call slept, iff not NULL
	* and the mp argument indicates to check for a forced dismount, iff not
	* NULL.
	*/
	APPLESTATIC int
	nfsv4_lock(struct nfsv4lock lp, int iwantlock, int isleptp,
	void mutex, struct mount mp)
	{

	if (isleptp)
	*isleptp = 0;
	/*
	* If a lock is wanted, loop around until the lock is acquired by
	* someone and then released. If I want the lock, try to acquire it.
	* For a lock to be issued, no lock must be in force and the usecnt
	* must be zero.
	*/
	if (iwantlock) {
	if (!(lp->nfslock_lock & NFSV4LOCK_LOCK) &&
	lp->nfslock_usecnt == 0) {
	lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED;
	lp->nfslock_lock \|= NFSV4LOCK_LOCK;
	return (1);
	}
	lp->nfslock_lock \|= NFSV4LOCK_LOCKWANTED;
	}
	while (lp->nfslock_lock & (NFSV4LOCK_LOCK \| NFSV4LOCK_LOCKWANTED)) {
	if (mp != NULL && NFSCL_FORCEDISM(mp)) {
	lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED;
	return (0);
	}
	lp->nfslock_lock \|= NFSV4LOCK_WANTED;
	if (isleptp)
	*isleptp = 1;
	(void) nfsmsleep(&lp->nfslock_lock, mutex,
	PZERO - 1, "nfsv4lck", NULL);
	if (iwantlock && !(lp->nfslock_lock & NFSV4LOCK_LOCK) &&
	lp->nfslock_usecnt == 0) {
	lp->nfslock_lock &= ~NFSV4LOCK_LOCKWANTED;
	lp->nfslock_lock \|= NFSV4LOCK_LOCK;
	return (1);
	}
	}
	return (0);
	}

	/*
	* Release the lock acquired by nfsv4_lock().
	* The second argument is set to 1 to indicate the nfslock_usecnt should be
	* incremented, as well.
	*/
	APPLESTATIC void
	nfsv4_unlock(struct nfsv4lock *lp, int incref)
	{

	lp->nfslock_lock &= ~NFSV4LOCK_LOCK;
	if (incref)
	lp->nfslock_usecnt++;
	nfsv4_wanted(lp);
	}

	/*
	* Release a reference cnt.
	*/
	APPLESTATIC void
	nfsv4_relref(struct nfsv4lock *lp)
	{

	if (lp->nfslock_usecnt <= 0)
	panic("nfsv4root ref cnt");
	lp->nfslock_usecnt--;
	if (lp->nfslock_usecnt == 0)
	nfsv4_wanted(lp);
	}

	/*
	* Get a reference cnt.
	* This function will wait for any exclusive lock to be released, but will
	* not wait for threads that want the exclusive lock. If priority needs
	* to be given to threads that need the exclusive lock, a call to nfsv4_lock()
	* with the 2nd argument == 0 should be done before calling nfsv4_getref().
	* If the mp argument is not NULL, check for NFSCL_FORCEDISM() being set and
	* return without getting a refcnt for that case.
	*/
	APPLESTATIC void
	nfsv4_getref(struct nfsv4lock lp, int isleptp, void *mutex,
	struct mount *mp)
	{

	if (isleptp)
	*isleptp = 0;

	/*
	* Wait for a lock held.
	*/
	while (lp->nfslock_lock & NFSV4LOCK_LOCK) {
	if (mp != NULL && NFSCL_FORCEDISM(mp))
	return;
	lp->nfslock_lock \|= NFSV4LOCK_WANTED;
	if (isleptp)
	*isleptp = 1;
	(void) nfsmsleep(&lp->nfslock_lock, mutex,
	PZERO - 1, "nfsv4gr", NULL);
	}
	if (mp != NULL && NFSCL_FORCEDISM(mp))
	return;

	lp->nfslock_usecnt++;
	}

	/*
	* Get a reference as above, but return failure instead of sleeping if
	* an exclusive lock is held.
	*/
	APPLESTATIC int
	nfsv4_getref_nonblock(struct nfsv4lock *lp)
	{

	if ((lp->nfslock_lock & NFSV4LOCK_LOCK) != 0)
	return (0);

	lp->nfslock_usecnt++;
	return (1);
	}

	/*
	* Test for a lock. Return 1 if locked, 0 otherwise.
	*/
	APPLESTATIC int
	nfsv4_testlock(struct nfsv4lock *lp)
	{

	if ((lp->nfslock_lock & NFSV4LOCK_LOCK) == 0 &&
	lp->nfslock_usecnt == 0)
	return (0);
	return (1);
	}

	/*
	* Wake up anyone sleeping, waiting for this lock.
	*/
	static void
	nfsv4_wanted(struct nfsv4lock *lp)
	{

	if (lp->nfslock_lock & NFSV4LOCK_WANTED) {
	lp->nfslock_lock &= ~NFSV4LOCK_WANTED;
	wakeup((caddr_t)&lp->nfslock_lock);
	}
	}

	/*
	* Copy a string from an mbuf list into a character array.
	* Return EBADRPC if there is an mbuf error,
	* 0 otherwise.
	*/
	APPLESTATIC int
	nfsrv_mtostr(struct nfsrv_descript nd, char str, int siz)
	{
	char *cp;
	int xfer, len;
	mbuf_t mp;
	int rem, error = 0;

	mp = nd->nd_md;
	cp = nd->nd_dpos;
	len = NFSMTOD(mp, caddr_t) + mbuf_len(mp) - cp;
	rem = NFSM_RNDUP(siz) - siz;
	while (siz > 0) {
	if (len > siz)
	xfer = siz;
	else
	xfer = len;
	NFSBCOPY(cp, str, xfer);
	str += xfer;
	siz -= xfer;
	if (siz > 0) {
	mp = mbuf_next(mp);
	if (mp == NULL) {
	error = EBADRPC;
	goto out;
	}
	cp = NFSMTOD(mp, caddr_t);
	len = mbuf_len(mp);
	} else {
	cp += xfer;
	len -= xfer;
	}
	}
	*str = '\0';
	nd->nd_dpos = cp;
	nd->nd_md = mp;
	if (rem > 0) {
	if (len < rem)
	error = nfsm_advance(nd, rem, len);
	else
	nd->nd_dpos += rem;
	}

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Fill in the attributes as marked by the bitmap (V4).
	*/
	APPLESTATIC int
	nfsv4_fillattr(struct nfsrv_descript nd, struct mount mp, vnode_t vp,
	NFSACL_T saclp, struct vattr vap, fhandle_t *fhp, int rderror,
	nfsattrbit_t attrbitp, struct ucred cred, NFSPROC_T *p, int isdgram,
	int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
	{
	int bitpos, retnum = 0;
	u_int32_t *tl;
	int siz, prefixnum, error;
	u_char *cp, namestr[NFSV4_SMALLSTR];
	nfsattrbit_t attrbits, retbits;
	nfsattrbit_t *retbitp = &retbits;
	u_int32_t freenum, *retnump;
	u_int64_t uquad;
	struct statfs *fs;
	struct nfsfsinfo fsinf;
	struct timespec temptime;
	NFSACL_T aclp, naclp = NULL;
	#ifdef QUOTA
	struct dqblk dqb;
	uid_t savuid;
	#endif

	/*
	* First, set the bits that can be filled and get fsinfo.
	*/
	NFSSET_ATTRBIT(retbitp, attrbitp);
	/*
	* If both p and cred are NULL, it is a client side setattr call.
	* If both p and cred are not NULL, it is a server side reply call.
	* If p is not NULL and cred is NULL, it is a client side callback
	* reply call.
	*/
	if (p == NULL && cred == NULL) {
	NFSCLRNOTSETABLE_ATTRBIT(retbitp);
	aclp = saclp;
	} else {
	NFSCLRNOTFILLABLE_ATTRBIT(retbitp);
	naclp = acl_alloc(M_WAITOK);
	aclp = naclp;
	}
	nfsvno_getfs(&fsinf, isdgram);
	#ifndef APPLE
	/*
	* Get the VFS_STATFS(), since some attributes need them.
	*/
	fs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
	if (NFSISSETSTATFS_ATTRBIT(retbitp)) {
	error = VFS_STATFS(mp, fs);
	if (error != 0) {
	if (reterr) {
	nd->nd_repstat = NFSERR_ACCES;
	free(fs, M_STATFS);
	return (0);
	}
	NFSCLRSTATFS_ATTRBIT(retbitp);
	}
	}
	#endif

	/*
	* And the NFSv4 ACL...
	*/
	if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_ACLSUPPORT) &&
	(nfsrv_useacl == 0 \|\| ((cred != NULL \|\| p != NULL) &&
	supports_nfsv4acls == 0))) {
	NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACLSUPPORT);
	}
	if (NFSISSET_ATTRBIT(retbitp, NFSATTRBIT_ACL)) {
	if (nfsrv_useacl == 0 \|\| ((cred != NULL \|\| p != NULL) &&
	supports_nfsv4acls == 0)) {
	NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACL);
	} else if (naclp != NULL) {
	if (NFSVOPLOCK(vp, LK_SHARED) == 0) {
	error = VOP_ACCESSX(vp, VREAD_ACL, cred, p);
	if (error == 0)
	error = VOP_GETACL(vp, ACL_TYPE_NFS4,
	naclp, cred, p);
	NFSVOPUNLOCK(vp, 0);
	} else
	error = NFSERR_PERM;
	if (error != 0) {
	if (reterr) {
	nd->nd_repstat = NFSERR_ACCES;
	free(fs, M_STATFS);
	return (0);
	}
	NFSCLRBIT_ATTRBIT(retbitp, NFSATTRBIT_ACL);
	}
	}
	}

	/*
	* Put out the attribute bitmap for the ones being filled in
	* and get the field for the number of attributes returned.
	*/
	prefixnum = nfsrv_putattrbit(nd, retbitp);
	NFSM_BUILD(retnump, u_int32_t *, NFSX_UNSIGNED);
	prefixnum += NFSX_UNSIGNED;

	/*
	* Now, loop around filling in the attributes for each bit set.
	*/
	for (bitpos = 0; bitpos < NFSATTRBIT_MAX; bitpos++) {
	if (NFSISSET_ATTRBIT(retbitp, bitpos)) {
	switch (bitpos) {
	case NFSATTRBIT_SUPPORTEDATTRS:
	NFSSETSUPP_ATTRBIT(&attrbits);
	if (nfsrv_useacl == 0 \|\| ((cred != NULL \|\| p != NULL)
	&& supports_nfsv4acls == 0)) {
	NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACLSUPPORT);
	NFSCLRBIT_ATTRBIT(&attrbits,NFSATTRBIT_ACL);
	}
	retnum += nfsrv_putattrbit(nd, &attrbits);
	break;
	case NFSATTRBIT_TYPE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = vtonfsv34_type(vap->va_type);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_FHEXPIRETYPE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(NFSV4FHTYPE_PERSISTENT);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CHANGE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	txdr_hyper(vap->va_filerev, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SIZE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	txdr_hyper(vap->va_size, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_LINKSUPPORT:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	if (fsinf.fs_properties & NFSV3FSINFO_LINK)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_SYMLINKSUPPORT:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	if (fsinf.fs_properties & NFSV3FSINFO_SYMLINK)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_NAMEDATTR:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = newnfs_false;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_FSID:
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4FSID);
	*tl++ = 0;
	*tl++ = txdr_unsigned(mp->mnt_stat.f_fsid.val[0]);
	*tl++ = 0;
	*tl = txdr_unsigned(mp->mnt_stat.f_fsid.val[1]);
	retnum += NFSX_V4FSID;
	break;
	case NFSATTRBIT_UNIQUEHANDLES:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = newnfs_true;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_LEASETIME:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(nfsrv_lease);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_RDATTRERROR:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(rderror);
	retnum += NFSX_UNSIGNED;
	break;
	/*
	* Recommended Attributes. (Only the supported ones.)
	*/
	case NFSATTRBIT_ACL:
	retnum += nfsrv_buildacl(nd, aclp, vnode_vtype(vp), p);
	break;
	case NFSATTRBIT_ACLSUPPORT:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(NFSV4ACE_SUPTYPES);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CANSETTIME:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	if (fsinf.fs_properties & NFSV3FSINFO_CANSETTIME)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CASEINSENSITIVE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = newnfs_false;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CASEPRESERVING:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = newnfs_true;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_CHOWNRESTRICTED:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = newnfs_true;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_FILEHANDLE:
	retnum += nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 0);
	break;
	case NFSATTRBIT_FILEID:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	uquad = vap->va_fileid;
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FILESAVAIL:
	/*
	* Check quota and use min(quota, f_ffree).
	*/
	freenum = fs->f_ffree;
	#ifdef QUOTA
	/*
	* ufs_quotactl() insists that the uid argument
	* equal p_ruid for non-root quota access, so
	* we'll just make sure that's the case.
	*/
	savuid = p->p_cred->p_ruid;
	p->p_cred->p_ruid = cred->cr_uid;
	if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
	cred->cr_uid, (caddr_t)&dqb))
	freenum = min(dqb.dqb_isoftlimit-dqb.dqb_curinodes,
	freenum);
	p->p_cred->p_ruid = savuid;
	#endif /* QUOTA */
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	*tl++ = 0;
	*tl = txdr_unsigned(freenum);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FILESFREE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	*tl++ = 0;
	*tl = txdr_unsigned(fs->f_ffree);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FILESTOTAL:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	*tl++ = 0;
	*tl = txdr_unsigned(fs->f_files);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_FSLOCATIONS:
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = 0;
	*tl = 0;
	retnum += 2 * NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_HOMOGENEOUS:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	if (fsinf.fs_properties & NFSV3FSINFO_HOMOGENEOUS)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MAXFILESIZE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	uquad = NFSRV_MAXFILESIZE;
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_MAXLINK:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(NFS_LINK_MAX);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MAXNAME:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(NFS_MAXNAMLEN);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MAXREAD:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	*tl++ = 0;
	*tl = txdr_unsigned(fsinf.fs_rtmax);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_MAXWRITE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	*tl++ = 0;
	*tl = txdr_unsigned(fsinf.fs_wtmax);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_MODE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = vtonfsv34_mode(vap->va_mode);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_NOTRUNC:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = newnfs_true;
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_NUMLINKS:
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(vap->va_nlink);
	retnum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_OWNER:
	cp = namestr;
	nfsv4_uidtostr(vap->va_uid, &cp, &siz, p);
	retnum += nfsm_strtom(nd, cp, siz);
	if (cp != namestr)
	free(cp, M_NFSSTRING);
	break;
	case NFSATTRBIT_OWNERGROUP:
	cp = namestr;
	nfsv4_gidtostr(vap->va_gid, &cp, &siz, p);
	retnum += nfsm_strtom(nd, cp, siz);
	if (cp != namestr)
	free(cp, M_NFSSTRING);
	break;
	case NFSATTRBIT_QUOTAHARD:
	if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
	freenum = fs->f_bfree;
	else
	freenum = fs->f_bavail;
	#ifdef QUOTA
	/*
	* ufs_quotactl() insists that the uid argument
	* equal p_ruid for non-root quota access, so
	* we'll just make sure that's the case.
	*/
	savuid = p->p_cred->p_ruid;
	p->p_cred->p_ruid = cred->cr_uid;
	if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
	cred->cr_uid, (caddr_t)&dqb))
	freenum = min(dqb.dqb_bhardlimit, freenum);
	p->p_cred->p_ruid = savuid;
	#endif /* QUOTA */
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	uquad = (u_int64_t)freenum;
	NFSQUOTABLKTOBYTE(uquad, fs->f_bsize);
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_QUOTASOFT:
	if (priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
	freenum = fs->f_bfree;
	else
	freenum = fs->f_bavail;
	#ifdef QUOTA
	/*
	* ufs_quotactl() insists that the uid argument
	* equal p_ruid for non-root quota access, so
	* we'll just make sure that's the case.
	*/
	savuid = p->p_cred->p_ruid;
	p->p_cred->p_ruid = cred->cr_uid;
	if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
	cred->cr_uid, (caddr_t)&dqb))
	freenum = min(dqb.dqb_bsoftlimit, freenum);
	p->p_cred->p_ruid = savuid;
	#endif /* QUOTA */
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	uquad = (u_int64_t)freenum;
	NFSQUOTABLKTOBYTE(uquad, fs->f_bsize);
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_QUOTAUSED:
	freenum = 0;
	#ifdef QUOTA
	/*
	* ufs_quotactl() insists that the uid argument
	* equal p_ruid for non-root quota access, so
	* we'll just make sure that's the case.
	*/
	savuid = p->p_cred->p_ruid;
	p->p_cred->p_ruid = cred->cr_uid;
	if (!VFS_QUOTACTL(mp, QCMD(Q_GETQUOTA,USRQUOTA),
	cred->cr_uid, (caddr_t)&dqb))
	freenum = dqb.dqb_curblocks;
	p->p_cred->p_ruid = savuid;
	#endif /* QUOTA */
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	uquad = (u_int64_t)freenum;
	NFSQUOTABLKTOBYTE(uquad, fs->f_bsize);
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_RAWDEV:
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4SPECDATA);
	*tl++ = txdr_unsigned(NFSMAJOR(vap->va_rdev));
	*tl = txdr_unsigned(NFSMINOR(vap->va_rdev));
	retnum += NFSX_V4SPECDATA;
	break;
	case NFSATTRBIT_SPACEAVAIL:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
	uquad = (u_int64_t)fs->f_bfree;
	else
	uquad = (u_int64_t)fs->f_bavail;
	uquad *= fs->f_bsize;
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SPACEFREE:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	uquad = (u_int64_t)fs->f_bfree;
	uquad *= fs->f_bsize;
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SPACETOTAL:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	uquad = (u_int64_t)fs->f_blocks;
	uquad *= fs->f_bsize;
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SPACEUSED:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	txdr_hyper(vap->va_bytes, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_TIMEACCESS:
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
	txdr_nfsv4time(&vap->va_atime, tl);
	retnum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEACCESSSET:
	if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) {
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4SETTIME);
	*tl++ = txdr_unsigned(NFSV4SATTRTIME_TOCLIENT);
	txdr_nfsv4time(&vap->va_atime, tl);
	retnum += NFSX_V4SETTIME;
	} else {
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(NFSV4SATTRTIME_TOSERVER);
	retnum += NFSX_UNSIGNED;
	}
	break;
	case NFSATTRBIT_TIMEDELTA:
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
	temptime.tv_sec = 0;
	temptime.tv_nsec = 1000000000 / hz;
	txdr_nfsv4time(&temptime, tl);
	retnum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMETADATA:
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
	txdr_nfsv4time(&vap->va_ctime, tl);
	retnum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMODIFY:
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4TIME);
	txdr_nfsv4time(&vap->va_mtime, tl);
	retnum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMODIFYSET:
	if ((vap->va_vaflags & VA_UTIMES_NULL) == 0) {
	NFSM_BUILD(tl, u_int32_t *, NFSX_V4SETTIME);
	*tl++ = txdr_unsigned(NFSV4SATTRTIME_TOCLIENT);
	txdr_nfsv4time(&vap->va_mtime, tl);
	retnum += NFSX_V4SETTIME;
	} else {
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(NFSV4SATTRTIME_TOSERVER);
	retnum += NFSX_UNSIGNED;
	}
	break;
	case NFSATTRBIT_MOUNTEDONFILEID:
	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER);
	if (at_root != 0)
	uquad = mounted_on_fileno;
	else
	uquad = vap->va_fileid;
	txdr_hyper(uquad, tl);
	retnum += NFSX_HYPER;
	break;
	case NFSATTRBIT_SUPPATTREXCLCREAT:
	NFSSETSUPP_ATTRBIT(&attrbits);
	NFSCLRNOTSETABLE_ATTRBIT(&attrbits);
	NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET);
	retnum += nfsrv_putattrbit(nd, &attrbits);
	break;
	default:
	printf("EEK! Bad V4 attribute bitpos=%d\n", bitpos);
	}
	}
	}
	if (naclp != NULL)
	acl_free(naclp);
	free(fs, M_STATFS);
	*retnump = txdr_unsigned(retnum);
	return (retnum + prefixnum);
	}

	/*
	* Put the attribute bits onto an mbuf list.
	* Return the number of bytes of output generated.
	*/
	APPLESTATIC int
	nfsrv_putattrbit(struct nfsrv_descript nd, nfsattrbit_t attrbitp)
	{
	u_int32_t *tl;
	int cnt, i, bytesize;

	for (cnt = NFSATTRBIT_MAXWORDS; cnt > 0; cnt--)
	if (attrbitp->bits[cnt - 1])
	break;
	bytesize = (cnt + 1) * NFSX_UNSIGNED;
	NFSM_BUILD(tl, u_int32_t *, bytesize);
	*tl++ = txdr_unsigned(cnt);
	for (i = 0; i < cnt; i++)
	*tl++ = txdr_unsigned(attrbitp->bits[i]);
	return (bytesize);
	}

	/*
	* Convert a uid to a string.
	* If the lookup fails, just output the digits.
	* uid - the user id
	* cpp - points to a buffer of size NFSV4_SMALLSTR
	* (malloc a larger one, as required)
	* retlenp - pointer to length to be returned
	*/
	APPLESTATIC void
	nfsv4_uidtostr(uid_t uid, u_char *cpp, int retlenp, NFSPROC_T *p)
	{
	int i;
	struct nfsusrgrp *usrp;
	u_char cp = cpp;
	uid_t tmp;
	int cnt, hasampersand, len = NFSV4_SMALLSTR, ret;
	struct nfsrv_lughash *hp;

	cnt = 0;
	tryagain:
	if (nfsrv_dnsnamelen > 0 && !nfs_enable_uidtostring) {
	/*
	* Always map nfsrv_defaultuid to "nobody".
	*/
	if (uid == nfsrv_defaultuid) {
	i = nfsrv_dnsnamelen + 7;
	if (i > len) {
	if (len > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	cp = malloc(i, M_NFSSTRING, M_WAITOK);
	*cpp = cp;
	len = i;
	goto tryagain;
	}
	*retlenp = i;
	NFSBCOPY("nobody@", cp, 7);
	cp += 7;
	NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
	return;
	}
	hasampersand = 0;
	hp = NFSUSERHASH(uid);
	mtx_lock(&hp->mtx);
	TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) {
	if (usrp->lug_uid == uid) {
	if (usrp->lug_expiry < NFSD_MONOSEC)
	break;
	/*
	* If the name doesn't already have an '@'
	* in it, append @domainname to it.
	*/
	for (i = 0; i < usrp->lug_namelen; i++) {
	if (usrp->lug_name[i] == '@') {
	hasampersand = 1;
	break;
	}
	}
	if (hasampersand)
	i = usrp->lug_namelen;
	else
	i = usrp->lug_namelen +
	nfsrv_dnsnamelen + 1;
	if (i > len) {
	mtx_unlock(&hp->mtx);
	if (len > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	cp = malloc(i, M_NFSSTRING, M_WAITOK);
	*cpp = cp;
	len = i;
	goto tryagain;
	}
	*retlenp = i;
	NFSBCOPY(usrp->lug_name, cp, usrp->lug_namelen);
	if (!hasampersand) {
	cp += usrp->lug_namelen;
	*cp++ = '@';
	NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
	}
	TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
	TAILQ_INSERT_TAIL(&hp->lughead, usrp,
	lug_numhash);
	mtx_unlock(&hp->mtx);
	return;
	}
	}
	mtx_unlock(&hp->mtx);
	cnt++;
	ret = nfsrv_getuser(RPCNFSUSERD_GETUID, uid, (gid_t)0,
	NULL, p);
	if (ret == 0 && cnt < 2)
	goto tryagain;
	}

	/*
	* No match, just return a string of digits.
	*/
	tmp = uid;
	i = 0;
	while (tmp \|\| i == 0) {
	tmp /= 10;
	i++;
	}
	len = (i > len) ? len : i;
	*retlenp = len;
	cp += (len - 1);
	tmp = uid;
	for (i = 0; i < len; i++) {
	*cp-- = '0' + (tmp % 10);
	tmp /= 10;
	}
	return;
	}

	/*
	* Get a credential for the uid with the server's group list.
	* If none is found, just return the credential passed in after
	* logging a warning message.
	*/
	struct ucred *
	nfsrv_getgrpscred(struct ucred *oldcred)
	{
	struct nfsusrgrp *usrp;
	struct ucred *newcred;
	int cnt, ret;
	uid_t uid;
	struct nfsrv_lughash *hp;

	cnt = 0;
	uid = oldcred->cr_uid;
	tryagain:
	if (nfsrv_dnsnamelen > 0) {
	hp = NFSUSERHASH(uid);
	mtx_lock(&hp->mtx);
	TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) {
	if (usrp->lug_uid == uid) {
	if (usrp->lug_expiry < NFSD_MONOSEC)
	break;
	if (usrp->lug_cred != NULL) {
	newcred = crhold(usrp->lug_cred);
	crfree(oldcred);
	} else
	newcred = oldcred;
	TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
	TAILQ_INSERT_TAIL(&hp->lughead, usrp,
	lug_numhash);
	mtx_unlock(&hp->mtx);
	return (newcred);
	}
	}
	mtx_unlock(&hp->mtx);
	cnt++;
	ret = nfsrv_getuser(RPCNFSUSERD_GETUID, uid, (gid_t)0,
	NULL, curthread);
	if (ret == 0 && cnt < 2)
	goto tryagain;
	}
	return (oldcred);
	}

	/*
	* Convert a string to a uid.
	* If no conversion is possible return NFSERR_BADOWNER, otherwise
	* return 0.
	* If this is called from a client side mount using AUTH_SYS and the
	* string is made up entirely of digits, just convert the string to
	* a number.
	*/
	APPLESTATIC int
	nfsv4_strtouid(struct nfsrv_descript nd, u_char str, int len, uid_t *uidp,
	NFSPROC_T *p)
	{
	int i;
	char cp, endstr, *str0;
	struct nfsusrgrp *usrp;
	int cnt, ret;
	int error = 0;
	uid_t tuid;
	struct nfsrv_lughash hp, hp2;

	if (len == 0) {
	error = NFSERR_BADOWNER;
	goto out;
	}
	/* If a string of digits and an AUTH_SYS mount, just convert it. */
	str0 = str;
	tuid = (uid_t)strtoul(str0, &endstr, 10);
	if ((endstr - str0) == len) {
	/* A numeric string. */
	if ((nd->nd_flag & ND_KERBV) == 0 &&
	((nd->nd_flag & ND_NFSCL) != 0 \|\|
	nfsd_enable_stringtouid != 0))
	*uidp = tuid;
	else
	error = NFSERR_BADOWNER;
	goto out;
	}
	/*
	* Look for an '@'.
	*/
	cp = strchr(str0, '@');
	if (cp != NULL)
	i = (int)(cp++ - str0);
	else
	i = len;

	cnt = 0;
	tryagain:
	if (nfsrv_dnsnamelen > 0) {
	/*
	* If an '@' is found and the domain name matches, search for
	* the name with dns stripped off.
	* Mixed case alpahbetics will match for the domain name, but
	* all upper case will not.
	*/
	if (cnt == 0 && i < len && i > 0 &&
	(len - 1 - i) == nfsrv_dnsnamelen &&
	!nfsrv_cmpmixedcase(cp, nfsrv_dnsname, nfsrv_dnsnamelen)) {
	len -= (nfsrv_dnsnamelen + 1);
	*(cp - 1) = '\0';
	}

	/*
	* Check for the special case of "nobody".
	*/
	if (len == 6 && !NFSBCMP(str, "nobody", 6)) {
	*uidp = nfsrv_defaultuid;
	error = 0;
	goto out;
	}

	hp = NFSUSERNAMEHASH(str, len);
	mtx_lock(&hp->mtx);
	TAILQ_FOREACH(usrp, &hp->lughead, lug_namehash) {
	if (usrp->lug_namelen == len &&
	!NFSBCMP(usrp->lug_name, str, len)) {
	if (usrp->lug_expiry < NFSD_MONOSEC)
	break;
	hp2 = NFSUSERHASH(usrp->lug_uid);
	mtx_lock(&hp2->mtx);
	TAILQ_REMOVE(&hp2->lughead, usrp, lug_numhash);
	TAILQ_INSERT_TAIL(&hp2->lughead, usrp,
	lug_numhash);
	*uidp = usrp->lug_uid;
	mtx_unlock(&hp2->mtx);
	mtx_unlock(&hp->mtx);
	error = 0;
	goto out;
	}
	}
	mtx_unlock(&hp->mtx);
	cnt++;
	ret = nfsrv_getuser(RPCNFSUSERD_GETUSER, (uid_t)0, (gid_t)0,
	str, p);
	if (ret == 0 && cnt < 2)
	goto tryagain;
	}
	error = NFSERR_BADOWNER;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Convert a gid to a string.
	* gid - the group id
	* cpp - points to a buffer of size NFSV4_SMALLSTR
	* (malloc a larger one, as required)
	* retlenp - pointer to length to be returned
	*/
	APPLESTATIC void
	nfsv4_gidtostr(gid_t gid, u_char *cpp, int retlenp, NFSPROC_T *p)
	{
	int i;
	struct nfsusrgrp *usrp;
	u_char cp = cpp;
	gid_t tmp;
	int cnt, hasampersand, len = NFSV4_SMALLSTR, ret;
	struct nfsrv_lughash *hp;

	cnt = 0;
	tryagain:
	if (nfsrv_dnsnamelen > 0 && !nfs_enable_uidtostring) {
	/*
	* Always map nfsrv_defaultgid to "nogroup".
	*/
	if (gid == nfsrv_defaultgid) {
	i = nfsrv_dnsnamelen + 8;
	if (i > len) {
	if (len > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	cp = malloc(i, M_NFSSTRING, M_WAITOK);
	*cpp = cp;
	len = i;
	goto tryagain;
	}
	*retlenp = i;
	NFSBCOPY("nogroup@", cp, 8);
	cp += 8;
	NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
	return;
	}
	hasampersand = 0;
	hp = NFSGROUPHASH(gid);
	mtx_lock(&hp->mtx);
	TAILQ_FOREACH(usrp, &hp->lughead, lug_numhash) {
	if (usrp->lug_gid == gid) {
	if (usrp->lug_expiry < NFSD_MONOSEC)
	break;
	/*
	* If the name doesn't already have an '@'
	* in it, append @domainname to it.
	*/
	for (i = 0; i < usrp->lug_namelen; i++) {
	if (usrp->lug_name[i] == '@') {
	hasampersand = 1;
	break;
	}
	}
	if (hasampersand)
	i = usrp->lug_namelen;
	else
	i = usrp->lug_namelen +
	nfsrv_dnsnamelen + 1;
	if (i > len) {
	mtx_unlock(&hp->mtx);
	if (len > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	cp = malloc(i, M_NFSSTRING, M_WAITOK);
	*cpp = cp;
	len = i;
	goto tryagain;
	}
	*retlenp = i;
	NFSBCOPY(usrp->lug_name, cp, usrp->lug_namelen);
	if (!hasampersand) {
	cp += usrp->lug_namelen;
	*cp++ = '@';
	NFSBCOPY(nfsrv_dnsname, cp, nfsrv_dnsnamelen);
	}
	TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
	TAILQ_INSERT_TAIL(&hp->lughead, usrp,
	lug_numhash);
	mtx_unlock(&hp->mtx);
	return;
	}
	}
	mtx_unlock(&hp->mtx);
	cnt++;
	ret = nfsrv_getuser(RPCNFSUSERD_GETGID, (uid_t)0, gid,
	NULL, p);
	if (ret == 0 && cnt < 2)
	goto tryagain;
	}

	/*
	* No match, just return a string of digits.
	*/
	tmp = gid;
	i = 0;
	while (tmp \|\| i == 0) {
	tmp /= 10;
	i++;
	}
	len = (i > len) ? len : i;
	*retlenp = len;
	cp += (len - 1);
	tmp = gid;
	for (i = 0; i < len; i++) {
	*cp-- = '0' + (tmp % 10);
	tmp /= 10;
	}
	return;
	}

	/*
	* Convert a string to a gid.
	* If no conversion is possible return NFSERR_BADOWNER, otherwise
	* return 0.
	* If this is called from a client side mount using AUTH_SYS and the
	* string is made up entirely of digits, just convert the string to
	* a number.
	*/
	APPLESTATIC int
	nfsv4_strtogid(struct nfsrv_descript nd, u_char str, int len, gid_t *gidp,
	NFSPROC_T *p)
	{
	int i;
	char cp, endstr, *str0;
	struct nfsusrgrp *usrp;
	int cnt, ret;
	int error = 0;
	gid_t tgid;
	struct nfsrv_lughash hp, hp2;

	if (len == 0) {
	error = NFSERR_BADOWNER;
	goto out;
	}
	/* If a string of digits and an AUTH_SYS mount, just convert it. */
	str0 = str;
	tgid = (gid_t)strtoul(str0, &endstr, 10);
	if ((endstr - str0) == len) {
	/* A numeric string. */
	if ((nd->nd_flag & ND_KERBV) == 0 &&
	((nd->nd_flag & ND_NFSCL) != 0 \|\|
	nfsd_enable_stringtouid != 0))
	*gidp = tgid;
	else
	error = NFSERR_BADOWNER;
	goto out;
	}
	/*
	* Look for an '@'.
	*/
	cp = strchr(str0, '@');
	if (cp != NULL)
	i = (int)(cp++ - str0);
	else
	i = len;

	cnt = 0;
	tryagain:
	if (nfsrv_dnsnamelen > 0) {
	/*
	* If an '@' is found and the dns name matches, search for the
	* name with the dns stripped off.
	*/
	if (cnt == 0 && i < len && i > 0 &&
	(len - 1 - i) == nfsrv_dnsnamelen &&
	!nfsrv_cmpmixedcase(cp, nfsrv_dnsname, nfsrv_dnsnamelen)) {
	len -= (nfsrv_dnsnamelen + 1);
	*(cp - 1) = '\0';
	}

	/*
	* Check for the special case of "nogroup".
	*/
	if (len == 7 && !NFSBCMP(str, "nogroup", 7)) {
	*gidp = nfsrv_defaultgid;
	error = 0;
	goto out;
	}

	hp = NFSGROUPNAMEHASH(str, len);
	mtx_lock(&hp->mtx);
	TAILQ_FOREACH(usrp, &hp->lughead, lug_namehash) {
	if (usrp->lug_namelen == len &&
	!NFSBCMP(usrp->lug_name, str, len)) {
	if (usrp->lug_expiry < NFSD_MONOSEC)
	break;
	hp2 = NFSGROUPHASH(usrp->lug_gid);
	mtx_lock(&hp2->mtx);
	TAILQ_REMOVE(&hp2->lughead, usrp, lug_numhash);
	TAILQ_INSERT_TAIL(&hp2->lughead, usrp,
	lug_numhash);
	*gidp = usrp->lug_gid;
	mtx_unlock(&hp2->mtx);
	mtx_unlock(&hp->mtx);
	error = 0;
	goto out;
	}
	}
	mtx_unlock(&hp->mtx);
	cnt++;
	ret = nfsrv_getuser(RPCNFSUSERD_GETGROUP, (uid_t)0, (gid_t)0,
	str, p);
	if (ret == 0 && cnt < 2)
	goto tryagain;
	}
	error = NFSERR_BADOWNER;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Cmp len chars, allowing mixed case in the first argument to match lower
	* case in the second, but not if the first argument is all upper case.
	* Return 0 for a match, 1 otherwise.
	*/
	static int
	nfsrv_cmpmixedcase(u_char cp, u_char cp2, int len)
	{
	int i;
	u_char tmp;
	int fndlower = 0;

	for (i = 0; i < len; i++) {
	if (cp >= 'A' && cp <= 'Z') {
	tmp = *cp++ + ('a' - 'A');
	} else {
	tmp = *cp++;
	if (tmp >= 'a' && tmp <= 'z')
	fndlower = 1;
	}
	if (tmp != *cp2++)
	return (1);
	}
	if (fndlower)
	return (0);
	else
	return (1);
	}

	/*
	* Set the port for the nfsuserd.
	*/
	APPLESTATIC int
	nfsrv_nfsuserdport(struct sockaddr sad, u_short port, NFSPROC_T p)
	{
	struct nfssockreq *rp;
	struct sockaddr_in *ad;
	int error;

	NFSLOCKNAMEID();
	if (nfsrv_nfsuserd) {
	NFSUNLOCKNAMEID();
	error = EPERM;
	NFSSOCKADDRFREE(sad);
	goto out;
	}
	nfsrv_nfsuserd = 1;
	NFSUNLOCKNAMEID();
	/*
	* Set up the socket record and connect.
	*/
	rp = &nfsrv_nfsuserdsock;
	rp->nr_client = NULL;
	rp->nr_cred = NULL;
	rp->nr_lock = (NFSR_RESERVEDPORT \| NFSR_LOCALHOST);
	if (sad != NULL) {
	/* Use the AF_LOCAL socket address passed in. */
	rp->nr_sotype = SOCK_STREAM;
	rp->nr_soproto = 0;
	rp->nr_nam = sad;
	} else {
	/* Use the port# for a UDP socket (old nfsuserd). */
	rp->nr_sotype = SOCK_DGRAM;
	rp->nr_soproto = IPPROTO_UDP;
	NFSSOCKADDRALLOC(rp->nr_nam);
	NFSSOCKADDRSIZE(rp->nr_nam, sizeof (struct sockaddr_in));
	ad = NFSSOCKADDR(rp->nr_nam, struct sockaddr_in *);
	ad->sin_family = AF_INET;
	ad->sin_addr.s_addr = htonl((u_int32_t)0x7f000001);
	ad->sin_port = port;
	}
	rp->nr_prog = RPCPROG_NFSUSERD;
	rp->nr_vers = RPCNFSUSERD_VERS;
	error = newnfs_connect(NULL, rp, NFSPROCCRED(p), p, 0);
	if (error) {
	NFSSOCKADDRFREE(rp->nr_nam);
	nfsrv_nfsuserd = 0;
	}
	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Delete the nfsuserd port.
	*/
	APPLESTATIC void
	nfsrv_nfsuserddelport(void)
	{

	NFSLOCKNAMEID();
	if (nfsrv_nfsuserd == 0) {
	NFSUNLOCKNAMEID();
	return;
	}
	nfsrv_nfsuserd = 0;
	NFSUNLOCKNAMEID();
	newnfs_disconnect(&nfsrv_nfsuserdsock);
	NFSSOCKADDRFREE(nfsrv_nfsuserdsock.nr_nam);
	}

	/*
	* Do upcalls to the nfsuserd, for cache misses of the owner/ownergroup
	* name<-->id cache.
	* Returns 0 upon success, non-zero otherwise.
	*/
	static int
	nfsrv_getuser(int procnum, uid_t uid, gid_t gid, char name, NFSPROC_T p)
	{
	u_int32_t *tl;
	struct nfsrv_descript *nd;
	int len;
	struct nfsrv_descript nfsd;
	struct ucred *cred;
	int error;

	NFSLOCKNAMEID();
	if (nfsrv_nfsuserd == 0) {
	NFSUNLOCKNAMEID();
	error = EPERM;
	goto out;
	}
	NFSUNLOCKNAMEID();
	nd = &nfsd;
	cred = newnfs_getcred();
	nd->nd_flag = ND_GSSINITREPLY;
	nfsrvd_rephead(nd);

	nd->nd_procnum = procnum;
	if (procnum == RPCNFSUSERD_GETUID \|\| procnum == RPCNFSUSERD_GETGID) {
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	if (procnum == RPCNFSUSERD_GETUID)
	*tl = txdr_unsigned(uid);
	else
	*tl = txdr_unsigned(gid);
	} else {
	len = strlen(name);
	(void) nfsm_strtom(nd, name, len);
	}
	error = newnfs_request(nd, NULL, NULL, &nfsrv_nfsuserdsock, NULL, NULL,
	cred, RPCPROG_NFSUSERD, RPCNFSUSERD_VERS, NULL, 0, NULL, NULL);
	NFSFREECRED(cred);
	if (!error) {
	mbuf_freem(nd->nd_mrep);
	error = nd->nd_repstat;
	}
	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* This function is called from the nfssvc(2) system call, to update the
	* kernel user/group name list(s) for the V4 owner and ownergroup attributes.
	*/
	APPLESTATIC int
	nfssvc_idname(struct nfsd_idargs *nidp)
	{
	struct nfsusrgrp nusrp, usrp, *newusrp;
	struct nfsrv_lughash hp_name, hp_idnum, *thp;
	int i, group_locked, groupname_locked, user_locked, username_locked;
	int error = 0;
	u_char *cp;
	gid_t *grps;
	struct ucred *cr;
	static int onethread = 0;
	static time_t lasttime = 0;

	if (nidp->nid_namelen <= 0 \|\| nidp->nid_namelen > MAXHOSTNAMELEN) {
	error = EINVAL;
	goto out;
	}
	if (nidp->nid_flag & NFSID_INITIALIZE) {
	cp = malloc(nidp->nid_namelen + 1, M_NFSSTRING, M_WAITOK);
	error = copyin(CAST_USER_ADDR_T(nidp->nid_name), cp,
	nidp->nid_namelen);
	if (error != 0) {
	free(cp, M_NFSSTRING);
	goto out;
	}
	if (atomic_cmpset_acq_int(&nfsrv_dnsnamelen, 0, 0) == 0) {
	/*
	* Free up all the old stuff and reinitialize hash
	* lists. All mutexes for both lists must be locked,
	* with the user/group name ones before the uid/gid
	* ones, to avoid a LOR.
	*/
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsusernamehash[i].mtx);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsuserhash[i].mtx);
	for (i = 0; i < nfsrv_lughashsize; i++)
	TAILQ_FOREACH_SAFE(usrp,
	&nfsuserhash[i].lughead, lug_numhash, nusrp)
	nfsrv_removeuser(usrp, 1);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsuserhash[i].mtx);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsusernamehash[i].mtx);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsgroupnamehash[i].mtx);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsgrouphash[i].mtx);
	for (i = 0; i < nfsrv_lughashsize; i++)
	TAILQ_FOREACH_SAFE(usrp,
	&nfsgrouphash[i].lughead, lug_numhash,
	nusrp)
	nfsrv_removeuser(usrp, 0);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsgrouphash[i].mtx);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsgroupnamehash[i].mtx);
	free(nfsrv_dnsname, M_NFSSTRING);
	nfsrv_dnsname = NULL;
	}
	if (nfsuserhash == NULL) {
	/* Allocate the hash tables. */
	nfsuserhash = malloc(sizeof(struct nfsrv_lughash) *
	nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK \|
	M_ZERO);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_init(&nfsuserhash[i].mtx, "nfsuidhash",
	NULL, MTX_DEF \| MTX_DUPOK);
	nfsusernamehash = malloc(sizeof(struct nfsrv_lughash) *
	nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK \|
	M_ZERO);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_init(&nfsusernamehash[i].mtx,
	"nfsusrhash", NULL, MTX_DEF \|
	MTX_DUPOK);
	nfsgrouphash = malloc(sizeof(struct nfsrv_lughash) *
	nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK \|
	M_ZERO);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_init(&nfsgrouphash[i].mtx, "nfsgidhash",
	NULL, MTX_DEF \| MTX_DUPOK);
	nfsgroupnamehash = malloc(sizeof(struct nfsrv_lughash) *
	nfsrv_lughashsize, M_NFSUSERGROUP, M_WAITOK \|
	M_ZERO);
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_init(&nfsgroupnamehash[i].mtx,
	"nfsgrphash", NULL, MTX_DEF \| MTX_DUPOK);
	}
	/* (Re)initialize the list heads. */
	for (i = 0; i < nfsrv_lughashsize; i++)
	TAILQ_INIT(&nfsuserhash[i].lughead);
	for (i = 0; i < nfsrv_lughashsize; i++)
	TAILQ_INIT(&nfsusernamehash[i].lughead);
	for (i = 0; i < nfsrv_lughashsize; i++)
	TAILQ_INIT(&nfsgrouphash[i].lughead);
	for (i = 0; i < nfsrv_lughashsize; i++)
	TAILQ_INIT(&nfsgroupnamehash[i].lughead);

	/*
	* Put name in "DNS" string.
	*/
	nfsrv_dnsname = cp;
	nfsrv_defaultuid = nidp->nid_uid;
	nfsrv_defaultgid = nidp->nid_gid;
	nfsrv_usercnt = 0;
	nfsrv_usermax = nidp->nid_usermax;
	atomic_store_rel_int(&nfsrv_dnsnamelen, nidp->nid_namelen);
	goto out;
	}

	/*
	* malloc the new one now, so any potential sleep occurs before
	* manipulation of the lists.
	*/
	newusrp = malloc(sizeof(struct nfsusrgrp) + nidp->nid_namelen,
	M_NFSUSERGROUP, M_WAITOK \| M_ZERO);
	error = copyin(CAST_USER_ADDR_T(nidp->nid_name), newusrp->lug_name,
	nidp->nid_namelen);
	if (error == 0 && nidp->nid_ngroup > 0 &&
	(nidp->nid_flag & NFSID_ADDUID) != 0) {
	grps = malloc(sizeof(gid_t) * nidp->nid_ngroup, M_TEMP,
	M_WAITOK);
	error = copyin(CAST_USER_ADDR_T(nidp->nid_grps), grps,
	sizeof(gid_t) * nidp->nid_ngroup);
	if (error == 0) {
	/*
	* Create a credential just like svc_getcred(),
	* but using the group list provided.
	*/
	cr = crget();
	cr->cr_uid = cr->cr_ruid = cr->cr_svuid = nidp->nid_uid;
	crsetgroups(cr, nidp->nid_ngroup, grps);
	cr->cr_rgid = cr->cr_svgid = cr->cr_groups[0];
	cr->cr_prison = &prison0;
	prison_hold(cr->cr_prison);
	#ifdef MAC
	mac_cred_associate_nfsd(cr);
	#endif
	newusrp->lug_cred = cr;
	}
	free(grps, M_TEMP);
	}
	if (error) {
	free(newusrp, M_NFSUSERGROUP);
	goto out;
	}
	newusrp->lug_namelen = nidp->nid_namelen;

	/*
	* The lock order is username[0]->[nfsrv_lughashsize - 1] followed
	* by uid[0]->[nfsrv_lughashsize - 1], with the same for group.
	* The flags user_locked, username_locked, group_locked and
	* groupname_locked are set to indicate all of those hash lists are
	* locked. hp_name != NULL and hp_idnum != NULL indicates that
	* the respective one mutex is locked.
	*/
	user_locked = username_locked = group_locked = groupname_locked = 0;
	hp_name = hp_idnum = NULL;

	/*
	* Delete old entries, as required.
	*/
	if (nidp->nid_flag & (NFSID_DELUID \| NFSID_ADDUID)) {
	/* Must lock all username hash lists first, to avoid a LOR. */
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsusernamehash[i].mtx);
	username_locked = 1;
	hp_idnum = NFSUSERHASH(nidp->nid_uid);
	mtx_lock(&hp_idnum->mtx);
	TAILQ_FOREACH_SAFE(usrp, &hp_idnum->lughead, lug_numhash,
	nusrp) {
	if (usrp->lug_uid == nidp->nid_uid)
	nfsrv_removeuser(usrp, 1);
	}
	} else if (nidp->nid_flag & (NFSID_DELUSERNAME \| NFSID_ADDUSERNAME)) {
	hp_name = NFSUSERNAMEHASH(newusrp->lug_name,
	newusrp->lug_namelen);
	mtx_lock(&hp_name->mtx);
	TAILQ_FOREACH_SAFE(usrp, &hp_name->lughead, lug_namehash,
	nusrp) {
	if (usrp->lug_namelen == newusrp->lug_namelen &&
	!NFSBCMP(usrp->lug_name, newusrp->lug_name,
	usrp->lug_namelen)) {
	thp = NFSUSERHASH(usrp->lug_uid);
	mtx_lock(&thp->mtx);
	nfsrv_removeuser(usrp, 1);
	mtx_unlock(&thp->mtx);
	}
	}
	hp_idnum = NFSUSERHASH(nidp->nid_uid);
	mtx_lock(&hp_idnum->mtx);
	} else if (nidp->nid_flag & (NFSID_DELGID \| NFSID_ADDGID)) {
	/* Must lock all groupname hash lists first, to avoid a LOR. */
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsgroupnamehash[i].mtx);
	groupname_locked = 1;
	hp_idnum = NFSGROUPHASH(nidp->nid_gid);
	mtx_lock(&hp_idnum->mtx);
	TAILQ_FOREACH_SAFE(usrp, &hp_idnum->lughead, lug_numhash,
	nusrp) {
	if (usrp->lug_gid == nidp->nid_gid)
	nfsrv_removeuser(usrp, 0);
	}
	} else if (nidp->nid_flag & (NFSID_DELGROUPNAME \| NFSID_ADDGROUPNAME)) {
	hp_name = NFSGROUPNAMEHASH(newusrp->lug_name,
	newusrp->lug_namelen);
	mtx_lock(&hp_name->mtx);
	TAILQ_FOREACH_SAFE(usrp, &hp_name->lughead, lug_namehash,
	nusrp) {
	if (usrp->lug_namelen == newusrp->lug_namelen &&
	!NFSBCMP(usrp->lug_name, newusrp->lug_name,
	usrp->lug_namelen)) {
	thp = NFSGROUPHASH(usrp->lug_gid);
	mtx_lock(&thp->mtx);
	nfsrv_removeuser(usrp, 0);
	mtx_unlock(&thp->mtx);
	}
	}
	hp_idnum = NFSGROUPHASH(nidp->nid_gid);
	mtx_lock(&hp_idnum->mtx);
	}

	/*
	* Now, we can add the new one.
	*/
	if (nidp->nid_usertimeout)
	newusrp->lug_expiry = NFSD_MONOSEC + nidp->nid_usertimeout;
	else
	newusrp->lug_expiry = NFSD_MONOSEC + 5;
	if (nidp->nid_flag & (NFSID_ADDUID \| NFSID_ADDUSERNAME)) {
	newusrp->lug_uid = nidp->nid_uid;
	thp = NFSUSERHASH(newusrp->lug_uid);
	mtx_assert(&thp->mtx, MA_OWNED);
	TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_numhash);
	thp = NFSUSERNAMEHASH(newusrp->lug_name, newusrp->lug_namelen);
	mtx_assert(&thp->mtx, MA_OWNED);
	TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_namehash);
	atomic_add_int(&nfsrv_usercnt, 1);
	} else if (nidp->nid_flag & (NFSID_ADDGID \| NFSID_ADDGROUPNAME)) {
	newusrp->lug_gid = nidp->nid_gid;
	thp = NFSGROUPHASH(newusrp->lug_gid);
	mtx_assert(&thp->mtx, MA_OWNED);
	TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_numhash);
	thp = NFSGROUPNAMEHASH(newusrp->lug_name, newusrp->lug_namelen);
	mtx_assert(&thp->mtx, MA_OWNED);
	TAILQ_INSERT_TAIL(&thp->lughead, newusrp, lug_namehash);
	atomic_add_int(&nfsrv_usercnt, 1);
	} else {
	if (newusrp->lug_cred != NULL)
	crfree(newusrp->lug_cred);
	free(newusrp, M_NFSUSERGROUP);
	}

	/*
	* Once per second, allow one thread to trim the cache.
	*/
	if (lasttime < NFSD_MONOSEC &&
	atomic_cmpset_acq_int(&onethread, 0, 1) != 0) {
	/*
	* First, unlock the single mutexes, so that all entries
	* can be locked and any LOR is avoided.
	*/
	if (hp_name != NULL) {
	mtx_unlock(&hp_name->mtx);
	hp_name = NULL;
	}
	if (hp_idnum != NULL) {
	mtx_unlock(&hp_idnum->mtx);
	hp_idnum = NULL;
	}

	if ((nidp->nid_flag & (NFSID_DELUID \| NFSID_ADDUID \|
	NFSID_DELUSERNAME \| NFSID_ADDUSERNAME)) != 0) {
	if (username_locked == 0) {
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsusernamehash[i].mtx);
	username_locked = 1;
	}
	KASSERT(user_locked == 0,
	("nfssvc_idname: user_locked"));
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsuserhash[i].mtx);
	user_locked = 1;
	for (i = 0; i < nfsrv_lughashsize; i++) {
	TAILQ_FOREACH_SAFE(usrp,
	&nfsuserhash[i].lughead, lug_numhash,
	nusrp)
	if (usrp->lug_expiry < NFSD_MONOSEC)
	nfsrv_removeuser(usrp, 1);
	}
	for (i = 0; i < nfsrv_lughashsize; i++) {
	/*
	* Trim the cache using an approximate LRU
	* algorithm. This code deletes the least
	* recently used entry on each hash list.
	*/
	if (nfsrv_usercnt <= nfsrv_usermax)
	break;
	usrp = TAILQ_FIRST(&nfsuserhash[i].lughead);
	if (usrp != NULL)
	nfsrv_removeuser(usrp, 1);
	}
	} else {
	if (groupname_locked == 0) {
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsgroupnamehash[i].mtx);
	groupname_locked = 1;
	}
	KASSERT(group_locked == 0,
	("nfssvc_idname: group_locked"));
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_lock(&nfsgrouphash[i].mtx);
	group_locked = 1;
	for (i = 0; i < nfsrv_lughashsize; i++) {
	TAILQ_FOREACH_SAFE(usrp,
	&nfsgrouphash[i].lughead, lug_numhash,
	nusrp)
	if (usrp->lug_expiry < NFSD_MONOSEC)
	nfsrv_removeuser(usrp, 0);
	}
	for (i = 0; i < nfsrv_lughashsize; i++) {
	/*
	* Trim the cache using an approximate LRU
	* algorithm. This code deletes the least
	* recently user entry on each hash list.
	*/
	if (nfsrv_usercnt <= nfsrv_usermax)
	break;
	usrp = TAILQ_FIRST(&nfsgrouphash[i].lughead);
	if (usrp != NULL)
	nfsrv_removeuser(usrp, 0);
	}
	}
	lasttime = NFSD_MONOSEC;
	atomic_store_rel_int(&onethread, 0);
	}

	/* Now, unlock all locked mutexes. */
	if (hp_idnum != NULL)
	mtx_unlock(&hp_idnum->mtx);
	if (hp_name != NULL)
	mtx_unlock(&hp_name->mtx);
	if (user_locked != 0)
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsuserhash[i].mtx);
	if (username_locked != 0)
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsusernamehash[i].mtx);
	if (group_locked != 0)
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsgrouphash[i].mtx);
	if (groupname_locked != 0)
	for (i = 0; i < nfsrv_lughashsize; i++)
	mtx_unlock(&nfsgroupnamehash[i].mtx);
	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Remove a user/group name element.
	*/
	static void
	nfsrv_removeuser(struct nfsusrgrp *usrp, int isuser)
	{
	struct nfsrv_lughash *hp;

	if (isuser != 0) {
	hp = NFSUSERHASH(usrp->lug_uid);
	mtx_assert(&hp->mtx, MA_OWNED);
	TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
	hp = NFSUSERNAMEHASH(usrp->lug_name, usrp->lug_namelen);
	mtx_assert(&hp->mtx, MA_OWNED);
	TAILQ_REMOVE(&hp->lughead, usrp, lug_namehash);
	} else {
	hp = NFSGROUPHASH(usrp->lug_gid);
	mtx_assert(&hp->mtx, MA_OWNED);
	TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
	hp = NFSGROUPNAMEHASH(usrp->lug_name, usrp->lug_namelen);
	mtx_assert(&hp->mtx, MA_OWNED);
	TAILQ_REMOVE(&hp->lughead, usrp, lug_namehash);
	}
	atomic_add_int(&nfsrv_usercnt, -1);
	if (usrp->lug_cred != NULL)
	crfree(usrp->lug_cred);
	free(usrp, M_NFSUSERGROUP);
	}

	/*
	* Free up all the allocations related to the name<-->id cache.
	* This function should only be called when the nfsuserd daemon isn't
	* running, since it doesn't do any locking.
	* This function is meant to be used when the nfscommon module is unloaded.
	*/
	APPLESTATIC void
	nfsrv_cleanusergroup(void)
	{
	struct nfsrv_lughash hp, hp2;
	struct nfsusrgrp nusrp, usrp;
	int i;

	if (nfsuserhash == NULL)
	return;

	for (i = 0; i < nfsrv_lughashsize; i++) {
	hp = &nfsuserhash[i];
	TAILQ_FOREACH_SAFE(usrp, &hp->lughead, lug_numhash, nusrp) {
	TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
	hp2 = NFSUSERNAMEHASH(usrp->lug_name,
	usrp->lug_namelen);
	TAILQ_REMOVE(&hp2->lughead, usrp, lug_namehash);
	if (usrp->lug_cred != NULL)
	crfree(usrp->lug_cred);
	free(usrp, M_NFSUSERGROUP);
	}
	hp = &nfsgrouphash[i];
	TAILQ_FOREACH_SAFE(usrp, &hp->lughead, lug_numhash, nusrp) {
	TAILQ_REMOVE(&hp->lughead, usrp, lug_numhash);
	hp2 = NFSGROUPNAMEHASH(usrp->lug_name,
	usrp->lug_namelen);
	TAILQ_REMOVE(&hp2->lughead, usrp, lug_namehash);
	if (usrp->lug_cred != NULL)
	crfree(usrp->lug_cred);
	free(usrp, M_NFSUSERGROUP);
	}
	mtx_destroy(&nfsuserhash[i].mtx);
	mtx_destroy(&nfsusernamehash[i].mtx);
	mtx_destroy(&nfsgroupnamehash[i].mtx);
	mtx_destroy(&nfsgrouphash[i].mtx);
	}
	free(nfsuserhash, M_NFSUSERGROUP);
	free(nfsusernamehash, M_NFSUSERGROUP);
	free(nfsgrouphash, M_NFSUSERGROUP);
	free(nfsgroupnamehash, M_NFSUSERGROUP);
	free(nfsrv_dnsname, M_NFSSTRING);
	}

	/*
	* This function scans a byte string and checks for UTF-8 compliance.
	* It returns 0 if it conforms and NFSERR_INVAL if not.
	*/
	APPLESTATIC int
	nfsrv_checkutf8(u_int8_t *cp, int len)
	{
	u_int32_t val = 0x0;
	int cnt = 0, gotd = 0, shift = 0;
	u_int8_t byte;
	static int utf8_shift[5] = { 7, 11, 16, 21, 26 };
	int error = 0;

	/*
	* Here are what the variables are used for:
	* val - the calculated value of a multibyte char, used to check
	* that it was coded with the correct range
	* cnt - the number of 10xxxxxx bytes to follow
	* gotd - set for a char of Dxxx, so D800<->DFFF can be checked for
	* shift - lower order bits of range (ie. "val >> shift" should
	* not be 0, in other words, dividing by the lower bound
	* of the range should get a non-zero value)
	* byte - used to calculate cnt
	*/
	while (len > 0) {
	if (cnt > 0) {
	/* This handles the 10xxxxxx bytes */
	if ((*cp & 0xc0) != 0x80 \|\|
	(gotd && (*cp & 0x20))) {
	error = NFSERR_INVAL;
	goto out;
	}
	gotd = 0;
	val <<= 6;
	val \|= (*cp & 0x3f);
	cnt--;
	if (cnt == 0 && (val >> shift) == 0x0) {
	error = NFSERR_INVAL;
	goto out;
	}
	} else if (*cp & 0x80) {
	/* first byte of multi byte char */
	byte = *cp;
	while ((byte & 0x40) && cnt < 6) {
	cnt++;
	byte <<= 1;
	}
	if (cnt == 0 \|\| cnt == 6) {
	error = NFSERR_INVAL;
	goto out;
	}
	val = (*cp & (0x3f >> cnt));
	shift = utf8_shift[cnt - 1];
	if (cnt == 2 && val == 0xd)
	/* Check for the 0xd800-0xdfff case */
	gotd = 1;
	}
	cp++;
	len--;
	}
	if (cnt > 0)
	error = NFSERR_INVAL;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Parse the xdr for an NFSv4 FsLocations attribute. Return two malloc'd
	* strings, one with the root path in it and the other with the list of
	* locations. The list is in the same format as is found in nfr_refs.
	* It is a "," separated list of entries, where each of them is of the
	* form <server>:<rootpath>. For example
	* "nfsv4-test:/sub2,nfsv4-test2:/user/mnt,nfsv4-test2:/user/mnt2"
	* The nilp argument is set to 1 for the special case of a null fs_root
	* and an empty server list.
	* It returns NFSERR_BADXDR, if the xdr can't be parsed and returns the
	* number of xdr bytes parsed in sump.
	*/
	static int
	nfsrv_getrefstr(struct nfsrv_descript nd, u_char fsrootp, u_char *srvp,
	int sump, int nilp)
	{
	u_int32_t *tl;
	u_char cp = NULL, cp2 = NULL, cp3, str;
	int i, j, len, stringlen, cnt, slen, siz, xdrsum, error = 0, nsrv;
	struct list {
	SLIST_ENTRY(list) next;
	int len;
	u_char host[1];
	} lsp, nlsp;
	SLIST_HEAD(, list) head;

	*fsrootp = NULL;
	*srvp = NULL;
	*nilp = 0;

	/*
	* Get the fs_root path and check for the special case of null path
	* and 0 length server list.
	*/
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	len = fxdr_unsigned(int, *tl);
	if (len < 0 \|\| len > 10240) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (len == 0) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl != 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	*nilp = 1;
	sump = 2 NFSX_UNSIGNED;
	error = 0;
	goto nfsmout;
	}
	cp = malloc(len + 1, M_NFSSTRING, M_WAITOK);
	error = nfsrv_mtostr(nd, cp, len);
	if (!error) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	cnt = fxdr_unsigned(int, *tl);
	if (cnt <= 0)
	error = NFSERR_BADXDR;
	}
	if (error)
	goto nfsmout;

	/*
	* Now, loop through the location list and make up the srvlist.
	*/
	xdrsum = (2 * NFSX_UNSIGNED) + NFSM_RNDUP(len);
	cp2 = cp3 = malloc(1024, M_NFSSTRING, M_WAITOK);
	slen = 1024;
	siz = 0;
	for (i = 0; i < cnt; i++) {
	SLIST_INIT(&head);
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nsrv = fxdr_unsigned(int, *tl);
	if (nsrv <= 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}

	/*
	* Handle the first server by putting it in the srvstr.
	*/
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	len = fxdr_unsigned(int, *tl);
	if (len <= 0 \|\| len > 1024) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	nfsrv_refstrbigenough(siz + len + 3, &cp2, &cp3, &slen);
	if (cp3 != cp2) {
	*cp3++ = ',';
	siz++;
	}
	error = nfsrv_mtostr(nd, cp3, len);
	if (error)
	goto nfsmout;
	cp3 += len;
	*cp3++ = ':';
	siz += (len + 1);
	xdrsum += (2 * NFSX_UNSIGNED) + NFSM_RNDUP(len);
	for (j = 1; j < nsrv; j++) {
	/*
	* Yuck, put them in an slist and process them later.
	*/
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	len = fxdr_unsigned(int, *tl);
	if (len <= 0 \|\| len > 1024) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	lsp = (struct list *)malloc(sizeof (struct list)
	+ len, M_TEMP, M_WAITOK);
	error = nfsrv_mtostr(nd, lsp->host, len);
	if (error)
	goto nfsmout;
	xdrsum += NFSX_UNSIGNED + NFSM_RNDUP(len);
	lsp->len = len;
	SLIST_INSERT_HEAD(&head, lsp, next);
	}

	/*
	* Finally, we can get the path.
	*/
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	len = fxdr_unsigned(int, *tl);
	if (len <= 0 \|\| len > 1024) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	nfsrv_refstrbigenough(siz + len + 1, &cp2, &cp3, &slen);
	error = nfsrv_mtostr(nd, cp3, len);
	if (error)
	goto nfsmout;
	xdrsum += NFSX_UNSIGNED + NFSM_RNDUP(len);
	str = cp3;
	stringlen = len;
	cp3 += len;
	siz += len;
	SLIST_FOREACH_SAFE(lsp, &head, next, nlsp) {
	nfsrv_refstrbigenough(siz + lsp->len + stringlen + 3,
	&cp2, &cp3, &slen);
	*cp3++ = ',';
	NFSBCOPY(lsp->host, cp3, lsp->len);
	cp3 += lsp->len;
	*cp3++ = ':';
	NFSBCOPY(str, cp3, stringlen);
	cp3 += stringlen;
	*cp3 = '\0';
	siz += (lsp->len + stringlen + 2);
	free((caddr_t)lsp, M_TEMP);
	}
	}
	*fsrootp = cp;
	*srvp = cp2;
	*sump = xdrsum;
	NFSEXITCODE2(0, nd);
	return (0);
	nfsmout:
	if (cp != NULL)
	free(cp, M_NFSSTRING);
	if (cp2 != NULL)
	free(cp2, M_NFSSTRING);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Make the malloc'd space large enough. This is a pain, but the xdr
	* doesn't set an upper bound on the side, so...
	*/
	static void
	nfsrv_refstrbigenough(int siz, u_char cpp, u_char cpp2, int *slenp)
	{
	u_char *cp;
	int i;

	if (siz <= *slenp)
	return;
	cp = malloc(siz + 1024, M_NFSSTRING, M_WAITOK);
	NFSBCOPY(cpp, cp, slenp);
	free(*cpp, M_NFSSTRING);
	i = cpp2 - cpp;
	*cpp = cp;
	*cpp2 = cp + i;
	*slenp = siz + 1024;
	}

	/*
	* Initialize the reply header data structures.
	*/
	APPLESTATIC void
	nfsrvd_rephead(struct nfsrv_descript *nd)
	{
	mbuf_t mreq;

	/*
	* If this is a big reply, use a cluster.
	*/
	if ((nd->nd_flag & ND_GSSINITREPLY) == 0 &&
	nfs_bigreply[nd->nd_procnum]) {
	NFSMCLGET(mreq, M_WAITOK);
	nd->nd_mreq = mreq;
	nd->nd_mb = mreq;
	} else {
	NFSMGET(mreq);
	nd->nd_mreq = mreq;
	nd->nd_mb = mreq;
	}
	nd->nd_bpos = NFSMTOD(mreq, caddr_t);
	mbuf_setlen(mreq, 0);

	if ((nd->nd_flag & ND_GSSINITREPLY) == 0)
	NFSM_BUILD(nd->nd_errp, int *, NFSX_UNSIGNED);
	}

	/*
	* Lock a socket against others.
	* Currently used to serialize connect/disconnect attempts.
	*/
	int
	newnfs_sndlock(int *flagp)
	{
	struct timespec ts;

	NFSLOCKSOCK();
	while (*flagp & NFSR_SNDLOCK) {
	*flagp \|= NFSR_WANTSND;
	ts.tv_sec = 0;
	ts.tv_nsec = 0;
	(void) nfsmsleep((caddr_t)flagp, NFSSOCKMUTEXPTR,
	PZERO - 1, "nfsndlck", &ts);
	}
	*flagp \|= NFSR_SNDLOCK;
	NFSUNLOCKSOCK();
	return (0);
	}

	/*
	* Unlock the stream socket for others.
	*/
	void
	newnfs_sndunlock(int *flagp)
	{

	NFSLOCKSOCK();
	if ((*flagp & NFSR_SNDLOCK) == 0)
	panic("nfs sndunlock");
	*flagp &= ~NFSR_SNDLOCK;
	if (*flagp & NFSR_WANTSND) {
	*flagp &= ~NFSR_WANTSND;
	wakeup((caddr_t)flagp);
	}
	NFSUNLOCKSOCK();
	}

	APPLESTATIC int
	nfsv4_getipaddr(struct nfsrv_descript nd, struct sockaddr_in sin,
	struct sockaddr_in6 sin6, sa_family_t saf, int *isudp)
	{
	struct in_addr saddr;
	uint32_t portnum, *tl;
	int i, j, k;
	sa_family_t af = AF_UNSPEC;
	char addr[64], protocol[5], *cp;
	int cantparse = 0, error = 0;
	uint16_t portv;

	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	if (i >= 3 && i <= 4) {
	error = nfsrv_mtostr(nd, protocol, i);
	if (error)
	goto nfsmout;
	if (strcmp(protocol, "tcp") == 0) {
	af = AF_INET;
	*isudp = 0;
	} else if (strcmp(protocol, "udp") == 0) {
	af = AF_INET;
	*isudp = 1;
	} else if (strcmp(protocol, "tcp6") == 0) {
	af = AF_INET6;
	*isudp = 0;
	} else if (strcmp(protocol, "udp6") == 0) {
	af = AF_INET6;
	*isudp = 1;
	} else
	cantparse = 1;
	} else {
	cantparse = 1;
	if (i > 0) {
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	}
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	if (i < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	} else if (cantparse == 0 && i >= 11 && i < 64) {
	/*
	* The shortest address is 11chars and the longest is < 64.
	*/
	error = nfsrv_mtostr(nd, addr, i);
	if (error)
	goto nfsmout;

	/* Find the port# at the end and extract that. */
	i = strlen(addr);
	k = 0;
	cp = &addr[i - 1];
	/* Count back two '.'s from end to get port# field. */
	for (j = 0; j < i; j++) {
	if (*cp == '.') {
	k++;
	if (k == 2)
	break;
	}
	cp--;
	}
	if (k == 2) {
	/*
	* The NFSv4 port# is appended as .N.N, where N is
	* a decimal # in the range 0-255, just like an inet4
	* address. Cheat and use inet_aton(), which will
	* return a Class A address and then shift the high
	* order 8bits over to convert it to the port#.
	*/
	*cp++ = '\0';
	if (inet_aton(cp, &saddr) == 1) {
	portnum = ntohl(saddr.s_addr);
	portv = (uint16_t)((portnum >> 16) \|
	(portnum & 0xff));
	} else
	cantparse = 1;
	} else
	cantparse = 1;
	if (cantparse == 0) {
	if (af == AF_INET) {
	if (inet_pton(af, addr, &sin->sin_addr) == 1) {
	sin->sin_len = sizeof(*sin);
	sin->sin_family = AF_INET;
	sin->sin_port = htons(portv);
	*saf = af;
	return (0);
	}
	} else {
	if (inet_pton(af, addr, &sin6->sin6_addr)
	== 1) {
	sin6->sin6_len = sizeof(*sin6);
	sin6->sin6_family = AF_INET6;
	sin6->sin6_port = htons(portv);
	*saf = af;
	return (0);
	}
	}
	}
	} else {
	if (i > 0) {
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	}
	}
	error = EPERM;
	nfsmout:
	return (error);
	}

	/*
	* Handle an NFSv4.1 Sequence request for the session.
	* If reply != NULL, use it to return the cached reply, as required.
	* The client gets a cached reply via this call for callbacks, however the
	* server gets a cached reply via the nfsv4_seqsess_cachereply() call.
	*/
	int
	nfsv4_seqsession(uint32_t seqid, uint32_t slotid, uint32_t highslot,
	struct nfsslot slots, struct mbuf *reply, uint16_t maxslot)
	{
	int error;

	error = 0;
	if (reply != NULL)
	*reply = NULL;
	if (slotid > maxslot)
	return (NFSERR_BADSLOT);
	if (seqid == slots[slotid].nfssl_seq) {
	/* A retry. */
	if (slots[slotid].nfssl_inprog != 0)
	error = NFSERR_DELAY;
	else if (slots[slotid].nfssl_reply != NULL) {
	if (reply != NULL) {
	*reply = slots[slotid].nfssl_reply;
	slots[slotid].nfssl_reply = NULL;
	}
	slots[slotid].nfssl_inprog = 1;
	error = NFSERR_REPLYFROMCACHE;
	} else
	/* No reply cached, so just do it. */
	slots[slotid].nfssl_inprog = 1;
	} else if ((slots[slotid].nfssl_seq + 1) == seqid) {
	if (slots[slotid].nfssl_reply != NULL)
	m_freem(slots[slotid].nfssl_reply);
	slots[slotid].nfssl_reply = NULL;
	slots[slotid].nfssl_inprog = 1;
	slots[slotid].nfssl_seq++;
	} else
	error = NFSERR_SEQMISORDERED;
	return (error);
	}

	/*
	* Cache this reply for the slot.
	* Use the "rep" argument to return the cached reply if repstat is set to
	* NFSERR_REPLYFROMCACHE. The client never sets repstat to this value.
	*/
	void
	nfsv4_seqsess_cacherep(uint32_t slotid, struct nfsslot *slots, int repstat,
	struct mbuf **rep)
	{

	if (repstat == NFSERR_REPLYFROMCACHE) {
	*rep = slots[slotid].nfssl_reply;
	slots[slotid].nfssl_reply = NULL;
	} else {
	if (slots[slotid].nfssl_reply != NULL)
	m_freem(slots[slotid].nfssl_reply);
	slots[slotid].nfssl_reply = *rep;
	}
	slots[slotid].nfssl_inprog = 0;
	}

	/*
	* Generate the xdr for an NFSv4.1 Sequence Operation.
	*/
	APPLESTATIC void
	nfsv4_setsequence(struct nfsmount nmp, struct nfsrv_descript nd,
	struct nfsclsession *sep, int dont_replycache)
	{
	uint32_t *tl, slotseq = 0;
	int error, maxslot, slotpos;
	uint8_t sessionid[NFSX_V4SESSIONID];

	error = nfsv4_sequencelookup(nmp, sep, &slotpos, &maxslot, &slotseq,
	sessionid);

	/* Build the Sequence arguments. */
	NFSM_BUILD(tl, uint32_t , NFSX_V4SESSIONID + 4 NFSX_UNSIGNED);
	nd->nd_sequence = tl;
	bcopy(sessionid, tl, NFSX_V4SESSIONID);
	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
	nd->nd_slotseq = tl;
	if (error == 0) {
	*tl++ = txdr_unsigned(slotseq);
	*tl++ = txdr_unsigned(slotpos);
	*tl++ = txdr_unsigned(maxslot);
	if (dont_replycache == 0)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	} else {
	/*
	* There are two errors and the rest of the session can
	* just be zeros.
	* NFSERR_BADSESSION: This bad session should just generate
	* the same error again when the RPC is retried.
	* ESTALE: A forced dismount is in progress and will cause the
	* RPC to fail later.
	*/
	*tl++ = 0;
	*tl++ = 0;
	*tl++ = 0;
	*tl = 0;
	}
	nd->nd_flag \|= ND_HASSEQUENCE;
	}

	int
	nfsv4_sequencelookup(struct nfsmount nmp, struct nfsclsession sep,
	int slotposp, int maxslotp, uint32_t slotseqp, uint8_t sessionid)
	{
	int i, maxslot, slotpos;
	uint64_t bitval;

	/* Find an unused slot. */
	slotpos = -1;
	maxslot = -1;
	mtx_lock(&sep->nfsess_mtx);
	do {
	if (nmp != NULL && sep->nfsess_defunct != 0) {
	/* Just return the bad session. */
	bcopy(sep->nfsess_sessionid, sessionid,
	NFSX_V4SESSIONID);
	mtx_unlock(&sep->nfsess_mtx);
	return (NFSERR_BADSESSION);
	}
	bitval = 1;
	for (i = 0; i < sep->nfsess_foreslots; i++) {
	if ((bitval & sep->nfsess_slots) == 0) {
	slotpos = i;
	sep->nfsess_slots \|= bitval;
	sep->nfsess_slotseq[i]++;
	*slotseqp = sep->nfsess_slotseq[i];
	break;
	}
	bitval <<= 1;
	}
	if (slotpos == -1) {
	/*
	* If a forced dismount is in progress, just return.
	* This RPC attempt will fail when it calls
	* newnfs_request().
	*/
	if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
	mtx_unlock(&sep->nfsess_mtx);
	return (ESTALE);
	}
	/* Wake up once/sec, to check for a forced dismount. */
	(void)mtx_sleep(&sep->nfsess_slots, &sep->nfsess_mtx,
	PZERO, "nfsclseq", hz);
	}
	} while (slotpos == -1);
	/* Now, find the highest slot in use. (nfsc_slots is 64bits) */
	bitval = 1;
	for (i = 0; i < 64; i++) {
	if ((bitval & sep->nfsess_slots) != 0)
	maxslot = i;
	bitval <<= 1;
	}
	bcopy(sep->nfsess_sessionid, sessionid, NFSX_V4SESSIONID);
	mtx_unlock(&sep->nfsess_mtx);
	*slotposp = slotpos;
	*maxslotp = maxslot;
	return (0);
	}

	/*
	* Free a session slot.
	*/
	APPLESTATIC void
	nfsv4_freeslot(struct nfsclsession *sep, int slot)
	{
	uint64_t bitval;

	bitval = 1;
	if (slot > 0)
	bitval <<= slot;
	mtx_lock(&sep->nfsess_mtx);
	if ((bitval & sep->nfsess_slots) == 0)
	printf("freeing free slot!!\n");
	sep->nfsess_slots &= ~bitval;
	wakeup(&sep->nfsess_slots);
	mtx_unlock(&sep->nfsess_mtx);
	}

	Index: head/sys/fs/nfsclient/nfs_clstate.c
	===================================================================
	--- head/sys/fs/nfsclient/nfs_clstate.c (revision 327172)
	+++ head/sys/fs/nfsclient/nfs_clstate.c (revision 327173)
	@@ -1,5358 +1,5353 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009 Rick Macklem, University of Guelph
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* These functions implement the client side state handling for NFSv4.
	* NFSv4 state handling:
	* - A lockowner is used to determine lock contention, so it
	* corresponds directly to a Posix pid. (1 to 1 mapping)
	* - The correct granularity of an OpenOwner is not nearly so
	* obvious. An OpenOwner does the following:
	* - provides a serial sequencing of Open/Close/Lock-with-new-lockowner
	* - is used to check for Open/Share contention (not applicable to
	* this client, since all Opens are Deny_None)
	* As such, I considered both extreme.
	* 1 OpenOwner per ClientID - Simple to manage, but fully serializes
	* all Open, Close and Lock (with a new lockowner) Ops.
	* 1 OpenOwner for each Open - This one results in an OpenConfirm for
	* every Open, for most servers.
	* So, I chose to use the same mapping as I did for LockOwnwers.
	* The main concern here is that you can end up with multiple Opens
	* for the same File Handle, but on different OpenOwners (opens
	* inherited from parents, grandparents...) and you do not know
	* which of these the vnodeop close applies to. This is handled by
	* delaying the Close Op(s) until all of the Opens have been closed.
	* (It is not yet obvious if this is the correct granularity.)
	* - How the code handles serialization:
	* - For the ClientId, it uses an exclusive lock while getting its
	* SetClientId and during recovery. Otherwise, it uses a shared
	* lock via a reference count.
	* - For the rest of the data structures, it uses an SMP mutex
	* (once the nfs client is SMP safe) and doesn't sleep while
	* manipulating the linked lists.
	* - The serialization of Open/Close/Lock/LockU falls out in the
	* "wash", since OpenOwners and LockOwners are both mapped from
	* Posix pid. In other words, there is only one Posix pid using
	* any given owner, so that owner is serialized. (If you change
	* the granularity of the OpenOwner, then code must be added to
	* serialize Ops on the OpenOwner.)
	* - When to get rid of OpenOwners and LockOwners.
	* - The function nfscl_cleanup_common() is executed after a process exits.
	* It goes through the client list looking for all Open and Lock Owners.
	* When one is found, it is marked "defunct" or in the case of
	* an OpenOwner without any Opens, freed.
	* The renew thread scans for defunct Owners and gets rid of them,
	* if it can. The LockOwners will also be deleted when the
	* associated Open is closed.
	* - If the LockU or Close Op(s) fail during close in a way
	* that could be recovered upon retry, they are relinked to the
	* ClientId's defunct open list and retried by the renew thread
	* until they succeed or an unmount/recovery occurs.
	* (Since we are done with them, they do not need to be recovered.)
	*/

	#ifndef APPLEKEXT
	#include <fs/nfs/nfsport.h>

	/*
	* Global variables
	*/
	extern struct nfsstatsv1 nfsstatsv1;
	extern struct nfsreqhead nfsd_reqq;
	extern u_int32_t newnfs_false, newnfs_true;
	extern int nfscl_debuglevel;
	extern int nfscl_enablecallb;
	extern int nfs_numnfscbd;
	NFSREQSPINLOCK;
	NFSCLSTATEMUTEX;
	int nfscl_inited = 0;
	struct nfsclhead nfsclhead; /* Head of clientid list */
	int nfscl_deleghighwater = NFSCLDELEGHIGHWATER;
	int nfscl_layouthighwater = NFSCLLAYOUTHIGHWATER;
	#endif /* !APPLEKEXT */

	static int nfscl_delegcnt = 0;
	static int nfscl_layoutcnt = 0;
	static int nfscl_getopen(struct nfsclownerhead , u_int8_t , int, u_int8_t *,
	u_int8_t , u_int32_t, struct nfscllockowner , struct nfsclopen *);
	static void nfscl_clrelease(struct nfsclclient *);
	static void nfscl_cleanclient(struct nfsclclient *);
	static void nfscl_expireclient(struct nfsclclient , struct nfsmount ,
	struct ucred , NFSPROC_T );
	static int nfscl_expireopen(struct nfsclclient , struct nfsclopen ,
	struct nfsmount , struct ucred , NFSPROC_T *);
	static void nfscl_recover(struct nfsclclient , struct ucred , NFSPROC_T *);
	static void nfscl_insertlock(struct nfscllockowner , struct nfscllock ,
	struct nfscllock *, int);
	static int nfscl_updatelock(struct nfscllockowner , struct nfscllock *,
	struct nfscllock **, int);
	static void nfscl_delegreturnall(struct nfsclclient , NFSPROC_T );
	static u_int32_t nfscl_nextcbident(void);
	static mount_t nfscl_getmnt(int, uint8_t , u_int32_t, struct nfsclclient *);
	static struct nfsclclient *nfscl_getclnt(u_int32_t);
	static struct nfsclclient nfscl_getclntsess(uint8_t );
	static struct nfscldeleg nfscl_finddeleg(struct nfsclclient , u_int8_t *,
	int);
	static void nfscl_retoncloselayout(vnode_t, struct nfsclclient , uint8_t ,
	int, struct nfsclrecalllayout **);
	static void nfscl_reldevinfo_locked(struct nfscldevinfo *);
	static struct nfscllayout nfscl_findlayout(struct nfsclclient , u_int8_t *,
	int);
	static struct nfscldevinfo nfscl_finddevinfo(struct nfsclclient , uint8_t *);
	static int nfscl_checkconflict(struct nfscllockownerhead , struct nfscllock ,
	u_int8_t , struct nfscllock *);
	static void nfscl_freealllocks(struct nfscllockownerhead *, int);
	static int nfscl_localconflict(struct nfsclclient , u_int8_t , int,
	struct nfscllock , u_int8_t , struct nfscldeleg , struct nfscllock *);
	static void nfscl_newopen(struct nfsclclient , struct nfscldeleg ,
	struct nfsclowner , struct nfsclowner , struct nfsclopen **,
	struct nfsclopen *, u_int8_t , u_int8_t , int, struct ucred , int *);
	static int nfscl_moveopen(vnode_t , struct nfsclclient *,
	struct nfsmount , struct nfsclopen , struct nfsclowner *,
	struct nfscldeleg , struct ucred , NFSPROC_T *);
	static void nfscl_totalrecall(struct nfsclclient *);
	static int nfscl_relock(vnode_t , struct nfsclclient , struct nfsmount ,
	struct nfscllockowner , struct nfscllock , struct ucred , NFSPROC_T );
	static int nfscl_tryopen(struct nfsmount , vnode_t , u_int8_t , int,
	u_int8_t , int, u_int32_t, struct nfsclopen , u_int8_t *, int,
	struct nfscldeleg *, int, u_int32_t, struct ucred , NFSPROC_T *);
	static int nfscl_trylock(struct nfsmount , vnode_t , u_int8_t ,
	int, struct nfscllockowner *, int, int, u_int64_t, u_int64_t, short,
	struct ucred , NFSPROC_T );
	static int nfsrpc_reopen(struct nfsmount , u_int8_t , int, u_int32_t,
	struct nfsclopen , struct nfscldeleg , struct ucred , NFSPROC_T *);
	static void nfscl_freedeleg(struct nfscldeleghead , struct nfscldeleg );
	static int nfscl_errmap(struct nfsrv_descript *, u_int32_t);
	static void nfscl_cleanup_common(struct nfsclclient , u_int8_t );
	static int nfscl_recalldeleg(struct nfsclclient , struct nfsmount ,
	struct nfscldeleg , vnode_t, struct ucred , NFSPROC_T *, int);
	static void nfscl_freeopenowner(struct nfsclowner *, int);
	static void nfscl_cleandeleg(struct nfscldeleg *);
	static int nfscl_trydelegreturn(struct nfscldeleg , struct ucred ,
	struct nfsmount , NFSPROC_T );
	static void nfscl_emptylockowner(struct nfscllockowner *,
	struct nfscllockownerfhhead *);
	static void nfscl_mergeflayouts(struct nfsclflayouthead *,
	struct nfsclflayouthead *);
	static int nfscl_layoutrecall(int, struct nfscllayout *, uint32_t, uint64_t,
	uint64_t, uint32_t, struct nfsclrecalllayout *);
	static int nfscl_seq(uint32_t, uint32_t);
	static void nfscl_layoutreturn(struct nfsmount , struct nfscllayout ,
	struct ucred , NFSPROC_T );
	static void nfscl_dolayoutcommit(struct nfsmount , struct nfscllayout ,
	struct ucred , NFSPROC_T );

	static short nfscberr_null[] = {
	0,
	0,
	};

	static short nfscberr_getattr[] = {
	NFSERR_RESOURCE,
	NFSERR_BADHANDLE,
	NFSERR_BADXDR,
	NFSERR_RESOURCE,
	NFSERR_SERVERFAULT,
	0,
	};

	static short nfscberr_recall[] = {
	NFSERR_RESOURCE,
	NFSERR_BADHANDLE,
	NFSERR_BADSTATEID,
	NFSERR_BADXDR,
	NFSERR_RESOURCE,
	NFSERR_SERVERFAULT,
	0,
	};

	static short *nfscl_cberrmap[] = {
	nfscberr_null,
	nfscberr_null,
	nfscberr_null,
	nfscberr_getattr,
	nfscberr_recall
	};

	#define NETFAMILY(clp) \
	(((clp)->nfsc_flags & NFSCLFLAGS_AFINET6) ? AF_INET6 : AF_INET)

	/*
	* Called for an open operation.
	* If the nfhp argument is NULL, just get an openowner.
	*/
	APPLESTATIC int
	nfscl_open(vnode_t vp, u_int8_t *nfhp, int fhlen, u_int32_t amode, int usedeleg,
	struct ucred cred, NFSPROC_T p, struct nfsclowner **owpp,
	struct nfsclopen *opp, int newonep, int *retp, int lockit)
	{
	struct nfsclclient *clp;
	struct nfsclowner owp, nowp;
	struct nfsclopen op = NULL, nop = NULL;
	struct nfscldeleg *dp;
	struct nfsclownerhead *ohp;
	u_int8_t own[NFSV4CL_LOCKNAMELEN];
	int ret;

	if (newonep != NULL)
	*newonep = 0;
	if (opp != NULL)
	*opp = NULL;
	if (owpp != NULL)
	*owpp = NULL;

	/*
	* Might need one or both of these, so MALLOC them now, to
	* avoid a tsleep() in MALLOC later.
	*/
	MALLOC(nowp, struct nfsclowner *, sizeof (struct nfsclowner),
	M_NFSCLOWNER, M_WAITOK);
	if (nfhp != NULL)
	MALLOC(nop, struct nfsclopen *, sizeof (struct nfsclopen) +
	fhlen - 1, M_NFSCLOPEN, M_WAITOK);
	ret = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
	if (ret != 0) {
	FREE((caddr_t)nowp, M_NFSCLOWNER);
	if (nop != NULL)
	FREE((caddr_t)nop, M_NFSCLOPEN);
	return (ret);
	}

	/*
	* Get the Open iff it already exists.
	* If none found, add the new one or return error, depending upon
	* "create".
	*/
	NFSLOCKCLSTATE();
	dp = NULL;
	/* First check the delegation list */
	if (nfhp != NULL && usedeleg) {
	LIST_FOREACH(dp, NFSCLDELEGHASH(clp, nfhp, fhlen), nfsdl_hash) {
	if (dp->nfsdl_fhlen == fhlen &&
	!NFSBCMP(nfhp, dp->nfsdl_fh, fhlen)) {
	if (!(amode & NFSV4OPEN_ACCESSWRITE) \|\|
	(dp->nfsdl_flags & NFSCLDL_WRITE))
	break;
	dp = NULL;
	break;
	}
	}
	}

	if (dp != NULL) {
	nfscl_filllockowner(p->td_proc, own, F_POSIX);
	ohp = &dp->nfsdl_owner;
	} else {
	/* For NFSv4.1 and this option, use a single open_owner. */
	if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp))))
	nfscl_filllockowner(NULL, own, F_POSIX);
	else
	nfscl_filllockowner(p->td_proc, own, F_POSIX);
	ohp = &clp->nfsc_owner;
	}
	/* Now, search for an openowner */
	LIST_FOREACH(owp, ohp, nfsow_list) {
	if (!NFSBCMP(owp->nfsow_owner, own, NFSV4CL_LOCKNAMELEN))
	break;
	}

	/*
	* Create a new open, as required.
	*/
	nfscl_newopen(clp, dp, &owp, &nowp, &op, &nop, own, nfhp, fhlen,
	cred, newonep);

	/*
	* Now, check the mode on the open and return the appropriate
	* value.
	*/
	if (retp != NULL) {
	if (nfhp != NULL && dp != NULL && nop == NULL)
	/* new local open on delegation */
	*retp = NFSCLOPEN_SETCRED;
	else
	*retp = NFSCLOPEN_OK;
	}
	if (op != NULL && (amode & ~(op->nfso_mode))) {
	op->nfso_mode \|= amode;
	if (retp != NULL && dp == NULL)
	*retp = NFSCLOPEN_DOOPEN;
	}

	/*
	* Serialize modifications to the open owner for multiple threads
	* within the same process using a read/write sleep lock.
	* For NFSv4.1 and a single OpenOwner, allow concurrent open operations
	* by acquiring a shared lock. The close operations still use an
	* exclusive lock for this case.
	*/
	if (lockit != 0) {
	if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp)))) {
	/*
	* Get a shared lock on the OpenOwner, but first
	* wait for any pending exclusive lock, so that the
	* exclusive locker gets priority.
	*/
	nfsv4_lock(&owp->nfsow_rwlock, 0, NULL,
	NFSCLSTATEMUTEXPTR, NULL);
	nfsv4_getref(&owp->nfsow_rwlock, NULL,
	NFSCLSTATEMUTEXPTR, NULL);
	} else
	nfscl_lockexcl(&owp->nfsow_rwlock, NFSCLSTATEMUTEXPTR);
	}
	NFSUNLOCKCLSTATE();
	if (nowp != NULL)
	FREE((caddr_t)nowp, M_NFSCLOWNER);
	if (nop != NULL)
	FREE((caddr_t)nop, M_NFSCLOPEN);
	if (owpp != NULL)
	*owpp = owp;
	if (opp != NULL)
	*opp = op;
	return (0);
	}

	/*
	* Create a new open, as required.
	*/
	static void
	nfscl_newopen(struct nfsclclient clp, struct nfscldeleg dp,
	struct nfsclowner owpp, struct nfsclowner nowpp, struct nfsclopen **opp,
	struct nfsclopen *nopp, u_int8_t own, u_int8_t *fhp, int fhlen,
	struct ucred cred, int newonep)
	{
	struct nfsclowner owp = owpp, *nowp;
	struct nfsclopen op, nop;

	if (nowpp != NULL)
	nowp = *nowpp;
	else
	nowp = NULL;
	if (nopp != NULL)
	nop = *nopp;
	else
	nop = NULL;
	if (owp == NULL && nowp != NULL) {
	NFSBCOPY(own, nowp->nfsow_owner, NFSV4CL_LOCKNAMELEN);
	LIST_INIT(&nowp->nfsow_open);
	nowp->nfsow_clp = clp;
	nowp->nfsow_seqid = 0;
	nowp->nfsow_defunct = 0;
	nfscl_lockinit(&nowp->nfsow_rwlock);
	if (dp != NULL) {
	nfsstatsv1.cllocalopenowners++;
	LIST_INSERT_HEAD(&dp->nfsdl_owner, nowp, nfsow_list);
	} else {
	nfsstatsv1.clopenowners++;
	LIST_INSERT_HEAD(&clp->nfsc_owner, nowp, nfsow_list);
	}
	owp = *owpp = nowp;
	*nowpp = NULL;
	if (newonep != NULL)
	*newonep = 1;
	}

	/* If an fhp has been specified, create an Open as well. */
	if (fhp != NULL) {
	/* and look for the correct open, based upon FH */
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op->nfso_fhlen == fhlen &&
	!NFSBCMP(op->nfso_fh, fhp, fhlen))
	break;
	}
	if (op == NULL && nop != NULL) {
	nop->nfso_own = owp;
	nop->nfso_mode = 0;
	nop->nfso_opencnt = 0;
	nop->nfso_posixlock = 1;
	nop->nfso_fhlen = fhlen;
	NFSBCOPY(fhp, nop->nfso_fh, fhlen);
	LIST_INIT(&nop->nfso_lock);
	nop->nfso_stateid.seqid = 0;
	nop->nfso_stateid.other[0] = 0;
	nop->nfso_stateid.other[1] = 0;
	nop->nfso_stateid.other[2] = 0;
	KASSERT(cred != NULL, ("%s: cred NULL\n", __func__));
	newnfs_copyincred(cred, &nop->nfso_cred);
	if (dp != NULL) {
	TAILQ_REMOVE(&clp->nfsc_deleg, dp, nfsdl_list);
	TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp,
	nfsdl_list);
	dp->nfsdl_timestamp = NFSD_MONOSEC + 120;
	nfsstatsv1.cllocalopens++;
	} else {
	nfsstatsv1.clopens++;
	}
	LIST_INSERT_HEAD(&owp->nfsow_open, nop, nfso_list);
	*opp = nop;
	*nopp = NULL;
	if (newonep != NULL)
	*newonep = 1;
	} else {
	*opp = op;
	}
	}
	}

	/*
	* Called to find/add a delegation to a client.
	*/
	APPLESTATIC int
	nfscl_deleg(mount_t mp, struct nfsclclient clp, u_int8_t nfhp,
	int fhlen, struct ucred cred, NFSPROC_T p, struct nfscldeleg **dpp)
	{
	struct nfscldeleg dp = dpp, *tdp;

	/*
	* First, if we have received a Read delegation for a file on a
	* read/write file system, just return it, because they aren't
	* useful, imho.
	*/
	if (mp != NULL && dp != NULL && !NFSMNT_RDONLY(mp) &&
	(dp->nfsdl_flags & NFSCLDL_READ)) {
	(void) nfscl_trydelegreturn(dp, cred, VFSTONFS(mp), p);
	FREE((caddr_t)dp, M_NFSCLDELEG);
	*dpp = NULL;
	return (0);
	}

	/* Look for the correct deleg, based upon FH */
	NFSLOCKCLSTATE();
	tdp = nfscl_finddeleg(clp, nfhp, fhlen);
	if (tdp == NULL) {
	if (dp == NULL) {
	NFSUNLOCKCLSTATE();
	return (NFSERR_BADSTATEID);
	}
	*dpp = NULL;
	TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp, nfsdl_list);
	LIST_INSERT_HEAD(NFSCLDELEGHASH(clp, nfhp, fhlen), dp,
	nfsdl_hash);
	dp->nfsdl_timestamp = NFSD_MONOSEC + 120;
	nfsstatsv1.cldelegates++;
	nfscl_delegcnt++;
	} else {
	/*
	* Delegation already exists, what do we do if a new one??
	*/
	if (dp != NULL) {
	printf("Deleg already exists!\n");
	FREE((caddr_t)dp, M_NFSCLDELEG);
	*dpp = NULL;
	} else {
	*dpp = tdp;
	}
	}
	NFSUNLOCKCLSTATE();
	return (0);
	}

	/*
	* Find a delegation for this file handle. Return NULL upon failure.
	*/
	static struct nfscldeleg *
	nfscl_finddeleg(struct nfsclclient clp, u_int8_t fhp, int fhlen)
	{
	struct nfscldeleg *dp;

	LIST_FOREACH(dp, NFSCLDELEGHASH(clp, fhp, fhlen), nfsdl_hash) {
	if (dp->nfsdl_fhlen == fhlen &&
	!NFSBCMP(dp->nfsdl_fh, fhp, fhlen))
	break;
	}
	return (dp);
	}

	/*
	* Get a stateid for an I/O operation. First, look for an open and iff
	* found, return either a lockowner stateid or the open stateid.
	* If no Open is found, just return error and the special stateid of all zeros.
	*/
	APPLESTATIC int
	nfscl_getstateid(vnode_t vp, u_int8_t *nfhp, int fhlen, u_int32_t mode,
	int fords, struct ucred cred, NFSPROC_T p, nfsv4stateid_t *stateidp,
	void **lckpp)
	{
	struct nfsclclient *clp;
	struct nfsclowner *owp;
	struct nfsclopen op = NULL, top;
	struct nfscllockowner *lp;
	struct nfscldeleg *dp;
	struct nfsnode *np;
	struct nfsmount *nmp;
	u_int8_t own[NFSV4CL_LOCKNAMELEN];
	int error, done;

	*lckpp = NULL;
	/*
	* Initially, just set the special stateid of all zeros.
	* (Don't do this for a DS, since the special stateid can't be used.)
	*/
	if (fords == 0) {
	stateidp->seqid = 0;
	stateidp->other[0] = 0;
	stateidp->other[1] = 0;
	stateidp->other[2] = 0;
	}
	if (vnode_vtype(vp) != VREG)
	return (EISDIR);
	np = VTONFS(vp);
	nmp = VFSTONFS(vnode_mount(vp));
	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (EACCES);
	}

	/*
	* Wait for recovery to complete.
	*/
	while ((clp->nfsc_flags & NFSCLFLAGS_RECVRINPROG))
	(void) nfsmsleep(&clp->nfsc_flags, NFSCLSTATEMUTEXPTR,
	PZERO, "nfsrecvr", NULL);

	/*
	* First, look for a delegation.
	*/
	LIST_FOREACH(dp, NFSCLDELEGHASH(clp, nfhp, fhlen), nfsdl_hash) {
	if (dp->nfsdl_fhlen == fhlen &&
	!NFSBCMP(nfhp, dp->nfsdl_fh, fhlen)) {
	if (!(mode & NFSV4OPEN_ACCESSWRITE) \|\|
	(dp->nfsdl_flags & NFSCLDL_WRITE)) {
	stateidp->seqid = dp->nfsdl_stateid.seqid;
	stateidp->other[0] = dp->nfsdl_stateid.other[0];
	stateidp->other[1] = dp->nfsdl_stateid.other[1];
	stateidp->other[2] = dp->nfsdl_stateid.other[2];
	if (!(np->n_flag & NDELEGRECALL)) {
	TAILQ_REMOVE(&clp->nfsc_deleg, dp,
	nfsdl_list);
	TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp,
	nfsdl_list);
	dp->nfsdl_timestamp = NFSD_MONOSEC +
	120;
	dp->nfsdl_rwlock.nfslock_usecnt++;
	lckpp = (void )&dp->nfsdl_rwlock;
	}
	NFSUNLOCKCLSTATE();
	return (0);
	}
	break;
	}
	}

	if (p != NULL) {
	/*
	* If p != NULL, we want to search the parentage tree
	* for a matching OpenOwner and use that.
	*/
	if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp))))
	nfscl_filllockowner(NULL, own, F_POSIX);
	else
	nfscl_filllockowner(p->td_proc, own, F_POSIX);
	lp = NULL;
	error = nfscl_getopen(&clp->nfsc_owner, nfhp, fhlen, own, own,
	mode, &lp, &op);
	if (error == 0 && lp != NULL && fords == 0) {
	/* Don't return a lock stateid for a DS. */
	stateidp->seqid =
	lp->nfsl_stateid.seqid;
	stateidp->other[0] =
	lp->nfsl_stateid.other[0];
	stateidp->other[1] =
	lp->nfsl_stateid.other[1];
	stateidp->other[2] =
	lp->nfsl_stateid.other[2];
	NFSUNLOCKCLSTATE();
	return (0);
	}
	}
	if (op == NULL) {
	/* If not found, just look for any OpenOwner that will work. */
	top = NULL;
	done = 0;
	owp = LIST_FIRST(&clp->nfsc_owner);
	while (!done && owp != NULL) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op->nfso_fhlen == fhlen &&
	!NFSBCMP(op->nfso_fh, nfhp, fhlen)) {
	if (top == NULL && (op->nfso_mode &
	NFSV4OPEN_ACCESSWRITE) != 0 &&
	(mode & NFSV4OPEN_ACCESSREAD) != 0)
	top = op;
	if ((mode & op->nfso_mode) == mode) {
	done = 1;
	break;
	}
	}
	}
	if (!done)
	owp = LIST_NEXT(owp, nfsow_list);
	}
	if (!done) {
	NFSCL_DEBUG(2, "openmode top=%p\n", top);
	if (top == NULL \|\| NFSHASOPENMODE(nmp)) {
	NFSUNLOCKCLSTATE();
	return (ENOENT);
	} else
	op = top;
	}
	/*
	* For read aheads or write behinds, use the open cred.
	* A read ahead or write behind is indicated by p == NULL.
	*/
	if (p == NULL)
	newnfs_copycred(&op->nfso_cred, cred);
	}

	/*
	* No lock stateid, so return the open stateid.
	*/
	stateidp->seqid = op->nfso_stateid.seqid;
	stateidp->other[0] = op->nfso_stateid.other[0];
	stateidp->other[1] = op->nfso_stateid.other[1];
	stateidp->other[2] = op->nfso_stateid.other[2];
	NFSUNLOCKCLSTATE();
	return (0);
	}

	/*
	* Search for a matching file, mode and, optionally, lockowner.
	*/
	static int
	nfscl_getopen(struct nfsclownerhead ohp, u_int8_t nfhp, int fhlen,
	u_int8_t openown, u_int8_t lockown, u_int32_t mode,
	struct nfscllockowner lpp, struct nfsclopen opp)
	{
	struct nfsclowner *owp;
	struct nfsclopen op, rop, *rop2;
	struct nfscllockowner *lp;
	int keep_looping;

	if (lpp != NULL)
	*lpp = NULL;
	/*
	* rop will be set to the open to be returned. There are three
	* variants of this, all for an open of the correct file:
	* 1 - A match of lockown.
	* 2 - A match of the openown, when no lockown match exists.
	* 3 - A match for any open, if no openown or lockown match exists.
	* Looking for #2 over #3 probably isn't necessary, but since
	* RFC3530 is vague w.r.t. the relationship between openowners and
	* lockowners, I think this is the safer way to go.
	*/
	rop = NULL;
	rop2 = NULL;
	keep_looping = 1;
	/* Search the client list */
	owp = LIST_FIRST(ohp);
	while (owp != NULL && keep_looping != 0) {
	/* and look for the correct open */
	op = LIST_FIRST(&owp->nfsow_open);
	while (op != NULL && keep_looping != 0) {
	if (op->nfso_fhlen == fhlen &&
	!NFSBCMP(op->nfso_fh, nfhp, fhlen)
	&& (op->nfso_mode & mode) == mode) {
	if (lpp != NULL) {
	/* Now look for a matching lockowner. */
	LIST_FOREACH(lp, &op->nfso_lock,
	nfsl_list) {
	if (!NFSBCMP(lp->nfsl_owner,
	lockown,
	NFSV4CL_LOCKNAMELEN)) {
	*lpp = lp;
	rop = op;
	keep_looping = 0;
	break;
	}
	}
	}
	if (rop == NULL && !NFSBCMP(owp->nfsow_owner,
	openown, NFSV4CL_LOCKNAMELEN)) {
	rop = op;
	if (lpp == NULL)
	keep_looping = 0;
	}
	if (rop2 == NULL)
	rop2 = op;
	}
	op = LIST_NEXT(op, nfso_list);
	}
	owp = LIST_NEXT(owp, nfsow_list);
	}
	if (rop == NULL)
	rop = rop2;
	if (rop == NULL)
	return (EBADF);
	*opp = rop;
	return (0);
	}

	/*
	* Release use of an open owner. Called when open operations are done
	* with the open owner.
	*/
	APPLESTATIC void
	nfscl_ownerrelease(struct nfsmount nmp, struct nfsclowner owp,
	__unused int error, __unused int candelete, int unlocked)
	{

	if (owp == NULL)
	return;
	NFSLOCKCLSTATE();
	if (unlocked == 0) {
	if (NFSHASONEOPENOWN(nmp))
	nfsv4_relref(&owp->nfsow_rwlock);
	else
	nfscl_lockunlock(&owp->nfsow_rwlock);
	}
	nfscl_clrelease(owp->nfsow_clp);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Release use of an open structure under an open owner.
	*/
	APPLESTATIC void
	nfscl_openrelease(struct nfsmount nmp, struct nfsclopen op, int error,
	int candelete)
	{
	struct nfsclclient *clp;
	struct nfsclowner *owp;

	if (op == NULL)
	return;
	NFSLOCKCLSTATE();
	owp = op->nfso_own;
	if (NFSHASONEOPENOWN(nmp))
	nfsv4_relref(&owp->nfsow_rwlock);
	else
	nfscl_lockunlock(&owp->nfsow_rwlock);
	clp = owp->nfsow_clp;
	if (error && candelete && op->nfso_opencnt == 0)
	nfscl_freeopen(op, 0);
	nfscl_clrelease(clp);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Called to get a clientid structure. It will optionally lock the
	* client data structures to do the SetClientId/SetClientId_confirm,
	* but will release that lock and return the clientid with a reference
	* count on it.
	* If the "cred" argument is NULL, a new clientid should not be created.
	* If the "p" argument is NULL, a SetClientID/SetClientIDConfirm cannot
	* be done.
	* The start_renewthread argument tells nfscl_getcl() to start a renew
	* thread if this creates a new clp.
	* It always clpp with a reference count on it, unless returning an error.
	*/
	APPLESTATIC int
	nfscl_getcl(struct mount mp, struct ucred cred, NFSPROC_T *p,
	int start_renewthread, struct nfsclclient **clpp)
	{
	struct nfsclclient *clp;
	struct nfsclclient *newclp = NULL;
	struct nfsmount *nmp;
	char uuid[HOSTUUIDLEN];
	int igotlock = 0, error, trystalecnt, clidinusedelay, i;
	u_int16_t idlen = 0;

	nmp = VFSTONFS(mp);
	if (cred != NULL) {
	getcredhostuuid(cred, uuid, sizeof uuid);
	idlen = strlen(uuid);
	if (idlen > 0)
	idlen += sizeof (u_int64_t);
	else
	idlen += sizeof (u_int64_t) + 16; /* 16 random bytes */
	MALLOC(newclp, struct nfsclclient *,
	sizeof (struct nfsclclient) + idlen - 1, M_NFSCLCLIENT,
	M_WAITOK \| M_ZERO);
	}
	NFSLOCKCLSTATE();
	/*
	* If a forced dismount is already in progress, don't
	* allocate a new clientid and get out now. For the case where
	* clp != NULL, this is a harmless optimization.
	*/
	if (NFSCL_FORCEDISM(mp)) {
	NFSUNLOCKCLSTATE();
	if (newclp != NULL)
	free(newclp, M_NFSCLCLIENT);
	return (EBADF);
	}
	clp = nmp->nm_clp;
	if (clp == NULL) {
	if (newclp == NULL) {
	NFSUNLOCKCLSTATE();
	return (EACCES);
	}
	clp = newclp;
	clp->nfsc_idlen = idlen;
	LIST_INIT(&clp->nfsc_owner);
	TAILQ_INIT(&clp->nfsc_deleg);
	TAILQ_INIT(&clp->nfsc_layout);
	LIST_INIT(&clp->nfsc_devinfo);
	for (i = 0; i < NFSCLDELEGHASHSIZE; i++)
	LIST_INIT(&clp->nfsc_deleghash[i]);
	for (i = 0; i < NFSCLLAYOUTHASHSIZE; i++)
	LIST_INIT(&clp->nfsc_layouthash[i]);
	clp->nfsc_flags = NFSCLFLAGS_INITED;
	clp->nfsc_clientidrev = 1;
	clp->nfsc_cbident = nfscl_nextcbident();
	nfscl_fillclid(nmp->nm_clval, uuid, clp->nfsc_id,
	clp->nfsc_idlen);
	LIST_INSERT_HEAD(&nfsclhead, clp, nfsc_list);
	nmp->nm_clp = clp;
	clp->nfsc_nmp = nmp;
	NFSUNLOCKCLSTATE();
	if (start_renewthread != 0)
	nfscl_start_renewthread(clp);
	} else {
	NFSUNLOCKCLSTATE();
	if (newclp != NULL)
	free(newclp, M_NFSCLCLIENT);
	}
	NFSLOCKCLSTATE();
	while ((clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID) == 0 && !igotlock &&
	!NFSCL_FORCEDISM(mp))
	igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
	NFSCLSTATEMUTEXPTR, mp);
	if (igotlock == 0) {
	/*
	* Call nfsv4_lock() with "iwantlock == 0" so that it will
	* wait for a pending exclusive lock request. This gives the
	* exclusive lock request priority over this shared lock
	* request.
	* An exclusive lock on nfsc_lock is used mainly for server
	* crash recoveries.
	*/
	nfsv4_lock(&clp->nfsc_lock, 0, NULL, NFSCLSTATEMUTEXPTR, mp);
	nfsv4_getref(&clp->nfsc_lock, NULL, NFSCLSTATEMUTEXPTR, mp);
	}
	if (igotlock == 0 && NFSCL_FORCEDISM(mp)) {
	/*
	* Both nfsv4_lock() and nfsv4_getref() know to check
	* for NFSCL_FORCEDISM() and return without sleeping to
	* wait for the exclusive lock to be released, since it
	* might be held by nfscl_umount() and we need to get out
	* now for that case and not wait until nfscl_umount()
	* releases it.
	*/
	NFSUNLOCKCLSTATE();
	return (EBADF);
	}
	NFSUNLOCKCLSTATE();

	/*
	* If it needs a clientid, do the setclientid now.
	*/
	if ((clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID) == 0) {
	if (!igotlock)
	panic("nfscl_clget");
	if (p == NULL \|\| cred == NULL) {
	NFSLOCKCLSTATE();
	nfsv4_unlock(&clp->nfsc_lock, 0);
	NFSUNLOCKCLSTATE();
	return (EACCES);
	}
	/*
	* If RFC3530 Sec. 14.2.33 is taken literally,
	* NFSERR_CLIDINUSE will be returned persistently for the
	* case where a new mount of the same file system is using
	* a different principal. In practice, NFSERR_CLIDINUSE is
	* only returned when there is outstanding unexpired state
	* on the clientid. As such, try for twice the lease
	* interval, if we know what that is. Otherwise, make a
	* wild ass guess.
	* The case of returning NFSERR_STALECLIENTID is far less
	* likely, but might occur if there is a significant delay
	* between doing the SetClientID and SetClientIDConfirm Ops,
	* such that the server throws away the clientid before
	* receiving the SetClientIDConfirm.
	*/
	if (clp->nfsc_renew > 0)
	clidinusedelay = NFSCL_LEASE(clp->nfsc_renew) * 2;
	else
	clidinusedelay = 120;
	trystalecnt = 3;
	do {
	error = nfsrpc_setclient(nmp, clp, 0, cred, p);
	if (error == NFSERR_STALECLIENTID \|\|
	error == NFSERR_STALEDONTRECOVER \|\|
	error == NFSERR_BADSESSION \|\|
	error == NFSERR_CLIDINUSE) {
	(void) nfs_catnap(PZERO, error, "nfs_setcl");
	}
	} while (((error == NFSERR_STALECLIENTID \|\|
	error == NFSERR_BADSESSION \|\|
	error == NFSERR_STALEDONTRECOVER) && --trystalecnt > 0) \|\|
	(error == NFSERR_CLIDINUSE && --clidinusedelay > 0));
	if (error) {
	NFSLOCKCLSTATE();
	nfsv4_unlock(&clp->nfsc_lock, 0);
	NFSUNLOCKCLSTATE();
	return (error);
	}
	clp->nfsc_flags \|= NFSCLFLAGS_HASCLIENTID;
	}
	if (igotlock) {
	NFSLOCKCLSTATE();
	nfsv4_unlock(&clp->nfsc_lock, 1);
	NFSUNLOCKCLSTATE();
	}

	*clpp = clp;
	return (0);
	}

	/*
	* Get a reference to a clientid and return it, if valid.
	*/
	APPLESTATIC struct nfsclclient *
	nfscl_findcl(struct nfsmount *nmp)
	{
	struct nfsclclient *clp;

	clp = nmp->nm_clp;
	if (clp == NULL \|\| !(clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID))
	return (NULL);
	return (clp);
	}

	/*
	* Release the clientid structure. It may be locked or reference counted.
	*/
	static void
	nfscl_clrelease(struct nfsclclient *clp)
	{

	if (clp->nfsc_lock.nfslock_lock & NFSV4LOCK_LOCK)
	nfsv4_unlock(&clp->nfsc_lock, 0);
	else
	nfsv4_relref(&clp->nfsc_lock);
	}

	/*
	* External call for nfscl_clrelease.
	*/
	APPLESTATIC void
	nfscl_clientrelease(struct nfsclclient *clp)
	{

	NFSLOCKCLSTATE();
	if (clp->nfsc_lock.nfslock_lock & NFSV4LOCK_LOCK)
	nfsv4_unlock(&clp->nfsc_lock, 0);
	else
	nfsv4_relref(&clp->nfsc_lock);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Called when wanting to lock a byte region.
	*/
	APPLESTATIC int
	nfscl_getbytelock(vnode_t vp, u_int64_t off, u_int64_t len,
	short type, struct ucred cred, NFSPROC_T p, struct nfsclclient *rclp,
	int recovery, void id, int flags, u_int8_t rownp, u_int8_t *ropenownp,
	struct nfscllockowner *lpp, int newonep, int *donelocallyp)
	{
	struct nfscllockowner *lp;
	struct nfsclopen *op;
	struct nfsclclient *clp;
	struct nfscllockowner *nlp;
	struct nfscllock nlop, otherlop;
	struct nfscldeleg dp = NULL, ldp = NULL;
	struct nfscllockownerhead *lhp = NULL;
	struct nfsnode *np;
	u_int8_t own[NFSV4CL_LOCKNAMELEN], *ownp, openown[NFSV4CL_LOCKNAMELEN];
	u_int8_t *openownp;
	int error = 0, ret, donelocally = 0;
	u_int32_t mode;

	/* For Lock Ops, the open mode doesn't matter, so use 0 to match any. */
	mode = 0;
	np = VTONFS(vp);
	*lpp = NULL;
	lp = NULL;
	*newonep = 0;
	*donelocallyp = 0;

	/*
	* Might need these, so MALLOC them now, to
	* avoid a tsleep() in MALLOC later.
	*/
	MALLOC(nlp, struct nfscllockowner *,
	sizeof (struct nfscllockowner), M_NFSCLLOCKOWNER, M_WAITOK);
	MALLOC(otherlop, struct nfscllock *,
	sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
	MALLOC(nlop, struct nfscllock *,
	sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
	nlop->nfslo_type = type;
	nlop->nfslo_first = off;
	if (len == NFS64BITSSET) {
	nlop->nfslo_end = NFS64BITSSET;
	} else {
	nlop->nfslo_end = off + len;
	if (nlop->nfslo_end <= nlop->nfslo_first)
	error = NFSERR_INVAL;
	}

	if (!error) {
	if (recovery)
	clp = rclp;
	else
	error = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
	}
	if (error) {
	FREE((caddr_t)nlp, M_NFSCLLOCKOWNER);
	FREE((caddr_t)otherlop, M_NFSCLLOCK);
	FREE((caddr_t)nlop, M_NFSCLLOCK);
	return (error);
	}

	op = NULL;
	if (recovery) {
	ownp = rownp;
	openownp = ropenownp;
	} else {
	nfscl_filllockowner(id, own, flags);
	ownp = own;
	if (NFSHASONEOPENOWN(VFSTONFS(vnode_mount(vp))))
	nfscl_filllockowner(NULL, openown, F_POSIX);
	else
	nfscl_filllockowner(p->td_proc, openown, F_POSIX);
	openownp = openown;
	}
	if (!recovery) {
	NFSLOCKCLSTATE();
	/*
	* First, search for a delegation. If one exists for this file,
	* the lock can be done locally against it, so long as there
	* isn't a local lock conflict.
	*/
	ldp = dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
	np->n_fhp->nfh_len);
	/* Just sanity check for correct type of delegation */
	if (dp != NULL && ((dp->nfsdl_flags &
	(NFSCLDL_RECALL \| NFSCLDL_DELEGRET)) != 0 \|\|
	(type == F_WRLCK &&
	(dp->nfsdl_flags & NFSCLDL_WRITE) == 0)))
	dp = NULL;
	}
	if (dp != NULL) {
	/* Now, find an open and maybe a lockowner. */
	ret = nfscl_getopen(&dp->nfsdl_owner, np->n_fhp->nfh_fh,
	np->n_fhp->nfh_len, openownp, ownp, mode, NULL, &op);
	if (ret)
	ret = nfscl_getopen(&clp->nfsc_owner,
	np->n_fhp->nfh_fh, np->n_fhp->nfh_len, openownp,
	ownp, mode, NULL, &op);
	if (!ret) {
	lhp = &dp->nfsdl_lock;
	TAILQ_REMOVE(&clp->nfsc_deleg, dp, nfsdl_list);
	TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp, nfsdl_list);
	dp->nfsdl_timestamp = NFSD_MONOSEC + 120;
	donelocally = 1;
	} else {
	dp = NULL;
	}
	}
	if (!donelocally) {
	/*
	* Get the related Open and maybe lockowner.
	*/
	error = nfscl_getopen(&clp->nfsc_owner,
	np->n_fhp->nfh_fh, np->n_fhp->nfh_len, openownp,
	ownp, mode, &lp, &op);
	if (!error)
	lhp = &op->nfso_lock;
	}
	if (!error && !recovery)
	error = nfscl_localconflict(clp, np->n_fhp->nfh_fh,
	np->n_fhp->nfh_len, nlop, ownp, ldp, NULL);
	if (error) {
	if (!recovery) {
	nfscl_clrelease(clp);
	NFSUNLOCKCLSTATE();
	}
	FREE((caddr_t)nlp, M_NFSCLLOCKOWNER);
	FREE((caddr_t)otherlop, M_NFSCLLOCK);
	FREE((caddr_t)nlop, M_NFSCLLOCK);
	return (error);
	}

	/*
	* Ok, see if a lockowner exists and create one, as required.
	*/
	if (lp == NULL)
	LIST_FOREACH(lp, lhp, nfsl_list) {
	if (!NFSBCMP(lp->nfsl_owner, ownp, NFSV4CL_LOCKNAMELEN))
	break;
	}
	if (lp == NULL) {
	NFSBCOPY(ownp, nlp->nfsl_owner, NFSV4CL_LOCKNAMELEN);
	if (recovery)
	NFSBCOPY(ropenownp, nlp->nfsl_openowner,
	NFSV4CL_LOCKNAMELEN);
	else
	NFSBCOPY(op->nfso_own->nfsow_owner, nlp->nfsl_openowner,
	NFSV4CL_LOCKNAMELEN);
	nlp->nfsl_seqid = 0;
	nlp->nfsl_lockflags = flags;
	nlp->nfsl_inprog = NULL;
	nfscl_lockinit(&nlp->nfsl_rwlock);
	LIST_INIT(&nlp->nfsl_lock);
	if (donelocally) {
	nlp->nfsl_open = NULL;
	nfsstatsv1.cllocallockowners++;
	} else {
	nlp->nfsl_open = op;
	nfsstatsv1.cllockowners++;
	}
	LIST_INSERT_HEAD(lhp, nlp, nfsl_list);
	lp = nlp;
	nlp = NULL;
	*newonep = 1;
	}

	/*
	* Now, update the byte ranges for locks.
	*/
	ret = nfscl_updatelock(lp, &nlop, &otherlop, donelocally);
	if (!ret)
	donelocally = 1;
	if (donelocally) {
	*donelocallyp = 1;
	if (!recovery)
	nfscl_clrelease(clp);
	} else {
	/*
	* Serial modifications on the lock owner for multiple threads
	* for the same process using a read/write lock.
	*/
	if (!recovery)
	nfscl_lockexcl(&lp->nfsl_rwlock, NFSCLSTATEMUTEXPTR);
	}
	if (!recovery)
	NFSUNLOCKCLSTATE();

	if (nlp)
	FREE((caddr_t)nlp, M_NFSCLLOCKOWNER);
	if (nlop)
	FREE((caddr_t)nlop, M_NFSCLLOCK);
	if (otherlop)
	FREE((caddr_t)otherlop, M_NFSCLLOCK);

	*lpp = lp;
	return (0);
	}

	/*
	* Called to unlock a byte range, for LockU.
	*/
	APPLESTATIC int
	nfscl_relbytelock(vnode_t vp, u_int64_t off, u_int64_t len,
	__unused struct ucred cred, NFSPROC_T p, int callcnt,
	struct nfsclclient clp, void id, int flags,
	struct nfscllockowner *lpp, int dorpcp)
	{
	struct nfscllockowner *lp;
	struct nfsclowner *owp;
	struct nfsclopen *op;
	struct nfscllock nlop, other_lop = NULL;
	struct nfscldeleg *dp;
	struct nfsnode *np;
	u_int8_t own[NFSV4CL_LOCKNAMELEN];
	int ret = 0, fnd;

	np = VTONFS(vp);
	*lpp = NULL;
	*dorpcp = 0;

	/*
	* Might need these, so MALLOC them now, to
	* avoid a tsleep() in MALLOC later.
	*/
	MALLOC(nlop, struct nfscllock *,
	sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
	nlop->nfslo_type = F_UNLCK;
	nlop->nfslo_first = off;
	if (len == NFS64BITSSET) {
	nlop->nfslo_end = NFS64BITSSET;
	} else {
	nlop->nfslo_end = off + len;
	if (nlop->nfslo_end <= nlop->nfslo_first) {
	FREE((caddr_t)nlop, M_NFSCLLOCK);
	return (NFSERR_INVAL);
	}
	}
	if (callcnt == 0) {
	MALLOC(other_lop, struct nfscllock *,
	sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK);
	other_lop = nlop;
	}
	nfscl_filllockowner(id, own, flags);
	dp = NULL;
	NFSLOCKCLSTATE();
	if (callcnt == 0)
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
	np->n_fhp->nfh_len);

	/*
	* First, unlock any local regions on a delegation.
	*/
	if (dp != NULL) {
	/* Look for this lockowner. */
	LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
	if (!NFSBCMP(lp->nfsl_owner, own,
	NFSV4CL_LOCKNAMELEN))
	break;
	}
	if (lp != NULL)
	/* Use other_lop, so nlop is still available */
	(void)nfscl_updatelock(lp, &other_lop, NULL, 1);
	}

	/*
	* Now, find a matching open/lockowner that hasn't already been done,
	* as marked by nfsl_inprog.
	*/
	lp = NULL;
	fnd = 0;
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op->nfso_fhlen == np->n_fhp->nfh_len &&
	!NFSBCMP(op->nfso_fh, np->n_fhp->nfh_fh, op->nfso_fhlen)) {
	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
	if (lp->nfsl_inprog == NULL &&
	!NFSBCMP(lp->nfsl_owner, own,
	NFSV4CL_LOCKNAMELEN)) {
	fnd = 1;
	break;
	}
	}
	if (fnd)
	break;
	}
	}
	if (fnd)
	break;
	}

	if (lp != NULL) {
	ret = nfscl_updatelock(lp, &nlop, NULL, 0);
	if (ret)
	*dorpcp = 1;
	/*
	* Serial modifications on the lock owner for multiple
	* threads for the same process using a read/write lock.
	*/
	lp->nfsl_inprog = p;
	nfscl_lockexcl(&lp->nfsl_rwlock, NFSCLSTATEMUTEXPTR);
	*lpp = lp;
	}
	NFSUNLOCKCLSTATE();
	if (nlop)
	FREE((caddr_t)nlop, M_NFSCLLOCK);
	if (other_lop)
	FREE((caddr_t)other_lop, M_NFSCLLOCK);
	return (0);
	}

	/*
	* Release all lockowners marked in progess for this process and file.
	*/
	APPLESTATIC void
	nfscl_releasealllocks(struct nfsclclient clp, vnode_t vp, NFSPROC_T p,
	void *id, int flags)
	{
	struct nfsclowner *owp;
	struct nfsclopen *op;
	struct nfscllockowner *lp;
	struct nfsnode *np;
	u_int8_t own[NFSV4CL_LOCKNAMELEN];

	np = VTONFS(vp);
	nfscl_filllockowner(id, own, flags);
	NFSLOCKCLSTATE();
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op->nfso_fhlen == np->n_fhp->nfh_len &&
	!NFSBCMP(op->nfso_fh, np->n_fhp->nfh_fh, op->nfso_fhlen)) {
	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
	if (lp->nfsl_inprog == p &&
	!NFSBCMP(lp->nfsl_owner, own,
	NFSV4CL_LOCKNAMELEN)) {
	lp->nfsl_inprog = NULL;
	nfscl_lockunlock(&lp->nfsl_rwlock);
	}
	}
	}
	}
	}
	nfscl_clrelease(clp);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Called to find out if any bytes within the byte range specified are
	* write locked by the calling process. Used to determine if flushing
	* is required before a LockU.
	* If in doubt, return 1, so the flush will occur.
	*/
	APPLESTATIC int
	nfscl_checkwritelocked(vnode_t vp, struct flock *fl,
	struct ucred cred, NFSPROC_T p, void *id, int flags)
	{
	struct nfsclowner *owp;
	struct nfscllockowner *lp;
	struct nfsclopen *op;
	struct nfsclclient *clp;
	struct nfscllock *lop;
	struct nfscldeleg *dp;
	struct nfsnode *np;
	u_int64_t off, end;
	u_int8_t own[NFSV4CL_LOCKNAMELEN];
	int error = 0;

	np = VTONFS(vp);
	switch (fl->l_whence) {
	case SEEK_SET:
	case SEEK_CUR:
	/*
	* Caller is responsible for adding any necessary offset
	* when SEEK_CUR is used.
	*/
	off = fl->l_start;
	break;
	case SEEK_END:
	off = np->n_size + fl->l_start;
	break;
	default:
	return (1);
	}
	if (fl->l_len != 0) {
	end = off + fl->l_len;
	if (end < off)
	return (1);
	} else {
	end = NFS64BITSSET;
	}

	error = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
	if (error)
	return (1);
	nfscl_filllockowner(id, own, flags);
	NFSLOCKCLSTATE();

	/*
	* First check the delegation locks.
	*/
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (dp != NULL) {
	LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
	if (!NFSBCMP(lp->nfsl_owner, own,
	NFSV4CL_LOCKNAMELEN))
	break;
	}
	if (lp != NULL) {
	LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
	if (lop->nfslo_first >= end)
	break;
	if (lop->nfslo_end <= off)
	continue;
	if (lop->nfslo_type == F_WRLCK) {
	nfscl_clrelease(clp);
	NFSUNLOCKCLSTATE();
	return (1);
	}
	}
	}
	}

	/*
	* Now, check state against the server.
	*/
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op->nfso_fhlen == np->n_fhp->nfh_len &&
	!NFSBCMP(op->nfso_fh, np->n_fhp->nfh_fh, op->nfso_fhlen)) {
	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
	if (!NFSBCMP(lp->nfsl_owner, own,
	NFSV4CL_LOCKNAMELEN))
	break;
	}
	if (lp != NULL) {
	LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
	if (lop->nfslo_first >= end)
	break;
	if (lop->nfslo_end <= off)
	continue;
	if (lop->nfslo_type == F_WRLCK) {
	nfscl_clrelease(clp);
	NFSUNLOCKCLSTATE();
	return (1);
	}
	}
	}
	}
	}
	}
	nfscl_clrelease(clp);
	NFSUNLOCKCLSTATE();
	return (0);
	}

	/*
	* Release a byte range lock owner structure.
	*/
	APPLESTATIC void
	nfscl_lockrelease(struct nfscllockowner *lp, int error, int candelete)
	{
	struct nfsclclient *clp;

	if (lp == NULL)
	return;
	NFSLOCKCLSTATE();
	clp = lp->nfsl_open->nfso_own->nfsow_clp;
	if (error != 0 && candelete &&
	(lp->nfsl_rwlock.nfslock_lock & NFSV4LOCK_WANTED) == 0)
	nfscl_freelockowner(lp, 0);
	else
	nfscl_lockunlock(&lp->nfsl_rwlock);
	nfscl_clrelease(clp);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Free up an open structure and any associated byte range lock structures.
	*/
	APPLESTATIC void
	nfscl_freeopen(struct nfsclopen *op, int local)
	{

	LIST_REMOVE(op, nfso_list);
	nfscl_freealllocks(&op->nfso_lock, local);
	FREE((caddr_t)op, M_NFSCLOPEN);
	if (local)
	nfsstatsv1.cllocalopens--;
	else
	nfsstatsv1.clopens--;
	}

	/*
	* Free up all lock owners and associated locks.
	*/
	static void
	nfscl_freealllocks(struct nfscllockownerhead *lhp, int local)
	{
	struct nfscllockowner lp, nlp;

	LIST_FOREACH_SAFE(lp, lhp, nfsl_list, nlp) {
	if ((lp->nfsl_rwlock.nfslock_lock & NFSV4LOCK_WANTED))
	panic("nfscllckw");
	nfscl_freelockowner(lp, local);
	}
	}

	/*
	* Called for an Open when NFSERR_EXPIRED is received from the server.
	* If there are no byte range locks nor a Share Deny lost, try to do a
	* fresh Open. Otherwise, free the open.
	*/
	static int
	nfscl_expireopen(struct nfsclclient clp, struct nfsclopen op,
	struct nfsmount nmp, struct ucred cred, NFSPROC_T *p)
	{
	struct nfscllockowner *lp;
	struct nfscldeleg *dp;
	int mustdelete = 0, error;

	/*
	* Look for any byte range lock(s).
	*/
	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
	if (!LIST_EMPTY(&lp->nfsl_lock)) {
	mustdelete = 1;
	break;
	}
	}

	/*
	* If no byte range lock(s) nor a Share deny, try to re-open.
	*/
	if (!mustdelete && (op->nfso_mode & NFSLCK_DENYBITS) == 0) {
	newnfs_copycred(&op->nfso_cred, cred);
	dp = NULL;
	error = nfsrpc_reopen(nmp, op->nfso_fh,
	op->nfso_fhlen, op->nfso_mode, op, &dp, cred, p);
	if (error) {
	mustdelete = 1;
	if (dp != NULL) {
	FREE((caddr_t)dp, M_NFSCLDELEG);
	dp = NULL;
	}
	}
	if (dp != NULL)
	nfscl_deleg(nmp->nm_mountp, clp, op->nfso_fh,
	op->nfso_fhlen, cred, p, &dp);
	}

	/*
	* If a byte range lock or Share deny or couldn't re-open, free it.
	*/
	if (mustdelete)
	nfscl_freeopen(op, 0);
	return (mustdelete);
	}

	/*
	* Free up an open owner structure.
	*/
	static void
	nfscl_freeopenowner(struct nfsclowner *owp, int local)
	{

	LIST_REMOVE(owp, nfsow_list);
	FREE((caddr_t)owp, M_NFSCLOWNER);
	if (local)
	nfsstatsv1.cllocalopenowners--;
	else
	nfsstatsv1.clopenowners--;
	}

	/*
	* Free up a byte range lock owner structure.
	*/
	APPLESTATIC void
	nfscl_freelockowner(struct nfscllockowner *lp, int local)
	{
	struct nfscllock lop, nlop;

	LIST_REMOVE(lp, nfsl_list);
	LIST_FOREACH_SAFE(lop, &lp->nfsl_lock, nfslo_list, nlop) {
	nfscl_freelock(lop, local);
	}
	FREE((caddr_t)lp, M_NFSCLLOCKOWNER);
	if (local)
	nfsstatsv1.cllocallockowners--;
	else
	nfsstatsv1.cllockowners--;
	}

	/*
	* Free up a byte range lock structure.
	*/
	APPLESTATIC void
	nfscl_freelock(struct nfscllock *lop, int local)
	{

	LIST_REMOVE(lop, nfslo_list);
	FREE((caddr_t)lop, M_NFSCLLOCK);
	if (local)
	nfsstatsv1.cllocallocks--;
	else
	nfsstatsv1.cllocks--;
	}

	/*
	* Clean out the state related to a delegation.
	*/
	static void
	nfscl_cleandeleg(struct nfscldeleg *dp)
	{
	struct nfsclowner owp, nowp;
	struct nfsclopen *op;

	LIST_FOREACH_SAFE(owp, &dp->nfsdl_owner, nfsow_list, nowp) {
	op = LIST_FIRST(&owp->nfsow_open);
	if (op != NULL) {
	if (LIST_NEXT(op, nfso_list) != NULL)
	panic("nfscleandel");
	nfscl_freeopen(op, 1);
	}
	nfscl_freeopenowner(owp, 1);
	}
	nfscl_freealllocks(&dp->nfsdl_lock, 1);
	}

	/*
	* Free a delegation.
	*/
	static void
	nfscl_freedeleg(struct nfscldeleghead hdp, struct nfscldeleg dp)
	{

	TAILQ_REMOVE(hdp, dp, nfsdl_list);
	LIST_REMOVE(dp, nfsdl_hash);
	FREE((caddr_t)dp, M_NFSCLDELEG);
	nfsstatsv1.cldelegates--;
	nfscl_delegcnt--;
	}

	/*
	* Free up all state related to this client structure.
	*/
	static void
	nfscl_cleanclient(struct nfsclclient *clp)
	{
	struct nfsclowner owp, nowp;
	struct nfsclopen op, nop;
	struct nfscllayout lyp, nlyp;
	struct nfscldevinfo dip, ndip;

	TAILQ_FOREACH_SAFE(lyp, &clp->nfsc_layout, nfsly_list, nlyp)
	nfscl_freelayout(lyp);

	LIST_FOREACH_SAFE(dip, &clp->nfsc_devinfo, nfsdi_list, ndip)
	nfscl_freedevinfo(dip);

	/* Now, all the OpenOwners, etc. */
	LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
	LIST_FOREACH_SAFE(op, &owp->nfsow_open, nfso_list, nop) {
	nfscl_freeopen(op, 0);
	}
	nfscl_freeopenowner(owp, 0);
	}
	}

	/*
	* Called when an NFSERR_EXPIRED is received from the server.
	*/
	static void
	nfscl_expireclient(struct nfsclclient clp, struct nfsmount nmp,
	struct ucred cred, NFSPROC_T p)
	{
	struct nfsclowner owp, nowp, *towp;
	struct nfsclopen op, nop, *top;
	struct nfscldeleg dp, ndp;
	int ret, printed = 0;

	/*
	* First, merge locally issued Opens into the list for the server.
	*/
	dp = TAILQ_FIRST(&clp->nfsc_deleg);
	while (dp != NULL) {
	ndp = TAILQ_NEXT(dp, nfsdl_list);
	owp = LIST_FIRST(&dp->nfsdl_owner);
	while (owp != NULL) {
	nowp = LIST_NEXT(owp, nfsow_list);
	op = LIST_FIRST(&owp->nfsow_open);
	if (op != NULL) {
	if (LIST_NEXT(op, nfso_list) != NULL)
	panic("nfsclexp");
	LIST_FOREACH(towp, &clp->nfsc_owner, nfsow_list) {
	if (!NFSBCMP(towp->nfsow_owner, owp->nfsow_owner,
	NFSV4CL_LOCKNAMELEN))
	break;
	}
	if (towp != NULL) {
	/* Merge opens in */
	LIST_FOREACH(top, &towp->nfsow_open, nfso_list) {
	if (top->nfso_fhlen == op->nfso_fhlen &&
	!NFSBCMP(top->nfso_fh, op->nfso_fh,
	op->nfso_fhlen)) {
	top->nfso_mode \|= op->nfso_mode;
	top->nfso_opencnt += op->nfso_opencnt;
	break;
	}
	}
	if (top == NULL) {
	/* Just add the open to the owner list */
	LIST_REMOVE(op, nfso_list);
	op->nfso_own = towp;
	LIST_INSERT_HEAD(&towp->nfsow_open, op, nfso_list);
	nfsstatsv1.cllocalopens--;
	nfsstatsv1.clopens++;
	}
	} else {
	/* Just add the openowner to the client list */
	LIST_REMOVE(owp, nfsow_list);
	owp->nfsow_clp = clp;
	LIST_INSERT_HEAD(&clp->nfsc_owner, owp, nfsow_list);
	nfsstatsv1.cllocalopenowners--;
	nfsstatsv1.clopenowners++;
	nfsstatsv1.cllocalopens--;
	nfsstatsv1.clopens++;
	}
	}
	owp = nowp;
	}
	if (!printed && !LIST_EMPTY(&dp->nfsdl_lock)) {
	printed = 1;
	printf("nfsv4 expired locks lost\n");
	}
	nfscl_cleandeleg(dp);
	nfscl_freedeleg(&clp->nfsc_deleg, dp);
	dp = ndp;
	}
	if (!TAILQ_EMPTY(&clp->nfsc_deleg))
	panic("nfsclexp");

	/*
	* Now, try and reopen against the server.
	*/
	LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
	owp->nfsow_seqid = 0;
	LIST_FOREACH_SAFE(op, &owp->nfsow_open, nfso_list, nop) {
	ret = nfscl_expireopen(clp, op, nmp, cred, p);
	if (ret && !printed) {
	printed = 1;
	printf("nfsv4 expired locks lost\n");
	}
	}
	if (LIST_EMPTY(&owp->nfsow_open))
	nfscl_freeopenowner(owp, 0);
	}
	}

	/*
	* This function must be called after the process represented by "own" has
	* exited. Must be called with CLSTATE lock held.
	*/
	static void
	nfscl_cleanup_common(struct nfsclclient clp, u_int8_t own)
	{
	struct nfsclowner owp, nowp;
	struct nfscllockowner lp, nlp;
	struct nfscldeleg *dp;

	/* First, get rid of local locks on delegations. */
	TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
	LIST_FOREACH_SAFE(lp, &dp->nfsdl_lock, nfsl_list, nlp) {
	if (!NFSBCMP(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN)) {
	if ((lp->nfsl_rwlock.nfslock_lock & NFSV4LOCK_WANTED))
	panic("nfscllckw");
	nfscl_freelockowner(lp, 1);
	}
	}
	}
	owp = LIST_FIRST(&clp->nfsc_owner);
	while (owp != NULL) {
	nowp = LIST_NEXT(owp, nfsow_list);
	if (!NFSBCMP(owp->nfsow_owner, own,
	NFSV4CL_LOCKNAMELEN)) {
	/*
	* If there are children that haven't closed the
	* file descriptors yet, the opens will still be
	* here. For that case, let the renew thread clear
	* out the OpenOwner later.
	*/
	if (LIST_EMPTY(&owp->nfsow_open))
	nfscl_freeopenowner(owp, 0);
	else
	owp->nfsow_defunct = 1;
	}
	owp = nowp;
	}
	}

	/*
	* Find open/lock owners for processes that have exited.
	*/
	static void
	nfscl_cleanupkext(struct nfsclclient clp, struct nfscllockownerfhhead lhp)
	{
	struct nfsclowner owp, nowp;
	struct nfsclopen *op;
	struct nfscllockowner lp, nlp;
	struct nfscldeleg *dp;

	NFSPROCLISTLOCK();
	NFSLOCKCLSTATE();
	LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	LIST_FOREACH_SAFE(lp, &op->nfso_lock, nfsl_list, nlp) {
	if (LIST_EMPTY(&lp->nfsl_lock))
	nfscl_emptylockowner(lp, lhp);
	}
	}
	if (nfscl_procdoesntexist(owp->nfsow_owner))
	nfscl_cleanup_common(clp, owp->nfsow_owner);
	}

	/*
	* For the single open_owner case, these lock owners need to be
	* checked to see if they still exist separately.
	* This is because nfscl_procdoesntexist() never returns true for
	* the single open_owner so that the above doesn't ever call
	* nfscl_cleanup_common().
	*/
	TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
	LIST_FOREACH_SAFE(lp, &dp->nfsdl_lock, nfsl_list, nlp) {
	if (nfscl_procdoesntexist(lp->nfsl_owner))
	nfscl_cleanup_common(clp, lp->nfsl_owner);
	}
	}
	NFSUNLOCKCLSTATE();
	NFSPROCLISTUNLOCK();
	}

	/*
	* Take the empty lock owner and move it to the local lhp list if the
	* associated process no longer exists.
	*/
	static void
	nfscl_emptylockowner(struct nfscllockowner *lp,
	struct nfscllockownerfhhead *lhp)
	{
	struct nfscllockownerfh lfhp, mylfhp;
	struct nfscllockowner *nlp;
	int fnd_it;

	/* If not a Posix lock owner, just return. */
	if ((lp->nfsl_lockflags & F_POSIX) == 0)
	return;

	fnd_it = 0;
	mylfhp = NULL;
	/*
	* First, search to see if this lock owner is already in the list.
	* If it is, then the associated process no longer exists.
	*/
	SLIST_FOREACH(lfhp, lhp, nfslfh_list) {
	if (lfhp->nfslfh_len == lp->nfsl_open->nfso_fhlen &&
	!NFSBCMP(lfhp->nfslfh_fh, lp->nfsl_open->nfso_fh,
	lfhp->nfslfh_len))
	mylfhp = lfhp;
	LIST_FOREACH(nlp, &lfhp->nfslfh_lock, nfsl_list)
	if (!NFSBCMP(nlp->nfsl_owner, lp->nfsl_owner,
	NFSV4CL_LOCKNAMELEN))
	fnd_it = 1;
	}
	/* If not found, check if process still exists. */
	if (fnd_it == 0 && nfscl_procdoesntexist(lp->nfsl_owner) == 0)
	return;

	/* Move the lock owner over to the local list. */
	if (mylfhp == NULL) {
	mylfhp = malloc(sizeof(struct nfscllockownerfh), M_TEMP,
	M_NOWAIT);
	if (mylfhp == NULL)
	return;
	mylfhp->nfslfh_len = lp->nfsl_open->nfso_fhlen;
	NFSBCOPY(lp->nfsl_open->nfso_fh, mylfhp->nfslfh_fh,
	mylfhp->nfslfh_len);
	LIST_INIT(&mylfhp->nfslfh_lock);
	SLIST_INSERT_HEAD(lhp, mylfhp, nfslfh_list);
	}
	LIST_REMOVE(lp, nfsl_list);
	LIST_INSERT_HEAD(&mylfhp->nfslfh_lock, lp, nfsl_list);
	}

	static int fake_global; /* Used to force visibility of MNTK_UNMOUNTF */
	/*
	* Called from nfs umount to free up the clientid.
	*/
	APPLESTATIC void
	nfscl_umount(struct nfsmount nmp, NFSPROC_T p)
	{
	struct nfsclclient *clp;
	struct ucred *cred;
	int igotlock;

	/*
	* For the case that matters, this is the thread that set
	* MNTK_UNMOUNTF, so it will see it set. The code that follows is
	* done to ensure that any thread executing nfscl_getcl() after
	* this time, will see MNTK_UNMOUNTF set. nfscl_getcl() uses the
	* mutex for NFSLOCKCLSTATE(), so it is "m" for the following
	* explanation, courtesy of Alan Cox.
	* What follows is a snippet from Alan Cox's email at:
	* http://docs.FreeBSD.org/cgi/
	* mid.cgi?BANLkTikR3d65zPHo9==08ZfJ2vmqZucEvw
	*
	* 1. Set MNTK_UNMOUNTF
	* 2. Acquire a standard FreeBSD mutex "m".
	* 3. Update some data structures.
	* 4. Release mutex "m".
	*
	* Then, other threads that acquire "m" after step 4 has occurred will
	* see MNTK_UNMOUNTF as set. But, other threads that beat thread X to
	* step 2 may or may not see MNTK_UNMOUNTF as set.
	*/
	NFSLOCKCLSTATE();
	if ((nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) != 0) {
	fake_global++;
	NFSUNLOCKCLSTATE();
	NFSLOCKCLSTATE();
	}

	clp = nmp->nm_clp;
	if (clp != NULL) {
	if ((clp->nfsc_flags & NFSCLFLAGS_INITED) == 0)
	panic("nfscl umount");

	/*
	* First, handshake with the nfscl renew thread, to terminate
	* it.
	*/
	clp->nfsc_flags \|= NFSCLFLAGS_UMOUNT;
	while (clp->nfsc_flags & NFSCLFLAGS_HASTHREAD)
	(void)mtx_sleep(clp, NFSCLSTATEMUTEXPTR, PWAIT,
	"nfsclumnt", hz);

	/*
	* Now, get the exclusive lock on the client state, so
	* that no uses of the state are still in progress.
	*/
	do {
	igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
	NFSCLSTATEMUTEXPTR, NULL);
	} while (!igotlock);
	NFSUNLOCKCLSTATE();

	/*
	* Free up all the state. It will expire on the server, but
	* maybe we should do a SetClientId/SetClientIdConfirm so
	* the server throws it away?
	*/
	LIST_REMOVE(clp, nfsc_list);
	nfscl_delegreturnall(clp, p);
	cred = newnfs_getcred();
	if (NFSHASNFSV4N(nmp)) {
	(void)nfsrpc_destroysession(nmp, clp, cred, p);
	(void)nfsrpc_destroyclient(nmp, clp, cred, p);
	} else
	(void)nfsrpc_setclient(nmp, clp, 0, cred, p);
	nfscl_cleanclient(clp);
	nmp->nm_clp = NULL;
	NFSFREECRED(cred);
	free(clp, M_NFSCLCLIENT);
	} else
	NFSUNLOCKCLSTATE();
	}

	/*
	* This function is called when a server replies with NFSERR_STALECLIENTID
	* NFSERR_STALESTATEID or NFSERR_BADSESSION. It traverses the clientid lists,
	* doing Opens and Locks with reclaim. If these fail, it deletes the
	* corresponding state.
	*/
	static void
	nfscl_recover(struct nfsclclient clp, struct ucred cred, NFSPROC_T *p)
	{
	struct nfsclowner owp, nowp;
	struct nfsclopen op, nop;
	struct nfscllockowner lp, nlp;
	struct nfscllock lop, nlop;
	struct nfscldeleg dp, ndp, *tdp;
	struct nfsmount *nmp;
	struct ucred *tcred;
	struct nfsclopenhead extra_open;
	struct nfscldeleghead extra_deleg;
	struct nfsreq *rep;
	u_int64_t len;
	u_int32_t delegtype = NFSV4OPEN_DELEGATEWRITE, mode;
	int i, igotlock = 0, error, trycnt, firstlock;
	struct nfscllayout lyp, nlyp;

	/*
	* First, lock the client structure, so everyone else will
	* block when trying to use state.
	*/
	NFSLOCKCLSTATE();
	clp->nfsc_flags \|= NFSCLFLAGS_RECVRINPROG;
	do {
	igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
	NFSCLSTATEMUTEXPTR, NULL);
	} while (!igotlock);
	NFSUNLOCKCLSTATE();

	nmp = clp->nfsc_nmp;
	if (nmp == NULL)
	panic("nfscl recover");

	/*
	* For now, just get rid of all layouts. There may be a need
	* to do LayoutCommit Ops with reclaim == true later.
	*/
	TAILQ_FOREACH_SAFE(lyp, &clp->nfsc_layout, nfsly_list, nlyp)
	nfscl_freelayout(lyp);
	TAILQ_INIT(&clp->nfsc_layout);
	for (i = 0; i < NFSCLLAYOUTHASHSIZE; i++)
	LIST_INIT(&clp->nfsc_layouthash[i]);

	trycnt = 5;
	do {
	error = nfsrpc_setclient(nmp, clp, 1, cred, p);
	} while ((error == NFSERR_STALECLIENTID \|\|
	error == NFSERR_BADSESSION \|\|
	error == NFSERR_STALEDONTRECOVER) && --trycnt > 0);
	if (error) {
	NFSLOCKCLSTATE();
	clp->nfsc_flags &= ~(NFSCLFLAGS_RECOVER \|
	NFSCLFLAGS_RECVRINPROG);
	wakeup(&clp->nfsc_flags);
	nfsv4_unlock(&clp->nfsc_lock, 0);
	NFSUNLOCKCLSTATE();
	return;
	}
	clp->nfsc_flags \|= NFSCLFLAGS_HASCLIENTID;
	clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;

	/*
	* Mark requests already queued on the server, so that they don't
	* initiate another recovery cycle. Any requests already in the
	* queue that handle state information will have the old stale
	* clientid/stateid and will get a NFSERR_STALESTATEID,
	* NFSERR_STALECLIENTID or NFSERR_BADSESSION reply from the server.
	* This will be translated to NFSERR_STALEDONTRECOVER when
	* R_DONTRECOVER is set.
	*/
	NFSLOCKREQ();
	TAILQ_FOREACH(rep, &nfsd_reqq, r_chain) {
	if (rep->r_nmp == nmp)
	rep->r_flags \|= R_DONTRECOVER;
	}
	NFSUNLOCKREQ();

	/*
	* Now, mark all delegations "need reclaim".
	*/
	TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list)
	dp->nfsdl_flags \|= NFSCLDL_NEEDRECLAIM;

	TAILQ_INIT(&extra_deleg);
	LIST_INIT(&extra_open);
	/*
	* Now traverse the state lists, doing Open and Lock Reclaims.
	*/
	tcred = newnfs_getcred();
	owp = LIST_FIRST(&clp->nfsc_owner);
	while (owp != NULL) {
	nowp = LIST_NEXT(owp, nfsow_list);
	owp->nfsow_seqid = 0;
	op = LIST_FIRST(&owp->nfsow_open);
	while (op != NULL) {
	nop = LIST_NEXT(op, nfso_list);
	if (error != NFSERR_NOGRACE && error != NFSERR_BADSESSION) {
	/* Search for a delegation to reclaim with the open */
	TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
	if (!(dp->nfsdl_flags & NFSCLDL_NEEDRECLAIM))
	continue;
	if ((dp->nfsdl_flags & NFSCLDL_WRITE)) {
	mode = NFSV4OPEN_ACCESSWRITE;
	delegtype = NFSV4OPEN_DELEGATEWRITE;
	} else {
	mode = NFSV4OPEN_ACCESSREAD;
	delegtype = NFSV4OPEN_DELEGATEREAD;
	}
	if ((op->nfso_mode & mode) == mode &&
	op->nfso_fhlen == dp->nfsdl_fhlen &&
	!NFSBCMP(op->nfso_fh, dp->nfsdl_fh, op->nfso_fhlen))
	break;
	}
	ndp = dp;
	if (dp == NULL)
	delegtype = NFSV4OPEN_DELEGATENONE;
	newnfs_copycred(&op->nfso_cred, tcred);
	error = nfscl_tryopen(nmp, NULL, op->nfso_fh,
	op->nfso_fhlen, op->nfso_fh, op->nfso_fhlen,
	op->nfso_mode, op, NULL, 0, &ndp, 1, delegtype,
	tcred, p);
	if (!error) {
	/* Handle any replied delegation */
	if (ndp != NULL && ((ndp->nfsdl_flags & NFSCLDL_WRITE)
	\|\| NFSMNT_RDONLY(nmp->nm_mountp))) {
	if ((ndp->nfsdl_flags & NFSCLDL_WRITE))
	mode = NFSV4OPEN_ACCESSWRITE;
	else
	mode = NFSV4OPEN_ACCESSREAD;
	TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
	if (!(dp->nfsdl_flags & NFSCLDL_NEEDRECLAIM))
	continue;
	if ((op->nfso_mode & mode) == mode &&
	op->nfso_fhlen == dp->nfsdl_fhlen &&
	!NFSBCMP(op->nfso_fh, dp->nfsdl_fh,
	op->nfso_fhlen)) {
	dp->nfsdl_stateid = ndp->nfsdl_stateid;
	dp->nfsdl_sizelimit = ndp->nfsdl_sizelimit;
	dp->nfsdl_ace = ndp->nfsdl_ace;
	dp->nfsdl_change = ndp->nfsdl_change;
	dp->nfsdl_flags &= ~NFSCLDL_NEEDRECLAIM;
	if ((ndp->nfsdl_flags & NFSCLDL_RECALL))
	dp->nfsdl_flags \|= NFSCLDL_RECALL;
	FREE((caddr_t)ndp, M_NFSCLDELEG);
	ndp = NULL;
	break;
	}
	}
	}
	if (ndp != NULL)
	TAILQ_INSERT_HEAD(&extra_deleg, ndp, nfsdl_list);

	/* and reclaim all byte range locks */
	lp = LIST_FIRST(&op->nfso_lock);
	while (lp != NULL) {
	nlp = LIST_NEXT(lp, nfsl_list);
	lp->nfsl_seqid = 0;
	firstlock = 1;
	lop = LIST_FIRST(&lp->nfsl_lock);
	while (lop != NULL) {
	nlop = LIST_NEXT(lop, nfslo_list);
	if (lop->nfslo_end == NFS64BITSSET)
	len = NFS64BITSSET;
	else
	len = lop->nfslo_end - lop->nfslo_first;
	error = nfscl_trylock(nmp, NULL,
	op->nfso_fh, op->nfso_fhlen, lp,
	firstlock, 1, lop->nfslo_first, len,
	lop->nfslo_type, tcred, p);
	if (error != 0)
	nfscl_freelock(lop, 0);
	else
	firstlock = 0;
	lop = nlop;
	}
	/* If no locks, but a lockowner, just delete it. */
	if (LIST_EMPTY(&lp->nfsl_lock))
	nfscl_freelockowner(lp, 0);
	lp = nlp;
	}
	}
	}
	if (error != 0 && error != NFSERR_BADSESSION)
	nfscl_freeopen(op, 0);
	op = nop;
	}
	owp = nowp;
	}

	/*
	* Now, try and get any delegations not yet reclaimed by cobbling
	* to-gether an appropriate open.
	*/
	nowp = NULL;
	dp = TAILQ_FIRST(&clp->nfsc_deleg);
	while (dp != NULL) {
	ndp = TAILQ_NEXT(dp, nfsdl_list);
	if ((dp->nfsdl_flags & NFSCLDL_NEEDRECLAIM)) {
	if (nowp == NULL) {
	MALLOC(nowp, struct nfsclowner *,
	sizeof (struct nfsclowner), M_NFSCLOWNER, M_WAITOK);
	/*
	* Name must be as long an largest possible
	* NFSV4CL_LOCKNAMELEN. 12 for now.
	*/
	NFSBCOPY("RECLAIMDELEG", nowp->nfsow_owner,
	NFSV4CL_LOCKNAMELEN);
	LIST_INIT(&nowp->nfsow_open);
	nowp->nfsow_clp = clp;
	nowp->nfsow_seqid = 0;
	nowp->nfsow_defunct = 0;
	nfscl_lockinit(&nowp->nfsow_rwlock);
	}
	nop = NULL;
	if (error != NFSERR_NOGRACE && error != NFSERR_BADSESSION) {
	MALLOC(nop, struct nfsclopen *, sizeof (struct nfsclopen) +
	dp->nfsdl_fhlen - 1, M_NFSCLOPEN, M_WAITOK);
	nop->nfso_own = nowp;
	if ((dp->nfsdl_flags & NFSCLDL_WRITE)) {
	nop->nfso_mode = NFSV4OPEN_ACCESSWRITE;
	delegtype = NFSV4OPEN_DELEGATEWRITE;
	} else {
	nop->nfso_mode = NFSV4OPEN_ACCESSREAD;
	delegtype = NFSV4OPEN_DELEGATEREAD;
	}
	nop->nfso_opencnt = 0;
	nop->nfso_posixlock = 1;
	nop->nfso_fhlen = dp->nfsdl_fhlen;
	NFSBCOPY(dp->nfsdl_fh, nop->nfso_fh, dp->nfsdl_fhlen);
	LIST_INIT(&nop->nfso_lock);
	nop->nfso_stateid.seqid = 0;
	nop->nfso_stateid.other[0] = 0;
	nop->nfso_stateid.other[1] = 0;
	nop->nfso_stateid.other[2] = 0;
	newnfs_copycred(&dp->nfsdl_cred, tcred);
	newnfs_copyincred(tcred, &nop->nfso_cred);
	tdp = NULL;
	error = nfscl_tryopen(nmp, NULL, nop->nfso_fh,
	nop->nfso_fhlen, nop->nfso_fh, nop->nfso_fhlen,
	nop->nfso_mode, nop, NULL, 0, &tdp, 1,
	delegtype, tcred, p);
	if (tdp != NULL) {
	if ((tdp->nfsdl_flags & NFSCLDL_WRITE))
	mode = NFSV4OPEN_ACCESSWRITE;
	else
	mode = NFSV4OPEN_ACCESSREAD;
	if ((nop->nfso_mode & mode) == mode &&
	nop->nfso_fhlen == tdp->nfsdl_fhlen &&
	!NFSBCMP(nop->nfso_fh, tdp->nfsdl_fh,
	nop->nfso_fhlen)) {
	dp->nfsdl_stateid = tdp->nfsdl_stateid;
	dp->nfsdl_sizelimit = tdp->nfsdl_sizelimit;
	dp->nfsdl_ace = tdp->nfsdl_ace;
	dp->nfsdl_change = tdp->nfsdl_change;
	dp->nfsdl_flags &= ~NFSCLDL_NEEDRECLAIM;
	if ((tdp->nfsdl_flags & NFSCLDL_RECALL))
	dp->nfsdl_flags \|= NFSCLDL_RECALL;
	FREE((caddr_t)tdp, M_NFSCLDELEG);
	} else {
	TAILQ_INSERT_HEAD(&extra_deleg, tdp, nfsdl_list);
	}
	}
	}
	if (error) {
	if (nop != NULL)
	FREE((caddr_t)nop, M_NFSCLOPEN);
	/*
	* Couldn't reclaim it, so throw the state
	* away. Ouch!!
	*/
	nfscl_cleandeleg(dp);
	nfscl_freedeleg(&clp->nfsc_deleg, dp);
	} else {
	LIST_INSERT_HEAD(&extra_open, nop, nfso_list);
	}
	}
	dp = ndp;
	}

	/*
	* Now, get rid of extra Opens and Delegations.
	*/
	LIST_FOREACH_SAFE(op, &extra_open, nfso_list, nop) {
	do {
	newnfs_copycred(&op->nfso_cred, tcred);
	error = nfscl_tryclose(op, tcred, nmp, p);
	if (error == NFSERR_GRACE)
	(void) nfs_catnap(PZERO, error, "nfsexcls");
	} while (error == NFSERR_GRACE);
	LIST_REMOVE(op, nfso_list);
	FREE((caddr_t)op, M_NFSCLOPEN);
	}
	if (nowp != NULL)
	FREE((caddr_t)nowp, M_NFSCLOWNER);

	TAILQ_FOREACH_SAFE(dp, &extra_deleg, nfsdl_list, ndp) {
	do {
	newnfs_copycred(&dp->nfsdl_cred, tcred);
	error = nfscl_trydelegreturn(dp, tcred, nmp, p);
	if (error == NFSERR_GRACE)
	(void) nfs_catnap(PZERO, error, "nfsexdlg");
	} while (error == NFSERR_GRACE);
	TAILQ_REMOVE(&extra_deleg, dp, nfsdl_list);
	FREE((caddr_t)dp, M_NFSCLDELEG);
	}

	/* For NFSv4.1 or later, do a RECLAIM_COMPLETE. */
	if (NFSHASNFSV4N(nmp))
	(void)nfsrpc_reclaimcomplete(nmp, cred, p);

	NFSLOCKCLSTATE();
	clp->nfsc_flags &= ~NFSCLFLAGS_RECVRINPROG;
	wakeup(&clp->nfsc_flags);
	nfsv4_unlock(&clp->nfsc_lock, 0);
	NFSUNLOCKCLSTATE();
	NFSFREECRED(tcred);
	}

	/*
	* This function is called when a server replies with NFSERR_EXPIRED.
	* It deletes all state for the client and does a fresh SetClientId/confirm.
	* XXX Someday it should post a signal to the process(es) that hold the
	* state, so they know that lock state has been lost.
	*/
	APPLESTATIC int
	nfscl_hasexpired(struct nfsclclient clp, u_int32_t clidrev, NFSPROC_T p)
	{
	struct nfsmount *nmp;
	struct ucred *cred;
	int igotlock = 0, error, trycnt;

	/*
	* If the clientid has gone away or a new SetClientid has already
	* been done, just return ok.
	*/
	if (clp == NULL \|\| clidrev != clp->nfsc_clientidrev)
	return (0);

	/*
	* First, lock the client structure, so everyone else will
	* block when trying to use state. Also, use NFSCLFLAGS_EXPIREIT so
	* that only one thread does the work.
	*/
	NFSLOCKCLSTATE();
	clp->nfsc_flags \|= NFSCLFLAGS_EXPIREIT;
	do {
	igotlock = nfsv4_lock(&clp->nfsc_lock, 1, NULL,
	NFSCLSTATEMUTEXPTR, NULL);
	} while (!igotlock && (clp->nfsc_flags & NFSCLFLAGS_EXPIREIT));
	if ((clp->nfsc_flags & NFSCLFLAGS_EXPIREIT) == 0) {
	if (igotlock)
	nfsv4_unlock(&clp->nfsc_lock, 0);
	NFSUNLOCKCLSTATE();
	return (0);
	}
	clp->nfsc_flags \|= NFSCLFLAGS_RECVRINPROG;
	NFSUNLOCKCLSTATE();

	nmp = clp->nfsc_nmp;
	if (nmp == NULL)
	panic("nfscl expired");
	cred = newnfs_getcred();
	trycnt = 5;
	do {
	error = nfsrpc_setclient(nmp, clp, 0, cred, p);
	} while ((error == NFSERR_STALECLIENTID \|\|
	error == NFSERR_BADSESSION \|\|
	error == NFSERR_STALEDONTRECOVER) && --trycnt > 0);
	if (error) {
	NFSLOCKCLSTATE();
	clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;
	} else {
	/*
	* Expire the state for the client.
	*/
	nfscl_expireclient(clp, nmp, cred, p);
	NFSLOCKCLSTATE();
	clp->nfsc_flags \|= NFSCLFLAGS_HASCLIENTID;
	clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;
	}
	clp->nfsc_flags &= ~(NFSCLFLAGS_EXPIREIT \| NFSCLFLAGS_RECVRINPROG);
	wakeup(&clp->nfsc_flags);
	nfsv4_unlock(&clp->nfsc_lock, 0);
	NFSUNLOCKCLSTATE();
	NFSFREECRED(cred);
	return (error);
	}

	/*
	* This function inserts a lock in the list after insert_lop.
	*/
	static void
	nfscl_insertlock(struct nfscllockowner lp, struct nfscllock new_lop,
	struct nfscllock *insert_lop, int local)
	{

	if ((struct nfscllockowner *)insert_lop == lp)
	LIST_INSERT_HEAD(&lp->nfsl_lock, new_lop, nfslo_list);
	else
	LIST_INSERT_AFTER(insert_lop, new_lop, nfslo_list);
	if (local)
	nfsstatsv1.cllocallocks++;
	else
	nfsstatsv1.cllocks++;
	}

	/*
	* This function updates the locking for a lock owner and given file. It
	* maintains a list of lock ranges ordered on increasing file offset that
	* are NFSCLLOCK_READ or NFSCLLOCK_WRITE and non-overlapping (aka POSIX style).
	* It always adds new_lop to the list and sometimes uses the one pointed
	* at by other_lopp.
	* Returns 1 if the locks were modified, 0 otherwise.
	*/
	static int
	nfscl_updatelock(struct nfscllockowner lp, struct nfscllock *new_lopp,
	struct nfscllock **other_lopp, int local)
	{
	struct nfscllock new_lop = new_lopp;
	struct nfscllock lop, tlop, *ilop;
	struct nfscllock *other_lop;
	int unlock = 0, modified = 0;
	u_int64_t tmp;

	/*
	* Work down the list until the lock is merged.
	*/
	if (new_lop->nfslo_type == F_UNLCK)
	unlock = 1;
	ilop = (struct nfscllock *)lp;
	lop = LIST_FIRST(&lp->nfsl_lock);
	while (lop != NULL) {
	/*
	* Only check locks for this file that aren't before the start of
	* new lock's range.
	*/
	if (lop->nfslo_end >= new_lop->nfslo_first) {
	if (new_lop->nfslo_end < lop->nfslo_first) {
	/*
	* If the new lock ends before the start of the
	* current lock's range, no merge, just insert
	* the new lock.
	*/
	break;
	}
	if (new_lop->nfslo_type == lop->nfslo_type \|\|
	(new_lop->nfslo_first <= lop->nfslo_first &&
	new_lop->nfslo_end >= lop->nfslo_end)) {
	/*
	* This lock can be absorbed by the new lock/unlock.
	* This happens when it covers the entire range
	* of the old lock or is contiguous
	* with the old lock and is of the same type or an
	* unlock.
	*/
	if (new_lop->nfslo_type != lop->nfslo_type \|\|
	new_lop->nfslo_first != lop->nfslo_first \|\|
	new_lop->nfslo_end != lop->nfslo_end)
	modified = 1;
	if (lop->nfslo_first < new_lop->nfslo_first)
	new_lop->nfslo_first = lop->nfslo_first;
	if (lop->nfslo_end > new_lop->nfslo_end)
	new_lop->nfslo_end = lop->nfslo_end;
	tlop = lop;
	lop = LIST_NEXT(lop, nfslo_list);
	nfscl_freelock(tlop, local);
	continue;
	}

	/*
	* All these cases are for contiguous locks that are not the
	* same type, so they can't be merged.
	*/
	if (new_lop->nfslo_first <= lop->nfslo_first) {
	/*
	* This case is where the new lock overlaps with the
	* first part of the old lock. Move the start of the
	* old lock to just past the end of the new lock. The
	* new lock will be inserted in front of the old, since
	* ilop hasn't been updated. (We are done now.)
	*/
	if (lop->nfslo_first != new_lop->nfslo_end) {
	lop->nfslo_first = new_lop->nfslo_end;
	modified = 1;
	}
	break;
	}
	if (new_lop->nfslo_end >= lop->nfslo_end) {
	/*
	* This case is where the new lock overlaps with the
	* end of the old lock's range. Move the old lock's
	* end to just before the new lock's first and insert
	* the new lock after the old lock.
	* Might not be done yet, since the new lock could
	* overlap further locks with higher ranges.
	*/
	if (lop->nfslo_end != new_lop->nfslo_first) {
	lop->nfslo_end = new_lop->nfslo_first;
	modified = 1;
	}
	ilop = lop;
	lop = LIST_NEXT(lop, nfslo_list);
	continue;
	}
	/*
	* The final case is where the new lock's range is in the
	* middle of the current lock's and splits the current lock
	* up. Use *other_lopp to handle the second part of the
	* split old lock range. (We are done now.)
	* For unlock, we use new_lop as other_lop and tmp, since
	* other_lop and new_lop are the same for this case.
	* We noted the unlock case above, so we don't need
	* new_lop->nfslo_type any longer.
	*/
	tmp = new_lop->nfslo_first;
	if (unlock) {
	other_lop = new_lop;
	*new_lopp = NULL;
	} else {
	other_lop = *other_lopp;
	*other_lopp = NULL;
	}
	other_lop->nfslo_first = new_lop->nfslo_end;
	other_lop->nfslo_end = lop->nfslo_end;
	other_lop->nfslo_type = lop->nfslo_type;
	lop->nfslo_end = tmp;
	nfscl_insertlock(lp, other_lop, lop, local);
	ilop = lop;
	modified = 1;
	break;
	}
	ilop = lop;
	lop = LIST_NEXT(lop, nfslo_list);
	if (lop == NULL)
	break;
	}

	/*
	* Insert the new lock in the list at the appropriate place.
	*/
	if (!unlock) {
	nfscl_insertlock(lp, new_lop, ilop, local);
	*new_lopp = NULL;
	modified = 1;
	}
	return (modified);
	}

	/*
	* This function must be run as a kernel thread.
	* It does Renew Ops and recovery, when required.
	*/
	APPLESTATIC void
	nfscl_renewthread(struct nfsclclient clp, NFSPROC_T p)
	{
	struct nfsclowner owp, nowp;
	struct nfsclopen *op;
	struct nfscllockowner lp, nlp;
	struct nfscldeleghead dh;
	struct nfscldeleg dp, ndp;
	struct ucred *cred;
	u_int32_t clidrev;
	int error, cbpathdown, islept, igotlock, ret, clearok;
	uint32_t recover_done_time = 0;
	time_t mytime;
	static time_t prevsec = 0;
	struct nfscllockownerfh lfhp, nlfhp;
	struct nfscllockownerfhhead lfh;
	struct nfscllayout lyp, nlyp;
	struct nfscldevinfo dip, ndip;
	struct nfscllayouthead rlh;
	struct nfsclrecalllayout *recallp;
	struct nfsclds *dsp;

	cred = newnfs_getcred();
	NFSLOCKCLSTATE();
	clp->nfsc_flags \|= NFSCLFLAGS_HASTHREAD;
	NFSUNLOCKCLSTATE();
	for(;;) {
	newnfs_setroot(cred);
	cbpathdown = 0;
	if (clp->nfsc_flags & NFSCLFLAGS_RECOVER) {
	/*
	* Only allow one recover within 1/2 of the lease
	* duration (nfsc_renew).
	*/
	if (recover_done_time < NFSD_MONOSEC) {
	recover_done_time = NFSD_MONOSEC +
	clp->nfsc_renew;
	NFSCL_DEBUG(1, "Doing recovery..\n");
	nfscl_recover(clp, cred, p);
	} else {
	NFSCL_DEBUG(1, "Clear Recovery dt=%u ms=%jd\n",
	recover_done_time, (intmax_t)NFSD_MONOSEC);
	NFSLOCKCLSTATE();
	clp->nfsc_flags &= ~NFSCLFLAGS_RECOVER;
	NFSUNLOCKCLSTATE();
	}
	}
	if (clp->nfsc_expire <= NFSD_MONOSEC &&
	(clp->nfsc_flags & NFSCLFLAGS_HASCLIENTID)) {
	clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew;
	clidrev = clp->nfsc_clientidrev;
	error = nfsrpc_renew(clp, NULL, cred, p);
	if (error == NFSERR_CBPATHDOWN)
	cbpathdown = 1;
	else if (error == NFSERR_STALECLIENTID \|\|
	error == NFSERR_BADSESSION) {
	NFSLOCKCLSTATE();
	clp->nfsc_flags \|= NFSCLFLAGS_RECOVER;
	NFSUNLOCKCLSTATE();
	} else if (error == NFSERR_EXPIRED)
	(void) nfscl_hasexpired(clp, clidrev, p);
	}

	checkdsrenew:
	if (NFSHASNFSV4N(clp->nfsc_nmp)) {
	/* Do renews for any DS sessions. */
	NFSLOCKMNT(clp->nfsc_nmp);
	/* Skip first entry, since the MDS is handled above. */
	dsp = TAILQ_FIRST(&clp->nfsc_nmp->nm_sess);
	if (dsp != NULL)
	dsp = TAILQ_NEXT(dsp, nfsclds_list);
	while (dsp != NULL) {
	if (dsp->nfsclds_expire <= NFSD_MONOSEC &&
	dsp->nfsclds_sess.nfsess_defunct == 0) {
	dsp->nfsclds_expire = NFSD_MONOSEC +
	clp->nfsc_renew;
	NFSUNLOCKMNT(clp->nfsc_nmp);
	(void)nfsrpc_renew(clp, dsp, cred, p);
	goto checkdsrenew;
	}
	dsp = TAILQ_NEXT(dsp, nfsclds_list);
	}
	NFSUNLOCKMNT(clp->nfsc_nmp);
	}

	TAILQ_INIT(&dh);
	NFSLOCKCLSTATE();
	if (cbpathdown)
	/* It's a Total Recall! */
	nfscl_totalrecall(clp);

	/*
	* Now, handle defunct owners.
	*/
	LIST_FOREACH_SAFE(owp, &clp->nfsc_owner, nfsow_list, nowp) {
	if (LIST_EMPTY(&owp->nfsow_open)) {
	if (owp->nfsow_defunct != 0)
	nfscl_freeopenowner(owp, 0);
	}
	}

	/*
	* Do the recall on any delegations. To avoid trouble, always
	* come back up here after having slept.
	*/
	igotlock = 0;
	tryagain:
	dp = TAILQ_FIRST(&clp->nfsc_deleg);
	while (dp != NULL) {
	ndp = TAILQ_NEXT(dp, nfsdl_list);
	if ((dp->nfsdl_flags & NFSCLDL_RECALL)) {
	/*
	* Wait for outstanding I/O ops to be done.
	*/
	if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
	if (igotlock) {
	nfsv4_unlock(&clp->nfsc_lock, 0);
	igotlock = 0;
	}
	dp->nfsdl_rwlock.nfslock_lock \|=
	NFSV4LOCK_WANTED;
	(void) nfsmsleep(&dp->nfsdl_rwlock,
	NFSCLSTATEMUTEXPTR, PZERO, "nfscld",
	NULL);
	goto tryagain;
	}
	while (!igotlock) {
	igotlock = nfsv4_lock(&clp->nfsc_lock, 1,
	&islept, NFSCLSTATEMUTEXPTR, NULL);
	if (islept)
	goto tryagain;
	}
	NFSUNLOCKCLSTATE();
	newnfs_copycred(&dp->nfsdl_cred, cred);
	ret = nfscl_recalldeleg(clp, clp->nfsc_nmp, dp,
	NULL, cred, p, 1);
	if (!ret) {
	nfscl_cleandeleg(dp);
	TAILQ_REMOVE(&clp->nfsc_deleg, dp,
	nfsdl_list);
	LIST_REMOVE(dp, nfsdl_hash);
	TAILQ_INSERT_HEAD(&dh, dp, nfsdl_list);
	nfscl_delegcnt--;
	nfsstatsv1.cldelegates--;
	}
	NFSLOCKCLSTATE();
	}
	dp = ndp;
	}

	/*
	* Clear out old delegations, if we are above the high water
	* mark. Only clear out ones with no state related to them.
	* The tailq list is in LRU order.
	*/
	dp = TAILQ_LAST(&clp->nfsc_deleg, nfscldeleghead);
	while (nfscl_delegcnt > nfscl_deleghighwater && dp != NULL) {
	ndp = TAILQ_PREV(dp, nfscldeleghead, nfsdl_list);
	if (dp->nfsdl_rwlock.nfslock_usecnt == 0 &&
	dp->nfsdl_rwlock.nfslock_lock == 0 &&
	dp->nfsdl_timestamp < NFSD_MONOSEC &&
	(dp->nfsdl_flags & (NFSCLDL_RECALL \| NFSCLDL_ZAPPED \|
	NFSCLDL_NEEDRECLAIM \| NFSCLDL_DELEGRET)) == 0) {
	clearok = 1;
	LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
	op = LIST_FIRST(&owp->nfsow_open);
	if (op != NULL) {
	clearok = 0;
	break;
	}
	}
	if (clearok) {
	LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
	if (!LIST_EMPTY(&lp->nfsl_lock)) {
	clearok = 0;
	break;
	}
	}
	}
	if (clearok) {
	TAILQ_REMOVE(&clp->nfsc_deleg, dp, nfsdl_list);
	LIST_REMOVE(dp, nfsdl_hash);
	TAILQ_INSERT_HEAD(&dh, dp, nfsdl_list);
	nfscl_delegcnt--;
	nfsstatsv1.cldelegates--;
	}
	}
	dp = ndp;
	}
	if (igotlock)
	nfsv4_unlock(&clp->nfsc_lock, 0);

	/*
	* Do the recall on any layouts. To avoid trouble, always
	* come back up here after having slept.
	*/
	TAILQ_INIT(&rlh);
	tryagain2:
	TAILQ_FOREACH_SAFE(lyp, &clp->nfsc_layout, nfsly_list, nlyp) {
	if ((lyp->nfsly_flags & NFSLY_RECALL) != 0) {
	/*
	* Wait for outstanding I/O ops to be done.
	*/
	if (lyp->nfsly_lock.nfslock_usecnt > 0 \|\|
	(lyp->nfsly_lock.nfslock_lock &
	NFSV4LOCK_LOCK) != 0) {
	lyp->nfsly_lock.nfslock_lock \|=
	NFSV4LOCK_WANTED;
	(void)nfsmsleep(&lyp->nfsly_lock,
	NFSCLSTATEMUTEXPTR, PZERO, "nfslyp",
	NULL);
	goto tryagain2;
	}
	/* Move the layout to the recall list. */
	TAILQ_REMOVE(&clp->nfsc_layout, lyp,
	nfsly_list);
	LIST_REMOVE(lyp, nfsly_hash);
	TAILQ_INSERT_HEAD(&rlh, lyp, nfsly_list);

	/* Handle any layout commits. */
	if (!NFSHASNOLAYOUTCOMMIT(clp->nfsc_nmp) &&
	(lyp->nfsly_flags & NFSLY_WRITTEN) != 0) {
	lyp->nfsly_flags &= ~NFSLY_WRITTEN;
	NFSUNLOCKCLSTATE();
	NFSCL_DEBUG(3, "do layoutcommit\n");
	nfscl_dolayoutcommit(clp->nfsc_nmp, lyp,
	cred, p);
	NFSLOCKCLSTATE();
	goto tryagain2;
	}
	}
	}

	/* Now, look for stale layouts. */
	lyp = TAILQ_LAST(&clp->nfsc_layout, nfscllayouthead);
	while (lyp != NULL) {
	nlyp = TAILQ_PREV(lyp, nfscllayouthead, nfsly_list);
	if (lyp->nfsly_timestamp < NFSD_MONOSEC &&
	(lyp->nfsly_flags & NFSLY_RECALL) == 0 &&
	lyp->nfsly_lock.nfslock_usecnt == 0 &&
	lyp->nfsly_lock.nfslock_lock == 0) {
	NFSCL_DEBUG(4, "ret stale lay=%d\n",
	nfscl_layoutcnt);
	recallp = malloc(sizeof(*recallp),
	M_NFSLAYRECALL, M_NOWAIT);
	if (recallp == NULL)
	break;
	(void)nfscl_layoutrecall(NFSLAYOUTRETURN_FILE,
	lyp, NFSLAYOUTIOMODE_ANY, 0, UINT64_MAX,
	lyp->nfsly_stateid.seqid, recallp);
	}
	lyp = nlyp;
	}

	/*
	* Free up any unreferenced device info structures.
	*/
	LIST_FOREACH_SAFE(dip, &clp->nfsc_devinfo, nfsdi_list, ndip) {
	if (dip->nfsdi_layoutrefs == 0 &&
	dip->nfsdi_refcnt == 0) {
	NFSCL_DEBUG(4, "freeing devinfo\n");
	LIST_REMOVE(dip, nfsdi_list);
	nfscl_freedevinfo(dip);
	}
	}
	NFSUNLOCKCLSTATE();

	/* Do layout return(s), as required. */
	TAILQ_FOREACH_SAFE(lyp, &rlh, nfsly_list, nlyp) {
	TAILQ_REMOVE(&rlh, lyp, nfsly_list);
	NFSCL_DEBUG(4, "ret layout\n");
	nfscl_layoutreturn(clp->nfsc_nmp, lyp, cred, p);
	nfscl_freelayout(lyp);
	}

	/*
	* Delegreturn any delegations cleaned out or recalled.
	*/
	TAILQ_FOREACH_SAFE(dp, &dh, nfsdl_list, ndp) {
	newnfs_copycred(&dp->nfsdl_cred, cred);
	(void) nfscl_trydelegreturn(dp, cred, clp->nfsc_nmp, p);
	TAILQ_REMOVE(&dh, dp, nfsdl_list);
	FREE((caddr_t)dp, M_NFSCLDELEG);
	}

	SLIST_INIT(&lfh);
	/*
	* Call nfscl_cleanupkext() once per second to check for
	* open/lock owners where the process has exited.
	*/
	mytime = NFSD_MONOSEC;
	if (prevsec != mytime) {
	prevsec = mytime;
	nfscl_cleanupkext(clp, &lfh);
	}

	/*
	* Do a ReleaseLockOwner for all lock owners where the
	* associated process no longer exists, as found by
	* nfscl_cleanupkext().
	*/
	newnfs_setroot(cred);
	SLIST_FOREACH_SAFE(lfhp, &lfh, nfslfh_list, nlfhp) {
	LIST_FOREACH_SAFE(lp, &lfhp->nfslfh_lock, nfsl_list,
	nlp) {
	(void)nfsrpc_rellockown(clp->nfsc_nmp, lp,
	lfhp->nfslfh_fh, lfhp->nfslfh_len, cred,
	p);
	nfscl_freelockowner(lp, 0);
	}
	free(lfhp, M_TEMP);
	}
	SLIST_INIT(&lfh);

	NFSLOCKCLSTATE();
	if ((clp->nfsc_flags & NFSCLFLAGS_RECOVER) == 0)
	(void)mtx_sleep(clp, NFSCLSTATEMUTEXPTR, PWAIT, "nfscl",
	hz);
	if (clp->nfsc_flags & NFSCLFLAGS_UMOUNT) {
	clp->nfsc_flags &= ~NFSCLFLAGS_HASTHREAD;
	NFSUNLOCKCLSTATE();
	NFSFREECRED(cred);
	wakeup((caddr_t)clp);
	return;
	}
	NFSUNLOCKCLSTATE();
	}
	}

	/*
	* Initiate state recovery. Called when NFSERR_STALECLIENTID,
	* NFSERR_STALESTATEID or NFSERR_BADSESSION is received.
	*/
	APPLESTATIC void
	nfscl_initiate_recovery(struct nfsclclient *clp)
	{

	if (clp == NULL)
	return;
	NFSLOCKCLSTATE();
	clp->nfsc_flags \|= NFSCLFLAGS_RECOVER;
	NFSUNLOCKCLSTATE();
	wakeup((caddr_t)clp);
	}

	/*
	* Dump out the state stuff for debugging.
	*/
	APPLESTATIC void
	nfscl_dumpstate(struct nfsmount *nmp, int openowner, int opens,
	int lockowner, int locks)
	{
	struct nfsclclient *clp;
	struct nfsclowner *owp;
	struct nfsclopen *op;
	struct nfscllockowner *lp;
	struct nfscllock *lop;
	struct nfscldeleg *dp;

	clp = nmp->nm_clp;
	if (clp == NULL) {
	printf("nfscl dumpstate NULL clp\n");
	return;
	}
	NFSLOCKCLSTATE();
	TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
	LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
	if (openowner && !LIST_EMPTY(&owp->nfsow_open))
	printf("owner=0x%x 0x%x 0x%x 0x%x seqid=%d\n",
	owp->nfsow_owner[0], owp->nfsow_owner[1],
	owp->nfsow_owner[2], owp->nfsow_owner[3],
	owp->nfsow_seqid);
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (opens)
	printf("open st=0x%x 0x%x 0x%x cnt=%d fh12=0x%x\n",
	op->nfso_stateid.other[0], op->nfso_stateid.other[1],
	op->nfso_stateid.other[2], op->nfso_opencnt,
	op->nfso_fh[12]);
	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
	if (lockowner)
	printf("lckown=0x%x 0x%x 0x%x 0x%x seqid=%d st=0x%x 0x%x 0x%x\n",
	lp->nfsl_owner[0], lp->nfsl_owner[1],
	lp->nfsl_owner[2], lp->nfsl_owner[3],
	lp->nfsl_seqid,
	lp->nfsl_stateid.other[0], lp->nfsl_stateid.other[1],
	lp->nfsl_stateid.other[2]);
	LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
	if (locks)
	#ifdef __FreeBSD__
	printf("lck typ=%d fst=%ju end=%ju\n",
	lop->nfslo_type, (intmax_t)lop->nfslo_first,
	(intmax_t)lop->nfslo_end);
	#else
	printf("lck typ=%d fst=%qd end=%qd\n",
	lop->nfslo_type, lop->nfslo_first,
	lop->nfslo_end);
	#endif
	}
	}
	}
	}
	}
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	if (openowner && !LIST_EMPTY(&owp->nfsow_open))
	printf("owner=0x%x 0x%x 0x%x 0x%x seqid=%d\n",
	owp->nfsow_owner[0], owp->nfsow_owner[1],
	owp->nfsow_owner[2], owp->nfsow_owner[3],
	owp->nfsow_seqid);
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (opens)
	printf("open st=0x%x 0x%x 0x%x cnt=%d fh12=0x%x\n",
	op->nfso_stateid.other[0], op->nfso_stateid.other[1],
	op->nfso_stateid.other[2], op->nfso_opencnt,
	op->nfso_fh[12]);
	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
	if (lockowner)
	printf("lckown=0x%x 0x%x 0x%x 0x%x seqid=%d st=0x%x 0x%x 0x%x\n",
	lp->nfsl_owner[0], lp->nfsl_owner[1],
	lp->nfsl_owner[2], lp->nfsl_owner[3],
	lp->nfsl_seqid,
	lp->nfsl_stateid.other[0], lp->nfsl_stateid.other[1],
	lp->nfsl_stateid.other[2]);
	LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
	if (locks)
	#ifdef __FreeBSD__
	printf("lck typ=%d fst=%ju end=%ju\n",
	lop->nfslo_type, (intmax_t)lop->nfslo_first,
	(intmax_t)lop->nfslo_end);
	#else
	printf("lck typ=%d fst=%qd end=%qd\n",
	lop->nfslo_type, lop->nfslo_first,
	lop->nfslo_end);
	#endif
	}
	}
	}
	}
	NFSUNLOCKCLSTATE();
	}

	/*
	* Check for duplicate open owners and opens.
	* (Only used as a diagnostic aid.)
	*/
	APPLESTATIC void
	nfscl_dupopen(vnode_t vp, int dupopens)
	{
	struct nfsclclient *clp;
	struct nfsclowner owp, owp2;
	struct nfsclopen op, op2;
	struct nfsfh *nfhp;

	clp = VFSTONFS(vnode_mount(vp))->nm_clp;
	if (clp == NULL) {
	printf("nfscl dupopen NULL clp\n");
	return;
	}
	nfhp = VTONFS(vp)->n_fhp;
	NFSLOCKCLSTATE();

	/*
	* First, search for duplicate owners.
	* These should never happen!
	*/
	LIST_FOREACH(owp2, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	if (owp != owp2 &&
	!NFSBCMP(owp->nfsow_owner, owp2->nfsow_owner,
	NFSV4CL_LOCKNAMELEN)) {
	NFSUNLOCKCLSTATE();
	printf("DUP OWNER\n");
	nfscl_dumpstate(VFSTONFS(vnode_mount(vp)), 1, 1, 0, 0);
	return;
	}
	}
	}

	/*
	* Now, search for duplicate stateids.
	* These shouldn't happen, either.
	*/
	LIST_FOREACH(owp2, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op2, &owp2->nfsow_open, nfso_list) {
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op != op2 &&
	(op->nfso_stateid.other[0] != 0 \|\|
	op->nfso_stateid.other[1] != 0 \|\|
	op->nfso_stateid.other[2] != 0) &&
	op->nfso_stateid.other[0] == op2->nfso_stateid.other[0] &&
	op->nfso_stateid.other[1] == op2->nfso_stateid.other[1] &&
	op->nfso_stateid.other[2] == op2->nfso_stateid.other[2]) {
	NFSUNLOCKCLSTATE();
	printf("DUP STATEID\n");
	nfscl_dumpstate(VFSTONFS(vnode_mount(vp)), 1, 1, 0,
	0);
	return;
	}
	}
	}
	}
	}

	/*
	* Now search for duplicate opens.
	* Duplicate opens for the same owner
	* should never occur. Other duplicates are
	* possible and are checked for if "dupopens"
	* is true.
	*/
	LIST_FOREACH(owp2, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op2, &owp2->nfsow_open, nfso_list) {
	if (nfhp->nfh_len == op2->nfso_fhlen &&
	!NFSBCMP(nfhp->nfh_fh, op2->nfso_fh, nfhp->nfh_len)) {
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op != op2 && nfhp->nfh_len == op->nfso_fhlen &&
	!NFSBCMP(nfhp->nfh_fh, op->nfso_fh, nfhp->nfh_len) &&
	(!NFSBCMP(op->nfso_own->nfsow_owner,
	op2->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN) \|\|
	dupopens)) {
	if (!NFSBCMP(op->nfso_own->nfsow_owner,
	op2->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN)) {
	NFSUNLOCKCLSTATE();
	printf("BADDUP OPEN\n");
	} else {
	NFSUNLOCKCLSTATE();
	printf("DUP OPEN\n");
	}
	nfscl_dumpstate(VFSTONFS(vnode_mount(vp)), 1, 1,
	0, 0);
	return;
	}
	}
	}
	}
	}
	}
	NFSUNLOCKCLSTATE();
	}

	/*
	* During close, find an open that needs to be dereferenced and
	* dereference it. If there are no more opens for this file,
	* log a message to that effect.
	* Opens aren't actually Close'd until VOP_INACTIVE() is performed
	* on the file's vnode.
	* This is the safe way, since it is difficult to identify
	* which open the close is for and I/O can be performed after the
	* close(2) system call when a file is mmap'd.
	* If it returns 0 for success, there will be a referenced
	* clp returned via clpp.
	*/
	APPLESTATIC int
	nfscl_getclose(vnode_t vp, struct nfsclclient **clpp)
	{
	struct nfsclclient *clp;
	struct nfsclowner *owp;
	struct nfsclopen *op;
	struct nfscldeleg *dp;
	struct nfsfh *nfhp;
	int error, notdecr;

	error = nfscl_getcl(vnode_mount(vp), NULL, NULL, 1, &clp);
	if (error)
	return (error);
	*clpp = clp;

	nfhp = VTONFS(vp)->n_fhp;
	notdecr = 1;
	NFSLOCKCLSTATE();
	/*
	* First, look for one under a delegation that was locally issued
	* and just decrement the opencnt for it. Since all my Opens against
	* the server are DENY_NONE, I don't see a problem with hanging
	* onto them. (It is much easier to use one of the extant Opens
	* that I already have on the server when a Delegation is recalled
	* than to do fresh Opens.) Someday, I might need to rethink this, but.
	*/
	dp = nfscl_finddeleg(clp, nfhp->nfh_fh, nfhp->nfh_len);
	if (dp != NULL) {
	LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
	op = LIST_FIRST(&owp->nfsow_open);
	if (op != NULL) {
	/*
	* Since a delegation is for a file, there
	* should never be more than one open for
	* each openowner.
	*/
	if (LIST_NEXT(op, nfso_list) != NULL)
	panic("nfscdeleg opens");
	if (notdecr && op->nfso_opencnt > 0) {
	notdecr = 0;
	op->nfso_opencnt--;
	break;
	}
	}
	}
	}

	/* Now process the opens against the server. */
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op->nfso_fhlen == nfhp->nfh_len &&
	!NFSBCMP(op->nfso_fh, nfhp->nfh_fh,
	nfhp->nfh_len)) {
	/* Found an open, decrement cnt if possible */
	if (notdecr && op->nfso_opencnt > 0) {
	notdecr = 0;
	op->nfso_opencnt--;
	}
	/*
	* There are more opens, so just return.
	*/
	if (op->nfso_opencnt > 0) {
	NFSUNLOCKCLSTATE();
	return (0);
	}
	}
	}
	}
	NFSUNLOCKCLSTATE();
	if (notdecr)
	printf("nfscl: never fnd open\n");
	return (0);
	}

	APPLESTATIC int
	nfscl_doclose(vnode_t vp, struct nfsclclient *clpp, NFSPROC_T p)
	{
	struct nfsclclient *clp;
	struct nfsclowner owp, nowp;
	struct nfsclopen *op;
	struct nfscldeleg *dp;
	struct nfsfh *nfhp;
	struct nfsclrecalllayout *recallp;
	int error;

	error = nfscl_getcl(vnode_mount(vp), NULL, NULL, 1, &clp);
	if (error)
	return (error);
	*clpp = clp;

	nfhp = VTONFS(vp)->n_fhp;
	recallp = malloc(sizeof(*recallp), M_NFSLAYRECALL, M_WAITOK);
	NFSLOCKCLSTATE();
	/*
	* First get rid of the local Open structures, which should be no
	* longer in use.
	*/
	dp = nfscl_finddeleg(clp, nfhp->nfh_fh, nfhp->nfh_len);
	if (dp != NULL) {
	LIST_FOREACH_SAFE(owp, &dp->nfsdl_owner, nfsow_list, nowp) {
	op = LIST_FIRST(&owp->nfsow_open);
	if (op != NULL) {
	KASSERT((op->nfso_opencnt == 0),
	("nfscl: bad open cnt on deleg"));
	nfscl_freeopen(op, 1);
	}
	nfscl_freeopenowner(owp, 1);
	}
	}

	/* Return any layouts marked return on close. */
	nfscl_retoncloselayout(vp, clp, nfhp->nfh_fh, nfhp->nfh_len, &recallp);

	/* Now process the opens against the server. */
	lookformore:
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	op = LIST_FIRST(&owp->nfsow_open);
	while (op != NULL) {
	if (op->nfso_fhlen == nfhp->nfh_len &&
	!NFSBCMP(op->nfso_fh, nfhp->nfh_fh,
	nfhp->nfh_len)) {
	/* Found an open, close it. */
	KASSERT((op->nfso_opencnt == 0),
	("nfscl: bad open cnt on server"));
	NFSUNLOCKCLSTATE();
	nfsrpc_doclose(VFSTONFS(vnode_mount(vp)), op,
	p);
	NFSLOCKCLSTATE();
	goto lookformore;
	}
	op = LIST_NEXT(op, nfso_list);
	}
	}
	NFSUNLOCKCLSTATE();
	/*
	* recallp has been set NULL by nfscl_retoncloselayout() if it was
	* used by the function, but calling free() with a NULL pointer is ok.
	*/
	free(recallp, M_NFSLAYRECALL);
	return (0);
	}

	/*
	* Return all delegations on this client.
	* (Must be called with client sleep lock.)
	*/
	static void
	nfscl_delegreturnall(struct nfsclclient clp, NFSPROC_T p)
	{
	struct nfscldeleg dp, ndp;
	struct ucred *cred;

	cred = newnfs_getcred();
	TAILQ_FOREACH_SAFE(dp, &clp->nfsc_deleg, nfsdl_list, ndp) {
	nfscl_cleandeleg(dp);
	(void) nfscl_trydelegreturn(dp, cred, clp->nfsc_nmp, p);
	nfscl_freedeleg(&clp->nfsc_deleg, dp);
	}
	NFSFREECRED(cred);
	}

	/*
	* Do a callback RPC.
	*/
	APPLESTATIC void
	nfscl_docb(struct nfsrv_descript nd, NFSPROC_T p)
	{
	int clist, gotseq_ok, i, j, k, op, rcalls;
	u_int32_t *tl;
	struct nfsclclient *clp;
	struct nfscldeleg *dp = NULL;
	int numops, taglen = -1, error = 0, trunc;
	u_int32_t minorvers = 0, retops = 0, retopsp = NULL, repp, cbident;
	u_char tag[NFSV4_SMALLSTR + 1], *tagstr;
	vnode_t vp = NULL;
	struct nfsnode *np;
	struct vattr va;
	struct nfsfh *nfhp;
	mount_t mp;
	nfsattrbit_t attrbits, rattrbits;
	nfsv4stateid_t stateid;
	uint32_t seqid, slotid = 0, highslot, cachethis;
	uint8_t sessionid[NFSX_V4SESSIONID];
	struct mbuf *rep;
	struct nfscllayout *lyp;
	uint64_t filesid[2], len, off;
	int changed, gotone, laytype, recalltype;
	uint32_t iomode;
	struct nfsclrecalllayout *recallp = NULL;
	struct nfsclsession *tsep;

	gotseq_ok = 0;
	nfsrvd_rephead(nd);
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	taglen = fxdr_unsigned(int, *tl);
	if (taglen < 0) {
	error = EBADRPC;
	goto nfsmout;
	}
	if (taglen <= NFSV4_SMALLSTR)
	tagstr = tag;
	else
	tagstr = malloc(taglen + 1, M_TEMP, M_WAITOK);
	error = nfsrv_mtostr(nd, tagstr, taglen);
	if (error) {
	if (taglen > NFSV4_SMALLSTR)
	free(tagstr, M_TEMP);
	taglen = -1;
	goto nfsmout;
	}
	(void) nfsm_strtom(nd, tag, taglen);
	if (taglen > NFSV4_SMALLSTR) {
	free(tagstr, M_TEMP);
	}
	NFSM_BUILD(retopsp, u_int32_t *, NFSX_UNSIGNED);
	NFSM_DISSECT(tl, u_int32_t , 3 NFSX_UNSIGNED);
	minorvers = fxdr_unsigned(u_int32_t, *tl++);
	if (minorvers != NFSV4_MINORVERSION && minorvers != NFSV41_MINORVERSION)
	nd->nd_repstat = NFSERR_MINORVERMISMATCH;
	cbident = fxdr_unsigned(u_int32_t, *tl++);
	if (nd->nd_repstat)
	numops = 0;
	else
	numops = fxdr_unsigned(int, *tl);
	/*
	* Loop around doing the sub ops.
	*/
	for (i = 0; i < numops; i++) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	NFSM_BUILD(repp, u_int32_t , 2 NFSX_UNSIGNED);
	repp++ = tl;
	op = fxdr_unsigned(int, *tl);
	if (op < NFSV4OP_CBGETATTR \|\|
	(op > NFSV4OP_CBRECALL && minorvers == NFSV4_MINORVERSION) \|\|
	(op > NFSV4OP_CBNOTIFYDEVID &&
	minorvers == NFSV41_MINORVERSION)) {
	nd->nd_repstat = NFSERR_OPILLEGAL;
	*repp = nfscl_errmap(nd, minorvers);
	retops++;
	break;
	}
	nd->nd_procnum = op;
	if (op < NFSV41_CBNOPS)
	nfsstatsv1.cbrpccnt[nd->nd_procnum]++;
	switch (op) {
	case NFSV4OP_CBGETATTR:
	NFSCL_DEBUG(4, "cbgetattr\n");
	mp = NULL;
	vp = NULL;
	error = nfsm_getfh(nd, &nfhp);
	if (!error)
	error = nfsrv_getattrbits(nd, &attrbits,
	NULL, NULL);
	if (error == 0 && i == 0 &&
	minorvers != NFSV4_MINORVERSION)
	error = NFSERR_OPNOTINSESS;
	if (!error) {
	mp = nfscl_getmnt(minorvers, sessionid, cbident,
	&clp);
	if (mp == NULL)
	error = NFSERR_SERVERFAULT;
	}
	if (!error) {
	error = nfscl_ngetreopen(mp, nfhp->nfh_fh,
	nfhp->nfh_len, p, &np);
	if (!error)
	vp = NFSTOV(np);
	}
	if (!error) {
	NFSZERO_ATTRBIT(&rattrbits);
	NFSLOCKCLSTATE();
	dp = nfscl_finddeleg(clp, nfhp->nfh_fh,
	nfhp->nfh_len);
	if (dp != NULL) {
	if (NFSISSET_ATTRBIT(&attrbits,
	NFSATTRBIT_SIZE)) {
	if (vp != NULL)
	va.va_size = np->n_size;
	else
	va.va_size =
	dp->nfsdl_size;
	NFSSETBIT_ATTRBIT(&rattrbits,
	NFSATTRBIT_SIZE);
	}
	if (NFSISSET_ATTRBIT(&attrbits,
	NFSATTRBIT_CHANGE)) {
	va.va_filerev =
	dp->nfsdl_change;
	if (vp == NULL \|\|
	(np->n_flag & NDELEGMOD))
	va.va_filerev++;
	NFSSETBIT_ATTRBIT(&rattrbits,
	NFSATTRBIT_CHANGE);
	}
	} else
	error = NFSERR_SERVERFAULT;
	NFSUNLOCKCLSTATE();
	}
	if (vp != NULL)
	vrele(vp);
	if (mp != NULL)
	vfs_unbusy(mp);
	if (nfhp != NULL)
	FREE((caddr_t)nfhp, M_NFSFH);
	if (!error)
	(void) nfsv4_fillattr(nd, NULL, NULL, NULL, &va,
	NULL, 0, &rattrbits, NULL, p, 0, 0, 0, 0,
	(uint64_t)0);
	break;
	case NFSV4OP_CBRECALL:
	NFSCL_DEBUG(4, "cbrecall\n");
	NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
	NFSX_UNSIGNED);
	stateid.seqid = *tl++;
	NFSBCOPY((caddr_t)tl, (caddr_t)stateid.other,
	NFSX_STATEIDOTHER);
	tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED);
	trunc = fxdr_unsigned(int, *tl);
	error = nfsm_getfh(nd, &nfhp);
	if (error == 0 && i == 0 &&
	minorvers != NFSV4_MINORVERSION)
	error = NFSERR_OPNOTINSESS;
	if (!error) {
	NFSLOCKCLSTATE();
	if (minorvers == NFSV4_MINORVERSION)
	clp = nfscl_getclnt(cbident);
	else
	clp = nfscl_getclntsess(sessionid);
	if (clp != NULL) {
	dp = nfscl_finddeleg(clp, nfhp->nfh_fh,
	nfhp->nfh_len);
	if (dp != NULL && (dp->nfsdl_flags &
	NFSCLDL_DELEGRET) == 0) {
	dp->nfsdl_flags \|=
	NFSCLDL_RECALL;
	wakeup((caddr_t)clp);
	}
	} else {
	error = NFSERR_SERVERFAULT;
	}
	NFSUNLOCKCLSTATE();
	}
	if (nfhp != NULL)
	FREE((caddr_t)nfhp, M_NFSFH);
	break;
	case NFSV4OP_CBLAYOUTRECALL:
	NFSCL_DEBUG(4, "cblayrec\n");
	nfhp = NULL;
	NFSM_DISSECT(tl, uint32_t , 4 NFSX_UNSIGNED);
	laytype = fxdr_unsigned(int, *tl++);
	iomode = fxdr_unsigned(uint32_t, *tl++);
	if (newnfs_true == *tl++)
	changed = 1;
	else
	changed = 0;
	recalltype = fxdr_unsigned(int, *tl);
	recallp = malloc(sizeof(*recallp), M_NFSLAYRECALL,
	M_WAITOK);
	if (laytype != NFSLAYOUT_NFSV4_1_FILES)
	error = NFSERR_NOMATCHLAYOUT;
	else if (recalltype == NFSLAYOUTRETURN_FILE) {
	error = nfsm_getfh(nd, &nfhp);
	NFSCL_DEBUG(4, "retfile getfh=%d\n", error);
	if (error != 0)
	goto nfsmout;
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_HYPER +
	NFSX_STATEID);
	off = fxdr_hyper(tl); tl += 2;
	len = fxdr_hyper(tl); tl += 2;
	stateid.seqid = fxdr_unsigned(uint32_t, *tl++);
	NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER);
	if (minorvers == NFSV4_MINORVERSION)
	error = NFSERR_NOTSUPP;
	else if (i == 0)
	error = NFSERR_OPNOTINSESS;
	if (error == 0) {
	NFSLOCKCLSTATE();
	clp = nfscl_getclntsess(sessionid);
	NFSCL_DEBUG(4, "cbly clp=%p\n", clp);
	if (clp != NULL) {
	lyp = nfscl_findlayout(clp,
	nfhp->nfh_fh,
	nfhp->nfh_len);
	NFSCL_DEBUG(4, "cblyp=%p\n",
	lyp);
	if (lyp != NULL &&
	(lyp->nfsly_flags &
	NFSLY_FILES) != 0 &&
	!NFSBCMP(stateid.other,
	lyp->nfsly_stateid.other,
	NFSX_STATEIDOTHER)) {
	error =
	nfscl_layoutrecall(
	recalltype,
	lyp, iomode, off,
	len, stateid.seqid,
	recallp);
	recallp = NULL;
	wakeup(clp);
	NFSCL_DEBUG(4,
	"aft layrcal=%d\n",
	error);
	} else
	error =
	NFSERR_NOMATCHLAYOUT;
	} else
	error = NFSERR_NOMATCHLAYOUT;
	NFSUNLOCKCLSTATE();
	}
	free(nfhp, M_NFSFH);
	} else if (recalltype == NFSLAYOUTRETURN_FSID) {
	NFSM_DISSECT(tl, uint32_t , 2 NFSX_HYPER);
	filesid[0] = fxdr_hyper(tl); tl += 2;
	filesid[1] = fxdr_hyper(tl); tl += 2;
	gotone = 0;
	NFSLOCKCLSTATE();
	clp = nfscl_getclntsess(sessionid);
	if (clp != NULL) {
	TAILQ_FOREACH(lyp, &clp->nfsc_layout,
	nfsly_list) {
	if (lyp->nfsly_filesid[0] ==
	filesid[0] &&
	lyp->nfsly_filesid[1] ==
	filesid[1]) {
	error =
	nfscl_layoutrecall(
	recalltype,
	lyp, iomode, 0,
	UINT64_MAX,
	lyp->nfsly_stateid.seqid,
	recallp);
	recallp = NULL;
	gotone = 1;
	}
	}
	if (gotone != 0)
	wakeup(clp);
	else
	error = NFSERR_NOMATCHLAYOUT;
	} else
	error = NFSERR_NOMATCHLAYOUT;
	NFSUNLOCKCLSTATE();
	} else if (recalltype == NFSLAYOUTRETURN_ALL) {
	gotone = 0;
	NFSLOCKCLSTATE();
	clp = nfscl_getclntsess(sessionid);
	if (clp != NULL) {
	TAILQ_FOREACH(lyp, &clp->nfsc_layout,
	nfsly_list) {
	error = nfscl_layoutrecall(
	recalltype, lyp, iomode, 0,
	UINT64_MAX,
	lyp->nfsly_stateid.seqid,
	recallp);
	recallp = NULL;
	gotone = 1;
	}
	if (gotone != 0)
	wakeup(clp);
	else
	error = NFSERR_NOMATCHLAYOUT;
	} else
	error = NFSERR_NOMATCHLAYOUT;
	NFSUNLOCKCLSTATE();
	} else
	error = NFSERR_NOMATCHLAYOUT;
	if (recallp != NULL) {
	free(recallp, M_NFSLAYRECALL);
	recallp = NULL;
	}
	break;
	case NFSV4OP_CBSEQUENCE:
	NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
	5 * NFSX_UNSIGNED);
	bcopy(tl, sessionid, NFSX_V4SESSIONID);
	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
	seqid = fxdr_unsigned(uint32_t, *tl++);
	slotid = fxdr_unsigned(uint32_t, *tl++);
	highslot = fxdr_unsigned(uint32_t, *tl++);
	cachethis = *tl++;
	/* Throw away the referring call stuff. */
	clist = fxdr_unsigned(int, *tl);
	for (j = 0; j < clist; j++) {
	NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
	NFSX_UNSIGNED);
	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
	rcalls = fxdr_unsigned(int, *tl);
	for (k = 0; k < rcalls; k++) {
	NFSM_DISSECT(tl, uint32_t *,
	2 * NFSX_UNSIGNED);
	}
	}
	NFSLOCKCLSTATE();
	if (i == 0) {
	clp = nfscl_getclntsess(sessionid);
	if (clp == NULL)
	error = NFSERR_SERVERFAULT;
	} else
	error = NFSERR_SEQUENCEPOS;
	if (error == 0) {
	tsep = nfsmnt_mdssession(clp->nfsc_nmp);
	error = nfsv4_seqsession(seqid, slotid,
	highslot, tsep->nfsess_cbslots, &rep,
	tsep->nfsess_backslots);
	}
	NFSUNLOCKCLSTATE();
	if (error == 0 \|\| error == NFSERR_REPLYFROMCACHE) {
	gotseq_ok = 1;
	if (rep != NULL) {
	/*
	* Handle a reply for a retried
	* callback. The reply will be
	* re-inserted in the session cache
	* by the nfsv4_seqsess_cacherep() call
	* after out:
	*/
	KASSERT(error == NFSERR_REPLYFROMCACHE,
	("cbsequence: non-NULL rep"));
	NFSCL_DEBUG(4, "Got cbretry\n");
	m_freem(nd->nd_mreq);
	nd->nd_mreq = rep;
	rep = NULL;
	goto out;
	}
	NFSM_BUILD(tl, uint32_t *,
	NFSX_V4SESSIONID + 4 * NFSX_UNSIGNED);
	bcopy(sessionid, tl, NFSX_V4SESSIONID);
	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
	*tl++ = txdr_unsigned(seqid);
	*tl++ = txdr_unsigned(slotid);
	*tl++ = txdr_unsigned(NFSV4_CBSLOTS - 1);
	*tl = txdr_unsigned(NFSV4_CBSLOTS - 1);
	}
	break;
	default:
	if (i == 0 && minorvers == NFSV41_MINORVERSION)
	error = NFSERR_OPNOTINSESS;
	else {
	NFSCL_DEBUG(1, "unsupp callback %d\n", op);
	error = NFSERR_NOTSUPP;
	}
	break;
	}
	if (error) {
	if (error == EBADRPC \|\| error == NFSERR_BADXDR) {
	nd->nd_repstat = NFSERR_BADXDR;
	} else {
	nd->nd_repstat = error;
	}
	error = 0;
	}
	retops++;
	if (nd->nd_repstat) {
	*repp = nfscl_errmap(nd, minorvers);
	break;
	} else
	repp = 0; / NFS4_OK */
	}
	nfsmout:
	if (recallp != NULL)
	free(recallp, M_NFSLAYRECALL);
	if (error) {
	if (error == EBADRPC \|\| error == NFSERR_BADXDR)
	nd->nd_repstat = NFSERR_BADXDR;
	else
	printf("nfsv4 comperr1=%d\n", error);
	}
	if (taglen == -1) {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = 0;
	*tl = 0;
	} else {
	*retopsp = txdr_unsigned(retops);
	}
	*nd->nd_errp = nfscl_errmap(nd, minorvers);
	out:
	if (gotseq_ok != 0) {
	rep = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
	NFSLOCKCLSTATE();
	clp = nfscl_getclntsess(sessionid);
	if (clp != NULL) {
	tsep = nfsmnt_mdssession(clp->nfsc_nmp);
	nfsv4_seqsess_cacherep(slotid, tsep->nfsess_cbslots,
	NFSERR_OK, &rep);
	NFSUNLOCKCLSTATE();
	} else {
	NFSUNLOCKCLSTATE();
	m_freem(rep);
	}
	}
	}

	/*
	* Generate the next cbident value. Basically just increment a static value
	* and then check that it isn't already in the list, if it has wrapped around.
	*/
	static u_int32_t
	nfscl_nextcbident(void)
	{
	struct nfsclclient *clp;
	int matched;
	static u_int32_t nextcbident = 0;
	static int haswrapped = 0;

	nextcbident++;
	if (nextcbident == 0)
	haswrapped = 1;
	if (haswrapped) {
	/*
	* Search the clientid list for one already using this cbident.
	*/
	do {
	matched = 0;
	NFSLOCKCLSTATE();
	LIST_FOREACH(clp, &nfsclhead, nfsc_list) {
	if (clp->nfsc_cbident == nextcbident) {
	matched = 1;
	break;
	}
	}
	NFSUNLOCKCLSTATE();
	if (matched == 1)
	nextcbident++;
	} while (matched);
	}
	return (nextcbident);
	}

	/*
	* Get the mount point related to a given cbident or session and busy it.
	*/
	static mount_t
	nfscl_getmnt(int minorvers, uint8_t *sessionid, u_int32_t cbident,
	struct nfsclclient **clpp)
	{
	struct nfsclclient *clp;
	mount_t mp;
	int error;
	struct nfsclsession *tsep;

	*clpp = NULL;
	NFSLOCKCLSTATE();
	LIST_FOREACH(clp, &nfsclhead, nfsc_list) {
	tsep = nfsmnt_mdssession(clp->nfsc_nmp);
	if (minorvers == NFSV4_MINORVERSION) {
	if (clp->nfsc_cbident == cbident)
	break;
	} else if (!NFSBCMP(tsep->nfsess_sessionid, sessionid,
	NFSX_V4SESSIONID))
	break;
	}
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (NULL);
	}
	mp = clp->nfsc_nmp->nm_mountp;
	vfs_ref(mp);
	NFSUNLOCKCLSTATE();
	error = vfs_busy(mp, 0);
	vfs_rel(mp);
	if (error != 0)
	return (NULL);
	*clpp = clp;
	return (mp);
	}

	/*
	* Get the clientid pointer related to a given cbident.
	*/
	static struct nfsclclient *
	nfscl_getclnt(u_int32_t cbident)
	{
	struct nfsclclient *clp;

	LIST_FOREACH(clp, &nfsclhead, nfsc_list)
	if (clp->nfsc_cbident == cbident)
	break;
	return (clp);
	}

	/*
	* Get the clientid pointer related to a given sessionid.
	*/
	static struct nfsclclient *
	nfscl_getclntsess(uint8_t *sessionid)
	{
	struct nfsclclient *clp;
	struct nfsclsession *tsep;

	LIST_FOREACH(clp, &nfsclhead, nfsc_list) {
	tsep = nfsmnt_mdssession(clp->nfsc_nmp);
	if (!NFSBCMP(tsep->nfsess_sessionid, sessionid,
	NFSX_V4SESSIONID))
	break;
	}
	return (clp);
	}

	/*
	* Search for a lock conflict locally on the client. A conflict occurs if
	* - not same owner and overlapping byte range and at least one of them is
	* a write lock or this is an unlock.
	*/
	static int
	nfscl_localconflict(struct nfsclclient clp, u_int8_t fhp, int fhlen,
	struct nfscllock nlop, u_int8_t own, struct nfscldeleg *dp,
	struct nfscllock **lopp)
	{
	struct nfsclowner *owp;
	struct nfsclopen *op;
	int ret;

	if (dp != NULL) {
	ret = nfscl_checkconflict(&dp->nfsdl_lock, nlop, own, lopp);
	if (ret)
	return (ret);
	}
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if (op->nfso_fhlen == fhlen &&
	!NFSBCMP(op->nfso_fh, fhp, fhlen)) {
	ret = nfscl_checkconflict(&op->nfso_lock, nlop,
	own, lopp);
	if (ret)
	return (ret);
	}
	}
	}
	return (0);
	}

	static int
	nfscl_checkconflict(struct nfscllockownerhead lhp, struct nfscllock nlop,
	u_int8_t own, struct nfscllock *lopp)
	{
	struct nfscllockowner *lp;
	struct nfscllock *lop;

	LIST_FOREACH(lp, lhp, nfsl_list) {
	if (NFSBCMP(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN)) {
	LIST_FOREACH(lop, &lp->nfsl_lock, nfslo_list) {
	if (lop->nfslo_first >= nlop->nfslo_end)
	break;
	if (lop->nfslo_end <= nlop->nfslo_first)
	continue;
	if (lop->nfslo_type == F_WRLCK \|\|
	nlop->nfslo_type == F_WRLCK \|\|
	nlop->nfslo_type == F_UNLCK) {
	if (lopp != NULL)
	*lopp = lop;
	return (NFSERR_DENIED);
	}
	}
	}
	}
	return (0);
	}

	/*
	* Check for a local conflicting lock.
	*/
	APPLESTATIC int
	nfscl_lockt(vnode_t vp, struct nfsclclient *clp, u_int64_t off,
	u_int64_t len, struct flock fl, NFSPROC_T p, void *id, int flags)
	{
	struct nfscllock *lop, nlck;
	struct nfscldeleg *dp;
	struct nfsnode *np;
	u_int8_t own[NFSV4CL_LOCKNAMELEN];
	int error;

	nlck.nfslo_type = fl->l_type;
	nlck.nfslo_first = off;
	if (len == NFS64BITSSET) {
	nlck.nfslo_end = NFS64BITSSET;
	} else {
	nlck.nfslo_end = off + len;
	if (nlck.nfslo_end <= nlck.nfslo_first)
	return (NFSERR_INVAL);
	}
	np = VTONFS(vp);
	nfscl_filllockowner(id, own, flags);
	NFSLOCKCLSTATE();
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	error = nfscl_localconflict(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len,
	&nlck, own, dp, &lop);
	if (error != 0) {
	fl->l_whence = SEEK_SET;
	fl->l_start = lop->nfslo_first;
	if (lop->nfslo_end == NFS64BITSSET)
	fl->l_len = 0;
	else
	fl->l_len = lop->nfslo_end - lop->nfslo_first;
	fl->l_pid = (pid_t)0;
	fl->l_type = lop->nfslo_type;
	error = -1; /* no RPC required */
	} else if (dp != NULL && ((dp->nfsdl_flags & NFSCLDL_WRITE) \|\|
	fl->l_type == F_RDLCK)) {
	/*
	* The delegation ensures that there isn't a conflicting
	* lock on the server, so return -1 to indicate an RPC
	* isn't required.
	*/
	fl->l_type = F_UNLCK;
	error = -1;
	}
	NFSUNLOCKCLSTATE();
	return (error);
	}

	/*
	* Handle Recall of a delegation.
	* The clp must be exclusive locked when this is called.
	*/
	static int
	nfscl_recalldeleg(struct nfsclclient clp, struct nfsmount nmp,
	struct nfscldeleg dp, vnode_t vp, struct ucred cred, NFSPROC_T *p,
	int called_from_renewthread)
	{
	struct nfsclowner owp, lowp, *nowp;
	struct nfsclopen op, lop;
	struct nfscllockowner *lp;
	struct nfscllock *lckp;
	struct nfsnode *np;
	int error = 0, ret, gotvp = 0;

	if (vp == NULL) {
	/*
	* First, get a vnode for the file. This is needed to do RPCs.
	*/
	ret = nfscl_ngetreopen(nmp->nm_mountp, dp->nfsdl_fh,
	dp->nfsdl_fhlen, p, &np);
	if (ret) {
	/*
	* File isn't open, so nothing to move over to the
	* server.
	*/
	return (0);
	}
	vp = NFSTOV(np);
	gotvp = 1;
	} else {
	np = VTONFS(vp);
	}
	dp->nfsdl_flags &= ~NFSCLDL_MODTIMESET;

	/*
	* Ok, if it's a write delegation, flush data to the server, so
	* that close/open consistency is retained.
	*/
	ret = 0;
	NFSLOCKNODE(np);
	if ((dp->nfsdl_flags & NFSCLDL_WRITE) && (np->n_flag & NMODIFIED)) {
	np->n_flag \|= NDELEGRECALL;
	NFSUNLOCKNODE(np);
	ret = ncl_flush(vp, MNT_WAIT, p, 1, called_from_renewthread);
	NFSLOCKNODE(np);
	np->n_flag &= ~NDELEGRECALL;
	}
	NFSINVALATTRCACHE(np);
	NFSUNLOCKNODE(np);
	if (ret == EIO && called_from_renewthread != 0) {
	/*
	* If the flush failed with EIO for the renew thread,
	* return now, so that the dirty buffer will be flushed
	* later.
	*/
	if (gotvp != 0)
	vrele(vp);
	return (ret);
	}

	/*
	* Now, for each openowner with opens issued locally, move them
	* over to state against the server.
	*/
	LIST_FOREACH(lowp, &dp->nfsdl_owner, nfsow_list) {
	lop = LIST_FIRST(&lowp->nfsow_open);
	if (lop != NULL) {
	if (LIST_NEXT(lop, nfso_list) != NULL)
	panic("nfsdlg mult opens");
	/*
	* Look for the same openowner against the server.
	*/
	LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) {
	if (!NFSBCMP(lowp->nfsow_owner,
	owp->nfsow_owner, NFSV4CL_LOCKNAMELEN)) {
	newnfs_copycred(&dp->nfsdl_cred, cred);
	ret = nfscl_moveopen(vp, clp, nmp, lop,
	owp, dp, cred, p);
	if (ret == NFSERR_STALECLIENTID \|\|
	ret == NFSERR_STALEDONTRECOVER \|\|
	ret == NFSERR_BADSESSION) {
	if (gotvp)
	vrele(vp);
	return (ret);
	}
	if (ret) {
	nfscl_freeopen(lop, 1);
	if (!error)
	error = ret;
	}
	break;
	}
	}

	/*
	* If no openowner found, create one and get an open
	* for it.
	*/
	if (owp == NULL) {
	MALLOC(nowp, struct nfsclowner *,
	sizeof (struct nfsclowner), M_NFSCLOWNER,
	M_WAITOK);
	nfscl_newopen(clp, NULL, &owp, &nowp, &op,
	NULL, lowp->nfsow_owner, dp->nfsdl_fh,
	dp->nfsdl_fhlen, NULL, NULL);
	newnfs_copycred(&dp->nfsdl_cred, cred);
	ret = nfscl_moveopen(vp, clp, nmp, lop,
	owp, dp, cred, p);
	if (ret) {
	nfscl_freeopenowner(owp, 0);
	if (ret == NFSERR_STALECLIENTID \|\|
	ret == NFSERR_STALEDONTRECOVER \|\|
	ret == NFSERR_BADSESSION) {
	if (gotvp)
	vrele(vp);
	return (ret);
	}
	if (ret) {
	nfscl_freeopen(lop, 1);
	if (!error)
	error = ret;
	}
	}
	}
	}
	}

	/*
	* Now, get byte range locks for any locks done locally.
	*/
	LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
	LIST_FOREACH(lckp, &lp->nfsl_lock, nfslo_list) {
	newnfs_copycred(&dp->nfsdl_cred, cred);
	ret = nfscl_relock(vp, clp, nmp, lp, lckp, cred, p);
	if (ret == NFSERR_STALESTATEID \|\|
	ret == NFSERR_STALEDONTRECOVER \|\|
	ret == NFSERR_STALECLIENTID \|\|
	ret == NFSERR_BADSESSION) {
	if (gotvp)
	vrele(vp);
	return (ret);
	}
	if (ret && !error)
	error = ret;
	}
	}
	if (gotvp)
	vrele(vp);
	return (error);
	}

	/*
	* Move a locally issued open over to an owner on the state list.
	* SIDE EFFECT: If it needs to sleep (do an rpc), it unlocks clstate and
	* returns with it unlocked.
	*/
	static int
	nfscl_moveopen(vnode_t vp, struct nfsclclient clp, struct nfsmount nmp,
	struct nfsclopen lop, struct nfsclowner owp, struct nfscldeleg *dp,
	struct ucred cred, NFSPROC_T p)
	{
	struct nfsclopen op, nop;
	struct nfscldeleg *ndp;
	struct nfsnode *np;
	int error = 0, newone;

	/*
	* First, look for an appropriate open, If found, just increment the
	* opencnt in it.
	*/
	LIST_FOREACH(op, &owp->nfsow_open, nfso_list) {
	if ((op->nfso_mode & lop->nfso_mode) == lop->nfso_mode &&
	op->nfso_fhlen == lop->nfso_fhlen &&
	!NFSBCMP(op->nfso_fh, lop->nfso_fh, op->nfso_fhlen)) {
	op->nfso_opencnt += lop->nfso_opencnt;
	nfscl_freeopen(lop, 1);
	return (0);
	}
	}

	/* No appropriate open, so we have to do one against the server. */
	np = VTONFS(vp);
	MALLOC(nop, struct nfsclopen *, sizeof (struct nfsclopen) +
	lop->nfso_fhlen - 1, M_NFSCLOPEN, M_WAITOK);
	newone = 0;
	nfscl_newopen(clp, NULL, &owp, NULL, &op, &nop, owp->nfsow_owner,
	lop->nfso_fh, lop->nfso_fhlen, cred, &newone);
	ndp = dp;
	error = nfscl_tryopen(nmp, vp, np->n_v4->n4_data, np->n_v4->n4_fhlen,
	lop->nfso_fh, lop->nfso_fhlen, lop->nfso_mode, op,
	NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, &ndp, 0, 0, cred, p);
	if (error) {
	if (newone)
	nfscl_freeopen(op, 0);
	} else {
	op->nfso_mode \|= lop->nfso_mode;
	op->nfso_opencnt += lop->nfso_opencnt;
	nfscl_freeopen(lop, 1);
	}
	if (nop != NULL)
	FREE((caddr_t)nop, M_NFSCLOPEN);
	if (ndp != NULL) {
	/*
	* What should I do with the returned delegation, since the
	* delegation is being recalled? For now, just printf and
	* through it away.
	*/
	printf("Moveopen returned deleg\n");
	FREE((caddr_t)ndp, M_NFSCLDELEG);
	}
	return (error);
	}

	/*
	* Recall all delegations on this client.
	*/
	static void
	nfscl_totalrecall(struct nfsclclient *clp)
	{
	struct nfscldeleg *dp;

	TAILQ_FOREACH(dp, &clp->nfsc_deleg, nfsdl_list) {
	if ((dp->nfsdl_flags & NFSCLDL_DELEGRET) == 0)
	dp->nfsdl_flags \|= NFSCLDL_RECALL;
	}
	}

	/*
	* Relock byte ranges. Called for delegation recall and state expiry.
	*/
	static int
	nfscl_relock(vnode_t vp, struct nfsclclient clp, struct nfsmount nmp,
	struct nfscllockowner lp, struct nfscllock lop, struct ucred *cred,
	NFSPROC_T *p)
	{
	struct nfscllockowner *nlp;
	struct nfsfh *nfhp;
	u_int64_t off, len;
	- u_int32_t clidrev = 0;
	int error, newone, donelocally;

	off = lop->nfslo_first;
	len = lop->nfslo_end - lop->nfslo_first;
	error = nfscl_getbytelock(vp, off, len, lop->nfslo_type, cred, p,
	clp, 1, NULL, lp->nfsl_lockflags, lp->nfsl_owner,
	lp->nfsl_openowner, &nlp, &newone, &donelocally);
	if (error \|\| donelocally)
	return (error);
	- if (nmp->nm_clp != NULL)
	- clidrev = nmp->nm_clp->nfsc_clientidrev;
	- else
	- clidrev = 0;
	nfhp = VTONFS(vp)->n_fhp;
	error = nfscl_trylock(nmp, vp, nfhp->nfh_fh,
	nfhp->nfh_len, nlp, newone, 0, off,
	len, lop->nfslo_type, cred, p);
	if (error)
	nfscl_freelockowner(nlp, 0);
	return (error);
	}

	/*
	* Called to re-open a file. Basically get a vnode for the file handle
	* and then call nfsrpc_openrpc() to do the rest.
	*/
	static int
	nfsrpc_reopen(struct nfsmount nmp, u_int8_t fhp, int fhlen,
	u_int32_t mode, struct nfsclopen op, struct nfscldeleg *dpp,
	struct ucred cred, NFSPROC_T p)
	{
	struct nfsnode *np;
	vnode_t vp;
	int error;

	error = nfscl_ngetreopen(nmp->nm_mountp, fhp, fhlen, p, &np);
	if (error)
	return (error);
	vp = NFSTOV(np);
	if (np->n_v4 != NULL) {
	error = nfscl_tryopen(nmp, vp, np->n_v4->n4_data,
	np->n_v4->n4_fhlen, fhp, fhlen, mode, op,
	NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, dpp, 0, 0,
	cred, p);
	} else {
	error = EINVAL;
	}
	vrele(vp);
	return (error);
	}

	/*
	* Try an open against the server. Just call nfsrpc_openrpc(), retrying while
	* NFSERR_DELAY. Also, try system credentials, if the passed in credentials
	* fail.
	*/
	static int
	nfscl_tryopen(struct nfsmount nmp, vnode_t vp, u_int8_t fhp, int fhlen,
	u_int8_t newfhp, int newfhlen, u_int32_t mode, struct nfsclopen op,
	u_int8_t name, int namelen, struct nfscldeleg *ndpp,
	int reclaim, u_int32_t delegtype, struct ucred cred, NFSPROC_T p)
	{
	int error;

	do {
	error = nfsrpc_openrpc(nmp, vp, fhp, fhlen, newfhp, newfhlen,
	mode, op, name, namelen, ndpp, reclaim, delegtype, cred, p,
	0, 0);
	if (error == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, error, "nfstryop");
	} while (error == NFSERR_DELAY);
	if (error == EAUTH \|\| error == EACCES) {
	/* Try again using system credentials */
	newnfs_setroot(cred);
	do {
	error = nfsrpc_openrpc(nmp, vp, fhp, fhlen, newfhp,
	newfhlen, mode, op, name, namelen, ndpp, reclaim,
	delegtype, cred, p, 1, 0);
	if (error == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, error, "nfstryop");
	} while (error == NFSERR_DELAY);
	}
	return (error);
	}

	/*
	* Try a byte range lock. Just loop on nfsrpc_lock() while it returns
	* NFSERR_DELAY. Also, retry with system credentials, if the provided
	* cred don't work.
	*/
	static int
	nfscl_trylock(struct nfsmount nmp, vnode_t vp, u_int8_t fhp,
	int fhlen, struct nfscllockowner *nlp, int newone, int reclaim,
	u_int64_t off, u_int64_t len, short type, struct ucred cred, NFSPROC_T p)
	{
	struct nfsrv_descript nfsd, *nd = &nfsd;
	int error;

	do {
	error = nfsrpc_lock(nd, nmp, vp, fhp, fhlen, nlp, newone,
	reclaim, off, len, type, cred, p, 0);
	if (!error && nd->nd_repstat == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, (int)nd->nd_repstat,
	"nfstrylck");
	} while (!error && nd->nd_repstat == NFSERR_DELAY);
	if (!error)
	error = nd->nd_repstat;
	if (error == EAUTH \|\| error == EACCES) {
	/* Try again using root credentials */
	newnfs_setroot(cred);
	do {
	error = nfsrpc_lock(nd, nmp, vp, fhp, fhlen, nlp,
	newone, reclaim, off, len, type, cred, p, 1);
	if (!error && nd->nd_repstat == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, (int)nd->nd_repstat,
	"nfstrylck");
	} while (!error && nd->nd_repstat == NFSERR_DELAY);
	if (!error)
	error = nd->nd_repstat;
	}
	return (error);
	}

	/*
	* Try a delegreturn against the server. Just call nfsrpc_delegreturn(),
	* retrying while NFSERR_DELAY. Also, try system credentials, if the passed in
	* credentials fail.
	*/
	static int
	nfscl_trydelegreturn(struct nfscldeleg dp, struct ucred cred,
	struct nfsmount nmp, NFSPROC_T p)
	{
	int error;

	do {
	error = nfsrpc_delegreturn(dp, cred, nmp, p, 0);
	if (error == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, error, "nfstrydp");
	} while (error == NFSERR_DELAY);
	if (error == EAUTH \|\| error == EACCES) {
	/* Try again using system credentials */
	newnfs_setroot(cred);
	do {
	error = nfsrpc_delegreturn(dp, cred, nmp, p, 1);
	if (error == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, error, "nfstrydp");
	} while (error == NFSERR_DELAY);
	}
	return (error);
	}

	/*
	* Try a close against the server. Just call nfsrpc_closerpc(),
	* retrying while NFSERR_DELAY. Also, try system credentials, if the passed in
	* credentials fail.
	*/
	APPLESTATIC int
	nfscl_tryclose(struct nfsclopen op, struct ucred cred,
	struct nfsmount nmp, NFSPROC_T p)
	{
	struct nfsrv_descript nfsd, *nd = &nfsd;
	int error;

	do {
	error = nfsrpc_closerpc(nd, nmp, op, cred, p, 0);
	if (error == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, error, "nfstrycl");
	} while (error == NFSERR_DELAY);
	if (error == EAUTH \|\| error == EACCES) {
	/* Try again using system credentials */
	newnfs_setroot(cred);
	do {
	error = nfsrpc_closerpc(nd, nmp, op, cred, p, 1);
	if (error == NFSERR_DELAY)
	(void) nfs_catnap(PZERO, error, "nfstrycl");
	} while (error == NFSERR_DELAY);
	}
	return (error);
	}

	/*
	* Decide if a delegation on a file permits close without flushing writes
	* to the server. This might be a big performance win in some environments.
	* (Not useful until the client does caching on local stable storage.)
	*/
	APPLESTATIC int
	nfscl_mustflush(vnode_t vp)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsnode *np;
	struct nfsmount *nmp;

	np = VTONFS(vp);
	nmp = VFSTONFS(vnode_mount(vp));
	if (!NFSHASNFSV4(nmp))
	return (1);
	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (1);
	}
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (dp != NULL && (dp->nfsdl_flags &
	(NFSCLDL_WRITE \| NFSCLDL_RECALL \| NFSCLDL_DELEGRET)) ==
	NFSCLDL_WRITE &&
	(dp->nfsdl_sizelimit >= np->n_size \|\|
	!NFSHASSTRICT3530(nmp))) {
	NFSUNLOCKCLSTATE();
	return (0);
	}
	NFSUNLOCKCLSTATE();
	return (1);
	}

	/*
	* See if a (write) delegation exists for this file.
	*/
	APPLESTATIC int
	nfscl_nodeleg(vnode_t vp, int writedeleg)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsnode *np;
	struct nfsmount *nmp;

	np = VTONFS(vp);
	nmp = VFSTONFS(vnode_mount(vp));
	if (!NFSHASNFSV4(nmp))
	return (1);
	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (1);
	}
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (dp != NULL &&
	(dp->nfsdl_flags & (NFSCLDL_RECALL \| NFSCLDL_DELEGRET)) == 0 &&
	(writedeleg == 0 \|\| (dp->nfsdl_flags & NFSCLDL_WRITE) ==
	NFSCLDL_WRITE)) {
	NFSUNLOCKCLSTATE();
	return (0);
	}
	NFSUNLOCKCLSTATE();
	return (1);
	}

	/*
	* Look for an associated delegation that should be DelegReturned.
	*/
	APPLESTATIC int
	nfscl_removedeleg(vnode_t vp, NFSPROC_T p, nfsv4stateid_t stp)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsclowner *owp;
	struct nfscllockowner *lp;
	struct nfsmount *nmp;
	struct ucred *cred;
	struct nfsnode *np;
	int igotlock = 0, triedrecall = 0, needsrecall, retcnt = 0, islept;

	nmp = VFSTONFS(vnode_mount(vp));
	np = VTONFS(vp);
	NFSLOCKCLSTATE();
	/*
	* Loop around waiting for:
	* - outstanding I/O operations on delegations to complete
	* - for a delegation on vp that has state, lock the client and
	* do a recall
	* - return delegation with no state
	*/
	while (1) {
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (retcnt);
	}
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
	np->n_fhp->nfh_len);
	if (dp != NULL) {
	/*
	* Wait for outstanding I/O ops to be done.
	*/
	if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
	if (igotlock) {
	nfsv4_unlock(&clp->nfsc_lock, 0);
	igotlock = 0;
	}
	dp->nfsdl_rwlock.nfslock_lock \|= NFSV4LOCK_WANTED;
	(void) nfsmsleep(&dp->nfsdl_rwlock,
	NFSCLSTATEMUTEXPTR, PZERO, "nfscld", NULL);
	continue;
	}
	needsrecall = 0;
	LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
	if (!LIST_EMPTY(&owp->nfsow_open)) {
	needsrecall = 1;
	break;
	}
	}
	if (!needsrecall) {
	LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
	if (!LIST_EMPTY(&lp->nfsl_lock)) {
	needsrecall = 1;
	break;
	}
	}
	}
	if (needsrecall && !triedrecall) {
	dp->nfsdl_flags \|= NFSCLDL_DELEGRET;
	islept = 0;
	while (!igotlock) {
	igotlock = nfsv4_lock(&clp->nfsc_lock, 1,
	&islept, NFSCLSTATEMUTEXPTR, NULL);
	if (islept)
	break;
	}
	if (islept)
	continue;
	NFSUNLOCKCLSTATE();
	cred = newnfs_getcred();
	newnfs_copycred(&dp->nfsdl_cred, cred);
	(void) nfscl_recalldeleg(clp, nmp, dp, vp, cred, p, 0);
	NFSFREECRED(cred);
	triedrecall = 1;
	NFSLOCKCLSTATE();
	nfsv4_unlock(&clp->nfsc_lock, 0);
	igotlock = 0;
	continue;
	}
	*stp = dp->nfsdl_stateid;
	retcnt = 1;
	nfscl_cleandeleg(dp);
	nfscl_freedeleg(&clp->nfsc_deleg, dp);
	}
	if (igotlock)
	nfsv4_unlock(&clp->nfsc_lock, 0);
	NFSUNLOCKCLSTATE();
	return (retcnt);
	}
	}

	/*
	* Look for associated delegation(s) that should be DelegReturned.
	*/
	APPLESTATIC int
	nfscl_renamedeleg(vnode_t fvp, nfsv4stateid_t fstp, int gotfdp, vnode_t tvp,
	nfsv4stateid_t tstp, int gottdp, NFSPROC_T *p)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsclowner *owp;
	struct nfscllockowner *lp;
	struct nfsmount *nmp;
	struct ucred *cred;
	struct nfsnode *np;
	int igotlock = 0, triedrecall = 0, needsrecall, retcnt = 0, islept;

	nmp = VFSTONFS(vnode_mount(fvp));
	*gotfdp = 0;
	*gottdp = 0;
	NFSLOCKCLSTATE();
	/*
	* Loop around waiting for:
	* - outstanding I/O operations on delegations to complete
	* - for a delegation on fvp that has state, lock the client and
	* do a recall
	* - return delegation(s) with no state.
	*/
	while (1) {
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (retcnt);
	}
	np = VTONFS(fvp);
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
	np->n_fhp->nfh_len);
	if (dp != NULL && *gotfdp == 0) {
	/*
	* Wait for outstanding I/O ops to be done.
	*/
	if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
	if (igotlock) {
	nfsv4_unlock(&clp->nfsc_lock, 0);
	igotlock = 0;
	}
	dp->nfsdl_rwlock.nfslock_lock \|= NFSV4LOCK_WANTED;
	(void) nfsmsleep(&dp->nfsdl_rwlock,
	NFSCLSTATEMUTEXPTR, PZERO, "nfscld", NULL);
	continue;
	}
	needsrecall = 0;
	LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
	if (!LIST_EMPTY(&owp->nfsow_open)) {
	needsrecall = 1;
	break;
	}
	}
	if (!needsrecall) {
	LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
	if (!LIST_EMPTY(&lp->nfsl_lock)) {
	needsrecall = 1;
	break;
	}
	}
	}
	if (needsrecall && !triedrecall) {
	dp->nfsdl_flags \|= NFSCLDL_DELEGRET;
	islept = 0;
	while (!igotlock) {
	igotlock = nfsv4_lock(&clp->nfsc_lock, 1,
	&islept, NFSCLSTATEMUTEXPTR, NULL);
	if (islept)
	break;
	}
	if (islept)
	continue;
	NFSUNLOCKCLSTATE();
	cred = newnfs_getcred();
	newnfs_copycred(&dp->nfsdl_cred, cred);
	(void) nfscl_recalldeleg(clp, nmp, dp, fvp, cred, p, 0);
	NFSFREECRED(cred);
	triedrecall = 1;
	NFSLOCKCLSTATE();
	nfsv4_unlock(&clp->nfsc_lock, 0);
	igotlock = 0;
	continue;
	}
	*fstp = dp->nfsdl_stateid;
	retcnt++;
	*gotfdp = 1;
	nfscl_cleandeleg(dp);
	nfscl_freedeleg(&clp->nfsc_deleg, dp);
	}
	if (igotlock) {
	nfsv4_unlock(&clp->nfsc_lock, 0);
	igotlock = 0;
	}
	if (tvp != NULL) {
	np = VTONFS(tvp);
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh,
	np->n_fhp->nfh_len);
	if (dp != NULL && *gottdp == 0) {
	/*
	* Wait for outstanding I/O ops to be done.
	*/
	if (dp->nfsdl_rwlock.nfslock_usecnt > 0) {
	dp->nfsdl_rwlock.nfslock_lock \|= NFSV4LOCK_WANTED;
	(void) nfsmsleep(&dp->nfsdl_rwlock,
	NFSCLSTATEMUTEXPTR, PZERO, "nfscld", NULL);
	continue;
	}
	LIST_FOREACH(owp, &dp->nfsdl_owner, nfsow_list) {
	if (!LIST_EMPTY(&owp->nfsow_open)) {
	NFSUNLOCKCLSTATE();
	return (retcnt);
	}
	}
	LIST_FOREACH(lp, &dp->nfsdl_lock, nfsl_list) {
	if (!LIST_EMPTY(&lp->nfsl_lock)) {
	NFSUNLOCKCLSTATE();
	return (retcnt);
	}
	}
	*tstp = dp->nfsdl_stateid;
	retcnt++;
	*gottdp = 1;
	nfscl_cleandeleg(dp);
	nfscl_freedeleg(&clp->nfsc_deleg, dp);
	}
	}
	NFSUNLOCKCLSTATE();
	return (retcnt);
	}
	}

	/*
	* Get a reference on the clientid associated with the mount point.
	* Return 1 if success, 0 otherwise.
	*/
	APPLESTATIC int
	nfscl_getref(struct nfsmount *nmp)
	{
	struct nfsclclient *clp;

	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (0);
	}
	nfsv4_getref(&clp->nfsc_lock, NULL, NFSCLSTATEMUTEXPTR, NULL);
	NFSUNLOCKCLSTATE();
	return (1);
	}

	/*
	* Release a reference on a clientid acquired with the above call.
	*/
	APPLESTATIC void
	nfscl_relref(struct nfsmount *nmp)
	{
	struct nfsclclient *clp;

	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return;
	}
	nfsv4_relref(&clp->nfsc_lock);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Save the size attribute in the delegation, since the nfsnode
	* is going away.
	*/
	APPLESTATIC void
	nfscl_reclaimnode(vnode_t vp)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsnode *np = VTONFS(vp);
	struct nfsmount *nmp;

	nmp = VFSTONFS(vnode_mount(vp));
	if (!NFSHASNFSV4(nmp))
	return;
	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return;
	}
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_WRITE))
	dp->nfsdl_size = np->n_size;
	NFSUNLOCKCLSTATE();
	}

	/*
	* Get the saved size attribute in the delegation, since it is a
	* newly allocated nfsnode.
	*/
	APPLESTATIC void
	nfscl_newnode(vnode_t vp)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsnode *np = VTONFS(vp);
	struct nfsmount *nmp;

	nmp = VFSTONFS(vnode_mount(vp));
	if (!NFSHASNFSV4(nmp))
	return;
	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return;
	}
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_WRITE))
	np->n_size = dp->nfsdl_size;
	NFSUNLOCKCLSTATE();
	}

	/*
	* If there is a valid write delegation for this file, set the modtime
	* to the local clock time.
	*/
	APPLESTATIC void
	nfscl_delegmodtime(vnode_t vp)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsnode *np = VTONFS(vp);
	struct nfsmount *nmp;

	nmp = VFSTONFS(vnode_mount(vp));
	if (!NFSHASNFSV4(nmp))
	return;
	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return;
	}
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (dp != NULL && (dp->nfsdl_flags & NFSCLDL_WRITE)) {
	nanotime(&dp->nfsdl_modtime);
	dp->nfsdl_flags \|= NFSCLDL_MODTIMESET;
	}
	NFSUNLOCKCLSTATE();
	}

	/*
	* If there is a valid write delegation for this file with a modtime set,
	* put that modtime in mtime.
	*/
	APPLESTATIC void
	nfscl_deleggetmodtime(vnode_t vp, struct timespec *mtime)
	{
	struct nfsclclient *clp;
	struct nfscldeleg *dp;
	struct nfsnode *np = VTONFS(vp);
	struct nfsmount *nmp;

	nmp = VFSTONFS(vnode_mount(vp));
	if (!NFSHASNFSV4(nmp))
	return;
	NFSLOCKCLSTATE();
	clp = nfscl_findcl(nmp);
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return;
	}
	dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (dp != NULL &&
	(dp->nfsdl_flags & (NFSCLDL_WRITE \| NFSCLDL_MODTIMESET)) ==
	(NFSCLDL_WRITE \| NFSCLDL_MODTIMESET))
	*mtime = dp->nfsdl_modtime;
	NFSUNLOCKCLSTATE();
	}

	static int
	nfscl_errmap(struct nfsrv_descript *nd, u_int32_t minorvers)
	{
	short defaulterrp, errp;

	if (!nd->nd_repstat)
	return (0);
	if (nd->nd_procnum == NFSPROC_NOOP)
	return (txdr_unsigned(nd->nd_repstat & 0xffff));
	if (nd->nd_repstat == EBADRPC)
	return (txdr_unsigned(NFSERR_BADXDR));
	if (nd->nd_repstat == NFSERR_MINORVERMISMATCH \|\|
	nd->nd_repstat == NFSERR_OPILLEGAL)
	return (txdr_unsigned(nd->nd_repstat));
	if (nd->nd_repstat >= NFSERR_BADIOMODE && nd->nd_repstat < 20000 &&
	minorvers > NFSV4_MINORVERSION) {
	/* NFSv4.n error. */
	return (txdr_unsigned(nd->nd_repstat));
	}
	if (nd->nd_procnum < NFSV4OP_CBNOPS)
	errp = defaulterrp = nfscl_cberrmap[nd->nd_procnum];
	else
	return (txdr_unsigned(nd->nd_repstat));
	while (*++errp)
	if (*errp == (short)nd->nd_repstat)
	return (txdr_unsigned(nd->nd_repstat));
	return (txdr_unsigned(*defaulterrp));
	}

	/*
	* Called to find/add a layout to a client.
	* This function returns the layout with a refcnt (shared lock) upon
	* success (returns 0) or with no lock/refcnt on the layout when an
	* error is returned.
	* If a layout is passed in via lypp, it is locked (exclusively locked).
	*/
	APPLESTATIC int
	nfscl_layout(struct nfsmount nmp, vnode_t vp, u_int8_t fhp, int fhlen,
	nfsv4stateid_t *stateidp, int layouttype, int retonclose,
	struct nfsclflayouthead fhlp, struct nfscllayout *lypp,
	struct ucred cred, NFSPROC_T p)
	{
	struct nfsclclient *clp;
	struct nfscllayout lyp, tlyp;
	struct nfsclflayout *flp;
	struct nfsnode *np = VTONFS(vp);
	mount_t mp;
	int layout_passed_in;

	mp = nmp->nm_mountp;
	layout_passed_in = 1;
	tlyp = NULL;
	lyp = *lypp;
	if (lyp == NULL) {
	layout_passed_in = 0;
	tlyp = malloc(sizeof(*tlyp) + fhlen - 1, M_NFSLAYOUT,
	M_WAITOK \| M_ZERO);
	}

	NFSLOCKCLSTATE();
	clp = nmp->nm_clp;
	if (clp == NULL) {
	if (layout_passed_in != 0)
	nfsv4_unlock(&lyp->nfsly_lock, 0);
	NFSUNLOCKCLSTATE();
	if (tlyp != NULL)
	free(tlyp, M_NFSLAYOUT);
	return (EPERM);
	}
	if (lyp == NULL) {
	/*
	* Although no lyp was passed in, another thread might have
	* allocated one. If one is found, just increment it's ref
	* count and return it.
	*/
	lyp = nfscl_findlayout(clp, fhp, fhlen);
	if (lyp == NULL) {
	lyp = tlyp;
	tlyp = NULL;
	lyp->nfsly_stateid.seqid = stateidp->seqid;
	lyp->nfsly_stateid.other[0] = stateidp->other[0];
	lyp->nfsly_stateid.other[1] = stateidp->other[1];
	lyp->nfsly_stateid.other[2] = stateidp->other[2];
	lyp->nfsly_lastbyte = 0;
	LIST_INIT(&lyp->nfsly_flayread);
	LIST_INIT(&lyp->nfsly_flayrw);
	LIST_INIT(&lyp->nfsly_recall);
	lyp->nfsly_filesid[0] = np->n_vattr.na_filesid[0];
	lyp->nfsly_filesid[1] = np->n_vattr.na_filesid[1];
	lyp->nfsly_clp = clp;
	if (layouttype == NFSLAYOUT_FLEXFILE)
	lyp->nfsly_flags = NFSLY_FLEXFILE;
	else
	lyp->nfsly_flags = NFSLY_FILES;
	if (retonclose != 0)
	lyp->nfsly_flags \|= NFSLY_RETONCLOSE;
	lyp->nfsly_fhlen = fhlen;
	NFSBCOPY(fhp, lyp->nfsly_fh, fhlen);
	TAILQ_INSERT_HEAD(&clp->nfsc_layout, lyp, nfsly_list);
	LIST_INSERT_HEAD(NFSCLLAYOUTHASH(clp, fhp, fhlen), lyp,
	nfsly_hash);
	lyp->nfsly_timestamp = NFSD_MONOSEC + 120;
	nfscl_layoutcnt++;
	} else {
	if (retonclose != 0)
	lyp->nfsly_flags \|= NFSLY_RETONCLOSE;
	TAILQ_REMOVE(&clp->nfsc_layout, lyp, nfsly_list);
	TAILQ_INSERT_HEAD(&clp->nfsc_layout, lyp, nfsly_list);
	lyp->nfsly_timestamp = NFSD_MONOSEC + 120;
	}
	nfsv4_getref(&lyp->nfsly_lock, NULL, NFSCLSTATEMUTEXPTR, mp);
	if (NFSCL_FORCEDISM(mp)) {
	NFSUNLOCKCLSTATE();
	if (tlyp != NULL)
	free(tlyp, M_NFSLAYOUT);
	return (EPERM);
	}
	*lypp = lyp;
	} else
	lyp->nfsly_stateid.seqid = stateidp->seqid;

	/* Merge the new list of File Layouts into the list. */
	flp = LIST_FIRST(fhlp);
	if (flp != NULL) {
	if (flp->nfsfl_iomode == NFSLAYOUTIOMODE_READ)
	nfscl_mergeflayouts(&lyp->nfsly_flayread, fhlp);
	else
	nfscl_mergeflayouts(&lyp->nfsly_flayrw, fhlp);
	}
	if (layout_passed_in != 0)
	nfsv4_unlock(&lyp->nfsly_lock, 1);
	NFSUNLOCKCLSTATE();
	if (tlyp != NULL)
	free(tlyp, M_NFSLAYOUT);
	return (0);
	}

	/*
	* Search for a layout by MDS file handle.
	* If one is found, it is returned with a refcnt (shared lock) iff
	* retflpp returned non-NULL and locked (exclusive locked) iff retflpp is
	* returned NULL.
	*/
	struct nfscllayout *
	nfscl_getlayout(struct nfsclclient clp, uint8_t fhp, int fhlen,
	uint64_t off, struct nfsclflayout *retflpp, int recalledp)
	{
	struct nfscllayout *lyp;
	mount_t mp;
	int error, igotlock;

	mp = clp->nfsc_nmp->nm_mountp;
	*recalledp = 0;
	*retflpp = NULL;
	NFSLOCKCLSTATE();
	lyp = nfscl_findlayout(clp, fhp, fhlen);
	if (lyp != NULL) {
	if ((lyp->nfsly_flags & NFSLY_RECALL) == 0) {
	TAILQ_REMOVE(&clp->nfsc_layout, lyp, nfsly_list);
	TAILQ_INSERT_HEAD(&clp->nfsc_layout, lyp, nfsly_list);
	lyp->nfsly_timestamp = NFSD_MONOSEC + 120;
	error = nfscl_findlayoutforio(lyp, off,
	NFSV4OPEN_ACCESSREAD, retflpp);
	if (error == 0)
	nfsv4_getref(&lyp->nfsly_lock, NULL,
	NFSCLSTATEMUTEXPTR, mp);
	else {
	do {
	igotlock = nfsv4_lock(&lyp->nfsly_lock,
	1, NULL, NFSCLSTATEMUTEXPTR, mp);
	} while (igotlock == 0 && !NFSCL_FORCEDISM(mp));
	*retflpp = NULL;
	}
	if (NFSCL_FORCEDISM(mp)) {
	lyp = NULL;
	*recalledp = 1;
	}
	} else {
	lyp = NULL;
	*recalledp = 1;
	}
	}
	NFSUNLOCKCLSTATE();
	return (lyp);
	}

	/*
	* Search for a layout by MDS file handle. If one is found, mark in to be
	* recalled, if it already marked "return on close".
	*/
	static void
	nfscl_retoncloselayout(vnode_t vp, struct nfsclclient clp, uint8_t fhp,
	int fhlen, struct nfsclrecalllayout **recallpp)
	{
	struct nfscllayout *lyp;
	uint32_t iomode;

	if (vp->v_type != VREG \|\| !NFSHASPNFS(VFSTONFS(vnode_mount(vp))) \|\|
	nfscl_enablecallb == 0 \|\| nfs_numnfscbd == 0 \|\|
	(VTONFS(vp)->n_flag & NNOLAYOUT) != 0)
	return;
	lyp = nfscl_findlayout(clp, fhp, fhlen);
	if (lyp != NULL && (lyp->nfsly_flags & (NFSLY_RETONCLOSE \|
	NFSLY_RECALL)) == NFSLY_RETONCLOSE) {
	iomode = 0;
	if (!LIST_EMPTY(&lyp->nfsly_flayread))
	iomode \|= NFSLAYOUTIOMODE_READ;
	if (!LIST_EMPTY(&lyp->nfsly_flayrw))
	iomode \|= NFSLAYOUTIOMODE_RW;
	(void)nfscl_layoutrecall(NFSLAYOUTRETURN_FILE, lyp, iomode,
	0, UINT64_MAX, lyp->nfsly_stateid.seqid, *recallpp);
	NFSCL_DEBUG(4, "retoncls recall iomode=%d\n", iomode);
	*recallpp = NULL;
	}
	}

	/*
	* Dereference a layout.
	*/
	void
	nfscl_rellayout(struct nfscllayout *lyp, int exclocked)
	{

	NFSLOCKCLSTATE();
	if (exclocked != 0)
	nfsv4_unlock(&lyp->nfsly_lock, 0);
	else
	nfsv4_relref(&lyp->nfsly_lock);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Search for a devinfo by deviceid. If one is found, return it after
	* acquiring a reference count on it.
	*/
	struct nfscldevinfo *
	nfscl_getdevinfo(struct nfsclclient clp, uint8_t deviceid,
	struct nfscldevinfo *dip)
	{

	NFSLOCKCLSTATE();
	if (dip == NULL)
	dip = nfscl_finddevinfo(clp, deviceid);
	if (dip != NULL)
	dip->nfsdi_refcnt++;
	NFSUNLOCKCLSTATE();
	return (dip);
	}

	/*
	* Dereference a devinfo structure.
	*/
	static void
	nfscl_reldevinfo_locked(struct nfscldevinfo *dip)
	{

	dip->nfsdi_refcnt--;
	if (dip->nfsdi_refcnt == 0)
	wakeup(&dip->nfsdi_refcnt);
	}

	/*
	* Dereference a devinfo structure.
	*/
	void
	nfscl_reldevinfo(struct nfscldevinfo *dip)
	{

	NFSLOCKCLSTATE();
	nfscl_reldevinfo_locked(dip);
	NFSUNLOCKCLSTATE();
	}

	/*
	* Find a layout for this file handle. Return NULL upon failure.
	*/
	static struct nfscllayout *
	nfscl_findlayout(struct nfsclclient clp, u_int8_t fhp, int fhlen)
	{
	struct nfscllayout *lyp;

	LIST_FOREACH(lyp, NFSCLLAYOUTHASH(clp, fhp, fhlen), nfsly_hash)
	if (lyp->nfsly_fhlen == fhlen &&
	!NFSBCMP(lyp->nfsly_fh, fhp, fhlen))
	break;
	return (lyp);
	}

	/*
	* Find a devinfo for this deviceid. Return NULL upon failure.
	*/
	static struct nfscldevinfo *
	nfscl_finddevinfo(struct nfsclclient clp, uint8_t deviceid)
	{
	struct nfscldevinfo *dip;

	LIST_FOREACH(dip, &clp->nfsc_devinfo, nfsdi_list)
	if (NFSBCMP(dip->nfsdi_deviceid, deviceid, NFSX_V4DEVICEID)
	== 0)
	break;
	return (dip);
	}

	/*
	* Merge the new file layout list into the main one, maintaining it in
	* increasing offset order.
	*/
	static void
	nfscl_mergeflayouts(struct nfsclflayouthead *fhlp,
	struct nfsclflayouthead *newfhlp)
	{
	struct nfsclflayout flp, nflp, prevflp, tflp;

	flp = LIST_FIRST(fhlp);
	prevflp = NULL;
	LIST_FOREACH_SAFE(nflp, newfhlp, nfsfl_list, tflp) {
	while (flp != NULL && flp->nfsfl_off < nflp->nfsfl_off) {
	prevflp = flp;
	flp = LIST_NEXT(flp, nfsfl_list);
	}
	if (prevflp == NULL)
	LIST_INSERT_HEAD(fhlp, nflp, nfsfl_list);
	else
	LIST_INSERT_AFTER(prevflp, nflp, nfsfl_list);
	prevflp = nflp;
	}
	}

	/*
	* Add this nfscldevinfo to the client, if it doesn't already exist.
	* This function consumes the structure pointed at by dip, if not NULL.
	*/
	APPLESTATIC int
	nfscl_adddevinfo(struct nfsmount nmp, struct nfscldevinfo dip,
	struct nfsclflayout *flp)
	{
	struct nfsclclient *clp;
	struct nfscldevinfo *tdip;
	uint8_t *dev;

	NFSLOCKCLSTATE();
	clp = nmp->nm_clp;
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	if (dip != NULL)
	free(dip, M_NFSDEVINFO);
	return (ENODEV);
	}
	if ((flp->nfsfl_flags & NFSFL_FILE) != 0)
	dev = flp->nfsfl_dev;
	else
	dev = flp->nfsfl_ffm[0].dev;
	tdip = nfscl_finddevinfo(clp, dev);
	if (tdip != NULL) {
	tdip->nfsdi_layoutrefs++;
	flp->nfsfl_devp = tdip;
	nfscl_reldevinfo_locked(tdip);
	NFSUNLOCKCLSTATE();
	if (dip != NULL)
	free(dip, M_NFSDEVINFO);
	return (0);
	}
	if (dip != NULL) {
	LIST_INSERT_HEAD(&clp->nfsc_devinfo, dip, nfsdi_list);
	dip->nfsdi_layoutrefs = 1;
	flp->nfsfl_devp = dip;
	}
	NFSUNLOCKCLSTATE();
	if (dip == NULL)
	return (ENODEV);
	return (0);
	}

	/*
	* Free up a layout structure and associated file layout structure(s).
	*/
	APPLESTATIC void
	nfscl_freelayout(struct nfscllayout *layp)
	{
	struct nfsclflayout flp, nflp;
	struct nfsclrecalllayout rp, nrp;

	LIST_FOREACH_SAFE(flp, &layp->nfsly_flayread, nfsfl_list, nflp) {
	LIST_REMOVE(flp, nfsfl_list);
	nfscl_freeflayout(flp);
	}
	LIST_FOREACH_SAFE(flp, &layp->nfsly_flayrw, nfsfl_list, nflp) {
	LIST_REMOVE(flp, nfsfl_list);
	nfscl_freeflayout(flp);
	}
	LIST_FOREACH_SAFE(rp, &layp->nfsly_recall, nfsrecly_list, nrp) {
	LIST_REMOVE(rp, nfsrecly_list);
	free(rp, M_NFSLAYRECALL);
	}
	nfscl_layoutcnt--;
	free(layp, M_NFSLAYOUT);
	}

	/*
	* Free up a file layout structure.
	*/
	APPLESTATIC void
	nfscl_freeflayout(struct nfsclflayout *flp)
	{
	int i, j;

	if ((flp->nfsfl_flags & NFSFL_FILE) != 0)
	for (i = 0; i < flp->nfsfl_fhcnt; i++)
	free(flp->nfsfl_fh[i], M_NFSFH);
	if ((flp->nfsfl_flags & NFSFL_FLEXFILE) != 0)
	for (i = 0; i < flp->nfsfl_mirrorcnt; i++)
	for (j = 0; j < flp->nfsfl_ffm[i].fhcnt; j++)
	free(flp->nfsfl_ffm[i].fh[j], M_NFSFH);
	if (flp->nfsfl_devp != NULL)
	flp->nfsfl_devp->nfsdi_layoutrefs--;
	free(flp, M_NFSFLAYOUT);
	}

	/*
	* Free up a file layout devinfo structure.
	*/
	APPLESTATIC void
	nfscl_freedevinfo(struct nfscldevinfo *dip)
	{

	free(dip, M_NFSDEVINFO);
	}

	/*
	* Mark any layouts that match as recalled.
	*/
	static int
	nfscl_layoutrecall(int recalltype, struct nfscllayout *lyp, uint32_t iomode,
	uint64_t off, uint64_t len, uint32_t stateseqid,
	struct nfsclrecalllayout *recallp)
	{
	struct nfsclrecalllayout rp, orp;

	recallp->nfsrecly_recalltype = recalltype;
	recallp->nfsrecly_iomode = iomode;
	recallp->nfsrecly_stateseqid = stateseqid;
	recallp->nfsrecly_off = off;
	recallp->nfsrecly_len = len;
	/*
	* Order the list as file returns first, followed by fsid and any
	* returns, both in increasing stateseqid order.
	* Note that the seqids wrap around, so 1 is after 0xffffffff.
	* (I'm not sure this is correct because I find RFC5661 confusing
	* on this, but hopefully it will work ok.)
	*/
	orp = NULL;
	LIST_FOREACH(rp, &lyp->nfsly_recall, nfsrecly_list) {
	orp = rp;
	if ((recalltype == NFSLAYOUTRETURN_FILE &&
	(rp->nfsrecly_recalltype != NFSLAYOUTRETURN_FILE \|\|
	nfscl_seq(stateseqid, rp->nfsrecly_stateseqid) != 0)) \|\|
	(recalltype != NFSLAYOUTRETURN_FILE &&
	rp->nfsrecly_recalltype != NFSLAYOUTRETURN_FILE &&
	nfscl_seq(stateseqid, rp->nfsrecly_stateseqid) != 0)) {
	LIST_INSERT_BEFORE(rp, recallp, nfsrecly_list);
	break;
	}
	}
	if (rp == NULL) {
	if (orp == NULL)
	LIST_INSERT_HEAD(&lyp->nfsly_recall, recallp,
	nfsrecly_list);
	else
	LIST_INSERT_AFTER(orp, recallp, nfsrecly_list);
	}
	lyp->nfsly_flags \|= NFSLY_RECALL;
	return (0);
	}

	/*
	* Compare the two seqids for ordering. The trick is that the seqids can
	* wrap around from 0xffffffff->0, so check for the cases where one
	* has wrapped around.
	* Return 1 if seqid1 comes before seqid2, 0 otherwise.
	*/
	static int
	nfscl_seq(uint32_t seqid1, uint32_t seqid2)
	{

	if (seqid2 > seqid1 && (seqid2 - seqid1) >= 0x7fffffff)
	/* seqid2 has wrapped around. */
	return (0);
	if (seqid1 > seqid2 && (seqid1 - seqid2) >= 0x7fffffff)
	/* seqid1 has wrapped around. */
	return (1);
	if (seqid1 <= seqid2)
	return (1);
	return (0);
	}

	/*
	* Do a layout return for each of the recalls.
	*/
	static void
	nfscl_layoutreturn(struct nfsmount nmp, struct nfscllayout lyp,
	struct ucred cred, NFSPROC_T p)
	{
	struct nfsclrecalllayout *rp;
	nfsv4stateid_t stateid;
	int layouttype;

	NFSBCOPY(lyp->nfsly_stateid.other, stateid.other, NFSX_STATEIDOTHER);
	stateid.seqid = lyp->nfsly_stateid.seqid;
	if ((lyp->nfsly_flags & NFSLY_FILES) != 0)
	layouttype = NFSLAYOUT_NFSV4_1_FILES;
	else
	layouttype = NFSLAYOUT_FLEXFILE;
	LIST_FOREACH(rp, &lyp->nfsly_recall, nfsrecly_list) {
	(void)nfsrpc_layoutreturn(nmp, lyp->nfsly_fh,
	lyp->nfsly_fhlen, 0, layouttype,
	rp->nfsrecly_iomode, rp->nfsrecly_recalltype,
	rp->nfsrecly_off, rp->nfsrecly_len,
	&stateid, cred, p, NULL);
	}
	}

	/*
	* Do the layout commit for a file layout.
	*/
	static void
	nfscl_dolayoutcommit(struct nfsmount nmp, struct nfscllayout lyp,
	struct ucred cred, NFSPROC_T p)
	{
	struct nfsclflayout *flp;
	uint64_t len;
	int error, layouttype;

	if ((lyp->nfsly_flags & NFSLY_FILES) != 0)
	layouttype = NFSLAYOUT_NFSV4_1_FILES;
	else
	layouttype = NFSLAYOUT_FLEXFILE;
	LIST_FOREACH(flp, &lyp->nfsly_flayrw, nfsfl_list) {
	if (layouttype == NFSLAYOUT_FLEXFILE &&
	(flp->nfsfl_fflags & NFSFLEXFLAG_NO_LAYOUTCOMMIT) != 0) {
	NFSCL_DEBUG(4, "Flex file: no layoutcommit\n");
	/* If not supported, don't bother doing it. */
	NFSLOCKMNT(nmp);
	nmp->nm_state \|= NFSSTA_NOLAYOUTCOMMIT;
	NFSUNLOCKMNT(nmp);
	break;
	} else if (flp->nfsfl_off <= lyp->nfsly_lastbyte) {
	len = flp->nfsfl_end - flp->nfsfl_off;
	error = nfsrpc_layoutcommit(nmp, lyp->nfsly_fh,
	lyp->nfsly_fhlen, 0, flp->nfsfl_off, len,
	lyp->nfsly_lastbyte, &lyp->nfsly_stateid,
	layouttype, cred, p, NULL);
	NFSCL_DEBUG(4, "layoutcommit err=%d\n", error);
	if (error == NFSERR_NOTSUPP) {
	/* If not supported, don't bother doing it. */
	NFSLOCKMNT(nmp);
	nmp->nm_state \|= NFSSTA_NOLAYOUTCOMMIT;
	NFSUNLOCKMNT(nmp);
	break;
	}
	}
	}
	}

	/*
	* Commit all layouts for a file (vnode).
	*/
	int
	nfscl_layoutcommit(vnode_t vp, NFSPROC_T *p)
	{
	struct nfsclclient *clp;
	struct nfscllayout *lyp;
	struct nfsnode *np = VTONFS(vp);
	mount_t mp;
	struct nfsmount *nmp;

	mp = vnode_mount(vp);
	nmp = VFSTONFS(mp);
	if (NFSHASNOLAYOUTCOMMIT(nmp))
	return (0);
	NFSLOCKCLSTATE();
	clp = nmp->nm_clp;
	if (clp == NULL) {
	NFSUNLOCKCLSTATE();
	return (EPERM);
	}
	lyp = nfscl_findlayout(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len);
	if (lyp == NULL) {
	NFSUNLOCKCLSTATE();
	return (EPERM);
	}
	nfsv4_getref(&lyp->nfsly_lock, NULL, NFSCLSTATEMUTEXPTR, mp);
	if (NFSCL_FORCEDISM(mp)) {
	NFSUNLOCKCLSTATE();
	return (EPERM);
	}
	tryagain:
	if ((lyp->nfsly_flags & NFSLY_WRITTEN) != 0) {
	lyp->nfsly_flags &= ~NFSLY_WRITTEN;
	NFSUNLOCKCLSTATE();
	NFSCL_DEBUG(4, "do layoutcommit2\n");
	nfscl_dolayoutcommit(clp->nfsc_nmp, lyp, NFSPROCCRED(p), p);
	NFSLOCKCLSTATE();
	goto tryagain;
	}
	nfsv4_relref(&lyp->nfsly_lock);
	NFSUNLOCKCLSTATE();
	return (0);
	}

	Index: head/sys/fs/nfsserver/nfs_nfsdport.c
	===================================================================
	--- head/sys/fs/nfsserver/nfs_nfsdport.c (revision 327172)
	+++ head/sys/fs/nfsserver/nfs_nfsdport.c (revision 327173)
	@@ -1,3443 +1,3441 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Rick Macklem at The University of Guelph.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/capsicum.h>

	/*
	* Functions that perform the vfs operations required by the routines in
	* nfsd_serv.c. It is hoped that this change will make the server more
	* portable.
	*/

	#include <fs/nfs/nfsport.h>
	#include <sys/hash.h>
	#include <sys/sysctl.h>
	#include <nlm/nlm_prot.h>
	#include <nlm/nlm.h>

	FEATURE(nfsd, "NFSv4 server");

	extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
	extern int nfsrv_useacl;
	extern int newnfs_numnfsd;
	extern struct mount nfsv4root_mnt;
	extern struct nfsrv_stablefirst nfsrv_stablefirst;
	extern void (*nfsd_call_servertimer)(void);
	extern SVCPOOL *nfsrvd_pool;
	extern struct nfsv4lock nfsd_suspend_lock;
	extern struct nfsclienthashhead *nfsclienthash;
	extern struct nfslockhashhead *nfslockhash;
	extern struct nfssessionhash *nfssessionhash;
	extern int nfsrv_sessionhashsize;
	extern struct nfsstatsv1 nfsstatsv1;
	struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
	NFSDLOCKMUTEX;
	struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
	struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
	struct mtx nfsrc_udpmtx;
	struct mtx nfs_v4root_mutex;
	struct nfsrvfh nfs_rootfh, nfs_pubfh;
	int nfs_pubfhset = 0, nfs_rootfhset = 0;
	struct proc *nfsd_master_proc = NULL;
	int nfsd_debuglevel = 0;
	static pid_t nfsd_master_pid = (pid_t)-1;
	static char nfsd_master_comm[MAXCOMLEN + 1];
	static struct timeval nfsd_master_start;
	static uint32_t nfsv4_sysid = 0;

	static int nfssvc_srvcall(struct thread , struct nfssvc_args ,
	struct ucred *);

	int nfsrv_enable_crossmntpt = 1;
	static int nfs_commit_blks;
	static int nfs_commit_miss;
	extern int nfsrv_issuedelegs;
	extern int nfsrv_dolocallocks;
	extern int nfsd_enable_stringtouid;

	SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW, 0, "NFS server");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW,
	&nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks,
	0, "");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss,
	0, "");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW,
	&nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW,
	&nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, debuglevel, CTLFLAG_RW, &nfsd_debuglevel,
	0, "Debug level for NFS server");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, CTLFLAG_RW,
	&nfsd_enable_stringtouid, 0, "Enable nfsd to accept numeric owner_names");

	#define MAX_REORDERED_RPC 16
	#define NUM_HEURISTIC 1031
	#define NHUSE_INIT 64
	#define NHUSE_INC 16
	#define NHUSE_MAX 2048

	static struct nfsheur {
	struct vnode nh_vp; / vp to match (unreferenced pointer) */
	off_t nh_nextoff; /* next offset for sequential detection */
	int nh_use; /* use count for selection */
	int nh_seqcount; /* heuristic */
	} nfsheur[NUM_HEURISTIC];


	/*
	* Heuristic to detect sequential operation.
	*/
	static struct nfsheur *
	nfsrv_sequential_heuristic(struct uio uio, struct vnode vp)
	{
	struct nfsheur *nh;
	int hi, try;

	/* Locate best candidate. */
	try = 32;
	hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
	nh = &nfsheur[hi];
	while (try--) {
	if (nfsheur[hi].nh_vp == vp) {
	nh = &nfsheur[hi];
	break;
	}
	if (nfsheur[hi].nh_use > 0)
	--nfsheur[hi].nh_use;
	hi = (hi + 1) % NUM_HEURISTIC;
	if (nfsheur[hi].nh_use < nh->nh_use)
	nh = &nfsheur[hi];
	}

	/* Initialize hint if this is a new file. */
	if (nh->nh_vp != vp) {
	nh->nh_vp = vp;
	nh->nh_nextoff = uio->uio_offset;
	nh->nh_use = NHUSE_INIT;
	if (uio->uio_offset == 0)
	nh->nh_seqcount = 4;
	else
	nh->nh_seqcount = 1;
	}

	/* Calculate heuristic. */
	if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) \|\|
	uio->uio_offset == nh->nh_nextoff) {
	/* See comments in vfs_vnops.c:sequential_heuristic(). */
	nh->nh_seqcount += howmany(uio->uio_resid, 16384);
	if (nh->nh_seqcount > IO_SEQMAX)
	nh->nh_seqcount = IO_SEQMAX;
	} else if (qabs(uio->uio_offset - nh->nh_nextoff) <= MAX_REORDERED_RPC *
	imax(vp->v_mount->mnt_stat.f_iosize, uio->uio_resid)) {
	/* Probably a reordered RPC, leave seqcount alone. */
	} else if (nh->nh_seqcount > 1) {
	nh->nh_seqcount /= 2;
	} else {
	nh->nh_seqcount = 0;
	}
	nh->nh_use += NHUSE_INC;
	if (nh->nh_use > NHUSE_MAX)
	nh->nh_use = NHUSE_MAX;
	return (nh);
	}

	/*
	* Get attributes into nfsvattr structure.
	*/
	int
	nfsvno_getattr(struct vnode vp, struct nfsvattr nvap, struct ucred *cred,
	struct thread *p, int vpislocked)
	{
	int error, lockedit = 0;

	if (vpislocked == 0) {
	/*
	* When vpislocked == 0, the vnode is either exclusively
	* locked by this thread or not locked by this thread.
	* As such, shared lock it, if not exclusively locked.
	*/
	if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
	lockedit = 1;
	NFSVOPLOCK(vp, LK_SHARED \| LK_RETRY);
	}
	}
	error = VOP_GETATTR(vp, &nvap->na_vattr, cred);
	if (lockedit != 0)
	NFSVOPUNLOCK(vp, 0);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Get a file handle for a vnode.
	*/
	int
	nfsvno_getfh(struct vnode vp, fhandle_t fhp, struct thread *p)
	{
	int error;

	NFSBZERO((caddr_t)fhp, sizeof(fhandle_t));
	fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
	error = VOP_VPTOFH(vp, &fhp->fh_fid);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Perform access checking for vnodes obtained from file handles that would
	* refer to files already opened by a Unix client. You cannot just use
	* vn_writechk() and VOP_ACCESSX() for two reasons.
	* 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
	* case.
	* 2 - The owner is to be given access irrespective of mode bits for some
	* operations, so that processes that chmod after opening a file don't
	* break.
	*/
	int
	nfsvno_accchk(struct vnode vp, accmode_t accmode, struct ucred cred,
	struct nfsexstuff exp, struct thread p, int override, int vpislocked,
	u_int32_t *supportedtypep)
	{
	struct vattr vattr;
	int error = 0, getret = 0;

	if (vpislocked == 0) {
	if (NFSVOPLOCK(vp, LK_SHARED) != 0) {
	error = EPERM;
	goto out;
	}
	}
	if (accmode & VWRITE) {
	/* Just vn_writechk() changed to check rdonly */
	/*
	* Disallow write attempts on read-only file systems;
	* unless the file is a socket or a block or character
	* device resident on the file system.
	*/
	if (NFSVNO_EXRDONLY(exp) \|\|
	(vp->v_mount->mnt_flag & MNT_RDONLY)) {
	switch (vp->v_type) {
	case VREG:
	case VDIR:
	case VLNK:
	error = EROFS;
	default:
	break;
	}
	}
	/*
	* If there's shared text associated with
	* the inode, try to free it up once. If
	* we fail, we can't allow writing.
	*/
	if (VOP_IS_TEXT(vp) && error == 0)
	error = ETXTBSY;
	}
	if (error != 0) {
	if (vpislocked == 0)
	NFSVOPUNLOCK(vp, 0);
	goto out;
	}

	/*
	* Should the override still be applied when ACLs are enabled?
	*/
	error = VOP_ACCESSX(vp, accmode, cred, p);
	if (error != 0 && (accmode & (VDELETE \| VDELETE_CHILD))) {
	/*
	* Try again with VEXPLICIT_DENY, to see if the test for
	* deletion is supported.
	*/
	error = VOP_ACCESSX(vp, accmode \| VEXPLICIT_DENY, cred, p);
	if (error == 0) {
	if (vp->v_type == VDIR) {
	accmode &= ~(VDELETE \| VDELETE_CHILD);
	accmode \|= VWRITE;
	error = VOP_ACCESSX(vp, accmode, cred, p);
	} else if (supportedtypep != NULL) {
	*supportedtypep &= ~NFSACCESS_DELETE;
	}
	}
	}

	/*
	* Allow certain operations for the owner (reads and writes
	* on files that are already open).
	*/
	if (override != NFSACCCHK_NOOVERRIDE &&
	(error == EPERM \|\| error == EACCES)) {
	if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT))
	error = 0;
	else if (override & NFSACCCHK_ALLOWOWNER) {
	getret = VOP_GETATTR(vp, &vattr, cred);
	if (getret == 0 && cred->cr_uid == vattr.va_uid)
	error = 0;
	}
	}
	if (vpislocked == 0)
	NFSVOPUNLOCK(vp, 0);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Set attribute(s) vnop.
	*/
	int
	nfsvno_setattr(struct vnode vp, struct nfsvattr nvap, struct ucred *cred,
	struct thread p, struct nfsexstuff exp)
	{
	int error;

	error = VOP_SETATTR(vp, &nvap->na_vattr, cred);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Set up nameidata for a lookup() call and do it.
	*/
	int
	nfsvno_namei(struct nfsrv_descript nd, struct nameidata ndp,
	struct vnode dp, int islocked, struct nfsexstuff exp, struct thread *p,
	struct vnode **retdirp)
	{
	struct componentname *cnp = &ndp->ni_cnd;
	int i;
	struct iovec aiov;
	struct uio auio;
	int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen;
	- int error = 0, crossmnt;
	+ int error = 0;
	char *cp;

	*retdirp = NULL;
	cnp->cn_nameptr = cnp->cn_pnbuf;
	ndp->ni_lcf = 0;
	/*
	* Extract and set starting directory.
	*/
	if (dp->v_type != VDIR) {
	if (islocked)
	vput(dp);
	else
	vrele(dp);
	nfsvno_relpathbuf(ndp);
	error = ENOTDIR;
	goto out1;
	}
	if (islocked)
	NFSVOPUNLOCK(dp, 0);
	VREF(dp);
	*retdirp = dp;
	if (NFSVNO_EXRDONLY(exp))
	cnp->cn_flags \|= RDONLY;
	ndp->ni_segflg = UIO_SYSSPACE;
	- crossmnt = 1;

	if (nd->nd_flag & ND_PUBLOOKUP) {
	ndp->ni_loopcnt = 0;
	if (cnp->cn_pnbuf[0] == '/') {
	vrele(dp);
	/*
	* Check for degenerate pathnames here, since lookup()
	* panics on them.
	*/
	for (i = 1; i < ndp->ni_pathlen; i++)
	if (cnp->cn_pnbuf[i] != '/')
	break;
	if (i == ndp->ni_pathlen) {
	error = NFSERR_ACCES;
	goto out;
	}
	dp = rootvnode;
	VREF(dp);
	}
	} else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) \|\|
	(nd->nd_flag & ND_NFSV4) == 0) {
	/*
	* Only cross mount points for NFSv4 when doing a
	* mount while traversing the file system above
	* the mount point, unless nfsrv_enable_crossmntpt is set.
	*/
	cnp->cn_flags \|= NOCROSSMOUNT;
	- crossmnt = 0;
	}

	/*
	* Initialize for scan, set ni_startdir and bump ref on dp again
	* because lookup() will dereference ni_startdir.
	*/

	cnp->cn_thread = p;
	ndp->ni_startdir = dp;
	ndp->ni_rootdir = rootvnode;
	ndp->ni_topdir = NULL;

	if (!lockleaf)
	cnp->cn_flags \|= LOCKLEAF;
	for (;;) {
	cnp->cn_nameptr = cnp->cn_pnbuf;
	/*
	* Call lookup() to do the real work. If an error occurs,
	* ndp->ni_vp and ni_dvp are left uninitialized or NULL and
	* we do not have to dereference anything before returning.
	* In either case ni_startdir will be dereferenced and NULLed
	* out.
	*/
	error = lookup(ndp);
	if (error)
	break;

	/*
	* Check for encountering a symbolic link. Trivial
	* termination occurs if no symlink encountered.
	*/
	if ((cnp->cn_flags & ISSYMLINK) == 0) {
	if ((cnp->cn_flags & (SAVENAME \| SAVESTART)) == 0)
	nfsvno_relpathbuf(ndp);
	if (ndp->ni_vp && !lockleaf)
	NFSVOPUNLOCK(ndp->ni_vp, 0);
	break;
	}

	/*
	* Validate symlink
	*/
	if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
	NFSVOPUNLOCK(ndp->ni_dvp, 0);
	if (!(nd->nd_flag & ND_PUBLOOKUP)) {
	error = EINVAL;
	goto badlink2;
	}

	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
	error = ELOOP;
	goto badlink2;
	}
	if (ndp->ni_pathlen > 1)
	cp = uma_zalloc(namei_zone, M_WAITOK);
	else
	cp = cnp->cn_pnbuf;
	aiov.iov_base = cp;
	aiov.iov_len = MAXPATHLEN;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = NULL;
	auio.uio_resid = MAXPATHLEN;
	error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
	if (error) {
	badlink1:
	if (ndp->ni_pathlen > 1)
	uma_zfree(namei_zone, cp);
	badlink2:
	vrele(ndp->ni_dvp);
	vput(ndp->ni_vp);
	break;
	}
	linklen = MAXPATHLEN - auio.uio_resid;
	if (linklen == 0) {
	error = ENOENT;
	goto badlink1;
	}
	if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
	error = ENAMETOOLONG;
	goto badlink1;
	}

	/*
	* Adjust or replace path
	*/
	if (ndp->ni_pathlen > 1) {
	NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
	uma_zfree(namei_zone, cnp->cn_pnbuf);
	cnp->cn_pnbuf = cp;
	} else
	cnp->cn_pnbuf[linklen] = '\0';
	ndp->ni_pathlen += linklen;

	/*
	* Cleanup refs for next loop and check if root directory
	* should replace current directory. Normally ni_dvp
	* becomes the new base directory and is cleaned up when
	* we loop. Explicitly null pointers after invalidation
	* to clarify operation.
	*/
	vput(ndp->ni_vp);
	ndp->ni_vp = NULL;

	if (cnp->cn_pnbuf[0] == '/') {
	vrele(ndp->ni_dvp);
	ndp->ni_dvp = ndp->ni_rootdir;
	VREF(ndp->ni_dvp);
	}
	ndp->ni_startdir = ndp->ni_dvp;
	ndp->ni_dvp = NULL;
	}
	if (!lockleaf)
	cnp->cn_flags &= ~LOCKLEAF;

	out:
	if (error) {
	nfsvno_relpathbuf(ndp);
	ndp->ni_vp = NULL;
	ndp->ni_dvp = NULL;
	ndp->ni_startdir = NULL;
	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT\|LOCKPARENT)) == 0) {
	ndp->ni_dvp = NULL;
	}

	out1:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Set up a pathname buffer and return a pointer to it and, optionally
	* set a hash pointer.
	*/
	void
	nfsvno_setpathbuf(struct nameidata ndp, char bufpp, u_long *hashpp)
	{
	struct componentname *cnp = &ndp->ni_cnd;

	cnp->cn_flags \|= (NOMACCHECK \| HASBUF);
	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
	if (hashpp != NULL)
	*hashpp = NULL;
	*bufpp = cnp->cn_pnbuf;
	}

	/*
	* Release the above path buffer, if not released by nfsvno_namei().
	*/
	void
	nfsvno_relpathbuf(struct nameidata *ndp)
	{

	if ((ndp->ni_cnd.cn_flags & HASBUF) == 0)
	panic("nfsrelpath");
	uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
	ndp->ni_cnd.cn_flags &= ~HASBUF;
	}

	/*
	* Readlink vnode op into an mbuf list.
	*/
	int
	nfsvno_readlink(struct vnode vp, struct ucred cred, struct thread *p,
	struct mbuf mpp, struct mbuf mpendp, int *lenp)
	{
	struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN];
	struct iovec *ivp = iv;
	struct uio io, *uiop = &io;
	struct mbuf mp, mp2 = NULL, *mp3 = NULL;
	int i, len, tlen, error = 0;

	len = 0;
	i = 0;
	while (len < NFS_MAXPATHLEN) {
	NFSMGET(mp);
	MCLGET(mp, M_WAITOK);
	mp->m_len = M_SIZE(mp);
	if (len == 0) {
	mp3 = mp2 = mp;
	} else {
	mp2->m_next = mp;
	mp2 = mp;
	}
	if ((len + mp->m_len) > NFS_MAXPATHLEN) {
	mp->m_len = NFS_MAXPATHLEN - len;
	len = NFS_MAXPATHLEN;
	} else {
	len += mp->m_len;
	}
	ivp->iov_base = mtod(mp, caddr_t);
	ivp->iov_len = mp->m_len;
	i++;
	ivp++;
	}
	uiop->uio_iov = iv;
	uiop->uio_iovcnt = i;
	uiop->uio_offset = 0;
	uiop->uio_resid = len;
	uiop->uio_rw = UIO_READ;
	uiop->uio_segflg = UIO_SYSSPACE;
	uiop->uio_td = NULL;
	error = VOP_READLINK(vp, uiop, cred);
	if (error) {
	m_freem(mp3);
	*lenp = 0;
	goto out;
	}
	if (uiop->uio_resid > 0) {
	len -= uiop->uio_resid;
	tlen = NFSM_RNDUP(len);
	nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len);
	}
	*lenp = len;
	*mpp = mp3;
	*mpendp = mp;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Read vnode op call into mbuf list.
	*/
	int
	nfsvno_read(struct vnode vp, off_t off, int cnt, struct ucred cred,
	struct thread p, struct mbuf mpp, struct mbuf *mpendp)
	{
	struct mbuf *m;
	int i;
	struct iovec *iv;
	struct iovec *iv2;
	int error = 0, len, left, siz, tlen, ioflag = 0;
	struct mbuf m2 = NULL, m3;
	struct uio io, *uiop = &io;
	struct nfsheur *nh;

	len = left = NFSM_RNDUP(cnt);
	m3 = NULL;
	/*
	* Generate the mbuf list with the uio_iov ref. to it.
	*/
	i = 0;
	while (left > 0) {
	NFSMGET(m);
	MCLGET(m, M_WAITOK);
	m->m_len = 0;
	siz = min(M_TRAILINGSPACE(m), left);
	left -= siz;
	i++;
	if (m3)
	m2->m_next = m;
	else
	m3 = m;
	m2 = m;
	}
	MALLOC(iv, struct iovec , i sizeof (struct iovec),
	M_TEMP, M_WAITOK);
	uiop->uio_iov = iv2 = iv;
	m = m3;
	left = len;
	i = 0;
	while (left > 0) {
	if (m == NULL)
	panic("nfsvno_read iov");
	siz = min(M_TRAILINGSPACE(m), left);
	if (siz > 0) {
	iv->iov_base = mtod(m, caddr_t) + m->m_len;
	iv->iov_len = siz;
	m->m_len += siz;
	left -= siz;
	iv++;
	i++;
	}
	m = m->m_next;
	}
	uiop->uio_iovcnt = i;
	uiop->uio_offset = off;
	uiop->uio_resid = len;
	uiop->uio_rw = UIO_READ;
	uiop->uio_segflg = UIO_SYSSPACE;
	uiop->uio_td = NULL;
	nh = nfsrv_sequential_heuristic(uiop, vp);
	ioflag \|= nh->nh_seqcount << IO_SEQSHIFT;
	/* XXX KDM make this more systematic? */
	nfsstatsv1.srvbytes[NFSV4OP_READ] += uiop->uio_resid;
	error = VOP_READ(vp, uiop, IO_NODELOCKED \| ioflag, cred);
	FREE((caddr_t)iv2, M_TEMP);
	if (error) {
	m_freem(m3);
	*mpp = NULL;
	goto out;
	}
	nh->nh_nextoff = uiop->uio_offset;
	tlen = len - uiop->uio_resid;
	cnt = cnt < tlen ? cnt : tlen;
	tlen = NFSM_RNDUP(cnt);
	if (tlen == 0) {
	m_freem(m3);
	m3 = NULL;
	} else if (len != tlen \|\| tlen != cnt)
	nfsrv_adj(m3, len - tlen, tlen - cnt);
	*mpp = m3;
	*mpendp = m2;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Write vnode op from an mbuf list.
	*/
	int
	nfsvno_write(struct vnode *vp, off_t off, int retlen, int cnt, int stable,
	struct mbuf mp, char cp, struct ucred cred, struct thread p)
	{
	struct iovec *ivp;
	int i, len;
	struct iovec *iv;
	int ioflags, error;
	struct uio io, *uiop = &io;
	struct nfsheur *nh;

	MALLOC(ivp, struct iovec , cnt sizeof (struct iovec), M_TEMP,
	M_WAITOK);
	uiop->uio_iov = iv = ivp;
	uiop->uio_iovcnt = cnt;
	i = mtod(mp, caddr_t) + mp->m_len - cp;
	len = retlen;
	while (len > 0) {
	if (mp == NULL)
	panic("nfsvno_write");
	if (i > 0) {
	i = min(i, len);
	ivp->iov_base = cp;
	ivp->iov_len = i;
	ivp++;
	len -= i;
	}
	mp = mp->m_next;
	if (mp) {
	i = mp->m_len;
	cp = mtod(mp, caddr_t);
	}
	}

	if (stable == NFSWRITE_UNSTABLE)
	ioflags = IO_NODELOCKED;
	else
	ioflags = (IO_SYNC \| IO_NODELOCKED);
	uiop->uio_resid = retlen;
	uiop->uio_rw = UIO_WRITE;
	uiop->uio_segflg = UIO_SYSSPACE;
	NFSUIOPROC(uiop, p);
	uiop->uio_offset = off;
	nh = nfsrv_sequential_heuristic(uiop, vp);
	ioflags \|= nh->nh_seqcount << IO_SEQSHIFT;
	/* XXX KDM make this more systematic? */
	nfsstatsv1.srvbytes[NFSV4OP_WRITE] += uiop->uio_resid;
	error = VOP_WRITE(vp, uiop, ioflags, cred);
	if (error == 0)
	nh->nh_nextoff = uiop->uio_offset;
	FREE((caddr_t)iv, M_TEMP);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Common code for creating a regular file (plus special files for V2).
	*/
	int
	nfsvno_createsub(struct nfsrv_descript nd, struct nameidata ndp,
	struct vnode *vpp, struct nfsvattr nvap, int *exclusive_flagp,
	int32_t cverf, NFSDEV_T rdev, struct thread p, struct nfsexstuff *exp)
	{
	u_quad_t tempsize;
	int error;

	error = nd->nd_repstat;
	if (!error && ndp->ni_vp == NULL) {
	if (nvap->na_type == VREG \|\| nvap->na_type == VSOCK) {
	vrele(ndp->ni_startdir);
	error = VOP_CREATE(ndp->ni_dvp,
	&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	if (!error) {
	if (*exclusive_flagp) {
	*exclusive_flagp = 0;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_atime.tv_sec = cverf[0];
	nvap->na_atime.tv_nsec = cverf[1];
	error = VOP_SETATTR(ndp->ni_vp,
	&nvap->na_vattr, nd->nd_cred);
	if (error != 0) {
	vput(ndp->ni_vp);
	ndp->ni_vp = NULL;
	error = NFSERR_NOTSUPP;
	}
	}
	}
	/*
	* NFS V2 Only. nfsrvd_mknod() does this for V3.
	* (This implies, just get out on an error.)
	*/
	} else if (nvap->na_type == VCHR \|\| nvap->na_type == VBLK \|\|
	nvap->na_type == VFIFO) {
	if (nvap->na_type == VCHR && rdev == 0xffffffff)
	nvap->na_type = VFIFO;
	if (nvap->na_type != VFIFO &&
	(error = priv_check_cred(nd->nd_cred,
	PRIV_VFS_MKNOD_DEV, 0))) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	goto out;
	}
	nvap->na_rdev = rdev;
	error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	vrele(ndp->ni_startdir);
	if (error)
	goto out;
	} else {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	error = ENXIO;
	goto out;
	}
	*vpp = ndp->ni_vp;
	} else {
	/*
	* Handle cases where error is already set and/or
	* the file exists.
	* 1 - clean up the lookup
	* 2 - iff !error and na_size set, truncate it
	*/
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	*vpp = ndp->ni_vp;
	if (ndp->ni_dvp == *vpp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	if (!error && nvap->na_size != VNOVAL) {
	error = nfsvno_accchk(*vpp, VWRITE,
	nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	if (!error) {
	tempsize = nvap->na_size;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_size = tempsize;
	error = VOP_SETATTR(*vpp,
	&nvap->na_vattr, nd->nd_cred);
	}
	}
	if (error)
	vput(*vpp);
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Do a mknod vnode op.
	*/
	int
	nfsvno_mknod(struct nameidata ndp, struct nfsvattr nvap, struct ucred *cred,
	struct thread *p)
	{
	int error = 0;
	enum vtype vtyp;

	vtyp = nvap->na_type;
	/*
	* Iff doesn't exist, create it.
	*/
	if (ndp->ni_vp) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	vrele(ndp->ni_vp);
	error = EEXIST;
	goto out;
	}
	if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	error = NFSERR_BADTYPE;
	goto out;
	}
	if (vtyp == VSOCK) {
	vrele(ndp->ni_startdir);
	error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	} else {
	if (nvap->na_type != VFIFO &&
	(error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV, 0))) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	goto out;
	}
	error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	vrele(ndp->ni_startdir);
	/*
	* Since VOP_MKNOD returns the ni_vp, I can't
	* see any reason to do the lookup.
	*/
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Mkdir vnode op.
	*/
	int
	nfsvno_mkdir(struct nameidata ndp, struct nfsvattr nvap, uid_t saved_uid,
	struct ucred cred, struct thread p, struct nfsexstuff *exp)
	{
	int error = 0;

	if (ndp->ni_vp != NULL) {
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vrele(ndp->ni_vp);
	nfsvno_relpathbuf(ndp);
	error = EEXIST;
	goto out;
	}
	error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
	&nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* symlink vnode op.
	*/
	int
	nfsvno_symlink(struct nameidata ndp, struct nfsvattr nvap, char *pathcp,
	int pathlen, int not_v2, uid_t saved_uid, struct ucred cred, struct thread p,
	struct nfsexstuff *exp)
	{
	int error = 0;

	if (ndp->ni_vp) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vrele(ndp->ni_vp);
	error = EEXIST;
	goto out;
	}

	error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
	&nvap->na_vattr, pathcp);
	vput(ndp->ni_dvp);
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	/*
	* Although FreeBSD still had the lookup code in
	* it for 7/current, there doesn't seem to be any
	* point, since VOP_SYMLINK() returns the ni_vp.
	* Just vput it for v2.
	*/
	if (!not_v2 && !error)
	vput(ndp->ni_vp);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Parse symbolic link arguments.
	* This function has an ugly side effect. It will MALLOC() an area for
	* the symlink and set iov_base to point to it, only if it succeeds.
	* So, if it returns with uiop->uio_iov->iov_base != NULL, that must
	* be FREE'd later.
	*/
	int
	nfsvno_getsymlink(struct nfsrv_descript nd, struct nfsvattr nvap,
	struct thread p, char pathcpp, int lenp)
	{
	u_int32_t *tl;
	char *pathcp = NULL;
	int error = 0, len;
	struct nfsv2_sattr *sp;

	*pathcpp = NULL;
	*lenp = 0;
	if ((nd->nd_flag & ND_NFSV3) &&
	(error = nfsrv_sattr(nd, NULL, nvap, NULL, NULL, p)))
	goto nfsmout;
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	len = fxdr_unsigned(int, *tl);
	if (len > NFS_MAXPATHLEN \|\| len <= 0) {
	error = EBADRPC;
	goto nfsmout;
	}
	MALLOC(pathcp, caddr_t, len + 1, M_TEMP, M_WAITOK);
	error = nfsrv_mtostr(nd, pathcp, len);
	if (error)
	goto nfsmout;
	if (nd->nd_flag & ND_NFSV2) {
	NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
	nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode);
	}
	*pathcpp = pathcp;
	*lenp = len;
	NFSEXITCODE2(0, nd);
	return (0);
	nfsmout:
	if (pathcp)
	free(pathcp, M_TEMP);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Remove a non-directory object.
	*/
	int
	nfsvno_removesub(struct nameidata ndp, int is_v4, struct ucred cred,
	struct thread p, struct nfsexstuff exp)
	{
	struct vnode *vp;
	int error = 0;

	vp = ndp->ni_vp;
	if (vp->v_type == VDIR)
	error = NFSERR_ISDIR;
	else if (is_v4)
	error = nfsrv_checkremove(vp, 1, p);
	if (!error)
	error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd);
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vput(vp);
	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
	nfsvno_relpathbuf(ndp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Remove a directory.
	*/
	int
	nfsvno_rmdirsub(struct nameidata ndp, int is_v4, struct ucred cred,
	struct thread p, struct nfsexstuff exp)
	{
	struct vnode *vp;
	int error = 0;

	vp = ndp->ni_vp;
	if (vp->v_type != VDIR) {
	error = ENOTDIR;
	goto out;
	}
	/*
	* No rmdir "." please.
	*/
	if (ndp->ni_dvp == vp) {
	error = EINVAL;
	goto out;
	}
	/*
	* The root of a mounted filesystem cannot be deleted.
	*/
	if (vp->v_vflag & VV_ROOT)
	error = EBUSY;
	out:
	if (!error)
	error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd);
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vput(vp);
	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
	nfsvno_relpathbuf(ndp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Rename vnode op.
	*/
	int
	nfsvno_rename(struct nameidata fromndp, struct nameidata tondp,
	u_int32_t ndstat, u_int32_t ndflag, struct ucred cred, struct thread p)
	{
	struct vnode fvp, tvp, *tdvp;
	int error = 0;

	fvp = fromndp->ni_vp;
	if (ndstat) {
	vrele(fromndp->ni_dvp);
	vrele(fvp);
	error = ndstat;
	goto out1;
	}
	tdvp = tondp->ni_dvp;
	tvp = tondp->ni_vp;
	if (tvp != NULL) {
	if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
	error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST;
	goto out;
	} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
	error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST;
	goto out;
	}
	if (tvp->v_type == VDIR && tvp->v_mountedhere) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
	goto out;
	}

	/*
	* A rename to '.' or '..' results in a prematurely
	* unlocked vnode on FreeBSD5, so I'm just going to fail that
	* here.
	*/
	if ((tondp->ni_cnd.cn_namelen == 1 &&
	tondp->ni_cnd.cn_nameptr[0] == '.') \|\|
	(tondp->ni_cnd.cn_namelen == 2 &&
	tondp->ni_cnd.cn_nameptr[0] == '.' &&
	tondp->ni_cnd.cn_nameptr[1] == '.')) {
	error = EINVAL;
	goto out;
	}
	}
	if (fvp->v_type == VDIR && fvp->v_mountedhere) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
	goto out;
	}
	if (fvp->v_mount != tdvp->v_mount) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
	goto out;
	}
	if (fvp == tdvp) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL;
	goto out;
	}
	if (fvp == tvp) {
	/*
	* If source and destination are the same, there is nothing to
	* do. Set error to -1 to indicate this.
	*/
	error = -1;
	goto out;
	}
	if (ndflag & ND_NFSV4) {
	if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) {
	error = nfsrv_checkremove(fvp, 0, p);
	NFSVOPUNLOCK(fvp, 0);
	} else
	error = EPERM;
	if (tvp && !error)
	error = nfsrv_checkremove(tvp, 1, p);
	} else {
	/*
	* For NFSv2 and NFSv3, try to get rid of the delegation, so
	* that the NFSv4 client won't be confused by the rename.
	* Since nfsd_recalldelegation() can only be called on an
	* unlocked vnode at this point and fvp is the file that will
	* still exist after the rename, just do fvp.
	*/
	nfsd_recalldelegation(fvp, p);
	}
	out:
	if (!error) {
	error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
	&fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
	&tondp->ni_cnd);
	} else {
	if (tdvp == tvp)
	vrele(tdvp);
	else
	vput(tdvp);
	if (tvp)
	vput(tvp);
	vrele(fromndp->ni_dvp);
	vrele(fvp);
	if (error == -1)
	error = 0;
	}
	vrele(tondp->ni_startdir);
	nfsvno_relpathbuf(tondp);
	out1:
	vrele(fromndp->ni_startdir);
	nfsvno_relpathbuf(fromndp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Link vnode op.
	*/
	int
	nfsvno_link(struct nameidata ndp, struct vnode vp, struct ucred *cred,
	struct thread p, struct nfsexstuff exp)
	{
	struct vnode *xp;
	int error = 0;

	xp = ndp->ni_vp;
	if (xp != NULL) {
	error = EEXIST;
	} else {
	xp = ndp->ni_dvp;
	if (vp->v_mount != xp->v_mount)
	error = EXDEV;
	}
	if (!error) {
	NFSVOPLOCK(vp, LK_EXCLUSIVE \| LK_RETRY);
	if ((vp->v_iflag & VI_DOOMED) == 0)
	error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd);
	else
	error = EPERM;
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	NFSVOPUNLOCK(vp, 0);
	} else {
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	if (ndp->ni_vp)
	vrele(ndp->ni_vp);
	}
	nfsvno_relpathbuf(ndp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Do the fsync() appropriate for the commit.
	*/
	int
	nfsvno_fsync(struct vnode vp, u_int64_t off, int cnt, struct ucred cred,
	struct thread *td)
	{
	int error = 0;

	/*
	* RFC 1813 3.3.21: if count is 0, a flush from offset to the end of
	* file is done. At this time VOP_FSYNC does not accept offset and
	* byte count parameters so call VOP_FSYNC the whole file for now.
	* The same is true for NFSv4: RFC 3530 Sec. 14.2.3.
	* File systems that do not use the buffer cache (as indicated
	* by MNTK_USES_BCACHE not being set) must use VOP_FSYNC().
	*/
	if (cnt == 0 \|\| cnt > MAX_COMMIT_COUNT \|\|
	(vp->v_mount->mnt_kern_flag & MNTK_USES_BCACHE) == 0) {
	/*
	* Give up and do the whole thing
	*/
	if (vp->v_object &&
	(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
	VM_OBJECT_WLOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
	VM_OBJECT_WUNLOCK(vp->v_object);
	}
	error = VOP_FSYNC(vp, MNT_WAIT, td);
	} else {
	/*
	* Locate and synchronously write any buffers that fall
	* into the requested range. Note: we are assuming that
	* f_iosize is a power of 2.
	*/
	int iosize = vp->v_mount->mnt_stat.f_iosize;
	int iomask = iosize - 1;
	struct bufobj *bo;
	daddr_t lblkno;

	/*
	* Align to iosize boundary, super-align to page boundary.
	*/
	if (off & iomask) {
	cnt += off & iomask;
	off &= ~(u_quad_t)iomask;
	}
	if (off & PAGE_MASK) {
	cnt += off & PAGE_MASK;
	off &= ~(u_quad_t)PAGE_MASK;
	}
	lblkno = off / iosize;

	if (vp->v_object &&
	(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
	VM_OBJECT_WLOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, off, off + cnt,
	OBJPC_SYNC);
	VM_OBJECT_WUNLOCK(vp->v_object);
	}

	bo = &vp->v_bufobj;
	BO_LOCK(bo);
	while (cnt > 0) {
	struct buf *bp;

	/*
	* If we have a buffer and it is marked B_DELWRI we
	* have to lock and write it. Otherwise the prior
	* write is assumed to have already been committed.
	*
	* gbincore() can return invalid buffers now so we
	* have to check that bit as well (though B_DELWRI
	* should not be set if B_INVAL is set there could be
	* a race here since we haven't locked the buffer).
	*/
	if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL \|
	LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) {
	BO_LOCK(bo);
	continue; /* retry */
	}
	if ((bp->b_flags & (B_DELWRI\|B_INVAL)) ==
	B_DELWRI) {
	bremfree(bp);
	bp->b_flags &= ~B_ASYNC;
	bwrite(bp);
	++nfs_commit_miss;
	} else
	BUF_UNLOCK(bp);
	BO_LOCK(bo);
	}
	++nfs_commit_blks;
	if (cnt < iosize)
	break;
	cnt -= iosize;
	++lblkno;
	}
	BO_UNLOCK(bo);
	}
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Statfs vnode op.
	*/
	int
	nfsvno_statfs(struct vnode vp, struct statfs sf)
	{
	int error;

	error = VFS_STATFS(vp->v_mount, sf);
	if (error == 0) {
	/*
	* Since NFS handles these values as unsigned on the
	* wire, there is no way to represent negative values,
	* so set them to 0. Without this, they will appear
	* to be very large positive values for clients like
	* Solaris10.
	*/
	if (sf->f_bavail < 0)
	sf->f_bavail = 0;
	if (sf->f_ffree < 0)
	sf->f_ffree = 0;
	}
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but
	* must handle nfsrv_opencheck() calls after any other access checks.
	*/
	void
	nfsvno_open(struct nfsrv_descript nd, struct nameidata ndp,
	nfsquad_t clientid, nfsv4stateid_t stateidp, struct nfsstate stp,
	int exclusive_flagp, struct nfsvattr nvap, int32_t *cverf, int create,
	NFSACL_T aclp, nfsattrbit_t attrbitp, struct ucred cred, struct thread p,
	struct nfsexstuff exp, struct vnode *vpp)
	{
	struct vnode *vp = NULL;
	u_quad_t tempsize;
	struct nfsexstuff nes;

	if (ndp->ni_vp == NULL)
	nd->nd_repstat = nfsrv_opencheck(clientid,
	stateidp, stp, NULL, nd, p, nd->nd_repstat);
	if (!nd->nd_repstat) {
	if (ndp->ni_vp == NULL) {
	vrele(ndp->ni_startdir);
	nd->nd_repstat = VOP_CREATE(ndp->ni_dvp,
	&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	if (!nd->nd_repstat) {
	if (*exclusive_flagp) {
	*exclusive_flagp = 0;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_atime.tv_sec = cverf[0];
	nvap->na_atime.tv_nsec = cverf[1];
	nd->nd_repstat = VOP_SETATTR(ndp->ni_vp,
	&nvap->na_vattr, cred);
	if (nd->nd_repstat != 0) {
	vput(ndp->ni_vp);
	ndp->ni_vp = NULL;
	nd->nd_repstat = NFSERR_NOTSUPP;
	} else
	NFSSETBIT_ATTRBIT(attrbitp,
	NFSATTRBIT_TIMEACCESS);
	} else {
	nfsrv_fixattr(nd, ndp->ni_vp, nvap,
	aclp, p, attrbitp, exp);
	}
	}
	vp = ndp->ni_vp;
	} else {
	if (ndp->ni_startdir)
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vp = ndp->ni_vp;
	if (create == NFSV4OPEN_CREATE) {
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	}
	if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) {
	if (ndp->ni_cnd.cn_flags & RDONLY)
	NFSVNO_SETEXRDONLY(&nes);
	else
	NFSVNO_EXINIT(&nes);
	nd->nd_repstat = nfsvno_accchk(vp,
	VWRITE, cred, &nes, p,
	NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	nd->nd_repstat = nfsrv_opencheck(clientid,
	stateidp, stp, vp, nd, p, nd->nd_repstat);
	if (!nd->nd_repstat) {
	tempsize = nvap->na_size;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_size = tempsize;
	nd->nd_repstat = VOP_SETATTR(vp,
	&nvap->na_vattr, cred);
	}
	} else if (vp->v_type == VREG) {
	nd->nd_repstat = nfsrv_opencheck(clientid,
	stateidp, stp, vp, nd, p, nd->nd_repstat);
	}
	}
	} else {
	if (ndp->ni_cnd.cn_flags & HASBUF)
	nfsvno_relpathbuf(ndp);
	if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) {
	vrele(ndp->ni_startdir);
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	if (ndp->ni_vp)
	vput(ndp->ni_vp);
	}
	}
	*vpp = vp;

	NFSEXITCODE2(0, nd);
	}

	/*
	* Updates the file rev and sets the mtime and ctime
	* to the current clock time, returning the va_filerev and va_Xtime
	* values.
	* Return ESTALE to indicate the vnode is VI_DOOMED.
	*/
	int
	nfsvno_updfilerev(struct vnode vp, struct nfsvattr nvap,
	struct ucred cred, struct thread p)
	{
	struct vattr va;

	VATTR_NULL(&va);
	vfs_timestamp(&va.va_mtime);
	if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
	NFSVOPLOCK(vp, LK_UPGRADE \| LK_RETRY);
	if ((vp->v_iflag & VI_DOOMED) != 0)
	return (ESTALE);
	}
	(void) VOP_SETATTR(vp, &va, cred);
	(void) nfsvno_getattr(vp, nvap, cred, p, 1);
	return (0);
	}

	/*
	* Glue routine to nfsv4_fillattr().
	*/
	int
	nfsvno_fillattr(struct nfsrv_descript nd, struct mount mp, struct vnode *vp,
	struct nfsvattr nvap, fhandle_t fhp, int rderror, nfsattrbit_t *attrbitp,
	struct ucred cred, struct thread p, int isdgram, int reterr,
	int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
	{
	int error;

	error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
	attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
	mounted_on_fileno);
	NFSEXITCODE2(0, nd);
	return (error);
	}

	/* Since the Readdir vnode ops vary, put the entire functions in here. */
	/*
	* nfs readdir service
	* - mallocs what it thinks is enough to read
	* count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR
	* - calls VOP_READDIR()
	* - loops around building the reply
	* if the output generated exceeds count break out of loop
	* The NFSM_CLGET macro is used here so that the reply will be packed
	* tightly in mbuf clusters.
	* - it trims out records with d_fileno == 0
	* this doesn't matter for Unix clients, but they might confuse clients
	* for other os'.
	* - it trims out records with d_type == DT_WHT
	* these cannot be seen through NFS (unless we extend the protocol)
	* The alternate call nfsrvd_readdirplus() does lookups as well.
	* PS: The NFS protocol spec. does not clarify what the "count" byte
	* argument is a count of.. just name strings and file id's or the
	* entire reply rpc or ...
	* I tried just file name and id sizes and it confused the Sun client,
	* so I am using the full rpc size now. The "paranoia.." comment refers
	* to including the status longwords that are not a part of the dir.
	* "entry" structures, but are in the rpc.
	*/
	int
	nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram,
	struct vnode vp, struct thread p, struct nfsexstuff *exp)
	{
	struct dirent *dp;
	u_int32_t *tl;
	int dirlen;
	char cpos, cend, *rbuf;
	struct nfsvattr at;
	int nlen, error = 0, getret = 1;
	int siz, cnt, fullsiz, eofflag, ncookies;
	u_int64_t off, toff, verf;
	u_long cookies = NULL, cookiep;
	struct uio io;
	struct iovec iv;
	int is_ufs;

	if (nd->nd_repstat) {
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	if (nd->nd_flag & ND_NFSV2) {
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	off = fxdr_unsigned(u_quad_t, *tl++);
	} else {
	NFSM_DISSECT(tl, u_int32_t , 5 NFSX_UNSIGNED);
	off = fxdr_hyper(tl);
	tl += 2;
	verf = fxdr_hyper(tl);
	tl += 2;
	}
	toff = off;
	cnt = fxdr_unsigned(int, *tl);
	if (cnt > NFS_SRVMAXDATA(nd) \|\| cnt < 0)
	cnt = NFS_SRVMAXDATA(nd);
	siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
	fullsiz = siz;
	if (nd->nd_flag & ND_NFSV3) {
	nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred,
	p, 1);
	#if 0
	/*
	* va_filerev is not sufficient as a cookie verifier,
	* since it is not supposed to change when entries are
	* removed/added unless that offset cookies returned to
	* the client are no longer valid.
	*/
	if (!nd->nd_repstat && toff && verf != at.na_filerev)
	nd->nd_repstat = NFSERR_BAD_COOKIE;
	#endif
	}
	if (!nd->nd_repstat && vp->v_type != VDIR)
	nd->nd_repstat = NFSERR_NOTDIR;
	if (nd->nd_repstat == 0 && cnt == 0) {
	if (nd->nd_flag & ND_NFSV2)
	/* NFSv2 does not have NFSERR_TOOSMALL */
	nd->nd_repstat = EPERM;
	else
	nd->nd_repstat = NFSERR_TOOSMALL;
	}
	if (!nd->nd_repstat)
	nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
	nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	if (nd->nd_repstat) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
	again:
	eofflag = 0;
	if (cookies) {
	free((caddr_t)cookies, M_TEMP);
	cookies = NULL;
	}

	iv.iov_base = rbuf;
	iv.iov_len = siz;
	io.uio_iov = &iv;
	io.uio_iovcnt = 1;
	io.uio_offset = (off_t)off;
	io.uio_resid = siz;
	io.uio_segflg = UIO_SYSSPACE;
	io.uio_rw = UIO_READ;
	io.uio_td = NULL;
	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
	&cookies);
	off = (u_int64_t)io.uio_offset;
	if (io.uio_resid)
	siz -= io.uio_resid;

	if (!cookies && !nd->nd_repstat)
	nd->nd_repstat = NFSERR_PERM;
	if (nd->nd_flag & ND_NFSV3) {
	getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
	if (!nd->nd_repstat)
	nd->nd_repstat = getret;
	}

	/*
	* Handles the failed cases. nd->nd_repstat == 0 past here.
	*/
	if (nd->nd_repstat) {
	vput(vp);
	free((caddr_t)rbuf, M_TEMP);
	if (cookies)
	free((caddr_t)cookies, M_TEMP);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	/*
	* If nothing read, return eof
	* rpc reply
	*/
	if (siz == 0) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV2) {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	} else {
	nfsrv_postopattr(nd, getret, &at);
	NFSM_BUILD(tl, u_int32_t , 4 NFSX_UNSIGNED);
	txdr_hyper(at.na_filerev, tl);
	tl += 2;
	}
	*tl++ = newnfs_false;
	*tl = newnfs_true;
	FREE((caddr_t)rbuf, M_TEMP);
	FREE((caddr_t)cookies, M_TEMP);
	goto out;
	}

	/*
	* Check for degenerate cases of nothing useful read.
	* If so go try again
	*/
	cpos = rbuf;
	cend = rbuf + siz;
	dp = (struct dirent *)cpos;
	cookiep = cookies;

	/*
	* For some reason FreeBSD's ufs_readdir() chooses to back the
	* directory offset up to a block boundary, so it is necessary to
	* skip over the records that precede the requested offset. This
	* requires the assumption that file offset cookies monotonically
	* increase.
	*/
	while (cpos < cend && ncookies > 0 &&
	(dp->d_fileno == 0 \|\| dp->d_type == DT_WHT \|\|
	(is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff))) {
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	if (cpos >= cend \|\| ncookies == 0) {
	siz = fullsiz;
	toff = off;
	goto again;
	}
	vput(vp);

	/*
	* dirlen is the size of the reply, including all XDR and must
	* not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate
	* if the XDR should be included in "count", but to be safe, we do.
	* (Include the two booleans at the end of the reply in dirlen now.)
	*/
	if (nd->nd_flag & ND_NFSV3) {
	nfsrv_postopattr(nd, getret, &at);
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	txdr_hyper(at.na_filerev, tl);
	dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
	} else {
	dirlen = 2 * NFSX_UNSIGNED;
	}

	/* Loop through the records and build reply */
	while (cpos < cend && ncookies > 0) {
	nlen = dp->d_namlen;
	if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
	nlen <= NFS_MAXNAMLEN) {
	if (nd->nd_flag & ND_NFSV3)
	dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
	else
	dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
	if (dirlen > cnt) {
	eofflag = 0;
	break;
	}

	/*
	* Build the directory record xdr from
	* the dirent entry.
	*/
	if (nd->nd_flag & ND_NFSV3) {
	NFSM_BUILD(tl, u_int32_t , 3 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	*tl++ = 0;
	} else {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	}
	*tl = txdr_unsigned(dp->d_fileno);
	(void) nfsm_strtom(nd, dp->d_name, nlen);
	if (nd->nd_flag & ND_NFSV3) {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = 0;
	} else
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	tl = txdr_unsigned(cookiep);
	}
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	if (cpos < cend)
	eofflag = 0;
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = newnfs_false;
	if (eofflag)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	FREE((caddr_t)rbuf, M_TEMP);
	FREE((caddr_t)cookies, M_TEMP);

	out:
	NFSEXITCODE2(0, nd);
	return (0);
	nfsmout:
	vput(vp);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Readdirplus for V3 and Readdir for V4.
	*/
	int
	nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
	struct vnode vp, struct thread p, struct nfsexstuff *exp)
	{
	struct dirent *dp;
	u_int32_t *tl;
	int dirlen;
	char cpos, cend, *rbuf;
	struct vnode *nvp;
	fhandle_t nfh;
	struct nfsvattr nva, at, *nvap = &nva;
	struct mbuf mb0, mb1;
	struct nfsreferral *refp;
	int nlen, r, error = 0, getret = 1, usevget = 1;
	int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
	caddr_t bpos0, bpos1;
	u_int64_t off, toff, verf;
	u_long cookies = NULL, cookiep;
	nfsattrbit_t attrbits, rderrbits, savbits;
	struct uio io;
	struct iovec iv;
	struct componentname cn;
	int at_root, is_ufs, is_zfs, needs_unbusy, supports_nfsv4acls;
	struct mount mp, new_mp;
	uint64_t mounted_on_fileno;

	if (nd->nd_repstat) {
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	NFSM_DISSECT(tl, u_int32_t , 6 NFSX_UNSIGNED);
	off = fxdr_hyper(tl);
	toff = off;
	tl += 2;
	verf = fxdr_hyper(tl);
	tl += 2;
	siz = fxdr_unsigned(int, *tl++);
	cnt = fxdr_unsigned(int, *tl);

	/*
	* Use the server's maximum data transfer size as the upper bound
	* on reply datalen.
	*/
	if (cnt > NFS_SRVMAXDATA(nd) \|\| cnt < 0)
	cnt = NFS_SRVMAXDATA(nd);

	/*
	* siz is a "hint" of how much directory information (name, fileid,
	* cookie) should be in the reply. At least one client "hints" 0,
	* so I set it to cnt for that case. I also round it up to the
	* next multiple of DIRBLKSIZ.
	*/
	if (siz <= 0)
	siz = cnt;
	siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));

	if (nd->nd_flag & ND_NFSV4) {
	error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
	if (error)
	goto nfsmout;
	NFSSET_ATTRBIT(&savbits, &attrbits);
	NFSCLRNOTFILLABLE_ATTRBIT(&attrbits);
	NFSZERO_ATTRBIT(&rderrbits);
	NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR);
	} else {
	NFSZERO_ATTRBIT(&attrbits);
	}
	fullsiz = siz;
	nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
	if (!nd->nd_repstat) {
	if (off && verf != at.na_filerev) {
	/*
	* va_filerev is not sufficient as a cookie verifier,
	* since it is not supposed to change when entries are
	* removed/added unless that offset cookies returned to
	* the client are no longer valid.
	*/
	#if 0
	if (nd->nd_flag & ND_NFSV4) {
	nd->nd_repstat = NFSERR_NOTSAME;
	} else {
	nd->nd_repstat = NFSERR_BAD_COOKIE;
	}
	#endif
	} else if ((nd->nd_flag & ND_NFSV4) && off == 0 && verf != 0) {
	nd->nd_repstat = NFSERR_BAD_COOKIE;
	}
	}
	if (!nd->nd_repstat && vp->v_type != VDIR)
	nd->nd_repstat = NFSERR_NOTDIR;
	if (!nd->nd_repstat && cnt == 0)
	nd->nd_repstat = NFSERR_TOOSMALL;
	if (!nd->nd_repstat)
	nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
	nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	if (nd->nd_repstat) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
	is_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs") == 0;

	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
	again:
	eofflag = 0;
	if (cookies) {
	free((caddr_t)cookies, M_TEMP);
	cookies = NULL;
	}

	iv.iov_base = rbuf;
	iv.iov_len = siz;
	io.uio_iov = &iv;
	io.uio_iovcnt = 1;
	io.uio_offset = (off_t)off;
	io.uio_resid = siz;
	io.uio_segflg = UIO_SYSSPACE;
	io.uio_rw = UIO_READ;
	io.uio_td = NULL;
	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
	&cookies);
	off = (u_int64_t)io.uio_offset;
	if (io.uio_resid)
	siz -= io.uio_resid;

	getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);

	if (!cookies && !nd->nd_repstat)
	nd->nd_repstat = NFSERR_PERM;
	if (!nd->nd_repstat)
	nd->nd_repstat = getret;
	if (nd->nd_repstat) {
	vput(vp);
	if (cookies)
	free((caddr_t)cookies, M_TEMP);
	free((caddr_t)rbuf, M_TEMP);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	/*
	* If nothing read, return eof
	* rpc reply
	*/
	if (siz == 0) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	NFSM_BUILD(tl, u_int32_t , 4 NFSX_UNSIGNED);
	txdr_hyper(at.na_filerev, tl);
	tl += 2;
	*tl++ = newnfs_false;
	*tl = newnfs_true;
	free((caddr_t)cookies, M_TEMP);
	free((caddr_t)rbuf, M_TEMP);
	goto out;
	}

	/*
	* Check for degenerate cases of nothing useful read.
	* If so go try again
	*/
	cpos = rbuf;
	cend = rbuf + siz;
	dp = (struct dirent *)cpos;
	cookiep = cookies;

	/*
	* For some reason FreeBSD's ufs_readdir() chooses to back the
	* directory offset up to a block boundary, so it is necessary to
	* skip over the records that precede the requested offset. This
	* requires the assumption that file offset cookies monotonically
	* increase.
	*/
	while (cpos < cend && ncookies > 0 &&
	(dp->d_fileno == 0 \|\| dp->d_type == DT_WHT \|\|
	(is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff) \|\|
	((nd->nd_flag & ND_NFSV4) &&
	((dp->d_namlen == 1 && dp->d_name[0] == '.') \|\|
	(dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) {
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	if (cpos >= cend \|\| ncookies == 0) {
	siz = fullsiz;
	toff = off;
	goto again;
	}

	/*
	* Busy the file system so that the mount point won't go away
	* and, as such, VFS_VGET() can be used safely.
	*/
	mp = vp->v_mount;
	vfs_ref(mp);
	NFSVOPUNLOCK(vp, 0);
	nd->nd_repstat = vfs_busy(mp, 0);
	vfs_rel(mp);
	if (nd->nd_repstat != 0) {
	vrele(vp);
	free(cookies, M_TEMP);
	free(rbuf, M_TEMP);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}

	/*
	* Check to see if entries in this directory can be safely acquired
	* via VFS_VGET() or if a switch to VOP_LOOKUP() is required.
	* ZFS snapshot directories need VOP_LOOKUP(), so that any
	* automount of the snapshot directory that is required will
	* be done.
	* This needs to be done here for NFSv4, since NFSv4 never does
	* a VFS_VGET() for "." or "..".
	*/
	if (is_zfs == 1) {
	r = VFS_VGET(mp, at.na_fileid, LK_SHARED, &nvp);
	if (r == EOPNOTSUPP) {
	usevget = 0;
	cn.cn_nameiop = LOOKUP;
	cn.cn_lkflags = LK_SHARED \| LK_RETRY;
	cn.cn_cred = nd->nd_cred;
	cn.cn_thread = p;
	} else if (r == 0)
	vput(nvp);
	}

	/*
	* Save this position, in case there is an error before one entry
	* is created.
	*/
	mb0 = nd->nd_mb;
	bpos0 = nd->nd_bpos;

	/*
	* Fill in the first part of the reply.
	* dirlen is the reply length in bytes and cannot exceed cnt.
	* (Include the two booleans at the end of the reply in dirlen now,
	* so we recognize when we have exceeded cnt.)
	*/
	if (nd->nd_flag & ND_NFSV3) {
	dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
	nfsrv_postopattr(nd, getret, &at);
	} else {
	dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
	}
	NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
	txdr_hyper(at.na_filerev, tl);

	/*
	* Save this position, in case there is an empty reply needed.
	*/
	mb1 = nd->nd_mb;
	bpos1 = nd->nd_bpos;

	/* Loop through the records and build reply */
	entrycnt = 0;
	while (cpos < cend && ncookies > 0 && dirlen < cnt) {
	nlen = dp->d_namlen;
	if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
	nlen <= NFS_MAXNAMLEN &&
	((nd->nd_flag & ND_NFSV3) \|\| nlen > 2 \|\|
	(nlen==2 && (dp->d_name[0]!='.' \|\| dp->d_name[1]!='.'))
	\|\| (nlen == 1 && dp->d_name[0] != '.'))) {
	/*
	* Save the current position in the reply, in case
	* this entry exceeds cnt.
	*/
	mb1 = nd->nd_mb;
	bpos1 = nd->nd_bpos;

	/*
	* For readdir_and_lookup get the vnode using
	* the file number.
	*/
	nvp = NULL;
	refp = NULL;
	r = 0;
	at_root = 0;
	needs_unbusy = 0;
	new_mp = mp;
	mounted_on_fileno = (uint64_t)dp->d_fileno;
	if ((nd->nd_flag & ND_NFSV3) \|\|
	NFSNONZERO_ATTRBIT(&savbits)) {
	if (nd->nd_flag & ND_NFSV4)
	refp = nfsv4root_getreferral(NULL,
	vp, dp->d_fileno);
	if (refp == NULL) {
	if (usevget)
	r = VFS_VGET(mp, dp->d_fileno,
	LK_SHARED, &nvp);
	else
	r = EOPNOTSUPP;
	if (r == EOPNOTSUPP) {
	if (usevget) {
	usevget = 0;
	cn.cn_nameiop = LOOKUP;
	cn.cn_lkflags =
	LK_SHARED \|
	LK_RETRY;
	cn.cn_cred =
	nd->nd_cred;
	cn.cn_thread = p;
	}
	cn.cn_nameptr = dp->d_name;
	cn.cn_namelen = nlen;
	cn.cn_flags = ISLASTCN \|
	NOFOLLOW \| LOCKLEAF;
	if (nlen == 2 &&
	dp->d_name[0] == '.' &&
	dp->d_name[1] == '.')
	cn.cn_flags \|=
	ISDOTDOT;
	if (NFSVOPLOCK(vp, LK_SHARED)
	!= 0) {
	nd->nd_repstat = EPERM;
	break;
	}
	if ((vp->v_vflag & VV_ROOT) != 0
	&& (cn.cn_flags & ISDOTDOT)
	!= 0) {
	vref(vp);
	nvp = vp;
	r = 0;
	} else {
	r = VOP_LOOKUP(vp, &nvp,
	&cn);
	if (vp != nvp)
	NFSVOPUNLOCK(vp,
	0);
	}
	}

	/*
	* For NFSv4, check to see if nvp is
	* a mount point and get the mount
	* point vnode, as required.
	*/
	if (r == 0 &&
	nfsrv_enable_crossmntpt != 0 &&
	(nd->nd_flag & ND_NFSV4) != 0 &&
	nvp->v_type == VDIR &&
	nvp->v_mountedhere != NULL) {
	new_mp = nvp->v_mountedhere;
	r = vfs_busy(new_mp, 0);
	vput(nvp);
	nvp = NULL;
	if (r == 0) {
	r = VFS_ROOT(new_mp,
	LK_SHARED, &nvp);
	needs_unbusy = 1;
	if (r == 0)
	at_root = 1;
	}
	}
	}
	if (!r) {
	if (refp == NULL &&
	((nd->nd_flag & ND_NFSV3) \|\|
	NFSNONZERO_ATTRBIT(&attrbits))) {
	r = nfsvno_getfh(nvp, &nfh, p);
	if (!r)
	r = nfsvno_getattr(nvp, nvap,
	nd->nd_cred, p, 1);
	if (r == 0 && is_zfs == 1 &&
	nfsrv_enable_crossmntpt != 0 &&
	(nd->nd_flag & ND_NFSV4) != 0 &&
	nvp->v_type == VDIR &&
	vp->v_mount != nvp->v_mount) {
	/*
	* For a ZFS snapshot, there is a
	* pseudo mount that does not set
	* v_mountedhere, so it needs to
	* be detected via a different
	* mount structure.
	*/
	at_root = 1;
	if (new_mp == mp)
	new_mp = nvp->v_mount;
	}
	}
	} else {
	nvp = NULL;
	}
	if (r) {
	if (!NFSISSET_ATTRBIT(&attrbits,
	NFSATTRBIT_RDATTRERROR)) {
	if (nvp != NULL)
	vput(nvp);
	if (needs_unbusy != 0)
	vfs_unbusy(new_mp);
	nd->nd_repstat = r;
	break;
	}
	}
	}

	/*
	* Build the directory record xdr
	*/
	if (nd->nd_flag & ND_NFSV3) {
	NFSM_BUILD(tl, u_int32_t , 3 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	*tl++ = 0;
	*tl = txdr_unsigned(dp->d_fileno);
	dirlen += nfsm_strtom(nd, dp->d_name, nlen);
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = 0;
	tl = txdr_unsigned(cookiep);
	nfsrv_postopattr(nd, 0, nvap);
	dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1);
	dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR);
	if (nvp != NULL)
	vput(nvp);
	} else {
	NFSM_BUILD(tl, u_int32_t , 3 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	*tl++ = 0;
	tl = txdr_unsigned(cookiep);
	dirlen += nfsm_strtom(nd, dp->d_name, nlen);
	if (nvp != NULL) {
	supports_nfsv4acls =
	nfs_supportsnfsv4acls(nvp);
	NFSVOPUNLOCK(nvp, 0);
	} else
	supports_nfsv4acls = 0;
	if (refp != NULL) {
	dirlen += nfsrv_putreferralattr(nd,
	&savbits, refp, 0,
	&nd->nd_repstat);
	if (nd->nd_repstat) {
	if (nvp != NULL)
	vrele(nvp);
	if (needs_unbusy != 0)
	vfs_unbusy(new_mp);
	break;
	}
	} else if (r) {
	dirlen += nfsvno_fillattr(nd, new_mp,
	nvp, nvap, &nfh, r, &rderrbits,
	nd->nd_cred, p, isdgram, 0,
	supports_nfsv4acls, at_root,
	mounted_on_fileno);
	} else {
	dirlen += nfsvno_fillattr(nd, new_mp,
	nvp, nvap, &nfh, r, &attrbits,
	nd->nd_cred, p, isdgram, 0,
	supports_nfsv4acls, at_root,
	mounted_on_fileno);
	}
	if (nvp != NULL)
	vrele(nvp);
	dirlen += (3 * NFSX_UNSIGNED);
	}
	if (needs_unbusy != 0)
	vfs_unbusy(new_mp);
	if (dirlen <= cnt)
	entrycnt++;
	}
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	vrele(vp);
	vfs_unbusy(mp);

	/*
	* If dirlen > cnt, we must strip off the last entry. If that
	* results in an empty reply, report NFSERR_TOOSMALL.
	*/
	if (dirlen > cnt \|\| nd->nd_repstat) {
	if (!nd->nd_repstat && entrycnt == 0)
	nd->nd_repstat = NFSERR_TOOSMALL;
	if (nd->nd_repstat) {
	newnfs_trimtrailing(nd, mb0, bpos0);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	} else
	newnfs_trimtrailing(nd, mb1, bpos1);
	eofflag = 0;
	} else if (cpos < cend)
	eofflag = 0;
	if (!nd->nd_repstat) {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = newnfs_false;
	if (eofflag)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	}
	FREE((caddr_t)cookies, M_TEMP);
	FREE((caddr_t)rbuf, M_TEMP);

	out:
	NFSEXITCODE2(0, nd);
	return (0);
	nfsmout:
	vput(vp);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Get the settable attributes out of the mbuf list.
	* (Return 0 or EBADRPC)
	*/
	int
	nfsrv_sattr(struct nfsrv_descript nd, vnode_t vp, struct nfsvattr nvap,
	nfsattrbit_t attrbitp, NFSACL_T aclp, struct thread *p)
	{
	u_int32_t *tl;
	struct nfsv2_sattr *sp;
	int error = 0, toclient = 0;

	switch (nd->nd_flag & (ND_NFSV2 \| ND_NFSV3 \| ND_NFSV4)) {
	case ND_NFSV2:
	NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
	/*
	* Some old clients didn't fill in the high order 16bits.
	* --> check the low order 2 bytes for 0xffff
	*/
	if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
	nvap->na_mode = nfstov_mode(sp->sa_mode);
	if (sp->sa_uid != newnfs_xdrneg1)
	nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid);
	if (sp->sa_gid != newnfs_xdrneg1)
	nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid);
	if (sp->sa_size != newnfs_xdrneg1)
	nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size);
	if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) {
	#ifdef notyet
	fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime);
	#else
	nvap->na_atime.tv_sec =
	fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec);
	nvap->na_atime.tv_nsec = 0;
	#endif
	}
	if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1)
	fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime);
	break;
	case ND_NFSV3:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_mode = nfstov_mode(*tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_uid = fxdr_unsigned(uid_t, *tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_gid = fxdr_unsigned(gid_t, *tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	nvap->na_size = fxdr_hyper(tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	switch (fxdr_unsigned(int, *tl)) {
	case NFSV3SATTRTIME_TOCLIENT:
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	fxdr_nfsv3time(tl, &nvap->na_atime);
	toclient = 1;
	break;
	case NFSV3SATTRTIME_TOSERVER:
	vfs_timestamp(&nvap->na_atime);
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	break;
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	switch (fxdr_unsigned(int, *tl)) {
	case NFSV3SATTRTIME_TOCLIENT:
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	fxdr_nfsv3time(tl, &nvap->na_mtime);
	nvap->na_vaflags &= ~VA_UTIMES_NULL;
	break;
	case NFSV3SATTRTIME_TOSERVER:
	vfs_timestamp(&nvap->na_mtime);
	if (!toclient)
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	break;
	}
	break;
	case ND_NFSV4:
	error = nfsv4_sattr(nd, vp, nvap, attrbitp, aclp, p);
	}
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Handle the setable attributes for V4.
	* Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise.
	*/
	int
	nfsv4_sattr(struct nfsrv_descript nd, vnode_t vp, struct nfsvattr nvap,
	nfsattrbit_t attrbitp, NFSACL_T aclp, struct thread *p)
	{
	u_int32_t *tl;
	int attrsum = 0;
	int i, j;
	int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0;
	int toclient = 0;
	u_char *cp, namestr[NFSV4_SMALLSTR + 1];
	uid_t uid;
	gid_t gid;

	error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup);
	if (error)
	goto nfsmout;
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsize = fxdr_unsigned(int, *tl);

	/*
	* Loop around getting the setable attributes. If an unsupported
	* one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return.
	*/
	if (retnotsup) {
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	bitpos = NFSATTRBIT_MAX;
	} else {
	bitpos = 0;
	}
	for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
	if (attrsum > attrsize) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (NFSISSET_ATTRBIT(attrbitp, bitpos))
	switch (bitpos) {
	case NFSATTRBIT_SIZE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	if (vp != NULL && vp->v_type != VREG) {
	error = (vp->v_type == VDIR) ? NFSERR_ISDIR :
	NFSERR_INVAL;
	goto nfsmout;
	}
	nvap->na_size = fxdr_hyper(tl);
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_ACL:
	error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize,
	p);
	if (error)
	goto nfsmout;
	if (aceerr && !nd->nd_repstat)
	nd->nd_repstat = aceerr;
	attrsum += aclsize;
	break;
	case NFSATTRBIT_ARCHIVE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_HIDDEN:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MIMETYPE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
	break;
	case NFSATTRBIT_MODE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_mode = nfstov_mode(*tl);
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_OWNER:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	j = fxdr_unsigned(int, *tl);
	if (j < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (j > NFSV4_SMALLSTR)
	cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
	else
	cp = namestr;
	error = nfsrv_mtostr(nd, cp, j);
	if (error) {
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	goto nfsmout;
	}
	if (!nd->nd_repstat) {
	nd->nd_repstat = nfsv4_strtouid(nd, cp, j, &uid,
	p);
	if (!nd->nd_repstat)
	nvap->na_uid = uid;
	}
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
	break;
	case NFSATTRBIT_OWNERGROUP:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	j = fxdr_unsigned(int, *tl);
	if (j < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (j > NFSV4_SMALLSTR)
	cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
	else
	cp = namestr;
	error = nfsrv_mtostr(nd, cp, j);
	if (error) {
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	goto nfsmout;
	}
	if (!nd->nd_repstat) {
	nd->nd_repstat = nfsv4_strtogid(nd, cp, j, &gid,
	p);
	if (!nd->nd_repstat)
	nvap->na_gid = gid;
	}
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
	break;
	case NFSATTRBIT_SYSTEM:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_TIMEACCESSSET:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsum += NFSX_UNSIGNED;
	if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	fxdr_nfsv4time(tl, &nvap->na_atime);
	toclient = 1;
	attrsum += NFSX_V4TIME;
	} else {
	vfs_timestamp(&nvap->na_atime);
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	}
	break;
	case NFSATTRBIT_TIMEBACKUP:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMECREATE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMODIFYSET:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsum += NFSX_UNSIGNED;
	if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	fxdr_nfsv4time(tl, &nvap->na_mtime);
	nvap->na_vaflags &= ~VA_UTIMES_NULL;
	attrsum += NFSX_V4TIME;
	} else {
	vfs_timestamp(&nvap->na_mtime);
	if (!toclient)
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	}
	break;
	default:
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	/*
	* set bitpos so we drop out of the loop.
	*/
	bitpos = NFSATTRBIT_MAX;
	break;
	}
	}

	/*
	* some clients pad the attrlist, so we need to skip over the
	* padding.
	*/
	if (attrsum > attrsize) {
	error = NFSERR_BADXDR;
	} else {
	attrsize = NFSM_RNDUP(attrsize);
	if (attrsum < attrsize)
	error = nfsm_advance(nd, attrsize - attrsum, -1);
	}
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Check/setup export credentials.
	*/
	int
	nfsd_excred(struct nfsrv_descript nd, struct nfsexstuff exp,
	struct ucred *credanon)
	{
	int error = 0;

	/*
	* Check/setup credentials.
	*/
	if (nd->nd_flag & ND_GSS)
	exp->nes_exflag &= ~MNT_EXPORTANON;

	/*
	* Check to see if the operation is allowed for this security flavor.
	* RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to
	* AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS.
	* Also, allow Secinfo, so that it can acquire the correct flavor(s).
	*/
	if (nfsvno_testexp(nd, exp) &&
	nd->nd_procnum != NFSV4OP_SECINFO &&
	nd->nd_procnum != NFSPROC_FSINFO) {
	if (nd->nd_flag & ND_NFSV4)
	error = NFSERR_WRONGSEC;
	else
	error = (NFSERR_AUTHERR \| AUTH_TOOWEAK);
	goto out;
	}

	/*
	* Check to see if the file system is exported V4 only.
	*/
	if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) {
	error = NFSERR_PROGNOTV4;
	goto out;
	}

	/*
	* Now, map the user credentials.
	* (Note that ND_AUTHNONE will only be set for an NFSv3
	* Fsinfo RPC. If set for anything else, this code might need
	* to change.)
	*/
	if (NFSVNO_EXPORTED(exp)) {
	if (((nd->nd_flag & ND_GSS) == 0 && nd->nd_cred->cr_uid == 0) \|\|
	NFSVNO_EXPORTANON(exp) \|\|
	(nd->nd_flag & ND_AUTHNONE) != 0) {
	nd->nd_cred->cr_uid = credanon->cr_uid;
	nd->nd_cred->cr_gid = credanon->cr_gid;
	crsetgroups(nd->nd_cred, credanon->cr_ngroups,
	credanon->cr_groups);
	} else if ((nd->nd_flag & ND_GSS) == 0) {
	/*
	* If using AUTH_SYS, call nfsrv_getgrpscred() to see
	* if there is a replacement credential with a group
	* list set up by "nfsuserd -manage-gids".
	* If there is no replacement, nfsrv_getgrpscred()
	* simply returns its argument.
	*/
	nd->nd_cred = nfsrv_getgrpscred(nd->nd_cred);
	}
	}

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Check exports.
	*/
	int
	nfsvno_checkexp(struct mount mp, struct sockaddr nam, struct nfsexstuff *exp,
	struct ucred **credp)
	{
	int i, error, *secflavors;

	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
	&exp->nes_numsecflavor, &secflavors);
	if (error) {
	if (nfs_rootfhset) {
	exp->nes_exflag = 0;
	exp->nes_numsecflavor = 0;
	error = 0;
	}
	} else {
	/* Copy the security flavors. */
	for (i = 0; i < exp->nes_numsecflavor; i++)
	exp->nes_secflavors[i] = secflavors[i];
	}
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Get a vnode for a file handle and export stuff.
	*/
	int
	nfsvno_fhtovp(struct mount mp, fhandle_t fhp, struct sockaddr *nam,
	int lktype, struct vnode *vpp, struct nfsexstuff exp,
	struct ucred **credp)
	{
	int i, error, *secflavors;

	*credp = NULL;
	exp->nes_numsecflavor = 0;
	error = VFS_FHTOVP(mp, &fhp->fh_fid, lktype, vpp);
	if (error != 0)
	/* Make sure the server replies ESTALE to the client. */
	error = ESTALE;
	if (nam && !error) {
	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
	&exp->nes_numsecflavor, &secflavors);
	if (error) {
	if (nfs_rootfhset) {
	exp->nes_exflag = 0;
	exp->nes_numsecflavor = 0;
	error = 0;
	} else {
	vput(*vpp);
	}
	} else {
	/* Copy the security flavors. */
	for (i = 0; i < exp->nes_numsecflavor; i++)
	exp->nes_secflavors[i] = secflavors[i];
	}
	}
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* nfsd_fhtovp() - convert a fh to a vnode ptr
	* - look up fsid in mount list (if not found ret error)
	* - get vp and export rights by calling nfsvno_fhtovp()
	* - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
	* for AUTH_SYS
	* - if mpp != NULL, return the mount point so that it can
	* be used for vn_finished_write() by the caller
	*/
	void
	nfsd_fhtovp(struct nfsrv_descript nd, struct nfsrvfh nfp, int lktype,
	struct vnode *vpp, struct nfsexstuff exp,
	struct mount *mpp, int startwrite, struct thread p)
	{
	struct mount *mp;
	struct ucred *credanon;
	fhandle_t *fhp;

	fhp = (fhandle_t *)nfp->nfsrvfh_data;
	/*
	* Check for the special case of the nfsv4root_fh.
	*/
	mp = vfs_busyfs(&fhp->fh_fsid);
	if (mpp != NULL)
	*mpp = mp;
	if (mp == NULL) {
	*vpp = NULL;
	nd->nd_repstat = ESTALE;
	goto out;
	}

	if (startwrite) {
	vn_start_write(NULL, mpp, V_WAIT);
	if (lktype == LK_SHARED && !(MNT_SHARED_WRITES(mp)))
	lktype = LK_EXCLUSIVE;
	}
	nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp,
	&credanon);
	vfs_unbusy(mp);

	/*
	* For NFSv4 without a pseudo root fs, unexported file handles
	* can be returned, so that Lookup works everywhere.
	*/
	if (!nd->nd_repstat && exp->nes_exflag == 0 &&
	!(nd->nd_flag & ND_NFSV4)) {
	vput(*vpp);
	nd->nd_repstat = EACCES;
	}

	/*
	* Personally, I've never seen any point in requiring a
	* reserved port#, since only in the rare case where the
	* clients are all boxes with secure system privileges,
	* does it provide any enhanced security, but... some people
	* believe it to be useful and keep putting this code back in.
	* (There is also some "security checker" out there that
	* complains if the nfs server doesn't enforce this.)
	* However, note the following:
	* RFC3530 (NFSv4) specifies that a reserved port# not be
	* required.
	* RFC2623 recommends that, if a reserved port# is checked for,
	* that there be a way to turn that off--> ifdef'd.
	*/
	#ifdef NFS_REQRSVPORT
	if (!nd->nd_repstat) {
	struct sockaddr_in *saddr;
	struct sockaddr_in6 *saddr6;

	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
	saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *);
	if (!(nd->nd_flag & ND_NFSV4) &&
	((saddr->sin_family == AF_INET &&
	ntohs(saddr->sin_port) >= IPPORT_RESERVED) \|\|
	(saddr6->sin6_family == AF_INET6 &&
	ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) {
	vput(*vpp);
	nd->nd_repstat = (NFSERR_AUTHERR \| AUTH_TOOWEAK);
	}
	}
	#endif /* NFS_REQRSVPORT */

	/*
	* Check/setup credentials.
	*/
	if (!nd->nd_repstat) {
	nd->nd_saveduid = nd->nd_cred->cr_uid;
	nd->nd_repstat = nfsd_excred(nd, exp, credanon);
	if (nd->nd_repstat)
	vput(*vpp);
	}
	if (credanon != NULL)
	crfree(credanon);
	if (nd->nd_repstat) {
	if (startwrite)
	vn_finished_write(mp);
	*vpp = NULL;
	if (mpp != NULL)
	*mpp = NULL;
	}

	out:
	NFSEXITCODE2(0, nd);
	}

	/*
	* glue for fp.
	*/
	static int
	fp_getfvp(struct thread p, int fd, struct file fpp, struct vnode *vpp)
	{
	struct filedesc *fdp;
	struct file *fp;
	int error = 0;

	fdp = p->td_proc->p_fd;
	if (fd < 0 \|\| fd >= fdp->fd_nfiles \|\|
	(fp = fdp->fd_ofiles[fd].fde_file) == NULL) {
	error = EBADF;
	goto out;
	}
	*fpp = fp;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Called from nfssvc() to update the exports list. Just call
	* vfs_export(). This has to be done, since the v4 root fake fs isn't
	* in the mount list.
	*/
	int
	nfsrv_v4rootexport(void argp, struct ucred cred, struct thread *p)
	{
	struct nfsex_args nfsexargp = (struct nfsex_args )argp;
	int error = 0;
	struct nameidata nd;
	fhandle_t fh;

	error = vfs_export(&nfsv4root_mnt, &nfsexargp->export);
	if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
	nfs_rootfhset = 0;
	else if (error == 0) {
	if (nfsexargp->fspec == NULL) {
	error = EPERM;
	goto out;
	}
	/*
	* If fspec != NULL, this is the v4root path.
	*/
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE,
	nfsexargp->fspec, p);
	if ((error = namei(&nd)) != 0)
	goto out;
	error = nfsvno_getfh(nd.ni_vp, &fh, p);
	vrele(nd.ni_vp);
	if (!error) {
	nfs_rootfh.nfsrvfh_len = NFSX_MYFH;
	NFSBCOPY((caddr_t)&fh,
	nfs_rootfh.nfsrvfh_data,
	sizeof (fhandle_t));
	nfs_rootfhset = 1;
	}
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* This function needs to test to see if the system is near its limit
	* for memory allocation via malloc() or mget() and return True iff
	* either of these resources are near their limit.
	* XXX (For now, this is just a stub.)
	*/
	int nfsrv_testmalloclimit = 0;
	int
	nfsrv_mallocmget_limit(void)
	{
	static int printmesg = 0;
	static int testval = 1;

	if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) {
	if ((printmesg++ % 100) == 0)
	printf("nfsd: malloc/mget near limit\n");
	return (1);
	}
	return (0);
	}

	/*
	* BSD specific initialization of a mount point.
	*/
	void
	nfsd_mntinit(void)
	{
	static int inited = 0;

	if (inited)
	return;
	inited = 1;
	nfsv4root_mnt.mnt_flag = (MNT_RDONLY \| MNT_EXPORTED);
	TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
	TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist);
	nfsv4root_mnt.mnt_export = NULL;
	TAILQ_INIT(&nfsv4root_opt);
	TAILQ_INIT(&nfsv4root_newopt);
	nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
	nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
	nfsv4root_mnt.mnt_nvnodelistsize = 0;
	nfsv4root_mnt.mnt_activevnodelistsize = 0;
	}

	/*
	* Get a vnode for a file handle, without checking exports, etc.
	*/
	struct vnode *
	nfsvno_getvp(fhandle_t *fhp)
	{
	struct mount *mp;
	struct vnode *vp;
	int error;

	mp = vfs_busyfs(&fhp->fh_fsid);
	if (mp == NULL)
	return (NULL);
	error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp);
	vfs_unbusy(mp);
	if (error)
	return (NULL);
	return (vp);
	}

	/*
	* Do a local VOP_ADVLOCK().
	*/
	int
	nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first,
	u_int64_t end, struct thread *td)
	{
	int error = 0;
	struct flock fl;
	u_int64_t tlen;

	if (nfsrv_dolocallocks == 0)
	goto out;
	ASSERT_VOP_UNLOCKED(vp, "nfsvno_advlock: vp locked");

	fl.l_whence = SEEK_SET;
	fl.l_type = ftype;
	fl.l_start = (off_t)first;
	if (end == NFS64BITSSET) {
	fl.l_len = 0;
	} else {
	tlen = end - first;
	fl.l_len = (off_t)tlen;
	}
	/*
	* For FreeBSD8, the l_pid and l_sysid must be set to the same
	* values for all calls, so that all locks will be held by the
	* nfsd server. (The nfsd server handles conflicts between the
	* various clients.)
	* Since an NFSv4 lockowner is a ClientID plus an array of up to 1024
	* bytes, so it can't be put in l_sysid.
	*/
	if (nfsv4_sysid == 0)
	nfsv4_sysid = nlm_acquire_next_sysid();
	fl.l_pid = (pid_t)0;
	fl.l_sysid = (int)nfsv4_sysid;

	if (ftype == F_UNLCK)
	error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl,
	(F_POSIX \| F_REMOTE));
	else
	error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl,
	(F_POSIX \| F_REMOTE));

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Check the nfsv4 root exports.
	*/
	int
	nfsvno_v4rootexport(struct nfsrv_descript *nd)
	{
	struct ucred *credanon;
	int exflags, error = 0, numsecflavor, *secflavors, i;

	error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags,
	&credanon, &numsecflavor, &secflavors);
	if (error) {
	error = NFSERR_PROGUNAVAIL;
	goto out;
	}
	if (credanon != NULL)
	crfree(credanon);
	for (i = 0; i < numsecflavor; i++) {
	if (secflavors[i] == AUTH_SYS)
	nd->nd_flag \|= ND_EXAUTHSYS;
	else if (secflavors[i] == RPCSEC_GSS_KRB5)
	nd->nd_flag \|= ND_EXGSS;
	else if (secflavors[i] == RPCSEC_GSS_KRB5I)
	nd->nd_flag \|= ND_EXGSSINTEGRITY;
	else if (secflavors[i] == RPCSEC_GSS_KRB5P)
	nd->nd_flag \|= ND_EXGSSPRIVACY;
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Nfs server pseudo system call for the nfsd's
	*/
	/*
	* MPSAFE
	*/
	static int
	nfssvc_nfsd(struct thread td, struct nfssvc_args uap)
	{
	struct file *fp;
	struct nfsd_addsock_args sockarg;
	struct nfsd_nfsd_args nfsdarg;
	cap_rights_t rights;
	int error;

	if (uap->flag & NFSSVC_NFSDADDSOCK) {
	error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg));
	if (error)
	goto out;
	/*
	* Since we don't know what rights might be required,
	* pretend that we need them all. It is better to be too
	* careful than too reckless.
	*/
	error = fget(td, sockarg.sock,
	cap_rights_init(&rights, CAP_SOCK_SERVER), &fp);
	if (error != 0)
	goto out;
	if (fp->f_type != DTYPE_SOCKET) {
	fdrop(fp, td);
	error = EPERM;
	goto out;
	}
	error = nfsrvd_addsock(fp);
	fdrop(fp, td);
	} else if (uap->flag & NFSSVC_NFSDNFSD) {
	if (uap->argp == NULL) {
	error = EINVAL;
	goto out;
	}
	error = copyin(uap->argp, (caddr_t)&nfsdarg,
	sizeof (nfsdarg));
	if (error)
	goto out;
	error = nfsrvd_nfsd(td, &nfsdarg);
	} else {
	error = nfssvc_srvcall(td, uap, td->td_ucred);
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	static int
	nfssvc_srvcall(struct thread p, struct nfssvc_args uap, struct ucred *cred)
	{
	struct nfsex_args export;
	struct file *fp = NULL;
	int stablefd, len;
	struct nfsd_clid adminrevoke;
	struct nfsd_dumplist dumplist;
	struct nfsd_dumpclients *dumpclients;
	struct nfsd_dumplocklist dumplocklist;
	struct nfsd_dumplocks *dumplocks;
	struct nameidata nd;
	vnode_t vp;
	int error = EINVAL, igotlock;
	struct proc *procp;
	static int suspend_nfsd = 0;

	if (uap->flag & NFSSVC_PUBLICFH) {
	NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data,
	sizeof (fhandle_t));
	error = copyin(uap->argp,
	&nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t));
	if (!error)
	nfs_pubfhset = 1;
	} else if (uap->flag & NFSSVC_V4ROOTEXPORT) {
	error = copyin(uap->argp,(caddr_t)&export,
	sizeof (struct nfsex_args));
	if (!error)
	error = nfsrv_v4rootexport(&export, cred, p);
	} else if (uap->flag & NFSSVC_NOPUBLICFH) {
	nfs_pubfhset = 0;
	error = 0;
	} else if (uap->flag & NFSSVC_STABLERESTART) {
	error = copyin(uap->argp, (caddr_t)&stablefd,
	sizeof (int));
	if (!error)
	error = fp_getfvp(p, stablefd, &fp, &vp);
	if (!error && (NFSFPFLAG(fp) & (FREAD \| FWRITE)) != (FREAD \| FWRITE))
	error = EBADF;
	if (!error && newnfs_numnfsd != 0)
	error = EPERM;
	if (!error) {
	nfsrv_stablefirst.nsf_fp = fp;
	nfsrv_setupstable(p);
	}
	} else if (uap->flag & NFSSVC_ADMINREVOKE) {
	error = copyin(uap->argp, (caddr_t)&adminrevoke,
	sizeof (struct nfsd_clid));
	if (!error)
	error = nfsrv_adminrevoke(&adminrevoke, p);
	} else if (uap->flag & NFSSVC_DUMPCLIENTS) {
	error = copyin(uap->argp, (caddr_t)&dumplist,
	sizeof (struct nfsd_dumplist));
	if (!error && (dumplist.ndl_size < 1 \|\|
	dumplist.ndl_size > NFSRV_MAXDUMPLIST))
	error = EPERM;
	if (!error) {
	len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size;
	dumpclients = (struct nfsd_dumpclients *)malloc(len,
	M_TEMP, M_WAITOK);
	nfsrv_dumpclients(dumpclients, dumplist.ndl_size);
	error = copyout(dumpclients,
	CAST_USER_ADDR_T(dumplist.ndl_list), len);
	free((caddr_t)dumpclients, M_TEMP);
	}
	} else if (uap->flag & NFSSVC_DUMPLOCKS) {
	error = copyin(uap->argp, (caddr_t)&dumplocklist,
	sizeof (struct nfsd_dumplocklist));
	if (!error && (dumplocklist.ndllck_size < 1 \|\|
	dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST))
	error = EPERM;
	if (!error)
	error = nfsrv_lookupfilename(&nd,
	dumplocklist.ndllck_fname, p);
	if (!error) {
	len = sizeof (struct nfsd_dumplocks) *
	dumplocklist.ndllck_size;
	dumplocks = (struct nfsd_dumplocks *)malloc(len,
	M_TEMP, M_WAITOK);
	nfsrv_dumplocks(nd.ni_vp, dumplocks,
	dumplocklist.ndllck_size, p);
	vput(nd.ni_vp);
	error = copyout(dumplocks,
	CAST_USER_ADDR_T(dumplocklist.ndllck_list), len);
	free((caddr_t)dumplocks, M_TEMP);
	}
	} else if (uap->flag & NFSSVC_BACKUPSTABLE) {
	procp = p->td_proc;
	PROC_LOCK(procp);
	nfsd_master_pid = procp->p_pid;
	bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1);
	nfsd_master_start = procp->p_stats->p_start;
	nfsd_master_proc = procp;
	PROC_UNLOCK(procp);
	} else if ((uap->flag & NFSSVC_SUSPENDNFSD) != 0) {
	NFSLOCKV4ROOTMUTEX();
	if (suspend_nfsd == 0) {
	/* Lock out all nfsd threads */
	do {
	igotlock = nfsv4_lock(&nfsd_suspend_lock, 1,
	NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
	} while (igotlock == 0 && suspend_nfsd == 0);
	suspend_nfsd = 1;
	}
	NFSUNLOCKV4ROOTMUTEX();
	error = 0;
	} else if ((uap->flag & NFSSVC_RESUMENFSD) != 0) {
	NFSLOCKV4ROOTMUTEX();
	if (suspend_nfsd != 0) {
	nfsv4_unlock(&nfsd_suspend_lock, 0);
	suspend_nfsd = 0;
	}
	NFSUNLOCKV4ROOTMUTEX();
	error = 0;
	}

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Check exports.
	* Returns 0 if ok, 1 otherwise.
	*/
	int
	nfsvno_testexp(struct nfsrv_descript nd, struct nfsexstuff exp)
	{
	int i;

	/*
	* This seems odd, but allow the case where the security flavor
	* list is empty. This happens when NFSv4 is traversing non-exported
	* file systems. Exported file systems should always have a non-empty
	* security flavor list.
	*/
	if (exp->nes_numsecflavor == 0)
	return (0);

	for (i = 0; i < exp->nes_numsecflavor; i++) {
	/*
	* The tests for privacy and integrity must be first,
	* since ND_GSS is set for everything but AUTH_SYS.
	*/
	if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P &&
	(nd->nd_flag & ND_GSSPRIVACY))
	return (0);
	if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I &&
	(nd->nd_flag & ND_GSSINTEGRITY))
	return (0);
	if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 &&
	(nd->nd_flag & ND_GSS))
	return (0);
	if (exp->nes_secflavors[i] == AUTH_SYS &&
	(nd->nd_flag & ND_GSS) == 0)
	return (0);
	}
	return (1);
	}

	/*
	* Calculate a hash value for the fid in a file handle.
	*/
	uint32_t
	nfsrv_hashfh(fhandle_t *fhp)
	{
	uint32_t hashval;

	hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0);
	return (hashval);
	}

	/*
	* Calculate a hash value for the sessionid.
	*/
	uint32_t
	nfsrv_hashsessionid(uint8_t *sessionid)
	{
	uint32_t hashval;

	hashval = hash32_buf(sessionid, NFSX_V4SESSIONID, 0);
	return (hashval);
	}

	/*
	* Signal the userland master nfsd to backup the stable restart file.
	*/
	void
	nfsrv_backupstable(void)
	{
	struct proc *procp;

	if (nfsd_master_proc != NULL) {
	procp = pfind(nfsd_master_pid);
	/* Try to make sure it is the correct process. */
	if (procp == nfsd_master_proc &&
	procp->p_stats->p_start.tv_sec ==
	nfsd_master_start.tv_sec &&
	procp->p_stats->p_start.tv_usec ==
	nfsd_master_start.tv_usec &&
	strcmp(procp->p_comm, nfsd_master_comm) == 0)
	kern_psignal(procp, SIGUSR2);
	else
	nfsd_master_proc = NULL;

	if (procp != NULL)
	PROC_UNLOCK(procp);
	}
	}

	extern int (nfsd_call_nfsd)(struct thread , struct nfssvc_args *);

	/*
	* Called once to initialize data structures...
	*/
	static int
	nfsd_modevent(module_t mod, int type, void *data)
	{
	int error = 0, i;
	static int loaded = 0;

	switch (type) {
	case MOD_LOAD:
	if (loaded)
	goto out;
	newnfs_portinit();
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
	mtx_init(&nfsrchash_table[i].mtx, "nfsrtc", NULL,
	MTX_DEF);
	mtx_init(&nfsrcahash_table[i].mtx, "nfsrtca", NULL,
	MTX_DEF);
	}
	mtx_init(&nfsrc_udpmtx, "nfsuc", NULL, MTX_DEF);
	mtx_init(&nfs_v4root_mutex, "nfs4rt", NULL, MTX_DEF);
	mtx_init(&nfsv4root_mnt.mnt_mtx, "nfs4mnt", NULL, MTX_DEF);
	lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0);
	nfsrvd_initcache();
	nfsd_init();
	NFSD_LOCK();
	nfsrvd_init(0);
	NFSD_UNLOCK();
	nfsd_mntinit();
	#ifdef VV_DISABLEDELEG
	vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation;
	vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation;
	#endif
	nfsd_call_servertimer = nfsrv_servertimer;
	nfsd_call_nfsd = nfssvc_nfsd;
	loaded = 1;
	break;

	case MOD_UNLOAD:
	if (newnfs_numnfsd != 0) {
	error = EBUSY;
	break;
	}

	#ifdef VV_DISABLEDELEG
	vn_deleg_ops.vndeleg_recall = NULL;
	vn_deleg_ops.vndeleg_disable = NULL;
	#endif
	nfsd_call_servertimer = NULL;
	nfsd_call_nfsd = NULL;

	/* Clean out all NFSv4 state. */
	nfsrv_throwawayallstate(curthread);

	/* Clean the NFS server reply cache */
	nfsrvd_cleancache();

	/* Free up the krpc server pool. */
	if (nfsrvd_pool != NULL)
	svcpool_destroy(nfsrvd_pool);

	/* and get rid of the locks */
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
	mtx_destroy(&nfsrchash_table[i].mtx);
	mtx_destroy(&nfsrcahash_table[i].mtx);
	}
	mtx_destroy(&nfsrc_udpmtx);
	mtx_destroy(&nfs_v4root_mutex);
	mtx_destroy(&nfsv4root_mnt.mnt_mtx);
	for (i = 0; i < nfsrv_sessionhashsize; i++)
	mtx_destroy(&nfssessionhash[i].mtx);
	lockdestroy(&nfsv4root_mnt.mnt_explock);
	free(nfsclienthash, M_NFSDCLIENT);
	free(nfslockhash, M_NFSDLOCKFILE);
	free(nfssessionhash, M_NFSDSESSION);
	loaded = 0;
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}
	static moduledata_t nfsd_mod = {
	"nfsd",
	nfsd_modevent,
	NULL,
	};
	DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY);

	/* So that loader and kldload(2) can find us, wherever we are.. */
	MODULE_VERSION(nfsd, 1);
	MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1);
	MODULE_DEPEND(nfsd, nfslock, 1, 1, 1);
	MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1);
	MODULE_DEPEND(nfsd, krpc, 1, 1, 1);
	MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1);

	Index: head/sys/fs/nfsserver/nfs_nfsdstate.c
	===================================================================
	--- head/sys/fs/nfsserver/nfs_nfsdstate.c (revision 327172)
	+++ head/sys/fs/nfsserver/nfs_nfsdstate.c (revision 327173)
	@@ -1,6141 +1,6140 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2009 Rick Macklem, University of Guelph
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#ifndef APPLEKEXT
	#include <fs/nfs/nfsport.h>

	struct nfsrv_stablefirst nfsrv_stablefirst;
	int nfsrv_issuedelegs = 0;
	int nfsrv_dolocallocks = 0;
	struct nfsv4lock nfsv4rootfs_lock;

	extern int newnfs_numnfsd;
	extern struct nfsstatsv1 nfsstatsv1;
	extern int nfsrv_lease;
	extern struct timeval nfsboottime;
	extern u_int32_t newnfs_true, newnfs_false;
	NFSV4ROOTLOCKMUTEX;
	NFSSTATESPINLOCK;

	SYSCTL_DECL(_vfs_nfsd);
	int nfsrv_statehashsize = NFSSTATEHASHSIZE;
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, statehashsize, CTLFLAG_RDTUN,
	&nfsrv_statehashsize, 0,
	"Size of state hash table set via loader.conf");

	int nfsrv_clienthashsize = NFSCLIENTHASHSIZE;
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, clienthashsize, CTLFLAG_RDTUN,
	&nfsrv_clienthashsize, 0,
	"Size of client hash table set via loader.conf");

	int nfsrv_lockhashsize = NFSLOCKHASHSIZE;
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, fhhashsize, CTLFLAG_RDTUN,
	&nfsrv_lockhashsize, 0,
	"Size of file handle hash table set via loader.conf");

	int nfsrv_sessionhashsize = NFSSESSIONHASHSIZE;
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, sessionhashsize, CTLFLAG_RDTUN,
	&nfsrv_sessionhashsize, 0,
	"Size of session hash table set via loader.conf");

	static int nfsrv_v4statelimit = NFSRV_V4STATELIMIT;
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, v4statelimit, CTLFLAG_RWTUN,
	&nfsrv_v4statelimit, 0,
	"High water limit for NFSv4 opens+locks+delegations");

	static int nfsrv_writedelegifpos = 0;
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, writedelegifpos, CTLFLAG_RW,
	&nfsrv_writedelegifpos, 0,
	"Issue a write delegation for read opens if possible");

	static int nfsrv_allowreadforwriteopen = 1;
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, allowreadforwriteopen, CTLFLAG_RW,
	&nfsrv_allowreadforwriteopen, 0,
	"Allow Reads to be done with Write Access StateIDs");

	/*
	* Hash lists for nfs V4.
	*/
	struct nfsclienthashhead *nfsclienthash;
	struct nfslockhashhead *nfslockhash;
	struct nfssessionhash *nfssessionhash;
	#endif /* !APPLEKEXT */

	static u_int32_t nfsrv_openpluslock = 0, nfsrv_delegatecnt = 0;
	static time_t nfsrvboottime;
	static int nfsrv_returnoldstateid = 0, nfsrv_clients = 0;
	static int nfsrv_clienthighwater = NFSRV_CLIENTHIGHWATER;
	static int nfsrv_nogsscallback = 0;
	static volatile int nfsrv_writedelegcnt = 0;

	/* local functions */
	static void nfsrv_dumpaclient(struct nfsclient *clp,
	struct nfsd_dumpclients *dumpp);
	static void nfsrv_freeopenowner(struct nfsstate *stp, int cansleep,
	NFSPROC_T *p);
	static int nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep,
	NFSPROC_T *p);
	static void nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep,
	NFSPROC_T *p);
	static void nfsrv_freeallnfslocks(struct nfsstate *stp, vnode_t vp,
	int cansleep, NFSPROC_T *p);
	static void nfsrv_freenfslock(struct nfslock *lop);
	static void nfsrv_freenfslockfile(struct nfslockfile *lfp);
	static void nfsrv_freedeleg(struct nfsstate *);
	static int nfsrv_getstate(struct nfsclient clp, nfsv4stateid_t stateidp,
	u_int32_t flags, struct nfsstate **stpp);
	static void nfsrv_getowner(struct nfsstatehead hp, struct nfsstate new_stp,
	struct nfsstate **stpp);
	static int nfsrv_getlockfh(vnode_t vp, u_short flags,
	struct nfslockfile new_lfp, fhandle_t nfhp, NFSPROC_T *p);
	static int nfsrv_getlockfile(u_short flags, struct nfslockfile **new_lfpp,
	struct nfslockfile *lfpp, fhandle_t nfhp, int lockit);
	static void nfsrv_insertlock(struct nfslock *new_lop,
	struct nfslock insert_lop, struct nfsstate stp, struct nfslockfile *lfp);
	static void nfsrv_updatelock(struct nfsstate stp, struct nfslock *new_lopp,
	struct nfslock *other_lopp, struct nfslockfile lfp);
	static int nfsrv_getipnumber(u_char *cp);
	static int nfsrv_checkrestart(nfsquad_t clientid, u_int32_t flags,
	nfsv4stateid_t *stateidp, int specialid);
	static int nfsrv_checkgrace(struct nfsrv_descript nd, struct nfsclient clp,
	u_int32_t flags);
	static int nfsrv_docallback(struct nfsclient *clp, int procnum,
	nfsv4stateid_t stateidp, int trunc, fhandle_t fhp,
	struct nfsvattr nap, nfsattrbit_t attrbitp, NFSPROC_T *p);
	static int nfsrv_cbcallargs(struct nfsrv_descript nd, struct nfsclient clp,
	uint32_t callback, int op, const char optag, struct nfsdsession *sepp);
	static u_int32_t nfsrv_nextclientindex(void);
	static u_int32_t nfsrv_nextstateindex(struct nfsclient *clp);
	static void nfsrv_markstable(struct nfsclient *clp);
	static int nfsrv_checkstable(struct nfsclient *clp);
	static int nfsrv_clientconflict(struct nfsclient clp, int haslockp, struct
	vnode vp, NFSPROC_T p);
	static int nfsrv_delegconflict(struct nfsstate stp, int haslockp,
	NFSPROC_T *p, vnode_t vp);
	static int nfsrv_cleandeleg(vnode_t vp, struct nfslockfile *lfp,
	struct nfsclient clp, int haslockp, NFSPROC_T *p);
	static int nfsrv_notsamecredname(struct nfsrv_descript *nd,
	struct nfsclient *clp);
	static time_t nfsrv_leaseexpiry(void);
	static void nfsrv_delaydelegtimeout(struct nfsstate *stp);
	static int nfsrv_checkseqid(struct nfsrv_descript *nd, u_int32_t seqid,
	struct nfsstate stp, struct nfsrvcache op);
	static int nfsrv_nootherstate(struct nfsstate *stp);
	static int nfsrv_locallock(vnode_t vp, struct nfslockfile *lfp, int flags,
	uint64_t first, uint64_t end, struct nfslockconflict cfp, NFSPROC_T p);
	static void nfsrv_localunlock(vnode_t vp, struct nfslockfile *lfp,
	uint64_t init_first, uint64_t init_end, NFSPROC_T *p);
	static int nfsrv_dolocal(vnode_t vp, struct nfslockfile *lfp, int flags,
	int oldflags, uint64_t first, uint64_t end, struct nfslockconflict *cfp,
	NFSPROC_T *p);
	static void nfsrv_locallock_rollback(vnode_t vp, struct nfslockfile *lfp,
	NFSPROC_T *p);
	static void nfsrv_locallock_commit(struct nfslockfile *lfp, int flags,
	uint64_t first, uint64_t end);
	static void nfsrv_locklf(struct nfslockfile *lfp);
	static void nfsrv_unlocklf(struct nfslockfile *lfp);
	static struct nfsdsession nfsrv_findsession(uint8_t sessionid);
	static int nfsrv_freesession(struct nfsdsession sep, uint8_t sessionid);
	static int nfsv4_setcbsequence(struct nfsrv_descript nd, struct nfsclient clp,
	int dont_replycache, struct nfsdsession **sepp);
	static int nfsv4_getcbsession(struct nfsclient clp, struct nfsdsession *sepp);

	/*
	* Scan the client list for a match and either return the current one,
	* create a new entry or return an error.
	* If returning a non-error, the clp structure must either be linked into
	* the client list or free'd.
	*/
	APPLESTATIC int
	nfsrv_setclient(struct nfsrv_descript nd, struct nfsclient *new_clpp,
	nfsquad_t clientidp, nfsquad_t confirmp, NFSPROC_T *p)
	{
	struct nfsclient clp = NULL, new_clp = *new_clpp;
	int i, error = 0;
	struct nfsstate stp, tstp;
	struct sockaddr_in sad, rad;
	int zapit = 0, gotit, hasstate = 0, igotlock;
	static u_int64_t confirm_index = 0;

	/*
	* Check for state resource limit exceeded.
	*/
	if (nfsrv_openpluslock > nfsrv_v4statelimit) {
	error = NFSERR_RESOURCE;
	goto out;
	}

	if (nfsrv_issuedelegs == 0 \|\|
	((nd->nd_flag & ND_GSS) != 0 && nfsrv_nogsscallback != 0))
	/*
	* Don't do callbacks when delegations are disabled or
	* for AUTH_GSS unless enabled via nfsrv_nogsscallback.
	* If establishing a callback connection is attempted
	* when a firewall is blocking the callback path, the
	* server may wait too long for the connect attempt to
	* succeed during the Open. Some clients, such as Linux,
	* may timeout and give up on the Open before the server
	* replies. Also, since AUTH_GSS callbacks are not
	* yet interoperability tested, they might cause the
	* server to crap out, if they get past the Init call to
	* the client.
	*/
	new_clp->lc_program = 0;

	/* Lock out other nfsd threads */
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	do {
	igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
	NFSV4ROOTLOCKMUTEXPTR, NULL);
	} while (!igotlock);
	NFSUNLOCKV4ROOTMUTEX();

	/*
	* Search for a match in the client list.
	*/
	gotit = i = 0;
	while (i < nfsrv_clienthashsize && !gotit) {
	LIST_FOREACH(clp, &nfsclienthash[i], lc_hash) {
	if (new_clp->lc_idlen == clp->lc_idlen &&
	!NFSBCMP(new_clp->lc_id, clp->lc_id, clp->lc_idlen)) {
	gotit = 1;
	break;
	}
	}
	if (gotit == 0)
	i++;
	}
	if (!gotit \|\|
	(clp->lc_flags & (LCL_NEEDSCONFIRM \| LCL_ADMINREVOKED))) {
	if ((nd->nd_flag & ND_NFSV41) != 0 && confirmp->lval[1] != 0) {
	/*
	* For NFSv4.1, if confirmp->lval[1] is non-zero, the
	* client is trying to update a confirmed clientid.
	*/
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	confirmp->lval[1] = 0;
	error = NFSERR_NOENT;
	goto out;
	}
	/*
	* Get rid of the old one.
	*/
	if (i != nfsrv_clienthashsize) {
	LIST_REMOVE(clp, lc_hash);
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_deleg);
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	zapit = 1;
	}
	/*
	* Add it after assigning a client id to it.
	*/
	new_clp->lc_flags \|= LCL_NEEDSCONFIRM;
	if ((nd->nd_flag & ND_NFSV41) != 0)
	new_clp->lc_confirm.lval[0] = confirmp->lval[0] =
	++confirm_index;
	else
	confirmp->qval = new_clp->lc_confirm.qval =
	++confirm_index;
	clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
	(u_int32_t)nfsrvboottime;
	clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
	nfsrv_nextclientindex();
	new_clp->lc_stateindex = 0;
	new_clp->lc_statemaxindex = 0;
	new_clp->lc_cbref = 0;
	new_clp->lc_expiry = nfsrv_leaseexpiry();
	LIST_INIT(&new_clp->lc_open);
	LIST_INIT(&new_clp->lc_deleg);
	LIST_INIT(&new_clp->lc_olddeleg);
	LIST_INIT(&new_clp->lc_session);
	for (i = 0; i < nfsrv_statehashsize; i++)
	LIST_INIT(&new_clp->lc_stateid[i]);
	LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
	lc_hash);
	nfsstatsv1.srvclients++;
	nfsrv_openpluslock++;
	nfsrv_clients++;
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	if (zapit)
	nfsrv_zapclient(clp, p);
	*new_clpp = NULL;
	goto out;
	}

	/*
	* Now, handle the cases where the id is already issued.
	*/
	if (nfsrv_notsamecredname(nd, clp)) {
	/*
	* Check to see if there is expired state that should go away.
	*/
	if (clp->lc_expiry < NFSD_MONOSEC &&
	(!LIST_EMPTY(&clp->lc_open) \|\| !LIST_EMPTY(&clp->lc_deleg))) {
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_deleg);
	}

	/*
	* If there is outstanding state, then reply NFSERR_CLIDINUSE per
	* RFC3530 Sec. 8.1.2 last para.
	*/
	if (!LIST_EMPTY(&clp->lc_deleg)) {
	hasstate = 1;
	} else if (LIST_EMPTY(&clp->lc_open)) {
	hasstate = 0;
	} else {
	hasstate = 0;
	/* Look for an Open on the OpenOwner */
	LIST_FOREACH(stp, &clp->lc_open, ls_list) {
	if (!LIST_EMPTY(&stp->ls_open)) {
	hasstate = 1;
	break;
	}
	}
	}
	if (hasstate) {
	/*
	* If the uid doesn't match, return NFSERR_CLIDINUSE after
	* filling out the correct ipaddr and portnum.
	*/
	sad = NFSSOCKADDR(new_clp->lc_req.nr_nam, struct sockaddr_in *);
	rad = NFSSOCKADDR(clp->lc_req.nr_nam, struct sockaddr_in *);
	sad->sin_addr.s_addr = rad->sin_addr.s_addr;
	sad->sin_port = rad->sin_port;
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	error = NFSERR_CLIDINUSE;
	goto out;
	}
	}

	if (NFSBCMP(new_clp->lc_verf, clp->lc_verf, NFSX_VERF)) {
	/*
	* If the verifier has changed, the client has rebooted
	* and a new client id is issued. The old state info
	* can be thrown away once the SETCLIENTID_CONFIRM occurs.
	*/
	LIST_REMOVE(clp, lc_hash);
	new_clp->lc_flags \|= LCL_NEEDSCONFIRM;
	if ((nd->nd_flag & ND_NFSV41) != 0)
	new_clp->lc_confirm.lval[0] = confirmp->lval[0] =
	++confirm_index;
	else
	confirmp->qval = new_clp->lc_confirm.qval =
	++confirm_index;
	clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
	nfsrvboottime;
	clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
	nfsrv_nextclientindex();
	new_clp->lc_stateindex = 0;
	new_clp->lc_statemaxindex = 0;
	new_clp->lc_cbref = 0;
	new_clp->lc_expiry = nfsrv_leaseexpiry();

	/*
	* Save the state until confirmed.
	*/
	LIST_NEWHEAD(&new_clp->lc_open, &clp->lc_open, ls_list);
	LIST_FOREACH(tstp, &new_clp->lc_open, ls_list)
	tstp->ls_clp = new_clp;
	LIST_NEWHEAD(&new_clp->lc_deleg, &clp->lc_deleg, ls_list);
	LIST_FOREACH(tstp, &new_clp->lc_deleg, ls_list)
	tstp->ls_clp = new_clp;
	LIST_NEWHEAD(&new_clp->lc_olddeleg, &clp->lc_olddeleg,
	ls_list);
	LIST_FOREACH(tstp, &new_clp->lc_olddeleg, ls_list)
	tstp->ls_clp = new_clp;
	for (i = 0; i < nfsrv_statehashsize; i++) {
	LIST_NEWHEAD(&new_clp->lc_stateid[i],
	&clp->lc_stateid[i], ls_hash);
	LIST_FOREACH(tstp, &new_clp->lc_stateid[i], ls_hash)
	tstp->ls_clp = new_clp;
	}
	LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
	lc_hash);
	nfsstatsv1.srvclients++;
	nfsrv_openpluslock++;
	nfsrv_clients++;
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();

	/*
	* Must wait until any outstanding callback on the old clp
	* completes.
	*/
	NFSLOCKSTATE();
	while (clp->lc_cbref) {
	clp->lc_flags \|= LCL_WAKEUPWANTED;
	(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
	"nfsd clp", 10 * hz);
	}
	NFSUNLOCKSTATE();
	nfsrv_zapclient(clp, p);
	*new_clpp = NULL;
	goto out;
	}

	/* For NFSv4.1, mark that we found a confirmed clientid. */
	if ((nd->nd_flag & ND_NFSV41) != 0) {
	clientidp->lval[0] = clp->lc_clientid.lval[0];
	clientidp->lval[1] = clp->lc_clientid.lval[1];
	confirmp->lval[0] = 0; /* Ignored by client */
	confirmp->lval[1] = 1;
	} else {
	/*
	* id and verifier match, so update the net address info
	* and get rid of any existing callback authentication
	* handle, so a new one will be acquired.
	*/
	LIST_REMOVE(clp, lc_hash);
	new_clp->lc_flags \|= (LCL_NEEDSCONFIRM \| LCL_DONTCLEAN);
	new_clp->lc_expiry = nfsrv_leaseexpiry();
	confirmp->qval = new_clp->lc_confirm.qval = ++confirm_index;
	clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
	clp->lc_clientid.lval[0];
	clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
	clp->lc_clientid.lval[1];
	new_clp->lc_delegtime = clp->lc_delegtime;
	new_clp->lc_stateindex = clp->lc_stateindex;
	new_clp->lc_statemaxindex = clp->lc_statemaxindex;
	new_clp->lc_cbref = 0;
	LIST_NEWHEAD(&new_clp->lc_open, &clp->lc_open, ls_list);
	LIST_FOREACH(tstp, &new_clp->lc_open, ls_list)
	tstp->ls_clp = new_clp;
	LIST_NEWHEAD(&new_clp->lc_deleg, &clp->lc_deleg, ls_list);
	LIST_FOREACH(tstp, &new_clp->lc_deleg, ls_list)
	tstp->ls_clp = new_clp;
	LIST_NEWHEAD(&new_clp->lc_olddeleg, &clp->lc_olddeleg, ls_list);
	LIST_FOREACH(tstp, &new_clp->lc_olddeleg, ls_list)
	tstp->ls_clp = new_clp;
	for (i = 0; i < nfsrv_statehashsize; i++) {
	LIST_NEWHEAD(&new_clp->lc_stateid[i],
	&clp->lc_stateid[i], ls_hash);
	LIST_FOREACH(tstp, &new_clp->lc_stateid[i], ls_hash)
	tstp->ls_clp = new_clp;
	}
	LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
	lc_hash);
	nfsstatsv1.srvclients++;
	nfsrv_openpluslock++;
	nfsrv_clients++;
	}
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();

	if ((nd->nd_flag & ND_NFSV41) == 0) {
	/*
	* Must wait until any outstanding callback on the old clp
	* completes.
	*/
	NFSLOCKSTATE();
	while (clp->lc_cbref) {
	clp->lc_flags \|= LCL_WAKEUPWANTED;
	(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
	"nfsdclp", 10 * hz);
	}
	NFSUNLOCKSTATE();
	nfsrv_zapclient(clp, p);
	*new_clpp = NULL;
	}

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Check to see if the client id exists and optionally confirm it.
	*/
	APPLESTATIC int
	nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp,
	struct nfsdsession *nsep, nfsquad_t confirm, uint32_t cbprogram,
	struct nfsrv_descript nd, NFSPROC_T p)
	{
	struct nfsclient *clp;
	struct nfsstate *stp;
	int i;
	struct nfsclienthashhead *hp;
	int error = 0, igotlock, doneok;
	struct nfssessionhash *shp;
	struct nfsdsession *sep;
	uint64_t sessid[2];
	static uint64_t next_sess = 0;

	if (clpp)
	*clpp = NULL;
	if ((nd == NULL \|\| (nd->nd_flag & ND_NFSV41) == 0 \|\|
	opflags != CLOPS_RENEW) && nfsrvboottime != clientid.lval[0]) {
	error = NFSERR_STALECLIENTID;
	goto out;
	}

	/*
	* If called with opflags == CLOPS_RENEW, the State Lock is
	* already held. Otherwise, we need to get either that or,
	* for the case of Confirm, lock out the nfsd threads.
	*/
	if (opflags & CLOPS_CONFIRM) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	do {
	igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
	NFSV4ROOTLOCKMUTEXPTR, NULL);
	} while (!igotlock);
	/*
	* Create a new sessionid here, since we need to do it where
	* there is a mutex held to serialize update of next_sess.
	*/
	if ((nd->nd_flag & ND_NFSV41) != 0) {
	sessid[0] = ++next_sess;
	sessid[1] = clientid.qval;
	}
	NFSUNLOCKV4ROOTMUTEX();
	} else if (opflags != CLOPS_RENEW) {
	NFSLOCKSTATE();
	}

	/* For NFSv4.1, the clp is acquired from the associated session. */
	if (nd != NULL && (nd->nd_flag & ND_NFSV41) != 0 &&
	opflags == CLOPS_RENEW) {
	clp = NULL;
	if ((nd->nd_flag & ND_HASSEQUENCE) != 0) {
	shp = NFSSESSIONHASH(nd->nd_sessionid);
	NFSLOCKSESSION(shp);
	sep = nfsrv_findsession(nd->nd_sessionid);
	if (sep != NULL)
	clp = sep->sess_clp;
	NFSUNLOCKSESSION(shp);
	}
	} else {
	hp = NFSCLIENTHASH(clientid);
	LIST_FOREACH(clp, hp, lc_hash) {
	if (clp->lc_clientid.lval[1] == clientid.lval[1])
	break;
	}
	}
	if (clp == NULL) {
	if (opflags & CLOPS_CONFIRM)
	error = NFSERR_STALECLIENTID;
	else
	error = NFSERR_EXPIRED;
	} else if (clp->lc_flags & LCL_ADMINREVOKED) {
	/*
	* If marked admin revoked, just return the error.
	*/
	error = NFSERR_ADMINREVOKED;
	}
	if (error) {
	if (opflags & CLOPS_CONFIRM) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	} else if (opflags != CLOPS_RENEW) {
	NFSUNLOCKSTATE();
	}
	goto out;
	}

	/*
	* Perform any operations specified by the opflags.
	*/
	if (opflags & CLOPS_CONFIRM) {
	if (((nd->nd_flag & ND_NFSV41) != 0 &&
	clp->lc_confirm.lval[0] != confirm.lval[0]) \|\|
	((nd->nd_flag & ND_NFSV41) == 0 &&
	clp->lc_confirm.qval != confirm.qval))
	error = NFSERR_STALECLIENTID;
	else if (nfsrv_notsamecredname(nd, clp))
	error = NFSERR_CLIDINUSE;

	if (!error) {
	if ((clp->lc_flags & (LCL_NEEDSCONFIRM \| LCL_DONTCLEAN)) ==
	LCL_NEEDSCONFIRM) {
	/*
	* Hang onto the delegations (as old delegations)
	* for an Open with CLAIM_DELEGATE_PREV unless in
	* grace, but get rid of the rest of the state.
	*/
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	if (nfsrv_checkgrace(nd, clp, 0)) {
	/* In grace, so just delete delegations */
	nfsrv_freedeleglist(&clp->lc_deleg);
	} else {
	LIST_FOREACH(stp, &clp->lc_deleg, ls_list)
	stp->ls_flags \|= NFSLCK_OLDDELEG;
	clp->lc_delegtime = NFSD_MONOSEC +
	nfsrv_lease + NFSRV_LEASEDELTA;
	LIST_NEWHEAD(&clp->lc_olddeleg, &clp->lc_deleg,
	ls_list);
	}
	if ((nd->nd_flag & ND_NFSV41) != 0)
	clp->lc_program = cbprogram;
	}
	clp->lc_flags &= ~(LCL_NEEDSCONFIRM \| LCL_DONTCLEAN);
	if (clp->lc_program)
	clp->lc_flags \|= LCL_NEEDSCBNULL;
	/* For NFSv4.1, link the session onto the client. */
	if (nsep != NULL) {
	/* Hold a reference on the xprt for a backchannel. */
	if ((nsep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN)
	!= 0 && clp->lc_req.nr_client == NULL) {
	clp->lc_req.nr_client = (struct __rpc_client *)
	clnt_bck_create(nd->nd_xprt->xp_socket,
	cbprogram, NFSV4_CBVERS);
	if (clp->lc_req.nr_client != NULL) {
	SVC_ACQUIRE(nd->nd_xprt);
	nd->nd_xprt->xp_p2 =
	clp->lc_req.nr_client->cl_private;
	/* Disable idle timeout. */
	nd->nd_xprt->xp_idletimeout = 0;
	nsep->sess_cbsess.nfsess_xprt = nd->nd_xprt;
	} else
	nsep->sess_crflags &= ~NFSV4CRSESS_CONNBACKCHAN;
	}
	NFSBCOPY(sessid, nsep->sess_sessionid,
	NFSX_V4SESSIONID);
	NFSBCOPY(sessid, nsep->sess_cbsess.nfsess_sessionid,
	NFSX_V4SESSIONID);
	shp = NFSSESSIONHASH(nsep->sess_sessionid);
	NFSLOCKSTATE();
	NFSLOCKSESSION(shp);
	LIST_INSERT_HEAD(&shp->list, nsep, sess_hash);
	LIST_INSERT_HEAD(&clp->lc_session, nsep, sess_list);
	nsep->sess_clp = clp;
	NFSUNLOCKSESSION(shp);
	NFSUNLOCKSTATE();
	}
	}
	} else if (clp->lc_flags & LCL_NEEDSCONFIRM) {
	error = NFSERR_EXPIRED;
	}

	/*
	* If called by the Renew Op, we must check the principal.
	*/
	if (!error && (opflags & CLOPS_RENEWOP)) {
	if (nfsrv_notsamecredname(nd, clp)) {
	doneok = 0;
	for (i = 0; i < nfsrv_statehashsize && doneok == 0; i++) {
	LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
	if ((stp->ls_flags & NFSLCK_OPEN) &&
	stp->ls_uid == nd->nd_cred->cr_uid) {
	doneok = 1;
	break;
	}
	}
	}
	if (!doneok)
	error = NFSERR_ACCES;
	}
	if (!error && (clp->lc_flags & LCL_CBDOWN))
	error = NFSERR_CBPATHDOWN;
	}
	if ((!error \|\| error == NFSERR_CBPATHDOWN) &&
	(opflags & CLOPS_RENEW)) {
	clp->lc_expiry = nfsrv_leaseexpiry();
	}
	if (opflags & CLOPS_CONFIRM) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	} else if (opflags != CLOPS_RENEW) {
	NFSUNLOCKSTATE();
	}
	if (clpp)
	*clpp = clp;

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Perform the NFSv4.1 destroy clientid.
	*/
	int
	nfsrv_destroyclient(nfsquad_t clientid, NFSPROC_T *p)
	{
	struct nfsclient *clp;
	struct nfsclienthashhead *hp;
	int error = 0, i, igotlock;

	if (nfsrvboottime != clientid.lval[0]) {
	error = NFSERR_STALECLIENTID;
	goto out;
	}

	/* Lock out other nfsd threads */
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	do {
	igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
	NFSV4ROOTLOCKMUTEXPTR, NULL);
	} while (igotlock == 0);
	NFSUNLOCKV4ROOTMUTEX();

	hp = NFSCLIENTHASH(clientid);
	LIST_FOREACH(clp, hp, lc_hash) {
	if (clp->lc_clientid.lval[1] == clientid.lval[1])
	break;
	}
	if (clp == NULL) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	/* Just return ok, since it is gone. */
	goto out;
	}

	/* Scan for state on the clientid. */
	for (i = 0; i < nfsrv_statehashsize; i++)
	if (!LIST_EMPTY(&clp->lc_stateid[i])) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	error = NFSERR_CLIENTIDBUSY;
	goto out;
	}
	if (!LIST_EMPTY(&clp->lc_session) \|\| !LIST_EMPTY(&clp->lc_deleg)) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	error = NFSERR_CLIENTIDBUSY;
	goto out;
	}

	/* Destroy the clientid and return ok. */
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_deleg);
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	LIST_REMOVE(clp, lc_hash);
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	nfsrv_zapclient(clp, p);
	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Called from the new nfssvc syscall to admin revoke a clientid.
	* Returns 0 for success, error otherwise.
	*/
	APPLESTATIC int
	nfsrv_adminrevoke(struct nfsd_clid revokep, NFSPROC_T p)
	{
	struct nfsclient *clp = NULL;
	int i, error = 0;
	int gotit, igotlock;

	/*
	* First, lock out the nfsd so that state won't change while the
	* revocation record is being written to the stable storage restart
	* file.
	*/
	NFSLOCKV4ROOTMUTEX();
	do {
	igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
	NFSV4ROOTLOCKMUTEXPTR, NULL);
	} while (!igotlock);
	NFSUNLOCKV4ROOTMUTEX();

	/*
	* Search for a match in the client list.
	*/
	gotit = i = 0;
	while (i < nfsrv_clienthashsize && !gotit) {
	LIST_FOREACH(clp, &nfsclienthash[i], lc_hash) {
	if (revokep->nclid_idlen == clp->lc_idlen &&
	!NFSBCMP(revokep->nclid_id, clp->lc_id, clp->lc_idlen)) {
	gotit = 1;
	break;
	}
	}
	i++;
	}
	if (!gotit) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 0);
	NFSUNLOCKV4ROOTMUTEX();
	error = EPERM;
	goto out;
	}

	/*
	* Now, write out the revocation record
	*/
	nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
	nfsrv_backupstable();

	/*
	* and clear out the state, marking the clientid revoked.
	*/
	clp->lc_flags &= ~LCL_CALLBACKSON;
	clp->lc_flags \|= LCL_ADMINREVOKED;
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_deleg);
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 0);
	NFSUNLOCKV4ROOTMUTEX();

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Dump out stats for all clients. Called from nfssvc(2), that is used
	* nfsstatsv1.
	*/
	APPLESTATIC void
	nfsrv_dumpclients(struct nfsd_dumpclients *dumpp, int maxcnt)
	{
	struct nfsclient *clp;
	int i = 0, cnt = 0;

	/*
	* First, get a reference on the nfsv4rootfs_lock so that an
	* exclusive lock cannot be acquired while dumping the clients.
	*/
	NFSLOCKV4ROOTMUTEX();
	nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
	NFSUNLOCKV4ROOTMUTEX();
	NFSLOCKSTATE();
	/*
	* Rattle through the client lists until done.
	*/
	while (i < nfsrv_clienthashsize && cnt < maxcnt) {
	clp = LIST_FIRST(&nfsclienthash[i]);
	while (clp != LIST_END(&nfsclienthash[i]) && cnt < maxcnt) {
	nfsrv_dumpaclient(clp, &dumpp[cnt]);
	cnt++;
	clp = LIST_NEXT(clp, lc_hash);
	}
	i++;
	}
	if (cnt < maxcnt)
	dumpp[cnt].ndcl_clid.nclid_idlen = 0;
	NFSUNLOCKSTATE();
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	NFSUNLOCKV4ROOTMUTEX();
	}

	/*
	* Dump stats for a client. Must be called with the NFSSTATELOCK and spl'd.
	*/
	static void
	nfsrv_dumpaclient(struct nfsclient clp, struct nfsd_dumpclients dumpp)
	{
	struct nfsstate stp, openstp, *lckownstp;
	struct nfslock *lop;
	struct sockaddr *sad;
	struct sockaddr_in *rad;
	struct sockaddr_in6 *rad6;

	dumpp->ndcl_nopenowners = dumpp->ndcl_nlockowners = 0;
	dumpp->ndcl_nopens = dumpp->ndcl_nlocks = 0;
	dumpp->ndcl_ndelegs = dumpp->ndcl_nolddelegs = 0;
	dumpp->ndcl_flags = clp->lc_flags;
	dumpp->ndcl_clid.nclid_idlen = clp->lc_idlen;
	NFSBCOPY(clp->lc_id, dumpp->ndcl_clid.nclid_id, clp->lc_idlen);
	sad = NFSSOCKADDR(clp->lc_req.nr_nam, struct sockaddr *);
	dumpp->ndcl_addrfam = sad->sa_family;
	if (sad->sa_family == AF_INET) {
	rad = (struct sockaddr_in *)sad;
	dumpp->ndcl_cbaddr.sin_addr = rad->sin_addr;
	} else {
	rad6 = (struct sockaddr_in6 *)sad;
	dumpp->ndcl_cbaddr.sin6_addr = rad6->sin6_addr;
	}

	/*
	* Now, scan the state lists and total up the opens and locks.
	*/
	LIST_FOREACH(stp, &clp->lc_open, ls_list) {
	dumpp->ndcl_nopenowners++;
	LIST_FOREACH(openstp, &stp->ls_open, ls_list) {
	dumpp->ndcl_nopens++;
	LIST_FOREACH(lckownstp, &openstp->ls_open, ls_list) {
	dumpp->ndcl_nlockowners++;
	LIST_FOREACH(lop, &lckownstp->ls_lock, lo_lckowner) {
	dumpp->ndcl_nlocks++;
	}
	}
	}
	}

	/*
	* and the delegation lists.
	*/
	LIST_FOREACH(stp, &clp->lc_deleg, ls_list) {
	dumpp->ndcl_ndelegs++;
	}
	LIST_FOREACH(stp, &clp->lc_olddeleg, ls_list) {
	dumpp->ndcl_nolddelegs++;
	}
	}

	/*
	* Dump out lock stats for a file.
	*/
	APPLESTATIC void
	nfsrv_dumplocks(vnode_t vp, struct nfsd_dumplocks *ldumpp, int maxcnt,
	NFSPROC_T *p)
	{
	struct nfsstate *stp;
	struct nfslock *lop;
	int cnt = 0;
	struct nfslockfile *lfp;
	struct sockaddr *sad;
	struct sockaddr_in *rad;
	struct sockaddr_in6 *rad6;
	int ret;
	fhandle_t nfh;

	ret = nfsrv_getlockfh(vp, 0, NULL, &nfh, p);
	/*
	* First, get a reference on the nfsv4rootfs_lock so that an
	* exclusive lock on it cannot be acquired while dumping the locks.
	*/
	NFSLOCKV4ROOTMUTEX();
	nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
	NFSUNLOCKV4ROOTMUTEX();
	NFSLOCKSTATE();
	if (!ret)
	ret = nfsrv_getlockfile(0, NULL, &lfp, &nfh, 0);
	if (ret) {
	ldumpp[0].ndlck_clid.nclid_idlen = 0;
	NFSUNLOCKSTATE();
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	NFSUNLOCKV4ROOTMUTEX();
	return;
	}

	/*
	* For each open share on file, dump it out.
	*/
	stp = LIST_FIRST(&lfp->lf_open);
	while (stp != LIST_END(&lfp->lf_open) && cnt < maxcnt) {
	ldumpp[cnt].ndlck_flags = stp->ls_flags;
	ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
	ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
	ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
	ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
	ldumpp[cnt].ndlck_owner.nclid_idlen =
	stp->ls_openowner->ls_ownerlen;
	NFSBCOPY(stp->ls_openowner->ls_owner,
	ldumpp[cnt].ndlck_owner.nclid_id,
	stp->ls_openowner->ls_ownerlen);
	ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
	NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
	stp->ls_clp->lc_idlen);
	sad=NFSSOCKADDR(stp->ls_clp->lc_req.nr_nam, struct sockaddr *);
	ldumpp[cnt].ndlck_addrfam = sad->sa_family;
	if (sad->sa_family == AF_INET) {
	rad = (struct sockaddr_in *)sad;
	ldumpp[cnt].ndlck_cbaddr.sin_addr = rad->sin_addr;
	} else {
	rad6 = (struct sockaddr_in6 *)sad;
	ldumpp[cnt].ndlck_cbaddr.sin6_addr = rad6->sin6_addr;
	}
	stp = LIST_NEXT(stp, ls_file);
	cnt++;
	}

	/*
	* and all locks.
	*/
	lop = LIST_FIRST(&lfp->lf_lock);
	while (lop != LIST_END(&lfp->lf_lock) && cnt < maxcnt) {
	stp = lop->lo_stp;
	ldumpp[cnt].ndlck_flags = lop->lo_flags;
	ldumpp[cnt].ndlck_first = lop->lo_first;
	ldumpp[cnt].ndlck_end = lop->lo_end;
	ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
	ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
	ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
	ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
	ldumpp[cnt].ndlck_owner.nclid_idlen = stp->ls_ownerlen;
	NFSBCOPY(stp->ls_owner, ldumpp[cnt].ndlck_owner.nclid_id,
	stp->ls_ownerlen);
	ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
	NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
	stp->ls_clp->lc_idlen);
	sad=NFSSOCKADDR(stp->ls_clp->lc_req.nr_nam, struct sockaddr *);
	ldumpp[cnt].ndlck_addrfam = sad->sa_family;
	if (sad->sa_family == AF_INET) {
	rad = (struct sockaddr_in *)sad;
	ldumpp[cnt].ndlck_cbaddr.sin_addr = rad->sin_addr;
	} else {
	rad6 = (struct sockaddr_in6 *)sad;
	ldumpp[cnt].ndlck_cbaddr.sin6_addr = rad6->sin6_addr;
	}
	lop = LIST_NEXT(lop, lo_lckfile);
	cnt++;
	}

	/*
	* and the delegations.
	*/
	stp = LIST_FIRST(&lfp->lf_deleg);
	while (stp != LIST_END(&lfp->lf_deleg) && cnt < maxcnt) {
	ldumpp[cnt].ndlck_flags = stp->ls_flags;
	ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
	ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
	ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
	ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
	ldumpp[cnt].ndlck_owner.nclid_idlen = 0;
	ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
	NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
	stp->ls_clp->lc_idlen);
	sad=NFSSOCKADDR(stp->ls_clp->lc_req.nr_nam, struct sockaddr *);
	ldumpp[cnt].ndlck_addrfam = sad->sa_family;
	if (sad->sa_family == AF_INET) {
	rad = (struct sockaddr_in *)sad;
	ldumpp[cnt].ndlck_cbaddr.sin_addr = rad->sin_addr;
	} else {
	rad6 = (struct sockaddr_in6 *)sad;
	ldumpp[cnt].ndlck_cbaddr.sin6_addr = rad6->sin6_addr;
	}
	stp = LIST_NEXT(stp, ls_file);
	cnt++;
	}

	/*
	* If list isn't full, mark end of list by setting the client name
	* to zero length.
	*/
	if (cnt < maxcnt)
	ldumpp[cnt].ndlck_clid.nclid_idlen = 0;
	NFSUNLOCKSTATE();
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	NFSUNLOCKV4ROOTMUTEX();
	}

	/*
	* Server timer routine. It can scan any linked list, so long
	* as it holds the spin/mutex lock and there is no exclusive lock on
	* nfsv4rootfs_lock.
	* (For OpenBSD, a kthread is ok. For FreeBSD, I think it is ok
	* to do this from a callout, since the spin locks work. For
	* Darwin, I'm not sure what will work correctly yet.)
	* Should be called once per second.
	*/
	APPLESTATIC void
	nfsrv_servertimer(void)
	{
	struct nfsclient clp, nclp;
	struct nfsstate stp, nstp;
	int got_ref, i;

	/*
	* Make sure nfsboottime is set. This is used by V3 as well
	* as V4. Note that nfsboottime is not nfsrvboottime, which is
	* only used by the V4 server for leases.
	*/
	if (nfsboottime.tv_sec == 0)
	NFSSETBOOTTIME(nfsboottime);

	/*
	* If server hasn't started yet, just return.
	*/
	NFSLOCKSTATE();
	if (nfsrv_stablefirst.nsf_eograce == 0) {
	NFSUNLOCKSTATE();
	return;
	}
	if (!(nfsrv_stablefirst.nsf_flags & NFSNSF_UPDATEDONE)) {
	if (!(nfsrv_stablefirst.nsf_flags & NFSNSF_GRACEOVER) &&
	NFSD_MONOSEC > nfsrv_stablefirst.nsf_eograce)
	nfsrv_stablefirst.nsf_flags \|=
	(NFSNSF_GRACEOVER \| NFSNSF_NEEDLOCK);
	NFSUNLOCKSTATE();
	return;
	}

	/*
	* Try and get a reference count on the nfsv4rootfs_lock so that
	* no nfsd thread can acquire an exclusive lock on it before this
	* call is done. If it is already exclusively locked, just return.
	*/
	NFSLOCKV4ROOTMUTEX();
	got_ref = nfsv4_getref_nonblock(&nfsv4rootfs_lock);
	NFSUNLOCKV4ROOTMUTEX();
	if (got_ref == 0) {
	NFSUNLOCKSTATE();
	return;
	}

	/*
	* For each client...
	*/
	for (i = 0; i < nfsrv_clienthashsize; i++) {
	clp = LIST_FIRST(&nfsclienthash[i]);
	while (clp != LIST_END(&nfsclienthash[i])) {
	nclp = LIST_NEXT(clp, lc_hash);
	if (!(clp->lc_flags & LCL_EXPIREIT)) {
	if (((clp->lc_expiry + NFSRV_STALELEASE) < NFSD_MONOSEC
	&& ((LIST_EMPTY(&clp->lc_deleg)
	&& LIST_EMPTY(&clp->lc_open)) \|\|
	nfsrv_clients > nfsrv_clienthighwater)) \|\|
	(clp->lc_expiry + NFSRV_MOULDYLEASE) < NFSD_MONOSEC \|\|
	(clp->lc_expiry < NFSD_MONOSEC &&
	(nfsrv_openpluslock * 10 / 9) > nfsrv_v4statelimit)) {
	/*
	* Lease has expired several nfsrv_lease times ago:
	* PLUS
	* - no state is associated with it
	* OR
	* - above high water mark for number of clients
	* (nfsrv_clienthighwater should be large enough
	* that this only occurs when clients fail to
	* use the same nfs_client_id4.id. Maybe somewhat
	* higher that the maximum number of clients that
	* will mount this server?)
	* OR
	* Lease has expired a very long time ago
	* OR
	* Lease has expired PLUS the number of opens + locks
	* has exceeded 90% of capacity
	*
	* --> Mark for expiry. The actual expiry will be done
	* by an nfsd sometime soon.
	*/
	clp->lc_flags \|= LCL_EXPIREIT;
	nfsrv_stablefirst.nsf_flags \|=
	(NFSNSF_NEEDLOCK \| NFSNSF_EXPIREDCLIENT);
	} else {
	/*
	* If there are no opens, increment no open tick cnt
	* If time exceeds NFSNOOPEN, mark it to be thrown away
	* otherwise, if there is an open, reset no open time
	* Hopefully, this will avoid excessive re-creation
	* of open owners and subsequent open confirms.
	*/
	stp = LIST_FIRST(&clp->lc_open);
	while (stp != LIST_END(&clp->lc_open)) {
	nstp = LIST_NEXT(stp, ls_list);
	if (LIST_EMPTY(&stp->ls_open)) {
	stp->ls_noopens++;
	if (stp->ls_noopens > NFSNOOPEN \|\|
	(nfsrv_openpluslock * 2) >
	nfsrv_v4statelimit)
	nfsrv_stablefirst.nsf_flags \|=
	NFSNSF_NOOPENS;
	} else {
	stp->ls_noopens = 0;
	}
	stp = nstp;
	}
	}
	}
	clp = nclp;
	}
	}
	NFSUNLOCKSTATE();
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	NFSUNLOCKV4ROOTMUTEX();
	}

	/*
	* The following set of functions free up the various data structures.
	*/
	/*
	* Clear out all open/lock state related to this nfsclient.
	* Caller must hold an exclusive lock on nfsv4rootfs_lock, so that
	* there are no other active nfsd threads.
	*/
	APPLESTATIC void
	nfsrv_cleanclient(struct nfsclient clp, NFSPROC_T p)
	{
	struct nfsstate stp, nstp;
	struct nfsdsession sep, nsep;

	LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp)
	nfsrv_freeopenowner(stp, 1, p);
	if ((clp->lc_flags & LCL_ADMINREVOKED) == 0)
	LIST_FOREACH_SAFE(sep, &clp->lc_session, sess_list, nsep)
	(void)nfsrv_freesession(sep, NULL);
	}

	/*
	* Free a client that has been cleaned. It should also already have been
	* removed from the lists.
	* (Just to be safe w.r.t. newnfs_disconnect(), call this function when
	* softclock interrupts are enabled.)
	*/
	APPLESTATIC void
	nfsrv_zapclient(struct nfsclient clp, NFSPROC_T p)
	{

	#ifdef notyet
	if ((clp->lc_flags & (LCL_GSS \| LCL_CALLBACKSON)) ==
	(LCL_GSS \| LCL_CALLBACKSON) &&
	(clp->lc_hand.nfsh_flag & NFSG_COMPLETE) &&
	clp->lc_handlelen > 0) {
	clp->lc_hand.nfsh_flag &= ~NFSG_COMPLETE;
	clp->lc_hand.nfsh_flag \|= NFSG_DESTROYED;
	(void) nfsrv_docallback(clp, NFSV4PROC_CBNULL,
	NULL, 0, NULL, NULL, NULL, p);
	}
	#endif
	newnfs_disconnect(&clp->lc_req);
	NFSSOCKADDRFREE(clp->lc_req.nr_nam);
	NFSFREEMUTEX(&clp->lc_req.nr_mtx);
	free(clp->lc_stateid, M_NFSDCLIENT);
	free(clp, M_NFSDCLIENT);
	NFSLOCKSTATE();
	nfsstatsv1.srvclients--;
	nfsrv_openpluslock--;
	nfsrv_clients--;
	NFSUNLOCKSTATE();
	}

	/*
	* Free a list of delegation state structures.
	* (This function will also free all nfslockfile structures that no
	* longer have associated state.)
	*/
	APPLESTATIC void
	nfsrv_freedeleglist(struct nfsstatehead *sthp)
	{
	struct nfsstate stp, nstp;

	LIST_FOREACH_SAFE(stp, sthp, ls_list, nstp) {
	nfsrv_freedeleg(stp);
	}
	LIST_INIT(sthp);
	}

	/*
	* Free up a delegation.
	*/
	static void
	nfsrv_freedeleg(struct nfsstate *stp)
	{
	struct nfslockfile *lfp;

	LIST_REMOVE(stp, ls_hash);
	LIST_REMOVE(stp, ls_list);
	LIST_REMOVE(stp, ls_file);
	if ((stp->ls_flags & NFSLCK_DELEGWRITE) != 0)
	nfsrv_writedelegcnt--;
	lfp = stp->ls_lfp;
	if (LIST_EMPTY(&lfp->lf_open) &&
	LIST_EMPTY(&lfp->lf_lock) && LIST_EMPTY(&lfp->lf_deleg) &&
	LIST_EMPTY(&lfp->lf_locallock) && LIST_EMPTY(&lfp->lf_rollback) &&
	lfp->lf_usecount == 0 &&
	nfsv4_testlock(&lfp->lf_locallock_lck) == 0)
	nfsrv_freenfslockfile(lfp);
	FREE((caddr_t)stp, M_NFSDSTATE);
	nfsstatsv1.srvdelegates--;
	nfsrv_openpluslock--;
	nfsrv_delegatecnt--;
	}

	/*
	* This function frees an open owner and all associated opens.
	*/
	static void
	nfsrv_freeopenowner(struct nfsstate stp, int cansleep, NFSPROC_T p)
	{
	struct nfsstate nstp, tstp;

	LIST_REMOVE(stp, ls_list);
	/*
	* Now, free all associated opens.
	*/
	nstp = LIST_FIRST(&stp->ls_open);
	while (nstp != LIST_END(&stp->ls_open)) {
	tstp = nstp;
	nstp = LIST_NEXT(nstp, ls_list);
	(void) nfsrv_freeopen(tstp, NULL, cansleep, p);
	}
	if (stp->ls_op)
	nfsrvd_derefcache(stp->ls_op);
	FREE((caddr_t)stp, M_NFSDSTATE);
	nfsstatsv1.srvopenowners--;
	nfsrv_openpluslock--;
	}

	/*
	* This function frees an open (nfsstate open structure) with all associated
	* lock_owners and locks. It also frees the nfslockfile structure iff there
	* are no other opens on the file.
	* Returns 1 if it free'd the nfslockfile, 0 otherwise.
	*/
	static int
	nfsrv_freeopen(struct nfsstate stp, vnode_t vp, int cansleep, NFSPROC_T p)
	{
	struct nfsstate nstp, tstp;
	struct nfslockfile *lfp;
	int ret;

	LIST_REMOVE(stp, ls_hash);
	LIST_REMOVE(stp, ls_list);
	LIST_REMOVE(stp, ls_file);

	lfp = stp->ls_lfp;
	/*
	* Now, free all lockowners associated with this open.
	*/
	LIST_FOREACH_SAFE(tstp, &stp->ls_open, ls_list, nstp)
	nfsrv_freelockowner(tstp, vp, cansleep, p);

	/*
	* The nfslockfile is freed here if there are no locks
	* associated with the open.
	* If there are locks associated with the open, the
	* nfslockfile structure can be freed via nfsrv_freelockowner().
	* Acquire the state mutex to avoid races with calls to
	* nfsrv_getlockfile().
	*/
	if (cansleep != 0)
	NFSLOCKSTATE();
	if (lfp != NULL && LIST_EMPTY(&lfp->lf_open) &&
	LIST_EMPTY(&lfp->lf_deleg) && LIST_EMPTY(&lfp->lf_lock) &&
	LIST_EMPTY(&lfp->lf_locallock) && LIST_EMPTY(&lfp->lf_rollback) &&
	lfp->lf_usecount == 0 &&
	(cansleep != 0 \|\| nfsv4_testlock(&lfp->lf_locallock_lck) == 0)) {
	nfsrv_freenfslockfile(lfp);
	ret = 1;
	} else
	ret = 0;
	if (cansleep != 0)
	NFSUNLOCKSTATE();
	FREE((caddr_t)stp, M_NFSDSTATE);
	nfsstatsv1.srvopens--;
	nfsrv_openpluslock--;
	return (ret);
	}

	/*
	* Frees a lockowner and all associated locks.
	*/
	static void
	nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep,
	NFSPROC_T *p)
	{

	LIST_REMOVE(stp, ls_hash);
	LIST_REMOVE(stp, ls_list);
	nfsrv_freeallnfslocks(stp, vp, cansleep, p);
	if (stp->ls_op)
	nfsrvd_derefcache(stp->ls_op);
	FREE((caddr_t)stp, M_NFSDSTATE);
	nfsstatsv1.srvlockowners--;
	nfsrv_openpluslock--;
	}

	/*
	* Free all the nfs locks on a lockowner.
	*/
	static void
	nfsrv_freeallnfslocks(struct nfsstate *stp, vnode_t vp, int cansleep,
	NFSPROC_T *p)
	{
	struct nfslock lop, nlop;
	struct nfsrollback rlp, nrlp;
	struct nfslockfile *lfp = NULL;
	int gottvp = 0;
	vnode_t tvp = NULL;
	uint64_t first, end;

	if (vp != NULL)
	ASSERT_VOP_UNLOCKED(vp, "nfsrv_freeallnfslocks: vnode locked");
	lop = LIST_FIRST(&stp->ls_lock);
	while (lop != LIST_END(&stp->ls_lock)) {
	nlop = LIST_NEXT(lop, lo_lckowner);
	/*
	* Since all locks should be for the same file, lfp should
	* not change.
	*/
	if (lfp == NULL)
	lfp = lop->lo_lfp;
	else if (lfp != lop->lo_lfp)
	panic("allnfslocks");
	/*
	* If vp is NULL and cansleep != 0, a vnode must be acquired
	* from the file handle. This only occurs when called from
	* nfsrv_cleanclient().
	*/
	if (gottvp == 0) {
	if (nfsrv_dolocallocks == 0)
	tvp = NULL;
	else if (vp == NULL && cansleep != 0) {
	tvp = nfsvno_getvp(&lfp->lf_fh);
	NFSVOPUNLOCK(tvp, 0);
	} else
	tvp = vp;
	gottvp = 1;
	}

	if (tvp != NULL) {
	if (cansleep == 0)
	panic("allnfs2");
	first = lop->lo_first;
	end = lop->lo_end;
	nfsrv_freenfslock(lop);
	nfsrv_localunlock(tvp, lfp, first, end, p);
	LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list,
	nrlp)
	free(rlp, M_NFSDROLLBACK);
	LIST_INIT(&lfp->lf_rollback);
	} else
	nfsrv_freenfslock(lop);
	lop = nlop;
	}
	if (vp == NULL && tvp != NULL)
	vrele(tvp);
	}

	/*
	* Free an nfslock structure.
	*/
	static void
	nfsrv_freenfslock(struct nfslock *lop)
	{

	if (lop->lo_lckfile.le_prev != NULL) {
	LIST_REMOVE(lop, lo_lckfile);
	nfsstatsv1.srvlocks--;
	nfsrv_openpluslock--;
	}
	LIST_REMOVE(lop, lo_lckowner);
	FREE((caddr_t)lop, M_NFSDLOCK);
	}

	/*
	* This function frees an nfslockfile structure.
	*/
	static void
	nfsrv_freenfslockfile(struct nfslockfile *lfp)
	{

	LIST_REMOVE(lfp, lf_hash);
	FREE((caddr_t)lfp, M_NFSDLOCKFILE);
	}

	/*
	* This function looks up an nfsstate structure via stateid.
	*/
	static int
	nfsrv_getstate(struct nfsclient clp, nfsv4stateid_t stateidp, __unused u_int32_t flags,
	struct nfsstate **stpp)
	{
	struct nfsstate *stp;
	struct nfsstatehead *hp;
	int error = 0;

	*stpp = NULL;
	hp = NFSSTATEHASH(clp, *stateidp);
	LIST_FOREACH(stp, hp, ls_hash) {
	if (!NFSBCMP(stp->ls_stateid.other, stateidp->other,
	NFSX_STATEIDOTHER))
	break;
	}

	/*
	* If no state id in list, return NFSERR_BADSTATEID.
	*/
	if (stp == LIST_END(hp)) {
	error = NFSERR_BADSTATEID;
	goto out;
	}
	*stpp = stp;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* This function gets an nfsstate structure via owner string.
	*/
	static void
	nfsrv_getowner(struct nfsstatehead hp, struct nfsstate new_stp,
	struct nfsstate **stpp)
	{
	struct nfsstate *stp;

	*stpp = NULL;
	LIST_FOREACH(stp, hp, ls_list) {
	if (new_stp->ls_ownerlen == stp->ls_ownerlen &&
	!NFSBCMP(new_stp->ls_owner,stp->ls_owner,stp->ls_ownerlen)) {
	*stpp = stp;
	return;
	}
	}
	}

	/*
	* Lock control function called to update lock status.
	* Returns 0 upon success, -1 if there is no lock and the flags indicate
	* that one isn't to be created and an NFSERR_xxx for other errors.
	* The structures new_stp and new_lop are passed in as pointers that should
	* be set to NULL if the structure is used and shouldn't be free'd.
	* For the NFSLCK_TEST and NFSLCK_CHECK cases, the structures are
	* never used and can safely be allocated on the stack. For all other
	* cases, new_stpp and new_lopp should be malloc'd before the call,
	* in case they are used.
	*/
	APPLESTATIC int
	nfsrv_lockctrl(vnode_t vp, struct nfsstate **new_stpp,
	struct nfslock *new_lopp, struct nfslockconflict cfp,
	nfsquad_t clientid, nfsv4stateid_t *stateidp,
	__unused struct nfsexstuff *exp,
	struct nfsrv_descript nd, NFSPROC_T p)
	{
	struct nfslock *lop;
	struct nfsstate new_stp = new_stpp;
	struct nfslock new_lop = new_lopp;
	struct nfsstate tstp, mystp, *nstp;
	int specialid = 0;
	struct nfslockfile *lfp;
	struct nfslock *other_lop = NULL;
	struct nfsstate stp, lckstp = NULL;
	struct nfsclient *clp = NULL;
	u_int32_t bits;
	int error = 0, haslock = 0, ret, reterr;
	int getlckret, delegation = 0, filestruct_locked, vnode_unlocked = 0;
	fhandle_t nfh;
	uint64_t first, end;
	uint32_t lock_flags;

	if (new_stp->ls_flags & (NFSLCK_CHECK \| NFSLCK_SETATTR)) {
	/*
	* Note the special cases of "all 1s" or "all 0s" stateids and
	* let reads with all 1s go ahead.
	*/
	if (new_stp->ls_stateid.seqid == 0x0 &&
	new_stp->ls_stateid.other[0] == 0x0 &&
	new_stp->ls_stateid.other[1] == 0x0 &&
	new_stp->ls_stateid.other[2] == 0x0)
	specialid = 1;
	else if (new_stp->ls_stateid.seqid == 0xffffffff &&
	new_stp->ls_stateid.other[0] == 0xffffffff &&
	new_stp->ls_stateid.other[1] == 0xffffffff &&
	new_stp->ls_stateid.other[2] == 0xffffffff)
	specialid = 2;
	}

	/*
	* Check for restart conditions (client and server).
	*/
	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
	&new_stp->ls_stateid, specialid);
	if (error)
	goto out;

	/*
	* Check for state resource limit exceeded.
	*/
	if ((new_stp->ls_flags & NFSLCK_LOCK) &&
	nfsrv_openpluslock > nfsrv_v4statelimit) {
	error = NFSERR_RESOURCE;
	goto out;
	}

	/*
	* For the lock case, get another nfslock structure,
	* just in case we need it.
	* Malloc now, before we start sifting through the linked lists,
	* in case we have to wait for memory.
	*/
	tryagain:
	if (new_stp->ls_flags & NFSLCK_LOCK)
	MALLOC(other_lop, struct nfslock *, sizeof (struct nfslock),
	M_NFSDLOCK, M_WAITOK);
	filestruct_locked = 0;
	reterr = 0;
	lfp = NULL;

	/*
	* Get the lockfile structure for CFH now, so we can do a sanity
	* check against the stateid, before incrementing the seqid#, since
	* we want to return NFSERR_BADSTATEID on failure and the seqid#
	* shouldn't be incremented for this case.
	* If nfsrv_getlockfile() returns -1, it means "not found", which
	* will be handled later.
	* If we are doing Lock/LockU and local locking is enabled, sleep
	* lock the nfslockfile structure.
	*/
	getlckret = nfsrv_getlockfh(vp, new_stp->ls_flags, NULL, &nfh, p);
	NFSLOCKSTATE();
	if (getlckret == 0) {
	if ((new_stp->ls_flags & (NFSLCK_LOCK \| NFSLCK_UNLOCK)) != 0 &&
	nfsrv_dolocallocks != 0 && nd->nd_repstat == 0) {
	getlckret = nfsrv_getlockfile(new_stp->ls_flags, NULL,
	&lfp, &nfh, 1);
	if (getlckret == 0)
	filestruct_locked = 1;
	} else
	getlckret = nfsrv_getlockfile(new_stp->ls_flags, NULL,
	&lfp, &nfh, 0);
	}
	if (getlckret != 0 && getlckret != -1)
	reterr = getlckret;

	if (filestruct_locked != 0) {
	LIST_INIT(&lfp->lf_rollback);
	if ((new_stp->ls_flags & NFSLCK_LOCK)) {
	/*
	* For local locking, do the advisory locking now, so
	* that any conflict can be detected. A failure later
	* can be rolled back locally. If an error is returned,
	* struct nfslockfile has been unlocked and any local
	* locking rolled back.
	*/
	NFSUNLOCKSTATE();
	if (vnode_unlocked == 0) {
	ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl1");
	vnode_unlocked = 1;
	NFSVOPUNLOCK(vp, 0);
	}
	reterr = nfsrv_locallock(vp, lfp,
	(new_lop->lo_flags & (NFSLCK_READ \| NFSLCK_WRITE)),
	new_lop->lo_first, new_lop->lo_end, cfp, p);
	NFSLOCKSTATE();
	}
	}

	if (specialid == 0) {
	if (new_stp->ls_flags & NFSLCK_TEST) {
	/*
	* RFC 3530 does not list LockT as an op that renews a
	* lease, but the consensus seems to be that it is ok
	* for a server to do so.
	*/
	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
	(nfsquad_t)((u_quad_t)0), 0, nd, p);

	/*
	* Since NFSERR_EXPIRED, NFSERR_ADMINREVOKED are not valid
	* error returns for LockT, just go ahead and test for a lock,
	* since there are no locks for this client, but other locks
	* can conflict. (ie. same client will always be false)
	*/
	if (error == NFSERR_EXPIRED \|\| error == NFSERR_ADMINREVOKED)
	error = 0;
	lckstp = new_stp;
	} else {
	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
	(nfsquad_t)((u_quad_t)0), 0, nd, p);
	if (error == 0)
	/*
	* Look up the stateid
	*/
	error = nfsrv_getstate(clp, &new_stp->ls_stateid,
	new_stp->ls_flags, &stp);
	/*
	* do some sanity checks for an unconfirmed open or a
	* stateid that refers to the wrong file, for an open stateid
	*/
	if (error == 0 && (stp->ls_flags & NFSLCK_OPEN) &&
	((stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM) \|\|
	(getlckret == 0 && stp->ls_lfp != lfp))){
	/*
	* NFSLCK_SETATTR should return OK rather than NFSERR_BADSTATEID
	* The only exception is using SETATTR with SIZE.
	* */
	if ((new_stp->ls_flags &
	(NFSLCK_SETATTR \| NFSLCK_CHECK)) != NFSLCK_SETATTR)
	error = NFSERR_BADSTATEID;
	}

	if (error == 0 &&
	(stp->ls_flags & (NFSLCK_DELEGREAD \| NFSLCK_DELEGWRITE)) &&
	getlckret == 0 && stp->ls_lfp != lfp)
	error = NFSERR_BADSTATEID;

	/*
	* If the lockowner stateid doesn't refer to the same file,
	* I believe that is considered ok, since some clients will
	* only create a single lockowner and use that for all locks
	* on all files.
	* For now, log it as a diagnostic, instead of considering it
	* a BadStateid.
	*/
	if (error == 0 && (stp->ls_flags &
	(NFSLCK_OPEN \| NFSLCK_DELEGREAD \| NFSLCK_DELEGWRITE)) == 0 &&
	getlckret == 0 && stp->ls_lfp != lfp) {
	#ifdef DIAGNOSTIC
	printf("Got a lock statid for different file open\n");
	#endif
	/*
	error = NFSERR_BADSTATEID;
	*/
	}

	if (error == 0) {
	if (new_stp->ls_flags & NFSLCK_OPENTOLOCK) {
	/*
	* If haslock set, we've already checked the seqid.
	*/
	if (!haslock) {
	if (stp->ls_flags & NFSLCK_OPEN)
	error = nfsrv_checkseqid(nd, new_stp->ls_seq,
	stp->ls_openowner, new_stp->ls_op);
	else
	error = NFSERR_BADSTATEID;
	}
	if (!error)
	nfsrv_getowner(&stp->ls_open, new_stp, &lckstp);
	if (lckstp)
	/*
	* I believe this should be an error, but it
	* isn't obvious what NFSERR_xxx would be
	* appropriate, so I'll use NFSERR_INVAL for now.
	*/
	error = NFSERR_INVAL;
	else
	lckstp = new_stp;
	} else if (new_stp->ls_flags&(NFSLCK_LOCK\|NFSLCK_UNLOCK)) {
	/*
	* If haslock set, ditto above.
	*/
	if (!haslock) {
	if (stp->ls_flags & NFSLCK_OPEN)
	error = NFSERR_BADSTATEID;
	else
	error = nfsrv_checkseqid(nd, new_stp->ls_seq,
	stp, new_stp->ls_op);
	}
	lckstp = stp;
	} else {
	lckstp = stp;
	}
	}
	/*
	* If the seqid part of the stateid isn't the same, return
	* NFSERR_OLDSTATEID for cases other than I/O Ops.
	* For I/O Ops, only return NFSERR_OLDSTATEID if
	* nfsrv_returnoldstateid is set. (The consensus on the email
	* list was that most clients would prefer to not receive
	* NFSERR_OLDSTATEID for I/O Ops, but the RFC suggests that that
	* is what will happen, so I use the nfsrv_returnoldstateid to
	* allow for either server configuration.)
	*/
	if (!error && stp->ls_stateid.seqid!=new_stp->ls_stateid.seqid &&
	(((nd->nd_flag & ND_NFSV41) == 0 &&
	(!(new_stp->ls_flags & NFSLCK_CHECK) \|\|
	nfsrv_returnoldstateid)) \|\|
	((nd->nd_flag & ND_NFSV41) != 0 &&
	new_stp->ls_stateid.seqid != 0)))
	error = NFSERR_OLDSTATEID;
	}
	}

	/*
	* Now we can check for grace.
	*/
	if (!error)
	error = nfsrv_checkgrace(nd, clp, new_stp->ls_flags);
	if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error &&
	nfsrv_checkstable(clp))
	error = NFSERR_NOGRACE;
	/*
	* If we successfully Reclaimed state, note that.
	*/
	if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error)
	nfsrv_markstable(clp);

	/*
	* At this point, either error == NFSERR_BADSTATEID or the
	* seqid# has been updated, so we can return any error.
	* If error == 0, there may be an error in:
	* nd_repstat - Set by the calling function.
	* reterr - Set above, if getting the nfslockfile structure
	* or acquiring the local lock failed.
	* (If both of these are set, nd_repstat should probably be
	* returned, since that error was detected before this
	* function call.)
	*/
	if (error != 0 \|\| nd->nd_repstat != 0 \|\| reterr != 0) {
	if (error == 0) {
	if (nd->nd_repstat != 0)
	error = nd->nd_repstat;
	else
	error = reterr;
	}
	if (filestruct_locked != 0) {
	/* Roll back local locks. */
	NFSUNLOCKSTATE();
	if (vnode_unlocked == 0) {
	ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl2");
	vnode_unlocked = 1;
	NFSVOPUNLOCK(vp, 0);
	}
	nfsrv_locallock_rollback(vp, lfp, p);
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	}
	NFSUNLOCKSTATE();
	goto out;
	}

	/*
	* Check the nfsrv_getlockfile return.
	* Returned -1 if no structure found.
	*/
	if (getlckret == -1) {
	error = NFSERR_EXPIRED;
	/*
	* Called from lockt, so no lock is OK.
	*/
	if (new_stp->ls_flags & NFSLCK_TEST) {
	error = 0;
	} else if (new_stp->ls_flags &
	(NFSLCK_CHECK \| NFSLCK_SETATTR)) {
	/*
	* Called to check for a lock, OK if the stateid is all
	* 1s or all 0s, but there should be an nfsstate
	* otherwise.
	* (ie. If there is no open, I'll assume no share
	* deny bits.)
	*/
	if (specialid)
	error = 0;
	else
	error = NFSERR_BADSTATEID;
	}
	NFSUNLOCKSTATE();
	goto out;
	}

	/*
	* For NFSLCK_CHECK and NFSLCK_LOCK, test for a share conflict.
	* For NFSLCK_CHECK, allow a read if write access is granted,
	* but check for a deny. For NFSLCK_LOCK, require correct access,
	* which implies a conflicting deny can't exist.
	*/
	if (new_stp->ls_flags & (NFSLCK_CHECK \| NFSLCK_LOCK)) {
	/*
	* Four kinds of state id:
	* - specialid (all 0s or all 1s), only for NFSLCK_CHECK
	* - stateid for an open
	* - stateid for a delegation
	* - stateid for a lock owner
	*/
	if (!specialid) {
	if (stp->ls_flags & (NFSLCK_DELEGREAD \| NFSLCK_DELEGWRITE)) {
	delegation = 1;
	mystp = stp;
	nfsrv_delaydelegtimeout(stp);
	} else if (stp->ls_flags & NFSLCK_OPEN) {
	mystp = stp;
	} else {
	mystp = stp->ls_openstp;
	}
	/*
	* If locking or checking, require correct access
	* bit set.
	*/
	if (((new_stp->ls_flags & NFSLCK_LOCK) &&
	!((new_lop->lo_flags >> NFSLCK_LOCKSHIFT) &
	mystp->ls_flags & NFSLCK_ACCESSBITS)) \|\|
	((new_stp->ls_flags & (NFSLCK_CHECK\|NFSLCK_READACCESS)) ==
	(NFSLCK_CHECK \| NFSLCK_READACCESS) &&
	!(mystp->ls_flags & NFSLCK_READACCESS) &&
	nfsrv_allowreadforwriteopen == 0) \|\|
	((new_stp->ls_flags & (NFSLCK_CHECK\|NFSLCK_WRITEACCESS)) ==
	(NFSLCK_CHECK \| NFSLCK_WRITEACCESS) &&
	!(mystp->ls_flags & NFSLCK_WRITEACCESS))) {
	if (filestruct_locked != 0) {
	/* Roll back local locks. */
	NFSUNLOCKSTATE();
	if (vnode_unlocked == 0) {
	ASSERT_VOP_ELOCKED(vp,
	"nfsrv_lockctrl3");
	vnode_unlocked = 1;
	NFSVOPUNLOCK(vp, 0);
	}
	nfsrv_locallock_rollback(vp, lfp, p);
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	}
	NFSUNLOCKSTATE();
	error = NFSERR_OPENMODE;
	goto out;
	}
	} else
	mystp = NULL;
	if ((new_stp->ls_flags & NFSLCK_CHECK) && !delegation) {
	/*
	* Check for a conflicting deny bit.
	*/
	LIST_FOREACH(tstp, &lfp->lf_open, ls_file) {
	if (tstp != mystp) {
	bits = tstp->ls_flags;
	bits >>= NFSLCK_SHIFT;
	if (new_stp->ls_flags & bits & NFSLCK_ACCESSBITS) {
	KASSERT(vnode_unlocked == 0,
	("nfsrv_lockctrl: vnode unlocked1"));
	ret = nfsrv_clientconflict(tstp->ls_clp, &haslock,
	vp, p);
	if (ret == 1) {
	/*
	* nfsrv_clientconflict unlocks state
	* when it returns non-zero.
	*/
	lckstp = NULL;
	goto tryagain;
	}
	if (ret == 0)
	NFSUNLOCKSTATE();
	if (ret == 2)
	error = NFSERR_PERM;
	else
	error = NFSERR_OPENMODE;
	goto out;
	}
	}
	}

	/* We're outta here */
	NFSUNLOCKSTATE();
	goto out;
	}
	}

	/*
	* For setattr, just get rid of all the Delegations for other clients.
	*/
	if (new_stp->ls_flags & NFSLCK_SETATTR) {
	KASSERT(vnode_unlocked == 0,
	("nfsrv_lockctrl: vnode unlocked2"));
	ret = nfsrv_cleandeleg(vp, lfp, clp, &haslock, p);
	if (ret) {
	/*
	* nfsrv_cleandeleg() unlocks state when it
	* returns non-zero.
	*/
	if (ret == -1) {
	lckstp = NULL;
	goto tryagain;
	}
	error = ret;
	goto out;
	}
	if (!(new_stp->ls_flags & NFSLCK_CHECK) \|\|
	(LIST_EMPTY(&lfp->lf_open) && LIST_EMPTY(&lfp->lf_lock) &&
	LIST_EMPTY(&lfp->lf_deleg))) {
	NFSUNLOCKSTATE();
	goto out;
	}
	}

	/*
	* Check for a conflicting delegation. If one is found, call
	* nfsrv_delegconflict() to handle it. If the v4root lock hasn't
	* been set yet, it will get the lock. Otherwise, it will recall
	* the delegation. Then, we try try again...
	* I currently believe the conflict algorithm to be:
	* For Lock Ops (Lock/LockT/LockU)
	* - there is a conflict iff a different client has a write delegation
	* For Reading (Read Op)
	* - there is a conflict iff a different client has a write delegation
	* (the specialids are always a different client)
	* For Writing (Write/Setattr of size)
	* - there is a conflict if a different client has any delegation
	* - there is a conflict if the same client has a read delegation
	* (I don't understand why this isn't allowed, but that seems to be
	* the current consensus?)
	*/
	tstp = LIST_FIRST(&lfp->lf_deleg);
	while (tstp != LIST_END(&lfp->lf_deleg)) {
	nstp = LIST_NEXT(tstp, ls_file);
	if ((((new_stp->ls_flags&(NFSLCK_LOCK\|NFSLCK_UNLOCK\|NFSLCK_TEST))\|\|
	((new_stp->ls_flags & NFSLCK_CHECK) &&
	(new_lop->lo_flags & NFSLCK_READ))) &&
	clp != tstp->ls_clp &&
	(tstp->ls_flags & NFSLCK_DELEGWRITE)) \|\|
	((new_stp->ls_flags & NFSLCK_CHECK) &&
	(new_lop->lo_flags & NFSLCK_WRITE) &&
	(clp != tstp->ls_clp \|\|
	(tstp->ls_flags & NFSLCK_DELEGREAD)))) {
	ret = 0;
	if (filestruct_locked != 0) {
	/* Roll back local locks. */
	NFSUNLOCKSTATE();
	if (vnode_unlocked == 0) {
	ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl4");
	NFSVOPUNLOCK(vp, 0);
	}
	nfsrv_locallock_rollback(vp, lfp, p);
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	NFSUNLOCKSTATE();
	NFSVOPLOCK(vp, LK_EXCLUSIVE \| LK_RETRY);
	vnode_unlocked = 0;
	if ((vp->v_iflag & VI_DOOMED) != 0)
	ret = NFSERR_SERVERFAULT;
	NFSLOCKSTATE();
	}
	if (ret == 0)
	ret = nfsrv_delegconflict(tstp, &haslock, p, vp);
	if (ret) {
	/*
	* nfsrv_delegconflict unlocks state when it
	* returns non-zero, which it always does.
	*/
	if (other_lop) {
	FREE((caddr_t)other_lop, M_NFSDLOCK);
	other_lop = NULL;
	}
	if (ret == -1) {
	lckstp = NULL;
	goto tryagain;
	}
	error = ret;
	goto out;
	}
	/* Never gets here. */
	}
	tstp = nstp;
	}

	/*
	* Handle the unlock case by calling nfsrv_updatelock().
	* (Should I have done some access checking above for unlock? For now,
	* just let it happen.)
	*/
	if (new_stp->ls_flags & NFSLCK_UNLOCK) {
	first = new_lop->lo_first;
	end = new_lop->lo_end;
	nfsrv_updatelock(stp, new_lopp, &other_lop, lfp);
	stateidp->seqid = ++(stp->ls_stateid.seqid);
	if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
	stateidp->seqid = stp->ls_stateid.seqid = 1;
	stateidp->other[0] = stp->ls_stateid.other[0];
	stateidp->other[1] = stp->ls_stateid.other[1];
	stateidp->other[2] = stp->ls_stateid.other[2];
	if (filestruct_locked != 0) {
	NFSUNLOCKSTATE();
	if (vnode_unlocked == 0) {
	ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl5");
	vnode_unlocked = 1;
	NFSVOPUNLOCK(vp, 0);
	}
	/* Update the local locks. */
	nfsrv_localunlock(vp, lfp, first, end, p);
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	}
	NFSUNLOCKSTATE();
	goto out;
	}

	/*
	* Search for a conflicting lock. A lock conflicts if:
	* - the lock range overlaps and
	* - at least one lock is a write lock and
	* - it is not owned by the same lock owner
	*/
	if (!delegation) {
	LIST_FOREACH(lop, &lfp->lf_lock, lo_lckfile) {
	if (new_lop->lo_end > lop->lo_first &&
	new_lop->lo_first < lop->lo_end &&
	(new_lop->lo_flags == NFSLCK_WRITE \|\|
	lop->lo_flags == NFSLCK_WRITE) &&
	lckstp != lop->lo_stp &&
	(clp != lop->lo_stp->ls_clp \|\|
	lckstp->ls_ownerlen != lop->lo_stp->ls_ownerlen \|\|
	NFSBCMP(lckstp->ls_owner, lop->lo_stp->ls_owner,
	lckstp->ls_ownerlen))) {
	if (other_lop) {
	FREE((caddr_t)other_lop, M_NFSDLOCK);
	other_lop = NULL;
	}
	if (vnode_unlocked != 0)
	ret = nfsrv_clientconflict(lop->lo_stp->ls_clp, &haslock,
	NULL, p);
	else
	ret = nfsrv_clientconflict(lop->lo_stp->ls_clp, &haslock,
	vp, p);
	if (ret == 1) {
	if (filestruct_locked != 0) {
	if (vnode_unlocked == 0) {
	ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl6");
	NFSVOPUNLOCK(vp, 0);
	}
	/* Roll back local locks. */
	nfsrv_locallock_rollback(vp, lfp, p);
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	NFSUNLOCKSTATE();
	NFSVOPLOCK(vp, LK_EXCLUSIVE \| LK_RETRY);
	vnode_unlocked = 0;
	if ((vp->v_iflag & VI_DOOMED) != 0) {
	error = NFSERR_SERVERFAULT;
	goto out;
	}
	}
	/*
	* nfsrv_clientconflict() unlocks state when it
	* returns non-zero.
	*/
	lckstp = NULL;
	goto tryagain;
	}
	/*
	* Found a conflicting lock, so record the conflict and
	* return the error.
	*/
	if (cfp != NULL && ret == 0) {
	cfp->cl_clientid.lval[0]=lop->lo_stp->ls_stateid.other[0];
	cfp->cl_clientid.lval[1]=lop->lo_stp->ls_stateid.other[1];
	cfp->cl_first = lop->lo_first;
	cfp->cl_end = lop->lo_end;
	cfp->cl_flags = lop->lo_flags;
	cfp->cl_ownerlen = lop->lo_stp->ls_ownerlen;
	NFSBCOPY(lop->lo_stp->ls_owner, cfp->cl_owner,
	cfp->cl_ownerlen);
	}
	if (ret == 2)
	error = NFSERR_PERM;
	else if (new_stp->ls_flags & NFSLCK_RECLAIM)
	error = NFSERR_RECLAIMCONFLICT;
	else if (new_stp->ls_flags & NFSLCK_CHECK)
	error = NFSERR_LOCKED;
	else
	error = NFSERR_DENIED;
	if (filestruct_locked != 0 && ret == 0) {
	/* Roll back local locks. */
	NFSUNLOCKSTATE();
	if (vnode_unlocked == 0) {
	ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl7");
	vnode_unlocked = 1;
	NFSVOPUNLOCK(vp, 0);
	}
	nfsrv_locallock_rollback(vp, lfp, p);
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	}
	if (ret == 0)
	NFSUNLOCKSTATE();
	goto out;
	}
	}
	}

	/*
	* We only get here if there was no lock that conflicted.
	*/
	if (new_stp->ls_flags & (NFSLCK_TEST \| NFSLCK_CHECK)) {
	NFSUNLOCKSTATE();
	goto out;
	}

	/*
	* We only get here when we are creating or modifying a lock.
	* There are two variants:
	* - exist_lock_owner where lock_owner exists
	* - open_to_lock_owner with new lock_owner
	*/
	first = new_lop->lo_first;
	end = new_lop->lo_end;
	lock_flags = new_lop->lo_flags;
	if (!(new_stp->ls_flags & NFSLCK_OPENTOLOCK)) {
	nfsrv_updatelock(lckstp, new_lopp, &other_lop, lfp);
	stateidp->seqid = ++(lckstp->ls_stateid.seqid);
	if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
	stateidp->seqid = lckstp->ls_stateid.seqid = 1;
	stateidp->other[0] = lckstp->ls_stateid.other[0];
	stateidp->other[1] = lckstp->ls_stateid.other[1];
	stateidp->other[2] = lckstp->ls_stateid.other[2];
	} else {
	/*
	* The new open_to_lock_owner case.
	* Link the new nfsstate into the lists.
	*/
	new_stp->ls_seq = new_stp->ls_opentolockseq;
	nfsrvd_refcache(new_stp->ls_op);
	stateidp->seqid = new_stp->ls_stateid.seqid = 1;
	stateidp->other[0] = new_stp->ls_stateid.other[0] =
	clp->lc_clientid.lval[0];
	stateidp->other[1] = new_stp->ls_stateid.other[1] =
	clp->lc_clientid.lval[1];
	stateidp->other[2] = new_stp->ls_stateid.other[2] =
	nfsrv_nextstateindex(clp);
	new_stp->ls_clp = clp;
	LIST_INIT(&new_stp->ls_lock);
	new_stp->ls_openstp = stp;
	new_stp->ls_lfp = lfp;
	nfsrv_insertlock(new_lop, (struct nfslock *)new_stp, new_stp,
	lfp);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_stp->ls_stateid),
	new_stp, ls_hash);
	LIST_INSERT_HEAD(&stp->ls_open, new_stp, ls_list);
	*new_lopp = NULL;
	*new_stpp = NULL;
	nfsstatsv1.srvlockowners++;
	nfsrv_openpluslock++;
	}
	if (filestruct_locked != 0) {
	NFSUNLOCKSTATE();
	nfsrv_locallock_commit(lfp, lock_flags, first, end);
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	}
	NFSUNLOCKSTATE();

	out:
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	if (vnode_unlocked != 0) {
	NFSVOPLOCK(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
	error = NFSERR_SERVERFAULT;
	}
	if (other_lop)
	FREE((caddr_t)other_lop, M_NFSDLOCK);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Check for state errors for Open.
	* repstat is passed back out as an error if more critical errors
	* are not detected.
	*/
	APPLESTATIC int
	nfsrv_opencheck(nfsquad_t clientid, nfsv4stateid_t *stateidp,
	struct nfsstate new_stp, vnode_t vp, struct nfsrv_descript nd,
	NFSPROC_T *p, int repstat)
	{
	struct nfsstate stp, nstp;
	struct nfsclient *clp;
	struct nfsstate *ownerstp;
	struct nfslockfile lfp, new_lfp;
	int error = 0, haslock = 0, ret, readonly = 0, getfhret = 0;

	if ((new_stp->ls_flags & NFSLCK_SHAREBITS) == NFSLCK_READACCESS)
	readonly = 1;
	/*
	* Check for restart conditions (client and server).
	*/
	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
	&new_stp->ls_stateid, 0);
	if (error)
	goto out;

	/*
	* Check for state resource limit exceeded.
	* Technically this should be SMP protected, but the worst
	* case error is "out by one or two" on the count when it
	* returns NFSERR_RESOURCE and the limit is just a rather
	* arbitrary high water mark, so no harm is done.
	*/
	if (nfsrv_openpluslock > nfsrv_v4statelimit) {
	error = NFSERR_RESOURCE;
	goto out;
	}

	tryagain:
	MALLOC(new_lfp, struct nfslockfile *, sizeof (struct nfslockfile),
	M_NFSDLOCKFILE, M_WAITOK);
	if (vp)
	getfhret = nfsrv_getlockfh(vp, new_stp->ls_flags, new_lfp,
	NULL, p);
	NFSLOCKSTATE();
	/*
	* Get the nfsclient structure.
	*/
	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
	(nfsquad_t)((u_quad_t)0), 0, nd, p);

	/*
	* Look up the open owner. See if it needs confirmation and
	* check the seq#, as required.
	*/
	if (!error)
	nfsrv_getowner(&clp->lc_open, new_stp, &ownerstp);

	if (!error && ownerstp) {
	error = nfsrv_checkseqid(nd, new_stp->ls_seq, ownerstp,
	new_stp->ls_op);
	/*
	* If the OpenOwner hasn't been confirmed, assume the
	* old one was a replay and this one is ok.
	* See: RFC3530 Sec. 14.2.18.
	*/
	if (error == NFSERR_BADSEQID &&
	(ownerstp->ls_flags & NFSLCK_NEEDSCONFIRM))
	error = 0;
	}

	/*
	* Check for grace.
	*/
	if (!error)
	error = nfsrv_checkgrace(nd, clp, new_stp->ls_flags);
	if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error &&
	nfsrv_checkstable(clp))
	error = NFSERR_NOGRACE;

	/*
	* If none of the above errors occurred, let repstat be
	* returned.
	*/
	if (repstat && !error)
	error = repstat;
	if (error) {
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	free((caddr_t)new_lfp, M_NFSDLOCKFILE);
	goto out;
	}

	/*
	* If vp == NULL, the file doesn't exist yet, so return ok.
	* (This always happens on the first pass, so haslock must be 0.)
	*/
	if (vp == NULL) {
	NFSUNLOCKSTATE();
	FREE((caddr_t)new_lfp, M_NFSDLOCKFILE);
	goto out;
	}

	/*
	* Get the structure for the underlying file.
	*/
	if (getfhret)
	error = getfhret;
	else
	error = nfsrv_getlockfile(new_stp->ls_flags, &new_lfp, &lfp,
	NULL, 0);
	if (new_lfp)
	FREE((caddr_t)new_lfp, M_NFSDLOCKFILE);
	if (error) {
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	goto out;
	}

	/*
	* Search for a conflicting open/share.
	*/
	if (new_stp->ls_flags & NFSLCK_DELEGCUR) {
	/*
	* For Delegate_Cur, search for the matching Delegation,
	* which indicates no conflict.
	* An old delegation should have been recovered by the
	* client doing a Claim_DELEGATE_Prev, so I won't let
	* it match and return NFSERR_EXPIRED. Should I let it
	* match?
	*/
	LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
	if (!(stp->ls_flags & NFSLCK_OLDDELEG) &&
	(((nd->nd_flag & ND_NFSV41) != 0 &&
	stateidp->seqid == 0) \|\|
	stateidp->seqid == stp->ls_stateid.seqid) &&
	!NFSBCMP(stateidp->other, stp->ls_stateid.other,
	NFSX_STATEIDOTHER))
	break;
	}
	if (stp == LIST_END(&lfp->lf_deleg) \|\|
	((new_stp->ls_flags & NFSLCK_WRITEACCESS) &&
	(stp->ls_flags & NFSLCK_DELEGREAD))) {
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	error = NFSERR_EXPIRED;
	goto out;
	}
	}

	/*
	* Check for access/deny bit conflicts. I check for the same
	* owner as well, in case the client didn't bother.
	*/
	LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
	if (!(new_stp->ls_flags & NFSLCK_DELEGCUR) &&
	(((new_stp->ls_flags & NFSLCK_ACCESSBITS) &
	((stp->ls_flags>>NFSLCK_SHIFT) & NFSLCK_ACCESSBITS))\|\|
	((stp->ls_flags & NFSLCK_ACCESSBITS) &
	((new_stp->ls_flags>>NFSLCK_SHIFT)&NFSLCK_ACCESSBITS)))){
	ret = nfsrv_clientconflict(stp->ls_clp,&haslock,vp,p);
	if (ret == 1) {
	/*
	* nfsrv_clientconflict() unlocks
	* state when it returns non-zero.
	*/
	goto tryagain;
	}
	if (ret == 2)
	error = NFSERR_PERM;
	else if (new_stp->ls_flags & NFSLCK_RECLAIM)
	error = NFSERR_RECLAIMCONFLICT;
	else
	error = NFSERR_SHAREDENIED;
	if (ret == 0)
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	goto out;
	}
	}

	/*
	* Check for a conflicting delegation. If one is found, call
	* nfsrv_delegconflict() to handle it. If the v4root lock hasn't
	* been set yet, it will get the lock. Otherwise, it will recall
	* the delegation. Then, we try try again...
	* (If NFSLCK_DELEGCUR is set, it has a delegation, so there
	* isn't a conflict.)
	* I currently believe the conflict algorithm to be:
	* For Open with Read Access and Deny None
	* - there is a conflict iff a different client has a write delegation
	* For Open with other Write Access or any Deny except None
	* - there is a conflict if a different client has any delegation
	* - there is a conflict if the same client has a read delegation
	* (The current consensus is that this last case should be
	* considered a conflict since the client with a read delegation
	* could have done an Open with ReadAccess and WriteDeny
	* locally and then not have checked for the WriteDeny.)
	* Don't check for a Reclaim, since that will be dealt with
	* by nfsrv_openctrl().
	*/
	if (!(new_stp->ls_flags &
	(NFSLCK_DELEGPREV \| NFSLCK_DELEGCUR \| NFSLCK_RECLAIM))) {
	stp = LIST_FIRST(&lfp->lf_deleg);
	while (stp != LIST_END(&lfp->lf_deleg)) {
	nstp = LIST_NEXT(stp, ls_file);
	if ((readonly && stp->ls_clp != clp &&
	(stp->ls_flags & NFSLCK_DELEGWRITE)) \|\|
	(!readonly && (stp->ls_clp != clp \|\|
	(stp->ls_flags & NFSLCK_DELEGREAD)))) {
	ret = nfsrv_delegconflict(stp, &haslock, p, vp);
	if (ret) {
	/*
	* nfsrv_delegconflict() unlocks state
	* when it returns non-zero.
	*/
	if (ret == -1)
	goto tryagain;
	error = ret;
	goto out;
	}
	}
	stp = nstp;
	}
	}
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Open control function to create/update open state for an open.
	*/
	APPLESTATIC int
	nfsrv_openctrl(struct nfsrv_descript *nd, vnode_t vp,
	struct nfsstate *new_stpp, nfsquad_t clientid, nfsv4stateid_t stateidp,
	nfsv4stateid_t delegstateidp, u_int32_t rflagsp, struct nfsexstuff *exp,
	NFSPROC_T *p, u_quad_t filerev)
	{
	struct nfsstate new_stp = new_stpp;
	struct nfsstate stp, nstp;
	struct nfsstate openstp = NULL, new_open, ownerstp, new_deleg;
	struct nfslockfile lfp, new_lfp;
	struct nfsclient *clp;
	int error = 0, haslock = 0, ret, delegate = 1, writedeleg = 1;
	int readonly = 0, cbret = 1, getfhret = 0;
	int gotstate = 0, len = 0;
	u_char *clidp = NULL;

	if ((new_stp->ls_flags & NFSLCK_SHAREBITS) == NFSLCK_READACCESS)
	readonly = 1;
	/*
	* Check for restart conditions (client and server).
	* (Paranoia, should have been detected by nfsrv_opencheck().)
	* If an error does show up, return NFSERR_EXPIRED, since the
	* the seqid# has already been incremented.
	*/
	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
	&new_stp->ls_stateid, 0);
	if (error) {
	printf("Nfsd: openctrl unexpected restart err=%d\n",
	error);
	error = NFSERR_EXPIRED;
	goto out;
	}

	clidp = malloc(NFSV4_OPAQUELIMIT, M_TEMP, M_WAITOK);
	tryagain:
	MALLOC(new_lfp, struct nfslockfile *, sizeof (struct nfslockfile),
	M_NFSDLOCKFILE, M_WAITOK);
	MALLOC(new_open, struct nfsstate *, sizeof (struct nfsstate),
	M_NFSDSTATE, M_WAITOK);
	MALLOC(new_deleg, struct nfsstate *, sizeof (struct nfsstate),
	M_NFSDSTATE, M_WAITOK);
	getfhret = nfsrv_getlockfh(vp, new_stp->ls_flags, new_lfp,
	NULL, p);
	NFSLOCKSTATE();
	/*
	* Get the client structure. Since the linked lists could be changed
	* by other nfsd processes if this process does a tsleep(), one of
	* two things must be done.
	* 1 - don't tsleep()
	* or
	* 2 - get the nfsv4_lock() { indicated by haslock == 1 }
	* before using the lists, since this lock stops the other
	* nfsd. This should only be used for rare cases, since it
	* essentially single threads the nfsd.
	* At this time, it is only done for cases where the stable
	* storage file must be written prior to completion of state
	* expiration.
	*/
	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
	(nfsquad_t)((u_quad_t)0), 0, nd, p);
	if (!error && (clp->lc_flags & LCL_NEEDSCBNULL) &&
	clp->lc_program) {
	/*
	* This happens on the first open for a client
	* that supports callbacks.
	*/
	NFSUNLOCKSTATE();
	/*
	* Although nfsrv_docallback() will sleep, clp won't
	* go away, since they are only removed when the
	* nfsv4_lock() has blocked the nfsd threads. The
	* fields in clp can change, but having multiple
	* threads do this Null callback RPC should be
	* harmless.
	*/
	cbret = nfsrv_docallback(clp, NFSV4PROC_CBNULL,
	NULL, 0, NULL, NULL, NULL, p);
	NFSLOCKSTATE();
	clp->lc_flags &= ~LCL_NEEDSCBNULL;
	if (!cbret)
	clp->lc_flags \|= LCL_CALLBACKSON;
	}

	/*
	* Look up the open owner. See if it needs confirmation and
	* check the seq#, as required.
	*/
	if (!error)
	nfsrv_getowner(&clp->lc_open, new_stp, &ownerstp);

	if (error) {
	NFSUNLOCKSTATE();
	printf("Nfsd: openctrl unexpected state err=%d\n",
	error);
	free((caddr_t)new_lfp, M_NFSDLOCKFILE);
	free((caddr_t)new_open, M_NFSDSTATE);
	free((caddr_t)new_deleg, M_NFSDSTATE);
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	error = NFSERR_EXPIRED;
	goto out;
	}

	if (new_stp->ls_flags & NFSLCK_RECLAIM)
	nfsrv_markstable(clp);

	/*
	* Get the structure for the underlying file.
	*/
	if (getfhret)
	error = getfhret;
	else
	error = nfsrv_getlockfile(new_stp->ls_flags, &new_lfp, &lfp,
	NULL, 0);
	if (new_lfp)
	FREE((caddr_t)new_lfp, M_NFSDLOCKFILE);
	if (error) {
	NFSUNLOCKSTATE();
	printf("Nfsd openctrl unexpected getlockfile err=%d\n",
	error);
	free((caddr_t)new_open, M_NFSDSTATE);
	free((caddr_t)new_deleg, M_NFSDSTATE);
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	goto out;
	}

	/*
	* Search for a conflicting open/share.
	*/
	if (new_stp->ls_flags & NFSLCK_DELEGCUR) {
	/*
	* For Delegate_Cur, search for the matching Delegation,
	* which indicates no conflict.
	* An old delegation should have been recovered by the
	* client doing a Claim_DELEGATE_Prev, so I won't let
	* it match and return NFSERR_EXPIRED. Should I let it
	* match?
	*/
	LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
	if (!(stp->ls_flags & NFSLCK_OLDDELEG) &&
	(((nd->nd_flag & ND_NFSV41) != 0 &&
	stateidp->seqid == 0) \|\|
	stateidp->seqid == stp->ls_stateid.seqid) &&
	!NFSBCMP(stateidp->other, stp->ls_stateid.other,
	NFSX_STATEIDOTHER))
	break;
	}
	if (stp == LIST_END(&lfp->lf_deleg) \|\|
	((new_stp->ls_flags & NFSLCK_WRITEACCESS) &&
	(stp->ls_flags & NFSLCK_DELEGREAD))) {
	NFSUNLOCKSTATE();
	printf("Nfsd openctrl unexpected expiry\n");
	free((caddr_t)new_open, M_NFSDSTATE);
	free((caddr_t)new_deleg, M_NFSDSTATE);
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	error = NFSERR_EXPIRED;
	goto out;
	}

	/*
	* Don't issue a Delegation, since one already exists and
	* delay delegation timeout, as required.
	*/
	delegate = 0;
	nfsrv_delaydelegtimeout(stp);
	}

	/*
	* Check for access/deny bit conflicts. I also check for the
	* same owner, since the client might not have bothered to check.
	* Also, note an open for the same file and owner, if found,
	* which is all we do here for Delegate_Cur, since conflict
	* checking is already done.
	*/
	LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
	if (ownerstp && stp->ls_openowner == ownerstp)
	openstp = stp;
	if (!(new_stp->ls_flags & NFSLCK_DELEGCUR)) {
	/*
	* If another client has the file open, the only
	* delegation that can be issued is a Read delegation
	* and only if it is a Read open with Deny none.
	*/
	if (clp != stp->ls_clp) {
	if ((stp->ls_flags & NFSLCK_SHAREBITS) ==
	NFSLCK_READACCESS)
	writedeleg = 0;
	else
	delegate = 0;
	}
	if(((new_stp->ls_flags & NFSLCK_ACCESSBITS) &
	((stp->ls_flags>>NFSLCK_SHIFT) & NFSLCK_ACCESSBITS))\|\|
	((stp->ls_flags & NFSLCK_ACCESSBITS) &
	((new_stp->ls_flags>>NFSLCK_SHIFT)&NFSLCK_ACCESSBITS))){
	ret = nfsrv_clientconflict(stp->ls_clp,&haslock,vp,p);
	if (ret == 1) {
	/*
	* nfsrv_clientconflict() unlocks state
	* when it returns non-zero.
	*/
	free((caddr_t)new_open, M_NFSDSTATE);
	free((caddr_t)new_deleg, M_NFSDSTATE);
	openstp = NULL;
	goto tryagain;
	}
	if (ret == 2)
	error = NFSERR_PERM;
	else if (new_stp->ls_flags & NFSLCK_RECLAIM)
	error = NFSERR_RECLAIMCONFLICT;
	else
	error = NFSERR_SHAREDENIED;
	if (ret == 0)
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	free((caddr_t)new_open, M_NFSDSTATE);
	free((caddr_t)new_deleg, M_NFSDSTATE);
	printf("nfsd openctrl unexpected client cnfl\n");
	goto out;
	}
	}
	}

	/*
	* Check for a conflicting delegation. If one is found, call
	* nfsrv_delegconflict() to handle it. If the v4root lock hasn't
	* been set yet, it will get the lock. Otherwise, it will recall
	* the delegation. Then, we try try again...
	* (If NFSLCK_DELEGCUR is set, it has a delegation, so there
	* isn't a conflict.)
	* I currently believe the conflict algorithm to be:
	* For Open with Read Access and Deny None
	* - there is a conflict iff a different client has a write delegation
	* For Open with other Write Access or any Deny except None
	* - there is a conflict if a different client has any delegation
	* - there is a conflict if the same client has a read delegation
	* (The current consensus is that this last case should be
	* considered a conflict since the client with a read delegation
	* could have done an Open with ReadAccess and WriteDeny
	* locally and then not have checked for the WriteDeny.)
	*/
	if (!(new_stp->ls_flags & (NFSLCK_DELEGPREV \| NFSLCK_DELEGCUR))) {
	stp = LIST_FIRST(&lfp->lf_deleg);
	while (stp != LIST_END(&lfp->lf_deleg)) {
	nstp = LIST_NEXT(stp, ls_file);
	if (stp->ls_clp != clp && (stp->ls_flags & NFSLCK_DELEGREAD))
	writedeleg = 0;
	else
	delegate = 0;
	if ((readonly && stp->ls_clp != clp &&
	(stp->ls_flags & NFSLCK_DELEGWRITE)) \|\|
	(!readonly && (stp->ls_clp != clp \|\|
	(stp->ls_flags & NFSLCK_DELEGREAD)))) {
	if (new_stp->ls_flags & NFSLCK_RECLAIM) {
	delegate = 2;
	} else {
	ret = nfsrv_delegconflict(stp, &haslock, p, vp);
	if (ret) {
	/*
	* nfsrv_delegconflict() unlocks state
	* when it returns non-zero.
	*/
	printf("Nfsd openctrl unexpected deleg cnfl\n");
	free((caddr_t)new_open, M_NFSDSTATE);
	free((caddr_t)new_deleg, M_NFSDSTATE);
	if (ret == -1) {
	openstp = NULL;
	goto tryagain;
	}
	error = ret;
	goto out;
	}
	}
	}
	stp = nstp;
	}
	}

	/*
	* We only get here if there was no open that conflicted.
	* If an open for the owner exists, or in the access/deny bits.
	* Otherwise it is a new open. If the open_owner hasn't been
	* confirmed, replace the open with the new one needing confirmation,
	* otherwise add the open.
	*/
	if (new_stp->ls_flags & NFSLCK_DELEGPREV) {
	/*
	* Handle NFSLCK_DELEGPREV by searching the old delegations for
	* a match. If found, just move the old delegation to the current
	* delegation list and issue open. If not found, return
	* NFSERR_EXPIRED.
	*/
	LIST_FOREACH(stp, &clp->lc_olddeleg, ls_list) {
	if (stp->ls_lfp == lfp) {
	/* Found it */
	if (stp->ls_clp != clp)
	panic("olddeleg clp");
	LIST_REMOVE(stp, ls_list);
	LIST_REMOVE(stp, ls_hash);
	stp->ls_flags &= ~NFSLCK_OLDDELEG;
	stp->ls_stateid.seqid = delegstateidp->seqid = 1;
	stp->ls_stateid.other[0] = delegstateidp->other[0] =
	clp->lc_clientid.lval[0];
	stp->ls_stateid.other[1] = delegstateidp->other[1] =
	clp->lc_clientid.lval[1];
	stp->ls_stateid.other[2] = delegstateidp->other[2] =
	nfsrv_nextstateindex(clp);
	stp->ls_compref = nd->nd_compref;
	LIST_INSERT_HEAD(&clp->lc_deleg, stp, ls_list);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp,
	stp->ls_stateid), stp, ls_hash);
	if (stp->ls_flags & NFSLCK_DELEGWRITE)
	*rflagsp \|= NFSV4OPEN_WRITEDELEGATE;
	else
	*rflagsp \|= NFSV4OPEN_READDELEGATE;
	clp->lc_delegtime = NFSD_MONOSEC +
	nfsrv_lease + NFSRV_LEASEDELTA;

	/*
	* Now, do the associated open.
	*/
	new_open->ls_stateid.seqid = 1;
	new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
	new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
	new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
	new_open->ls_flags = (new_stp->ls_flags&NFSLCK_DENYBITS)\|
	NFSLCK_OPEN;
	if (stp->ls_flags & NFSLCK_DELEGWRITE)
	new_open->ls_flags \|= (NFSLCK_READACCESS \|
	NFSLCK_WRITEACCESS);
	else
	new_open->ls_flags \|= NFSLCK_READACCESS;
	new_open->ls_uid = new_stp->ls_uid;
	new_open->ls_lfp = lfp;
	new_open->ls_clp = clp;
	LIST_INIT(&new_open->ls_open);
	LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
	new_open, ls_hash);
	/*
	* and handle the open owner
	*/
	if (ownerstp) {
	new_open->ls_openowner = ownerstp;
	LIST_INSERT_HEAD(&ownerstp->ls_open,new_open,ls_list);
	} else {
	new_open->ls_openowner = new_stp;
	new_stp->ls_flags = 0;
	nfsrvd_refcache(new_stp->ls_op);
	new_stp->ls_noopens = 0;
	LIST_INIT(&new_stp->ls_open);
	LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
	LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
	*new_stpp = NULL;
	nfsstatsv1.srvopenowners++;
	nfsrv_openpluslock++;
	}
	openstp = new_open;
	new_open = NULL;
	nfsstatsv1.srvopens++;
	nfsrv_openpluslock++;
	break;
	}
	}
	if (stp == LIST_END(&clp->lc_olddeleg))
	error = NFSERR_EXPIRED;
	} else if (new_stp->ls_flags & (NFSLCK_DELEGREAD \| NFSLCK_DELEGWRITE)) {
	/*
	* Scan to see that no delegation for this client and file
	* doesn't already exist.
	* There also shouldn't yet be an Open for this file and
	* openowner.
	*/
	LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
	if (stp->ls_clp == clp)
	break;
	}
	if (stp == LIST_END(&lfp->lf_deleg) && openstp == NULL) {
	/*
	* This is the Claim_Previous case with a delegation
	* type != Delegate_None.
	*/
	/*
	* First, add the delegation. (Although we must issue the
	* delegation, we can also ask for an immediate return.)
	*/
	new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
	new_deleg->ls_stateid.other[0] = delegstateidp->other[0] =
	clp->lc_clientid.lval[0];
	new_deleg->ls_stateid.other[1] = delegstateidp->other[1] =
	clp->lc_clientid.lval[1];
	new_deleg->ls_stateid.other[2] = delegstateidp->other[2] =
	nfsrv_nextstateindex(clp);
	if (new_stp->ls_flags & NFSLCK_DELEGWRITE) {
	new_deleg->ls_flags = (NFSLCK_DELEGWRITE \|
	NFSLCK_READACCESS \| NFSLCK_WRITEACCESS);
	*rflagsp \|= NFSV4OPEN_WRITEDELEGATE;
	nfsrv_writedelegcnt++;
	} else {
	new_deleg->ls_flags = (NFSLCK_DELEGREAD \|
	NFSLCK_READACCESS);
	*rflagsp \|= NFSV4OPEN_READDELEGATE;
	}
	new_deleg->ls_uid = new_stp->ls_uid;
	new_deleg->ls_lfp = lfp;
	new_deleg->ls_clp = clp;
	new_deleg->ls_filerev = filerev;
	new_deleg->ls_compref = nd->nd_compref;
	LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp,
	new_deleg->ls_stateid), new_deleg, ls_hash);
	LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
	new_deleg = NULL;
	if (delegate == 2 \|\| nfsrv_issuedelegs == 0 \|\|
	(clp->lc_flags & (LCL_CALLBACKSON \| LCL_CBDOWN)) !=
	LCL_CALLBACKSON \|\|
	NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt) \|\|
	!NFSVNO_DELEGOK(vp))
	*rflagsp \|= NFSV4OPEN_RECALL;
	nfsstatsv1.srvdelegates++;
	nfsrv_openpluslock++;
	nfsrv_delegatecnt++;

	/*
	* Now, do the associated open.
	*/
	new_open->ls_stateid.seqid = 1;
	new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
	new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
	new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
	new_open->ls_flags = (new_stp->ls_flags & NFSLCK_DENYBITS) \|
	NFSLCK_OPEN;
	if (new_stp->ls_flags & NFSLCK_DELEGWRITE)
	new_open->ls_flags \|= (NFSLCK_READACCESS \|
	NFSLCK_WRITEACCESS);
	else
	new_open->ls_flags \|= NFSLCK_READACCESS;
	new_open->ls_uid = new_stp->ls_uid;
	new_open->ls_lfp = lfp;
	new_open->ls_clp = clp;
	LIST_INIT(&new_open->ls_open);
	LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
	new_open, ls_hash);
	/*
	* and handle the open owner
	*/
	if (ownerstp) {
	new_open->ls_openowner = ownerstp;
	LIST_INSERT_HEAD(&ownerstp->ls_open, new_open, ls_list);
	} else {
	new_open->ls_openowner = new_stp;
	new_stp->ls_flags = 0;
	nfsrvd_refcache(new_stp->ls_op);
	new_stp->ls_noopens = 0;
	LIST_INIT(&new_stp->ls_open);
	LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
	LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
	*new_stpp = NULL;
	nfsstatsv1.srvopenowners++;
	nfsrv_openpluslock++;
	}
	openstp = new_open;
	new_open = NULL;
	nfsstatsv1.srvopens++;
	nfsrv_openpluslock++;
	} else {
	error = NFSERR_RECLAIMCONFLICT;
	}
	} else if (ownerstp) {
	if (ownerstp->ls_flags & NFSLCK_NEEDSCONFIRM) {
	/* Replace the open */
	if (ownerstp->ls_op)
	nfsrvd_derefcache(ownerstp->ls_op);
	ownerstp->ls_op = new_stp->ls_op;
	nfsrvd_refcache(ownerstp->ls_op);
	ownerstp->ls_seq = new_stp->ls_seq;
	*rflagsp \|= NFSV4OPEN_RESULTCONFIRM;
	stp = LIST_FIRST(&ownerstp->ls_open);
	stp->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS) \|
	NFSLCK_OPEN;
	stp->ls_stateid.seqid = 1;
	stp->ls_uid = new_stp->ls_uid;
	if (lfp != stp->ls_lfp) {
	LIST_REMOVE(stp, ls_file);
	LIST_INSERT_HEAD(&lfp->lf_open, stp, ls_file);
	stp->ls_lfp = lfp;
	}
	openstp = stp;
	} else if (openstp) {
	openstp->ls_flags \|= (new_stp->ls_flags & NFSLCK_SHAREBITS);
	openstp->ls_stateid.seqid++;
	if ((nd->nd_flag & ND_NFSV41) != 0 &&
	openstp->ls_stateid.seqid == 0)
	openstp->ls_stateid.seqid = 1;

	/*
	* This is where we can choose to issue a delegation.
	*/
	if (delegate == 0 \|\| writedeleg == 0 \|\|
	NFSVNO_EXRDONLY(exp) \|\| (readonly != 0 &&
	nfsrv_writedelegifpos == 0) \|\|
	!NFSVNO_DELEGOK(vp) \|\|
	(new_stp->ls_flags & NFSLCK_WANTRDELEG) != 0 \|\|
	(clp->lc_flags & (LCL_CALLBACKSON \| LCL_CBDOWN)) !=
	LCL_CALLBACKSON)
	*rflagsp \|= NFSV4OPEN_WDCONTENTION;
	else if (nfsrv_issuedelegs == 0 \|\|
	NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt))
	*rflagsp \|= NFSV4OPEN_WDRESOURCE;
	else if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0)
	*rflagsp \|= NFSV4OPEN_WDNOTWANTED;
	else {
	new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
	new_deleg->ls_stateid.other[0] = delegstateidp->other[0]
	= clp->lc_clientid.lval[0];
	new_deleg->ls_stateid.other[1] = delegstateidp->other[1]
	= clp->lc_clientid.lval[1];
	new_deleg->ls_stateid.other[2] = delegstateidp->other[2]
	= nfsrv_nextstateindex(clp);
	new_deleg->ls_flags = (NFSLCK_DELEGWRITE \|
	NFSLCK_READACCESS \| NFSLCK_WRITEACCESS);
	*rflagsp \|= NFSV4OPEN_WRITEDELEGATE;
	new_deleg->ls_uid = new_stp->ls_uid;
	new_deleg->ls_lfp = lfp;
	new_deleg->ls_clp = clp;
	new_deleg->ls_filerev = filerev;
	new_deleg->ls_compref = nd->nd_compref;
	nfsrv_writedelegcnt++;
	LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp,
	new_deleg->ls_stateid), new_deleg, ls_hash);
	LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
	new_deleg = NULL;
	nfsstatsv1.srvdelegates++;
	nfsrv_openpluslock++;
	nfsrv_delegatecnt++;
	}
	} else {
	new_open->ls_stateid.seqid = 1;
	new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
	new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
	new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
	new_open->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS)\|
	NFSLCK_OPEN;
	new_open->ls_uid = new_stp->ls_uid;
	new_open->ls_openowner = ownerstp;
	new_open->ls_lfp = lfp;
	new_open->ls_clp = clp;
	LIST_INIT(&new_open->ls_open);
	LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
	LIST_INSERT_HEAD(&ownerstp->ls_open, new_open, ls_list);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
	new_open, ls_hash);
	openstp = new_open;
	new_open = NULL;
	nfsstatsv1.srvopens++;
	nfsrv_openpluslock++;

	/*
	* This is where we can choose to issue a delegation.
	*/
	if (delegate == 0 \|\| (writedeleg == 0 && readonly == 0) \|\|
	!NFSVNO_DELEGOK(vp) \|\|
	(clp->lc_flags & (LCL_CALLBACKSON \| LCL_CBDOWN)) !=
	LCL_CALLBACKSON)
	*rflagsp \|= NFSV4OPEN_WDCONTENTION;
	else if (nfsrv_issuedelegs == 0 \|\|
	NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt))
	*rflagsp \|= NFSV4OPEN_WDRESOURCE;
	else if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0)
	*rflagsp \|= NFSV4OPEN_WDNOTWANTED;
	else {
	new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
	new_deleg->ls_stateid.other[0] = delegstateidp->other[0]
	= clp->lc_clientid.lval[0];
	new_deleg->ls_stateid.other[1] = delegstateidp->other[1]
	= clp->lc_clientid.lval[1];
	new_deleg->ls_stateid.other[2] = delegstateidp->other[2]
	= nfsrv_nextstateindex(clp);
	if (writedeleg && !NFSVNO_EXRDONLY(exp) &&
	(nfsrv_writedelegifpos \|\| !readonly) &&
	(new_stp->ls_flags & NFSLCK_WANTRDELEG) == 0) {
	new_deleg->ls_flags = (NFSLCK_DELEGWRITE \|
	NFSLCK_READACCESS \| NFSLCK_WRITEACCESS);
	*rflagsp \|= NFSV4OPEN_WRITEDELEGATE;
	nfsrv_writedelegcnt++;
	} else {
	new_deleg->ls_flags = (NFSLCK_DELEGREAD \|
	NFSLCK_READACCESS);
	*rflagsp \|= NFSV4OPEN_READDELEGATE;
	}
	new_deleg->ls_uid = new_stp->ls_uid;
	new_deleg->ls_lfp = lfp;
	new_deleg->ls_clp = clp;
	new_deleg->ls_filerev = filerev;
	new_deleg->ls_compref = nd->nd_compref;
	LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp,
	new_deleg->ls_stateid), new_deleg, ls_hash);
	LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
	new_deleg = NULL;
	nfsstatsv1.srvdelegates++;
	nfsrv_openpluslock++;
	nfsrv_delegatecnt++;
	}
	}
	} else {
	/*
	* New owner case. Start the open_owner sequence with a
	* Needs confirmation (unless a reclaim) and hang the
	* new open off it.
	*/
	new_open->ls_stateid.seqid = 1;
	new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
	new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
	new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
	new_open->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS) \|
	NFSLCK_OPEN;
	new_open->ls_uid = new_stp->ls_uid;
	LIST_INIT(&new_open->ls_open);
	new_open->ls_openowner = new_stp;
	new_open->ls_lfp = lfp;
	new_open->ls_clp = clp;
	LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
	if (new_stp->ls_flags & NFSLCK_RECLAIM) {
	new_stp->ls_flags = 0;
	} else if ((nd->nd_flag & ND_NFSV41) != 0) {
	/* NFSv4.1 never needs confirmation. */
	new_stp->ls_flags = 0;

	/*
	* This is where we can choose to issue a delegation.
	*/
	if (delegate && nfsrv_issuedelegs &&
	(writedeleg \|\| readonly) &&
	(clp->lc_flags & (LCL_CALLBACKSON \| LCL_CBDOWN)) ==
	LCL_CALLBACKSON &&
	!NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt) &&
	NFSVNO_DELEGOK(vp) &&
	((nd->nd_flag & ND_NFSV41) == 0 \|\|
	(new_stp->ls_flags & NFSLCK_WANTNODELEG) == 0)) {
	new_deleg->ls_stateid.seqid =
	delegstateidp->seqid = 1;
	new_deleg->ls_stateid.other[0] =
	delegstateidp->other[0]
	= clp->lc_clientid.lval[0];
	new_deleg->ls_stateid.other[1] =
	delegstateidp->other[1]
	= clp->lc_clientid.lval[1];
	new_deleg->ls_stateid.other[2] =
	delegstateidp->other[2]
	= nfsrv_nextstateindex(clp);
	if (writedeleg && !NFSVNO_EXRDONLY(exp) &&
	(nfsrv_writedelegifpos \|\| !readonly) &&
	((nd->nd_flag & ND_NFSV41) == 0 \|\|
	(new_stp->ls_flags & NFSLCK_WANTRDELEG) ==
	0)) {
	new_deleg->ls_flags =
	(NFSLCK_DELEGWRITE \|
	NFSLCK_READACCESS \|
	NFSLCK_WRITEACCESS);
	*rflagsp \|= NFSV4OPEN_WRITEDELEGATE;
	nfsrv_writedelegcnt++;
	} else {
	new_deleg->ls_flags =
	(NFSLCK_DELEGREAD \|
	NFSLCK_READACCESS);
	*rflagsp \|= NFSV4OPEN_READDELEGATE;
	}
	new_deleg->ls_uid = new_stp->ls_uid;
	new_deleg->ls_lfp = lfp;
	new_deleg->ls_clp = clp;
	new_deleg->ls_filerev = filerev;
	new_deleg->ls_compref = nd->nd_compref;
	LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg,
	ls_file);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp,
	new_deleg->ls_stateid), new_deleg, ls_hash);
	LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg,
	ls_list);
	new_deleg = NULL;
	nfsstatsv1.srvdelegates++;
	nfsrv_openpluslock++;
	nfsrv_delegatecnt++;
	}
	/*
	* Since NFSv4.1 never does an OpenConfirm, the first
	* open state will be acquired here.
	*/
	if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) {
	clp->lc_flags \|= LCL_STAMPEDSTABLE;
	len = clp->lc_idlen;
	NFSBCOPY(clp->lc_id, clidp, len);
	gotstate = 1;
	}
	} else {
	*rflagsp \|= NFSV4OPEN_RESULTCONFIRM;
	new_stp->ls_flags = NFSLCK_NEEDSCONFIRM;
	}
	nfsrvd_refcache(new_stp->ls_op);
	new_stp->ls_noopens = 0;
	LIST_INIT(&new_stp->ls_open);
	LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
	LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
	LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
	new_open, ls_hash);
	openstp = new_open;
	new_open = NULL;
	*new_stpp = NULL;
	nfsstatsv1.srvopens++;
	nfsrv_openpluslock++;
	nfsstatsv1.srvopenowners++;
	nfsrv_openpluslock++;
	}
	if (!error) {
	stateidp->seqid = openstp->ls_stateid.seqid;
	stateidp->other[0] = openstp->ls_stateid.other[0];
	stateidp->other[1] = openstp->ls_stateid.other[1];
	stateidp->other[2] = openstp->ls_stateid.other[2];
	}
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	if (new_open)
	FREE((caddr_t)new_open, M_NFSDSTATE);
	if (new_deleg)
	FREE((caddr_t)new_deleg, M_NFSDSTATE);

	/*
	* If the NFSv4.1 client just acquired its first open, write a timestamp
	* to the stable storage file.
	*/
	if (gotstate != 0) {
	nfsrv_writestable(clidp, len, NFSNST_NEWSTATE, p);
	nfsrv_backupstable();
	}

	out:
	free(clidp, M_TEMP);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Open update. Does the confirm, downgrade and close.
	*/
	APPLESTATIC int
	nfsrv_openupdate(vnode_t vp, struct nfsstate *new_stp, nfsquad_t clientid,
	nfsv4stateid_t stateidp, struct nfsrv_descript nd, NFSPROC_T *p)
	{
	- struct nfsstate stp, ownerstp;
	+ struct nfsstate *stp;
	struct nfsclient *clp;
	struct nfslockfile *lfp;
	u_int32_t bits;
	int error = 0, gotstate = 0, len = 0;
	u_char *clidp = NULL;

	/*
	* Check for restart conditions (client and server).
	*/
	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
	&new_stp->ls_stateid, 0);
	if (error)
	goto out;

	clidp = malloc(NFSV4_OPAQUELIMIT, M_TEMP, M_WAITOK);
	NFSLOCKSTATE();
	/*
	* Get the open structure via clientid and stateid.
	*/
	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
	(nfsquad_t)((u_quad_t)0), 0, nd, p);
	if (!error)
	error = nfsrv_getstate(clp, &new_stp->ls_stateid,
	new_stp->ls_flags, &stp);

	/*
	* Sanity check the open.
	*/
	if (!error && (!(stp->ls_flags & NFSLCK_OPEN) \|\|
	(!(new_stp->ls_flags & NFSLCK_CONFIRM) &&
	(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM)) \|\|
	((new_stp->ls_flags & NFSLCK_CONFIRM) &&
	(!(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM)))))
	error = NFSERR_BADSTATEID;

	if (!error)
	error = nfsrv_checkseqid(nd, new_stp->ls_seq,
	stp->ls_openowner, new_stp->ls_op);
	if (!error && stp->ls_stateid.seqid != new_stp->ls_stateid.seqid &&
	(((nd->nd_flag & ND_NFSV41) == 0 &&
	!(new_stp->ls_flags & NFSLCK_CONFIRM)) \|\|
	((nd->nd_flag & ND_NFSV41) != 0 &&
	new_stp->ls_stateid.seqid != 0)))
	error = NFSERR_OLDSTATEID;
	if (!error && vnode_vtype(vp) != VREG) {
	if (vnode_vtype(vp) == VDIR)
	error = NFSERR_ISDIR;
	else
	error = NFSERR_INVAL;
	}

	if (error) {
	/*
	* If a client tries to confirm an Open with a bad
	* seqid# and there are no byte range locks or other Opens
	* on the openowner, just throw it away, so the next use of the
	* openowner will start a fresh seq#.
	*/
	if (error == NFSERR_BADSEQID &&
	(new_stp->ls_flags & NFSLCK_CONFIRM) &&
	nfsrv_nootherstate(stp))
	nfsrv_freeopenowner(stp->ls_openowner, 0, p);
	NFSUNLOCKSTATE();
	goto out;
	}

	/*
	* Set the return stateid.
	*/
	stateidp->seqid = stp->ls_stateid.seqid + 1;
	if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
	stateidp->seqid = 1;
	stateidp->other[0] = stp->ls_stateid.other[0];
	stateidp->other[1] = stp->ls_stateid.other[1];
	stateidp->other[2] = stp->ls_stateid.other[2];
	/*
	* Now, handle the three cases.
	*/
	if (new_stp->ls_flags & NFSLCK_CONFIRM) {
	/*
	* If the open doesn't need confirmation, it seems to me that
	* there is a client error, but I'll just log it and keep going?
	*/
	if (!(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM))
	printf("Nfsv4d: stray open confirm\n");
	stp->ls_openowner->ls_flags = 0;
	stp->ls_stateid.seqid++;
	if ((nd->nd_flag & ND_NFSV41) != 0 &&
	stp->ls_stateid.seqid == 0)
	stp->ls_stateid.seqid = 1;
	if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) {
	clp->lc_flags \|= LCL_STAMPEDSTABLE;
	len = clp->lc_idlen;
	NFSBCOPY(clp->lc_id, clidp, len);
	gotstate = 1;
	}
	NFSUNLOCKSTATE();
	} else if (new_stp->ls_flags & NFSLCK_CLOSE) {
	- ownerstp = stp->ls_openowner;
	lfp = stp->ls_lfp;
	if (nfsrv_dolocallocks != 0 && !LIST_EMPTY(&stp->ls_open)) {
	/* Get the lf lock */
	nfsrv_locklf(lfp);
	NFSUNLOCKSTATE();
	ASSERT_VOP_ELOCKED(vp, "nfsrv_openupdate");
	NFSVOPUNLOCK(vp, 0);
	if (nfsrv_freeopen(stp, vp, 1, p) == 0) {
	NFSLOCKSTATE();
	nfsrv_unlocklf(lfp);
	NFSUNLOCKSTATE();
	}
	NFSVOPLOCK(vp, LK_EXCLUSIVE \| LK_RETRY);
	} else {
	(void) nfsrv_freeopen(stp, NULL, 0, p);
	NFSUNLOCKSTATE();
	}
	} else {
	/*
	* Update the share bits, making sure that the new set are a
	* subset of the old ones.
	*/
	bits = (new_stp->ls_flags & NFSLCK_SHAREBITS);
	if (~(stp->ls_flags) & bits) {
	NFSUNLOCKSTATE();
	error = NFSERR_INVAL;
	goto out;
	}
	stp->ls_flags = (bits \| NFSLCK_OPEN);
	stp->ls_stateid.seqid++;
	if ((nd->nd_flag & ND_NFSV41) != 0 &&
	stp->ls_stateid.seqid == 0)
	stp->ls_stateid.seqid = 1;
	NFSUNLOCKSTATE();
	}

	/*
	* If the client just confirmed its first open, write a timestamp
	* to the stable storage file.
	*/
	if (gotstate != 0) {
	nfsrv_writestable(clidp, len, NFSNST_NEWSTATE, p);
	nfsrv_backupstable();
	}

	out:
	free(clidp, M_TEMP);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Delegation update. Does the purge and return.
	*/
	APPLESTATIC int
	nfsrv_delegupdate(struct nfsrv_descript *nd, nfsquad_t clientid,
	nfsv4stateid_t stateidp, vnode_t vp, int op, struct ucred cred,
	NFSPROC_T *p)
	{
	struct nfsstate *stp;
	struct nfsclient *clp;
	int error = 0;
	fhandle_t fh;

	/*
	* Do a sanity check against the file handle for DelegReturn.
	*/
	if (vp) {
	error = nfsvno_getfh(vp, &fh, p);
	if (error)
	goto out;
	}
	/*
	* Check for restart conditions (client and server).
	*/
	if (op == NFSV4OP_DELEGRETURN)
	error = nfsrv_checkrestart(clientid, NFSLCK_DELEGRETURN,
	stateidp, 0);
	else
	error = nfsrv_checkrestart(clientid, NFSLCK_DELEGPURGE,
	stateidp, 0);

	NFSLOCKSTATE();
	/*
	* Get the open structure via clientid and stateid.
	*/
	if (!error)
	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
	(nfsquad_t)((u_quad_t)0), 0, nd, p);
	if (error) {
	if (error == NFSERR_CBPATHDOWN)
	error = 0;
	if (error == NFSERR_STALECLIENTID && op == NFSV4OP_DELEGRETURN)
	error = NFSERR_STALESTATEID;
	}
	if (!error && op == NFSV4OP_DELEGRETURN) {
	error = nfsrv_getstate(clp, stateidp, NFSLCK_DELEGRETURN, &stp);
	if (!error && stp->ls_stateid.seqid != stateidp->seqid &&
	((nd->nd_flag & ND_NFSV41) == 0 \|\| stateidp->seqid != 0))
	error = NFSERR_OLDSTATEID;
	}
	/*
	* NFSERR_EXPIRED means that the state has gone away,
	* so Delegations have been purged. Just return ok.
	*/
	if (error == NFSERR_EXPIRED && op == NFSV4OP_DELEGPURGE) {
	NFSUNLOCKSTATE();
	error = 0;
	goto out;
	}
	if (error) {
	NFSUNLOCKSTATE();
	goto out;
	}

	if (op == NFSV4OP_DELEGRETURN) {
	if (NFSBCMP((caddr_t)&fh, (caddr_t)&stp->ls_lfp->lf_fh,
	sizeof (fhandle_t))) {
	NFSUNLOCKSTATE();
	error = NFSERR_BADSTATEID;
	goto out;
	}
	nfsrv_freedeleg(stp);
	} else {
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	}
	NFSUNLOCKSTATE();
	error = 0;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Release lock owner.
	*/
	APPLESTATIC int
	nfsrv_releaselckown(struct nfsstate *new_stp, nfsquad_t clientid,
	NFSPROC_T *p)
	{
	struct nfsstate stp, nstp, openstp, ownstp;
	struct nfsclient *clp;
	int error = 0;

	/*
	* Check for restart conditions (client and server).
	*/
	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
	&new_stp->ls_stateid, 0);
	if (error)
	goto out;

	NFSLOCKSTATE();
	/*
	* Get the lock owner by name.
	*/
	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
	(nfsquad_t)((u_quad_t)0), 0, NULL, p);
	if (error) {
	NFSUNLOCKSTATE();
	goto out;
	}
	LIST_FOREACH(ownstp, &clp->lc_open, ls_list) {
	LIST_FOREACH(openstp, &ownstp->ls_open, ls_list) {
	stp = LIST_FIRST(&openstp->ls_open);
	while (stp != LIST_END(&openstp->ls_open)) {
	nstp = LIST_NEXT(stp, ls_list);
	/*
	* If the owner matches, check for locks and
	* then free or return an error.
	*/
	if (stp->ls_ownerlen == new_stp->ls_ownerlen &&
	!NFSBCMP(stp->ls_owner, new_stp->ls_owner,
	stp->ls_ownerlen)){
	if (LIST_EMPTY(&stp->ls_lock)) {
	nfsrv_freelockowner(stp, NULL, 0, p);
	} else {
	NFSUNLOCKSTATE();
	error = NFSERR_LOCKSHELD;
	goto out;
	}
	}
	stp = nstp;
	}
	}
	}
	NFSUNLOCKSTATE();

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Get the file handle for a lock structure.
	*/
	static int
	nfsrv_getlockfh(vnode_t vp, u_short flags, struct nfslockfile *new_lfp,
	fhandle_t nfhp, NFSPROC_T p)
	{
	fhandle_t *fhp = NULL;
	int error;

	/*
	* For lock, use the new nfslock structure, otherwise just
	* a fhandle_t on the stack.
	*/
	if (flags & NFSLCK_OPEN) {
	KASSERT(new_lfp != NULL, ("nfsrv_getlockfh: new_lfp NULL"));
	fhp = &new_lfp->lf_fh;
	} else if (nfhp) {
	fhp = nfhp;
	} else {
	panic("nfsrv_getlockfh");
	}
	error = nfsvno_getfh(vp, fhp, p);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Get an nfs lock structure. Allocate one, as required, and return a
	* pointer to it.
	* Returns an NFSERR_xxx upon failure or -1 to indicate no current lock.
	*/
	static int
	nfsrv_getlockfile(u_short flags, struct nfslockfile **new_lfpp,
	struct nfslockfile *lfpp, fhandle_t nfhp, int lockit)
	{
	struct nfslockfile *lfp;
	fhandle_t fhp = NULL, tfhp;
	struct nfslockhashhead *hp;
	struct nfslockfile *new_lfp = NULL;

	/*
	* For lock, use the new nfslock structure, otherwise just
	* a fhandle_t on the stack.
	*/
	if (flags & NFSLCK_OPEN) {
	new_lfp = *new_lfpp;
	fhp = &new_lfp->lf_fh;
	} else if (nfhp) {
	fhp = nfhp;
	} else {
	panic("nfsrv_getlockfile");
	}

	hp = NFSLOCKHASH(fhp);
	LIST_FOREACH(lfp, hp, lf_hash) {
	tfhp = &lfp->lf_fh;
	if (NFSVNO_CMPFH(fhp, tfhp)) {
	if (lockit)
	nfsrv_locklf(lfp);
	*lfpp = lfp;
	return (0);
	}
	}
	if (!(flags & NFSLCK_OPEN))
	return (-1);

	/*
	* No match, so chain the new one into the list.
	*/
	LIST_INIT(&new_lfp->lf_open);
	LIST_INIT(&new_lfp->lf_lock);
	LIST_INIT(&new_lfp->lf_deleg);
	LIST_INIT(&new_lfp->lf_locallock);
	LIST_INIT(&new_lfp->lf_rollback);
	new_lfp->lf_locallock_lck.nfslock_usecnt = 0;
	new_lfp->lf_locallock_lck.nfslock_lock = 0;
	new_lfp->lf_usecount = 0;
	LIST_INSERT_HEAD(hp, new_lfp, lf_hash);
	*lfpp = new_lfp;
	*new_lfpp = NULL;
	return (0);
	}

	/*
	* This function adds a nfslock lock structure to the list for the associated
	* nfsstate and nfslockfile structures. It will be inserted after the
	* entry pointed at by insert_lop.
	*/
	static void
	nfsrv_insertlock(struct nfslock new_lop, struct nfslock insert_lop,
	struct nfsstate stp, struct nfslockfile lfp)
	{
	struct nfslock lop, nlop;

	new_lop->lo_stp = stp;
	new_lop->lo_lfp = lfp;

	if (stp != NULL) {
	/* Insert in increasing lo_first order */
	lop = LIST_FIRST(&lfp->lf_lock);
	if (lop == LIST_END(&lfp->lf_lock) \|\|
	new_lop->lo_first <= lop->lo_first) {
	LIST_INSERT_HEAD(&lfp->lf_lock, new_lop, lo_lckfile);
	} else {
	nlop = LIST_NEXT(lop, lo_lckfile);
	while (nlop != LIST_END(&lfp->lf_lock) &&
	nlop->lo_first < new_lop->lo_first) {
	lop = nlop;
	nlop = LIST_NEXT(lop, lo_lckfile);
	}
	LIST_INSERT_AFTER(lop, new_lop, lo_lckfile);
	}
	} else {
	new_lop->lo_lckfile.le_prev = NULL; /* list not used */
	}

	/*
	* Insert after insert_lop, which is overloaded as stp or lfp for
	* an empty list.
	*/
	if (stp == NULL && (struct nfslockfile *)insert_lop == lfp)
	LIST_INSERT_HEAD(&lfp->lf_locallock, new_lop, lo_lckowner);
	else if ((struct nfsstate *)insert_lop == stp)
	LIST_INSERT_HEAD(&stp->ls_lock, new_lop, lo_lckowner);
	else
	LIST_INSERT_AFTER(insert_lop, new_lop, lo_lckowner);
	if (stp != NULL) {
	nfsstatsv1.srvlocks++;
	nfsrv_openpluslock++;
	}
	}

	/*
	* This function updates the locking for a lock owner and given file. It
	* maintains a list of lock ranges ordered on increasing file offset that
	* are NFSLCK_READ or NFSLCK_WRITE and non-overlapping (aka POSIX style).
	* It always adds new_lop to the list and sometimes uses the one pointed
	* at by other_lopp.
	*/
	static void
	nfsrv_updatelock(struct nfsstate stp, struct nfslock *new_lopp,
	struct nfslock *other_lopp, struct nfslockfile lfp)
	{
	struct nfslock new_lop = new_lopp;
	struct nfslock lop, tlop, *ilop;
	struct nfslock other_lop = other_lopp;
	int unlock = 0, myfile = 0;
	u_int64_t tmp;

	/*
	* Work down the list until the lock is merged.
	*/
	if (new_lop->lo_flags & NFSLCK_UNLOCK)
	unlock = 1;
	if (stp != NULL) {
	ilop = (struct nfslock *)stp;
	lop = LIST_FIRST(&stp->ls_lock);
	} else {
	ilop = (struct nfslock *)lfp;
	lop = LIST_FIRST(&lfp->lf_locallock);
	}
	while (lop != NULL) {
	/*
	* Only check locks for this file that aren't before the start of
	* new lock's range.
	*/
	if (lop->lo_lfp == lfp) {
	myfile = 1;
	if (lop->lo_end >= new_lop->lo_first) {
	if (new_lop->lo_end < lop->lo_first) {
	/*
	* If the new lock ends before the start of the
	* current lock's range, no merge, just insert
	* the new lock.
	*/
	break;
	}
	if (new_lop->lo_flags == lop->lo_flags \|\|
	(new_lop->lo_first <= lop->lo_first &&
	new_lop->lo_end >= lop->lo_end)) {
	/*
	* This lock can be absorbed by the new lock/unlock.
	* This happens when it covers the entire range
	* of the old lock or is contiguous
	* with the old lock and is of the same type or an
	* unlock.
	*/
	if (lop->lo_first < new_lop->lo_first)
	new_lop->lo_first = lop->lo_first;
	if (lop->lo_end > new_lop->lo_end)
	new_lop->lo_end = lop->lo_end;
	tlop = lop;
	lop = LIST_NEXT(lop, lo_lckowner);
	nfsrv_freenfslock(tlop);
	continue;
	}

	/*
	* All these cases are for contiguous locks that are not the
	* same type, so they can't be merged.
	*/
	if (new_lop->lo_first <= lop->lo_first) {
	/*
	* This case is where the new lock overlaps with the
	* first part of the old lock. Move the start of the
	* old lock to just past the end of the new lock. The
	* new lock will be inserted in front of the old, since
	* ilop hasn't been updated. (We are done now.)
	*/
	lop->lo_first = new_lop->lo_end;
	break;
	}
	if (new_lop->lo_end >= lop->lo_end) {
	/*
	* This case is where the new lock overlaps with the
	* end of the old lock's range. Move the old lock's
	* end to just before the new lock's first and insert
	* the new lock after the old lock.
	* Might not be done yet, since the new lock could
	* overlap further locks with higher ranges.
	*/
	lop->lo_end = new_lop->lo_first;
	ilop = lop;
	lop = LIST_NEXT(lop, lo_lckowner);
	continue;
	}
	/*
	* The final case is where the new lock's range is in the
	* middle of the current lock's and splits the current lock
	* up. Use *other_lopp to handle the second part of the
	* split old lock range. (We are done now.)
	* For unlock, we use new_lop as other_lop and tmp, since
	* other_lop and new_lop are the same for this case.
	* We noted the unlock case above, so we don't need
	* new_lop->lo_flags any longer.
	*/
	tmp = new_lop->lo_first;
	if (other_lop == NULL) {
	if (!unlock)
	panic("nfsd srv update unlock");
	other_lop = new_lop;
	*new_lopp = NULL;
	}
	other_lop->lo_first = new_lop->lo_end;
	other_lop->lo_end = lop->lo_end;
	other_lop->lo_flags = lop->lo_flags;
	other_lop->lo_stp = stp;
	other_lop->lo_lfp = lfp;
	lop->lo_end = tmp;
	nfsrv_insertlock(other_lop, lop, stp, lfp);
	*other_lopp = NULL;
	ilop = lop;
	break;
	}
	}
	ilop = lop;
	lop = LIST_NEXT(lop, lo_lckowner);
	if (myfile && (lop == NULL \|\| lop->lo_lfp != lfp))
	break;
	}

	/*
	* Insert the new lock in the list at the appropriate place.
	*/
	if (!unlock) {
	nfsrv_insertlock(new_lop, ilop, stp, lfp);
	*new_lopp = NULL;
	}
	}

	/*
	* This function handles sequencing of locks, etc.
	* It returns an error that indicates what the caller should do.
	*/
	static int
	nfsrv_checkseqid(struct nfsrv_descript *nd, u_int32_t seqid,
	struct nfsstate stp, struct nfsrvcache op)
	{
	int error = 0;

	if ((nd->nd_flag & ND_NFSV41) != 0)
	/* NFSv4.1 ignores the open_seqid and lock_seqid. */
	goto out;
	if (op != nd->nd_rp)
	panic("nfsrvstate checkseqid");
	if (!(op->rc_flag & RC_INPROG))
	panic("nfsrvstate not inprog");
	if (stp->ls_op && stp->ls_op->rc_refcnt <= 0) {
	printf("refcnt=%d\n", stp->ls_op->rc_refcnt);
	panic("nfsrvstate op refcnt");
	}
	if ((stp->ls_seq + 1) == seqid) {
	if (stp->ls_op)
	nfsrvd_derefcache(stp->ls_op);
	stp->ls_op = op;
	nfsrvd_refcache(op);
	stp->ls_seq = seqid;
	goto out;
	} else if (stp->ls_seq == seqid && stp->ls_op &&
	op->rc_xid == stp->ls_op->rc_xid &&
	op->rc_refcnt == 0 &&
	op->rc_reqlen == stp->ls_op->rc_reqlen &&
	op->rc_cksum == stp->ls_op->rc_cksum) {
	if (stp->ls_op->rc_flag & RC_INPROG) {
	error = NFSERR_DONTREPLY;
	goto out;
	}
	nd->nd_rp = stp->ls_op;
	nd->nd_rp->rc_flag \|= RC_INPROG;
	nfsrvd_delcache(op);
	error = NFSERR_REPLYFROMCACHE;
	goto out;
	}
	error = NFSERR_BADSEQID;

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Get the client ip address for callbacks. If the strings can't be parsed,
	* just set lc_program to 0 to indicate no callbacks are possible.
	* (For cases where the address can't be parsed or is 0.0.0.0.0.0, set
	* the address to the client's transport address. This won't be used
	* for callbacks, but can be printed out by nfsstats for info.)
	* Return error if the xdr can't be parsed, 0 otherwise.
	*/
	APPLESTATIC int
	nfsrv_getclientipaddr(struct nfsrv_descript nd, struct nfsclient clp)
	{
	u_int32_t *tl;
	u_char cp, cp2;
	int i, j;
	struct sockaddr_in rad, sad;
	u_char protocol[5], addr[24];
	int error = 0, cantparse = 0;
	union {
	in_addr_t ival;
	u_char cval[4];
	} ip;
	union {
	in_port_t sval;
	u_char cval[2];
	} port;

	rad = NFSSOCKADDR(clp->lc_req.nr_nam, struct sockaddr_in *);
	rad->sin_family = AF_INET;
	rad->sin_len = sizeof (struct sockaddr_in);
	rad->sin_addr.s_addr = 0;
	rad->sin_port = 0;
	clp->lc_req.nr_client = NULL;
	clp->lc_req.nr_lock = 0;
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	if (i >= 3 && i <= 4) {
	error = nfsrv_mtostr(nd, protocol, i);
	if (error)
	goto nfsmout;
	if (!strcmp(protocol, "tcp")) {
	clp->lc_flags \|= LCL_TCPCALLBACK;
	clp->lc_req.nr_sotype = SOCK_STREAM;
	clp->lc_req.nr_soproto = IPPROTO_TCP;
	} else if (!strcmp(protocol, "udp")) {
	clp->lc_req.nr_sotype = SOCK_DGRAM;
	clp->lc_req.nr_soproto = IPPROTO_UDP;
	} else {
	cantparse = 1;
	}
	} else {
	cantparse = 1;
	if (i > 0) {
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	}
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	if (i < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	} else if (i == 0) {
	cantparse = 1;
	} else if (!cantparse && i <= 23 && i >= 11) {
	error = nfsrv_mtostr(nd, addr, i);
	if (error)
	goto nfsmout;

	/*
	* Parse out the address fields. We expect 6 decimal numbers
	* separated by '.'s.
	*/
	cp = addr;
	i = 0;
	while (*cp && i < 6) {
	cp2 = cp;
	while (cp2 && cp2 != '.')
	cp2++;
	if (*cp2)
	*cp2++ = '\0';
	else if (i != 5) {
	cantparse = 1;
	break;
	}
	j = nfsrv_getipnumber(cp);
	if (j >= 0) {
	if (i < 4)
	ip.cval[3 - i] = j;
	else
	port.cval[5 - i] = j;
	} else {
	cantparse = 1;
	break;
	}
	cp = cp2;
	i++;
	}
	if (!cantparse) {
	if (ip.ival != 0x0) {
	rad->sin_addr.s_addr = htonl(ip.ival);
	rad->sin_port = htons(port.sval);
	} else {
	cantparse = 1;
	}
	}
	} else {
	cantparse = 1;
	if (i > 0) {
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	}
	}
	if (cantparse) {
	sad = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
	if (sad->sin_family == AF_INET) {
	rad->sin_addr.s_addr = sad->sin_addr.s_addr;
	rad->sin_port = 0x0;
	}
	clp->lc_program = 0;
	}
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Turn a string of up to three decimal digits into a number. Return -1 upon
	* error.
	*/
	static int
	nfsrv_getipnumber(u_char *cp)
	{
	int i = 0, j = 0;

	while (*cp) {
	if (j > 2 \|\| cp < '0' \|\| cp > '9')
	return (-1);
	i *= 10;
	i += (*cp - '0');
	cp++;
	j++;
	}
	if (i < 256)
	return (i);
	return (-1);
	}

	/*
	* This function checks for restart conditions.
	*/
	static int
	nfsrv_checkrestart(nfsquad_t clientid, u_int32_t flags,
	nfsv4stateid_t *stateidp, int specialid)
	{
	int ret = 0;

	/*
	* First check for a server restart. Open, LockT, ReleaseLockOwner
	* and DelegPurge have a clientid, the rest a stateid.
	*/
	if (flags &
	(NFSLCK_OPEN \| NFSLCK_TEST \| NFSLCK_RELEASE \| NFSLCK_DELEGPURGE)) {
	if (clientid.lval[0] != nfsrvboottime) {
	ret = NFSERR_STALECLIENTID;
	goto out;
	}
	} else if (stateidp->other[0] != nfsrvboottime &&
	specialid == 0) {
	ret = NFSERR_STALESTATEID;
	goto out;
	}

	/*
	* Read, Write, Setattr and LockT can return NFSERR_GRACE and do
	* not use a lock/open owner seqid#, so the check can be done now.
	* (The others will be checked, as required, later.)
	*/
	if (!(flags & (NFSLCK_CHECK \| NFSLCK_TEST)))
	goto out;

	NFSLOCKSTATE();
	ret = nfsrv_checkgrace(NULL, NULL, flags);
	NFSUNLOCKSTATE();

	out:
	NFSEXITCODE(ret);
	return (ret);
	}

	/*
	* Check for grace.
	*/
	static int
	nfsrv_checkgrace(struct nfsrv_descript nd, struct nfsclient clp,
	u_int32_t flags)
	{
	int error = 0;

	if ((nfsrv_stablefirst.nsf_flags & NFSNSF_GRACEOVER) != 0) {
	if (flags & NFSLCK_RECLAIM) {
	error = NFSERR_NOGRACE;
	goto out;
	}
	} else {
	if (!(flags & NFSLCK_RECLAIM)) {
	error = NFSERR_GRACE;
	goto out;
	}
	if (nd != NULL && clp != NULL &&
	(nd->nd_flag & ND_NFSV41) != 0 &&
	(clp->lc_flags & LCL_RECLAIMCOMPLETE) != 0) {
	error = NFSERR_NOGRACE;
	goto out;
	}

	/*
	* If grace is almost over and we are still getting Reclaims,
	* extend grace a bit.
	*/
	if ((NFSD_MONOSEC + NFSRV_LEASEDELTA) >
	nfsrv_stablefirst.nsf_eograce)
	nfsrv_stablefirst.nsf_eograce = NFSD_MONOSEC +
	NFSRV_LEASEDELTA;
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Do a server callback.
	*/
	static int
	nfsrv_docallback(struct nfsclient *clp, int procnum,
	nfsv4stateid_t stateidp, int trunc, fhandle_t fhp,
	struct nfsvattr nap, nfsattrbit_t attrbitp, NFSPROC_T *p)
	{
	mbuf_t m;
	u_int32_t *tl;
	struct nfsrv_descript nfsd, *nd = &nfsd;
	struct ucred *cred;
	int error = 0;
	u_int32_t callback;
	struct nfsdsession *sep = NULL;

	cred = newnfs_getcred();
	NFSLOCKSTATE(); /* mostly for lc_cbref++ */
	if (clp->lc_flags & LCL_NEEDSCONFIRM) {
	NFSUNLOCKSTATE();
	panic("docallb");
	}
	clp->lc_cbref++;

	/*
	* Fill the callback program# and version into the request
	* structure for newnfs_connect() to use.
	*/
	clp->lc_req.nr_prog = clp->lc_program;
	#ifdef notnow
	if ((clp->lc_flags & LCL_NFSV41) != 0)
	clp->lc_req.nr_vers = NFSV41_CBVERS;
	else
	#endif
	clp->lc_req.nr_vers = NFSV4_CBVERS;

	/*
	* First, fill in some of the fields of nd and cr.
	*/
	nd->nd_flag = ND_NFSV4;
	if (clp->lc_flags & LCL_GSS)
	nd->nd_flag \|= ND_KERBV;
	if ((clp->lc_flags & LCL_NFSV41) != 0)
	nd->nd_flag \|= ND_NFSV41;
	nd->nd_repstat = 0;
	cred->cr_uid = clp->lc_uid;
	cred->cr_gid = clp->lc_gid;
	callback = clp->lc_callback;
	NFSUNLOCKSTATE();
	cred->cr_ngroups = 1;

	/*
	* Get the first mbuf for the request.
	*/
	MGET(m, M_WAITOK, MT_DATA);
	mbuf_setlen(m, 0);
	nd->nd_mreq = nd->nd_mb = m;
	nd->nd_bpos = NFSMTOD(m, caddr_t);

	/*
	* and build the callback request.
	*/
	if (procnum == NFSV4OP_CBGETATTR) {
	nd->nd_procnum = NFSV4PROC_CBCOMPOUND;
	error = nfsrv_cbcallargs(nd, clp, callback, NFSV4OP_CBGETATTR,
	"CB Getattr", &sep);
	if (error != 0) {
	mbuf_freem(nd->nd_mreq);
	goto errout;
	}
	(void)nfsm_fhtom(nd, (u_int8_t *)fhp, NFSX_MYFH, 0);
	(void)nfsrv_putattrbit(nd, attrbitp);
	} else if (procnum == NFSV4OP_CBRECALL) {
	nd->nd_procnum = NFSV4PROC_CBCOMPOUND;
	error = nfsrv_cbcallargs(nd, clp, callback, NFSV4OP_CBRECALL,
	"CB Recall", &sep);
	if (error != 0) {
	mbuf_freem(nd->nd_mreq);
	goto errout;
	}
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID);
	*tl++ = txdr_unsigned(stateidp->seqid);
	NFSBCOPY((caddr_t)stateidp->other, (caddr_t)tl,
	NFSX_STATEIDOTHER);
	tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED);
	if (trunc)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	(void)nfsm_fhtom(nd, (u_int8_t *)fhp, NFSX_MYFH, 0);
	} else if (procnum == NFSV4PROC_CBNULL) {
	nd->nd_procnum = NFSV4PROC_CBNULL;
	if ((clp->lc_flags & LCL_NFSV41) != 0) {
	error = nfsv4_getcbsession(clp, &sep);
	if (error != 0) {
	mbuf_freem(nd->nd_mreq);
	goto errout;
	}
	}
	} else {
	error = NFSERR_SERVERFAULT;
	mbuf_freem(nd->nd_mreq);
	goto errout;
	}

	/*
	* Call newnfs_connect(), as required, and then newnfs_request().
	*/
	(void) newnfs_sndlock(&clp->lc_req.nr_lock);
	if (clp->lc_req.nr_client == NULL) {
	if ((clp->lc_flags & LCL_NFSV41) != 0)
	error = ECONNREFUSED;
	else if (nd->nd_procnum == NFSV4PROC_CBNULL)
	error = newnfs_connect(NULL, &clp->lc_req, cred,
	NULL, 1);
	else
	error = newnfs_connect(NULL, &clp->lc_req, cred,
	NULL, 3);
	}
	newnfs_sndunlock(&clp->lc_req.nr_lock);
	if (!error) {
	if ((nd->nd_flag & ND_NFSV41) != 0) {
	KASSERT(sep != NULL, ("sep NULL"));
	if (sep->sess_cbsess.nfsess_xprt != NULL)
	error = newnfs_request(nd, NULL, clp,
	&clp->lc_req, NULL, NULL, cred,
	clp->lc_program, clp->lc_req.nr_vers, NULL,
	1, NULL, &sep->sess_cbsess);
	else {
	/*
	* This should probably never occur, but if a
	* client somehow does an RPC without a
	* SequenceID Op that causes a callback just
	* after the nfsd threads have been terminated
	* and restared we could conceivably get here
	* without a backchannel xprt.
	*/
	printf("nfsrv_docallback: no xprt\n");
	error = ECONNREFUSED;
	}
	nfsrv_freesession(sep, NULL);
	} else
	error = newnfs_request(nd, NULL, clp, &clp->lc_req,
	NULL, NULL, cred, clp->lc_program,
	clp->lc_req.nr_vers, NULL, 1, NULL, NULL);
	}
	errout:
	NFSFREECRED(cred);

	/*
	* If error is set here, the Callback path isn't working
	* properly, so twiddle the appropriate LCL_ flags.
	* (nd_repstat != 0 indicates the Callback path is working,
	* but the callback failed on the client.)
	*/
	if (error) {
	/*
	* Mark the callback pathway down, which disabled issuing
	* of delegations and gets Renew to return NFSERR_CBPATHDOWN.
	*/
	NFSLOCKSTATE();
	clp->lc_flags \|= LCL_CBDOWN;
	NFSUNLOCKSTATE();
	} else {
	/*
	* Callback worked. If the callback path was down, disable
	* callbacks, so no more delegations will be issued. (This
	* is done on the assumption that the callback pathway is
	* flakey.)
	*/
	NFSLOCKSTATE();
	if (clp->lc_flags & LCL_CBDOWN)
	clp->lc_flags &= ~(LCL_CBDOWN \| LCL_CALLBACKSON);
	NFSUNLOCKSTATE();
	if (nd->nd_repstat)
	error = nd->nd_repstat;
	else if (error == 0 && procnum == NFSV4OP_CBGETATTR)
	error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
	NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL,
	p, NULL);
	mbuf_freem(nd->nd_mrep);
	}
	NFSLOCKSTATE();
	clp->lc_cbref--;
	if ((clp->lc_flags & LCL_WAKEUPWANTED) && clp->lc_cbref == 0) {
	clp->lc_flags &= ~LCL_WAKEUPWANTED;
	wakeup(clp);
	}
	NFSUNLOCKSTATE();

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Set up the compound RPC for the callback.
	*/
	static int
	nfsrv_cbcallargs(struct nfsrv_descript nd, struct nfsclient clp,
	uint32_t callback, int op, const char optag, struct nfsdsession *sepp)
	{
	uint32_t *tl;
	int error, len;

	len = strlen(optag);
	(void)nfsm_strtom(nd, optag, len);
	NFSM_BUILD(tl, uint32_t , 4 NFSX_UNSIGNED);
	if ((nd->nd_flag & ND_NFSV41) != 0) {
	*tl++ = txdr_unsigned(NFSV41_MINORVERSION);
	*tl++ = txdr_unsigned(callback);
	*tl++ = txdr_unsigned(2);
	*tl = txdr_unsigned(NFSV4OP_CBSEQUENCE);
	error = nfsv4_setcbsequence(nd, clp, 1, sepp);
	if (error != 0)
	return (error);
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(op);
	} else {
	*tl++ = txdr_unsigned(NFSV4_MINORVERSION);
	*tl++ = txdr_unsigned(callback);
	*tl++ = txdr_unsigned(1);
	*tl = txdr_unsigned(op);
	}
	return (0);
	}

	/*
	* Return the next index# for a clientid. Mostly just increment and return
	* the next one, but... if the 32bit unsigned does actually wrap around,
	* it should be rebooted.
	* At an average rate of one new client per second, it will wrap around in
	* approximately 136 years. (I think the server will have been shut
	* down or rebooted before then.)
	*/
	static u_int32_t
	nfsrv_nextclientindex(void)
	{
	static u_int32_t client_index = 0;

	client_index++;
	if (client_index != 0)
	return (client_index);

	printf("%s: out of clientids\n", __func__);
	return (client_index);
	}

	/*
	* Return the next index# for a stateid. Mostly just increment and return
	* the next one, but... if the 32bit unsigned does actually wrap around
	* (will a BSD server stay up that long?), find
	* new start and end values.
	*/
	static u_int32_t
	nfsrv_nextstateindex(struct nfsclient *clp)
	{
	struct nfsstate *stp;
	int i;
	u_int32_t canuse, min_index, max_index;

	if (!(clp->lc_flags & LCL_INDEXNOTOK)) {
	clp->lc_stateindex++;
	if (clp->lc_stateindex != clp->lc_statemaxindex)
	return (clp->lc_stateindex);
	}

	/*
	* Yuck, we've hit the end.
	* Look for a new min and max.
	*/
	min_index = 0;
	max_index = 0xffffffff;
	for (i = 0; i < nfsrv_statehashsize; i++) {
	LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
	if (stp->ls_stateid.other[2] > 0x80000000) {
	if (stp->ls_stateid.other[2] < max_index)
	max_index = stp->ls_stateid.other[2];
	} else {
	if (stp->ls_stateid.other[2] > min_index)
	min_index = stp->ls_stateid.other[2];
	}
	}
	}

	/*
	* Yikes, highly unlikely, but I'll handle it anyhow.
	*/
	if (min_index == 0x80000000 && max_index == 0x80000001) {
	canuse = 0;
	/*
	* Loop around until we find an unused entry. Return that
	* and set LCL_INDEXNOTOK, so the search will continue next time.
	* (This is one of those rare cases where a goto is the
	* cleanest way to code the loop.)
	*/
	tryagain:
	for (i = 0; i < nfsrv_statehashsize; i++) {
	LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
	if (stp->ls_stateid.other[2] == canuse) {
	canuse++;
	goto tryagain;
	}
	}
	}
	clp->lc_flags \|= LCL_INDEXNOTOK;
	return (canuse);
	}

	/*
	* Ok to start again from min + 1.
	*/
	clp->lc_stateindex = min_index + 1;
	clp->lc_statemaxindex = max_index;
	clp->lc_flags &= ~LCL_INDEXNOTOK;
	return (clp->lc_stateindex);
	}

	/*
	* The following functions handle the stable storage file that deals with
	* the edge conditions described in RFC3530 Sec. 8.6.3.
	* The file is as follows:
	* - a single record at the beginning that has the lease time of the
	* previous server instance (before the last reboot) and the nfsrvboottime
	* values for the previous server boots.
	* These previous boot times are used to ensure that the current
	* nfsrvboottime does not, somehow, get set to a previous one.
	* (This is important so that Stale ClientIDs and StateIDs can
	* be recognized.)
	* The number of previous nfsvrboottime values precedes the list.
	* - followed by some number of appended records with:
	* - client id string
	* - flag that indicates it is a record revoking state via lease
	* expiration or similar
	* OR has successfully acquired state.
	* These structures vary in length, with the client string at the end, up
	* to NFSV4_OPAQUELIMIT in size.
	*
	* At the end of the grace period, the file is truncated, the first
	* record is rewritten with updated information and any acquired state
	* records for successful reclaims of state are written.
	*
	* Subsequent records are appended when the first state is issued to
	* a client and when state is revoked for a client.
	*
	* When reading the file in, state issued records that come later in
	* the file override older ones, since the append log is in cronological order.
	* If, for some reason, the file can't be read, the grace period is
	* immediately terminated and all reclaims get NFSERR_NOGRACE.
	*/

	/*
	* Read in the stable storage file. Called by nfssvc() before the nfsd
	* processes start servicing requests.
	*/
	APPLESTATIC void
	nfsrv_setupstable(NFSPROC_T *p)
	{
	struct nfsrv_stablefirst *sf = &nfsrv_stablefirst;
	struct nfsrv_stable sp, nsp;
	struct nfst_rec *tsp;
	int error, i, tryagain;
	off_t off = 0;
	ssize_t aresid, len;

	/*
	* If NFSNSF_UPDATEDONE is set, this is a restart of the nfsds without
	* a reboot, so state has not been lost.
	*/
	if (sf->nsf_flags & NFSNSF_UPDATEDONE)
	return;
	/*
	* Set Grace over just until the file reads successfully.
	*/
	nfsrvboottime = time_second;
	LIST_INIT(&sf->nsf_head);
	sf->nsf_flags = (NFSNSF_GRACEOVER \| NFSNSF_NEEDLOCK);
	sf->nsf_eograce = NFSD_MONOSEC + NFSRV_LEASEDELTA;
	if (sf->nsf_fp == NULL)
	return;
	error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
	(caddr_t)&sf->nsf_rec, sizeof (struct nfsf_rec), off, UIO_SYSSPACE,
	0, NFSFPCRED(sf->nsf_fp), &aresid, p);
	if (error \|\| aresid \|\| sf->nsf_numboots == 0 \|\|
	sf->nsf_numboots > NFSNSF_MAXNUMBOOTS)
	return;

	/*
	* Now, read in the boottimes.
	*/
	sf->nsf_bootvals = (time_t )malloc((sf->nsf_numboots + 1)
	sizeof (time_t), M_TEMP, M_WAITOK);
	off = sizeof (struct nfsf_rec);
	error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
	(caddr_t)sf->nsf_bootvals, sf->nsf_numboots * sizeof (time_t), off,
	UIO_SYSSPACE, 0, NFSFPCRED(sf->nsf_fp), &aresid, p);
	if (error \|\| aresid) {
	free((caddr_t)sf->nsf_bootvals, M_TEMP);
	sf->nsf_bootvals = NULL;
	return;
	}

	/*
	* Make sure this nfsrvboottime is different from all recorded
	* previous ones.
	*/
	do {
	tryagain = 0;
	for (i = 0; i < sf->nsf_numboots; i++) {
	if (nfsrvboottime == sf->nsf_bootvals[i]) {
	nfsrvboottime++;
	tryagain = 1;
	break;
	}
	}
	} while (tryagain);

	sf->nsf_flags \|= NFSNSF_OK;
	off += (sf->nsf_numboots * sizeof (time_t));

	/*
	* Read through the file, building a list of records for grace
	* checking.
	* Each record is between sizeof (struct nfst_rec) and
	* sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1
	* and is actually sizeof (struct nfst_rec) + nst_len - 1.
	*/
	tsp = (struct nfst_rec *)malloc(sizeof (struct nfst_rec) +
	NFSV4_OPAQUELIMIT - 1, M_TEMP, M_WAITOK);
	do {
	error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
	(caddr_t)tsp, sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1,
	off, UIO_SYSSPACE, 0, NFSFPCRED(sf->nsf_fp), &aresid, p);
	len = (sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1) - aresid;
	if (error \|\| (len > 0 && (len < sizeof (struct nfst_rec) \|\|
	len < (sizeof (struct nfst_rec) + tsp->len - 1)))) {
	/*
	* Yuck, the file has been corrupted, so just return
	* after clearing out any restart state, so the grace period
	* is over.
	*/
	LIST_FOREACH_SAFE(sp, &sf->nsf_head, nst_list, nsp) {
	LIST_REMOVE(sp, nst_list);
	free((caddr_t)sp, M_TEMP);
	}
	free((caddr_t)tsp, M_TEMP);
	sf->nsf_flags &= ~NFSNSF_OK;
	free((caddr_t)sf->nsf_bootvals, M_TEMP);
	sf->nsf_bootvals = NULL;
	return;
	}
	if (len > 0) {
	off += sizeof (struct nfst_rec) + tsp->len - 1;
	/*
	* Search the list for a matching client.
	*/
	LIST_FOREACH(sp, &sf->nsf_head, nst_list) {
	if (tsp->len == sp->nst_len &&
	!NFSBCMP(tsp->client, sp->nst_client, tsp->len))
	break;
	}
	if (sp == LIST_END(&sf->nsf_head)) {
	sp = (struct nfsrv_stable *)malloc(tsp->len +
	sizeof (struct nfsrv_stable) - 1, M_TEMP,
	M_WAITOK);
	NFSBCOPY((caddr_t)tsp, (caddr_t)&sp->nst_rec,
	sizeof (struct nfst_rec) + tsp->len - 1);
	LIST_INSERT_HEAD(&sf->nsf_head, sp, nst_list);
	} else {
	if (tsp->flag == NFSNST_REVOKE)
	sp->nst_flag \|= NFSNST_REVOKE;
	else
	/*
	* A subsequent timestamp indicates the client
	* did a setclientid/confirm and any previous
	* revoke is no longer relevant.
	*/
	sp->nst_flag &= ~NFSNST_REVOKE;
	}
	}
	} while (len > 0);
	free((caddr_t)tsp, M_TEMP);
	sf->nsf_flags = NFSNSF_OK;
	sf->nsf_eograce = NFSD_MONOSEC + sf->nsf_lease +
	NFSRV_LEASEDELTA;
	}

	/*
	* Update the stable storage file, now that the grace period is over.
	*/
	APPLESTATIC void
	nfsrv_updatestable(NFSPROC_T *p)
	{
	struct nfsrv_stablefirst *sf = &nfsrv_stablefirst;
	struct nfsrv_stable sp, nsp;
	int i;
	struct nfsvattr nva;
	vnode_t vp;
	#if defined(__FreeBSD_version) && (__FreeBSD_version >= 500000)
	mount_t mp = NULL;
	#endif
	int error;

	if (sf->nsf_fp == NULL \|\| (sf->nsf_flags & NFSNSF_UPDATEDONE))
	return;
	sf->nsf_flags \|= NFSNSF_UPDATEDONE;
	/*
	* Ok, we need to rewrite the stable storage file.
	* - truncate to 0 length
	* - write the new first structure
	* - loop through the data structures, writing out any that
	* have timestamps older than the old boot
	*/
	if (sf->nsf_bootvals) {
	sf->nsf_numboots++;
	for (i = sf->nsf_numboots - 2; i >= 0; i--)
	sf->nsf_bootvals[i + 1] = sf->nsf_bootvals[i];
	} else {
	sf->nsf_numboots = 1;
	sf->nsf_bootvals = (time_t *)malloc(sizeof (time_t),
	M_TEMP, M_WAITOK);
	}
	sf->nsf_bootvals[0] = nfsrvboottime;
	sf->nsf_lease = nfsrv_lease;
	NFSVNO_ATTRINIT(&nva);
	NFSVNO_SETATTRVAL(&nva, size, 0);
	vp = NFSFPVNODE(sf->nsf_fp);
	vn_start_write(vp, &mp, V_WAIT);
	if (NFSVOPLOCK(vp, LK_EXCLUSIVE) == 0) {
	error = nfsvno_setattr(vp, &nva, NFSFPCRED(sf->nsf_fp), p,
	NULL);
	NFSVOPUNLOCK(vp, 0);
	} else
	error = EPERM;
	vn_finished_write(mp);
	if (!error)
	error = NFSD_RDWR(UIO_WRITE, vp,
	(caddr_t)&sf->nsf_rec, sizeof (struct nfsf_rec), (off_t)0,
	UIO_SYSSPACE, IO_SYNC, NFSFPCRED(sf->nsf_fp), NULL, p);
	if (!error)
	error = NFSD_RDWR(UIO_WRITE, vp,
	(caddr_t)sf->nsf_bootvals,
	sf->nsf_numboots * sizeof (time_t),
	(off_t)(sizeof (struct nfsf_rec)),
	UIO_SYSSPACE, IO_SYNC, NFSFPCRED(sf->nsf_fp), NULL, p);
	free((caddr_t)sf->nsf_bootvals, M_TEMP);
	sf->nsf_bootvals = NULL;
	if (error) {
	sf->nsf_flags &= ~NFSNSF_OK;
	printf("EEK! Can't write NfsV4 stable storage file\n");
	return;
	}
	sf->nsf_flags \|= NFSNSF_OK;

	/*
	* Loop through the list and write out timestamp records for
	* any clients that successfully reclaimed state.
	*/
	LIST_FOREACH_SAFE(sp, &sf->nsf_head, nst_list, nsp) {
	if (sp->nst_flag & NFSNST_GOTSTATE) {
	nfsrv_writestable(sp->nst_client, sp->nst_len,
	NFSNST_NEWSTATE, p);
	sp->nst_clp->lc_flags \|= LCL_STAMPEDSTABLE;
	}
	LIST_REMOVE(sp, nst_list);
	free((caddr_t)sp, M_TEMP);
	}
	nfsrv_backupstable();
	}

	/*
	* Append a record to the stable storage file.
	*/
	APPLESTATIC void
	nfsrv_writestable(u_char client, int len, int flag, NFSPROC_T p)
	{
	struct nfsrv_stablefirst *sf = &nfsrv_stablefirst;
	struct nfst_rec *sp;
	int error;

	if (!(sf->nsf_flags & NFSNSF_OK) \|\| sf->nsf_fp == NULL)
	return;
	sp = (struct nfst_rec *)malloc(sizeof (struct nfst_rec) +
	len - 1, M_TEMP, M_WAITOK);
	sp->len = len;
	NFSBCOPY(client, sp->client, len);
	sp->flag = flag;
	error = NFSD_RDWR(UIO_WRITE, NFSFPVNODE(sf->nsf_fp),
	(caddr_t)sp, sizeof (struct nfst_rec) + len - 1, (off_t)0,
	UIO_SYSSPACE, (IO_SYNC \| IO_APPEND), NFSFPCRED(sf->nsf_fp), NULL, p);
	free((caddr_t)sp, M_TEMP);
	if (error) {
	sf->nsf_flags &= ~NFSNSF_OK;
	printf("EEK! Can't write NfsV4 stable storage file\n");
	}
	}

	/*
	* This function is called during the grace period to mark a client
	* that successfully reclaimed state.
	*/
	static void
	nfsrv_markstable(struct nfsclient *clp)
	{
	struct nfsrv_stable *sp;

	/*
	* First find the client structure.
	*/
	LIST_FOREACH(sp, &nfsrv_stablefirst.nsf_head, nst_list) {
	if (sp->nst_len == clp->lc_idlen &&
	!NFSBCMP(sp->nst_client, clp->lc_id, sp->nst_len))
	break;
	}
	if (sp == LIST_END(&nfsrv_stablefirst.nsf_head))
	return;

	/*
	* Now, just mark it and set the nfsclient back pointer.
	*/
	sp->nst_flag \|= NFSNST_GOTSTATE;
	sp->nst_clp = clp;
	}

	/*
	* This function is called for a reclaim, to see if it gets grace.
	* It returns 0 if a reclaim is allowed, 1 otherwise.
	*/
	static int
	nfsrv_checkstable(struct nfsclient *clp)
	{
	struct nfsrv_stable *sp;

	/*
	* First, find the entry for the client.
	*/
	LIST_FOREACH(sp, &nfsrv_stablefirst.nsf_head, nst_list) {
	if (sp->nst_len == clp->lc_idlen &&
	!NFSBCMP(sp->nst_client, clp->lc_id, sp->nst_len))
	break;
	}

	/*
	* If not in the list, state was revoked or no state was issued
	* since the previous reboot, a reclaim is denied.
	*/
	if (sp == LIST_END(&nfsrv_stablefirst.nsf_head) \|\|
	(sp->nst_flag & NFSNST_REVOKE) \|\|
	!(nfsrv_stablefirst.nsf_flags & NFSNSF_OK))
	return (1);
	return (0);
	}

	/*
	* Test for and try to clear out a conflicting client. This is called by
	* nfsrv_lockctrl() and nfsrv_openctrl() when conflicts with other clients
	* a found.
	* The trick here is that it can't revoke a conflicting client with an
	* expired lease unless it holds the v4root lock, so...
	* If no v4root lock, get the lock and return 1 to indicate "try again".
	* Return 0 to indicate the conflict can't be revoked and 1 to indicate
	* the revocation worked and the conflicting client is "bye, bye", so it
	* can be tried again.
	* Return 2 to indicate that the vnode is VI_DOOMED after NFSVOPLOCK().
	* Unlocks State before a non-zero value is returned.
	*/
	static int
	nfsrv_clientconflict(struct nfsclient clp, int haslockp, vnode_t vp,
	NFSPROC_T *p)
	{
	int gotlock, lktype = 0;

	/*
	* If lease hasn't expired, we can't fix it.
	*/
	if (clp->lc_expiry >= NFSD_MONOSEC \|\|
	!(nfsrv_stablefirst.nsf_flags & NFSNSF_UPDATEDONE))
	return (0);
	if (*haslockp == 0) {
	NFSUNLOCKSTATE();
	if (vp != NULL) {
	lktype = NFSVOPISLOCKED(vp);
	NFSVOPUNLOCK(vp, 0);
	}
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	do {
	gotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
	NFSV4ROOTLOCKMUTEXPTR, NULL);
	} while (!gotlock);
	NFSUNLOCKV4ROOTMUTEX();
	*haslockp = 1;
	if (vp != NULL) {
	NFSVOPLOCK(vp, lktype \| LK_RETRY);
	if ((vp->v_iflag & VI_DOOMED) != 0)
	return (2);
	}
	return (1);
	}
	NFSUNLOCKSTATE();

	/*
	* Ok, we can expire the conflicting client.
	*/
	nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
	nfsrv_backupstable();
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_deleg);
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	LIST_REMOVE(clp, lc_hash);
	nfsrv_zapclient(clp, p);
	return (1);
	}

	/*
	* Resolve a delegation conflict.
	* Returns 0 to indicate the conflict was resolved without sleeping.
	* Return -1 to indicate that the caller should check for conflicts again.
	* Return > 0 for an error that should be returned, normally NFSERR_DELAY.
	*
	* Also, manipulate the nfsv4root_lock, as required. It isn't changed
	* for a return of 0, since there was no sleep and it could be required
	* later. It is released for a return of NFSERR_DELAY, since the caller
	* will return that error. It is released when a sleep was done waiting
	* for the delegation to be returned or expire (so that other nfsds can
	* handle ops). Then, it must be acquired for the write to stable storage.
	* (This function is somewhat similar to nfsrv_clientconflict(), but
	* the semantics differ in a couple of subtle ways. The return of 0
	* indicates the conflict was resolved without sleeping here, not
	* that the conflict can't be resolved and the handling of nfsv4root_lock
	* differs, as noted above.)
	* Unlocks State before returning a non-zero value.
	*/
	static int
	nfsrv_delegconflict(struct nfsstate stp, int haslockp, NFSPROC_T *p,
	vnode_t vp)
	{
	struct nfsclient *clp = stp->ls_clp;
	int gotlock, error, lktype = 0, retrycnt, zapped_clp;
	nfsv4stateid_t tstateid;
	fhandle_t tfh;

	/*
	* If the conflict is with an old delegation...
	*/
	if (stp->ls_flags & NFSLCK_OLDDELEG) {
	/*
	* You can delete it, if it has expired.
	*/
	if (clp->lc_delegtime < NFSD_MONOSEC) {
	nfsrv_freedeleg(stp);
	NFSUNLOCKSTATE();
	error = -1;
	goto out;
	}
	NFSUNLOCKSTATE();
	/*
	* During this delay, the old delegation could expire or it
	* could be recovered by the client via an Open with
	* CLAIM_DELEGATE_PREV.
	* Release the nfsv4root_lock, if held.
	*/
	if (*haslockp) {
	*haslockp = 0;
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	error = NFSERR_DELAY;
	goto out;
	}

	/*
	* It's a current delegation, so:
	* - check to see if the delegation has expired
	* - if so, get the v4root lock and then expire it
	*/
	if (!(stp->ls_flags & NFSLCK_DELEGRECALL)) {
	/*
	* - do a recall callback, since not yet done
	* For now, never allow truncate to be set. To use
	* truncate safely, it must be guaranteed that the
	* Remove, Rename or Setattr with size of 0 will
	* succeed and that would require major changes to
	* the VFS/Vnode OPs.
	* Set the expiry time large enough so that it won't expire
	* until after the callback, then set it correctly, once
	* the callback is done. (The delegation will now time
	* out whether or not the Recall worked ok. The timeout
	* will be extended when ops are done on the delegation
	* stateid, up to the timelimit.)
	*/
	stp->ls_delegtime = NFSD_MONOSEC + (2 * nfsrv_lease) +
	NFSRV_LEASEDELTA;
	stp->ls_delegtimelimit = NFSD_MONOSEC + (6 * nfsrv_lease) +
	NFSRV_LEASEDELTA;
	stp->ls_flags \|= NFSLCK_DELEGRECALL;

	/*
	* Loop NFSRV_CBRETRYCNT times while the CBRecall replies
	* NFSERR_BADSTATEID or NFSERR_BADHANDLE. This is done
	* in order to try and avoid a race that could happen
	* when a CBRecall request passed the Open reply with
	* the delegation in it when transitting the network.
	* Since nfsrv_docallback will sleep, don't use stp after
	* the call.
	*/
	NFSBCOPY((caddr_t)&stp->ls_stateid, (caddr_t)&tstateid,
	sizeof (tstateid));
	NFSBCOPY((caddr_t)&stp->ls_lfp->lf_fh, (caddr_t)&tfh,
	sizeof (tfh));
	NFSUNLOCKSTATE();
	if (*haslockp) {
	*haslockp = 0;
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	retrycnt = 0;
	do {
	error = nfsrv_docallback(clp, NFSV4OP_CBRECALL,
	&tstateid, 0, &tfh, NULL, NULL, p);
	retrycnt++;
	} while ((error == NFSERR_BADSTATEID \|\|
	error == NFSERR_BADHANDLE) && retrycnt < NFSV4_CBRETRYCNT);
	error = NFSERR_DELAY;
	goto out;
	}

	if (clp->lc_expiry >= NFSD_MONOSEC &&
	stp->ls_delegtime >= NFSD_MONOSEC) {
	NFSUNLOCKSTATE();
	/*
	* A recall has been done, but it has not yet expired.
	* So, RETURN_DELAY.
	*/
	if (*haslockp) {
	*haslockp = 0;
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	error = NFSERR_DELAY;
	goto out;
	}

	/*
	* If we don't yet have the lock, just get it and then return,
	* since we need that before deleting expired state, such as
	* this delegation.
	* When getting the lock, unlock the vnode, so other nfsds that
	* are in progress, won't get stuck waiting for the vnode lock.
	*/
	if (*haslockp == 0) {
	NFSUNLOCKSTATE();
	if (vp != NULL) {
	lktype = NFSVOPISLOCKED(vp);
	NFSVOPUNLOCK(vp, 0);
	}
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	do {
	gotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
	NFSV4ROOTLOCKMUTEXPTR, NULL);
	} while (!gotlock);
	NFSUNLOCKV4ROOTMUTEX();
	*haslockp = 1;
	if (vp != NULL) {
	NFSVOPLOCK(vp, lktype \| LK_RETRY);
	if ((vp->v_iflag & VI_DOOMED) != 0) {
	*haslockp = 0;
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	error = NFSERR_PERM;
	goto out;
	}
	}
	error = -1;
	goto out;
	}

	NFSUNLOCKSTATE();
	/*
	* Ok, we can delete the expired delegation.
	* First, write the Revoke record to stable storage and then
	* clear out the conflict.
	* Since all other nfsd threads are now blocked, we can safely
	* sleep without the state changing.
	*/
	nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
	nfsrv_backupstable();
	if (clp->lc_expiry < NFSD_MONOSEC) {
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_deleg);
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	LIST_REMOVE(clp, lc_hash);
	zapped_clp = 1;
	} else {
	nfsrv_freedeleg(stp);
	zapped_clp = 0;
	}
	if (zapped_clp)
	nfsrv_zapclient(clp, p);
	error = -1;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Check for a remove allowed, if remove is set to 1 and get rid of
	* delegations.
	*/
	APPLESTATIC int
	nfsrv_checkremove(vnode_t vp, int remove, NFSPROC_T *p)
	{
	struct nfsstate *stp;
	struct nfslockfile *lfp;
	int error, haslock = 0;
	fhandle_t nfh;

	/*
	* First, get the lock file structure.
	* (A return of -1 means no associated state, so remove ok.)
	*/
	error = nfsrv_getlockfh(vp, NFSLCK_CHECK, NULL, &nfh, p);
	tryagain:
	NFSLOCKSTATE();
	if (!error)
	error = nfsrv_getlockfile(NFSLCK_CHECK, NULL, &lfp, &nfh, 0);
	if (error) {
	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	if (error == -1)
	error = 0;
	goto out;
	}

	/*
	* Now, we must Recall any delegations.
	*/
	error = nfsrv_cleandeleg(vp, lfp, NULL, &haslock, p);
	if (error) {
	/*
	* nfsrv_cleandeleg() unlocks state for non-zero
	* return.
	*/
	if (error == -1)
	goto tryagain;
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}
	goto out;
	}

	/*
	* Now, look for a conflicting open share.
	*/
	if (remove) {
	/*
	* If the entry in the directory was the last reference to the
	* corresponding filesystem object, the object can be destroyed
	* */
	if(lfp->lf_usecount>1)
	LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
	if (stp->ls_flags & NFSLCK_WRITEDENY) {
	error = NFSERR_FILEOPEN;
	break;
	}
	}
	}

	NFSUNLOCKSTATE();
	if (haslock) {
	NFSLOCKV4ROOTMUTEX();
	nfsv4_unlock(&nfsv4rootfs_lock, 1);
	NFSUNLOCKV4ROOTMUTEX();
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Clear out all delegations for the file referred to by lfp.
	* May return NFSERR_DELAY, if there will be a delay waiting for
	* delegations to expire.
	* Returns -1 to indicate it slept while recalling a delegation.
	* This function has the side effect of deleting the nfslockfile structure,
	* if it no longer has associated state and didn't have to sleep.
	* Unlocks State before a non-zero value is returned.
	*/
	static int
	nfsrv_cleandeleg(vnode_t vp, struct nfslockfile *lfp,
	struct nfsclient clp, int haslockp, NFSPROC_T *p)
	{
	struct nfsstate stp, nstp;
	int ret = 0;

	stp = LIST_FIRST(&lfp->lf_deleg);
	while (stp != LIST_END(&lfp->lf_deleg)) {
	nstp = LIST_NEXT(stp, ls_file);
	if (stp->ls_clp != clp) {
	ret = nfsrv_delegconflict(stp, haslockp, p, vp);
	if (ret) {
	/*
	* nfsrv_delegconflict() unlocks state
	* when it returns non-zero.
	*/
	goto out;
	}
	}
	stp = nstp;
	}
	out:
	NFSEXITCODE(ret);
	return (ret);
	}

	/*
	* There are certain operations that, when being done outside of NFSv4,
	* require that any NFSv4 delegation for the file be recalled.
	* This function is to be called for those cases:
	* VOP_RENAME() - When a delegation is being recalled for any reason,
	* the client may have to do Opens against the server, using the file's
	* final component name. If the file has been renamed on the server,
	* that component name will be incorrect and the Open will fail.
	* VOP_REMOVE() - Theoretically, a client could Open a file after it has
	* been removed on the server, if there is a delegation issued to
	* that client for the file. I say "theoretically" since clients
	* normally do an Access Op before the Open and that Access Op will
	* fail with ESTALE. Note that NFSv2 and 3 don't even do Opens, so
	* they will detect the file's removal in the same manner. (There is
	* one case where RFC3530 allows a client to do an Open without first
	* doing an Access Op, which is passage of a check against the ACE
	* returned with a Write delegation, but current practice is to ignore
	* the ACE and always do an Access Op.)
	* Since the functions can only be called with an unlocked vnode, this
	* can't be done at this time.
	* VOP_ADVLOCK() - When a client holds a delegation, it can issue byte range
	* locks locally in the client, which are not visible to the server. To
	* deal with this, issuing of delegations for a vnode must be disabled
	* and all delegations for the vnode recalled. This is done via the
	* second function, using the VV_DISABLEDELEG vflag on the vnode.
	*/
	APPLESTATIC void
	nfsd_recalldelegation(vnode_t vp, NFSPROC_T *p)
	{
	time_t starttime;
	int error;

	/*
	* First, check to see if the server is currently running and it has
	* been called for a regular file when issuing delegations.
	*/
	if (newnfs_numnfsd == 0 \|\| vp->v_type != VREG \|\|
	nfsrv_issuedelegs == 0)
	return;

	KASSERT((NFSVOPISLOCKED(vp) != LK_EXCLUSIVE), ("vp %p is locked", vp));
	/*
	* First, get a reference on the nfsv4rootfs_lock so that an
	* exclusive lock cannot be acquired by another thread.
	*/
	NFSLOCKV4ROOTMUTEX();
	nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
	NFSUNLOCKV4ROOTMUTEX();

	/*
	* Now, call nfsrv_checkremove() in a loop while it returns
	* NFSERR_DELAY. Return upon any other error or when timed out.
	*/
	starttime = NFSD_MONOSEC;
	do {
	if (NFSVOPLOCK(vp, LK_EXCLUSIVE) == 0) {
	error = nfsrv_checkremove(vp, 0, p);
	NFSVOPUNLOCK(vp, 0);
	} else
	error = EPERM;
	if (error == NFSERR_DELAY) {
	if (NFSD_MONOSEC - starttime > NFS_REMOVETIMEO)
	break;
	/* Sleep for a short period of time */
	(void) nfs_catnap(PZERO, 0, "nfsremove");
	}
	} while (error == NFSERR_DELAY);
	NFSLOCKV4ROOTMUTEX();
	nfsv4_relref(&nfsv4rootfs_lock);
	NFSUNLOCKV4ROOTMUTEX();
	}

	APPLESTATIC void
	nfsd_disabledelegation(vnode_t vp, NFSPROC_T *p)
	{

	#ifdef VV_DISABLEDELEG
	/*
	* First, flag issuance of delegations disabled.
	*/
	atomic_set_long(&vp->v_vflag, VV_DISABLEDELEG);
	#endif

	/*
	* Then call nfsd_recalldelegation() to get rid of all extant
	* delegations.
	*/
	nfsd_recalldelegation(vp, p);
	}

	/*
	* Check for conflicting locks, etc. and then get rid of delegations.
	* (At one point I thought that I should get rid of delegations for any
	* Setattr, since it could potentially disallow the I/O op (read or write)
	* allowed by the delegation. However, Setattr Ops that aren't changing
	* the size get a stateid of all 0s, so you can't tell if it is a delegation
	* for the same client or a different one, so I decided to only get rid
	* of delegations for other clients when the size is being changed.)
	* In general, a Setattr can disable NFS I/O Ops that are outstanding, such
	* as Write backs, even if there is no delegation, so it really isn't any
	* different?)
	*/
	APPLESTATIC int
	nfsrv_checksetattr(vnode_t vp, struct nfsrv_descript *nd,
	nfsv4stateid_t stateidp, struct nfsvattr nvap, nfsattrbit_t *attrbitp,
	struct nfsexstuff exp, NFSPROC_T p)
	{
	struct nfsstate st, *stp = &st;
	struct nfslock lo, *lop = &lo;
	int error = 0;
	nfsquad_t clientid;

	if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SIZE)) {
	stp->ls_flags = (NFSLCK_CHECK \| NFSLCK_WRITEACCESS);
	lop->lo_first = nvap->na_size;
	} else {
	stp->ls_flags = 0;
	lop->lo_first = 0;
	}
	if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_OWNER) \|\|
	NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_OWNERGROUP) \|\|
	NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_MODE) \|\|
	NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_ACL))
	stp->ls_flags \|= NFSLCK_SETATTR;
	if (stp->ls_flags == 0)
	goto out;
	lop->lo_end = NFS64BITSSET;
	lop->lo_flags = NFSLCK_WRITE;
	stp->ls_ownerlen = 0;
	stp->ls_op = NULL;
	stp->ls_uid = nd->nd_cred->cr_uid;
	stp->ls_stateid.seqid = stateidp->seqid;
	clientid.lval[0] = stp->ls_stateid.other[0] = stateidp->other[0];
	clientid.lval[1] = stp->ls_stateid.other[1] = stateidp->other[1];
	stp->ls_stateid.other[2] = stateidp->other[2];
	error = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid,
	stateidp, exp, nd, p);

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Check for a write delegation and do a CBGETATTR if there is one, updating
	* the attributes, as required.
	* Should I return an error if I can't get the attributes? (For now, I'll
	* just return ok.
	*/
	APPLESTATIC int
	nfsrv_checkgetattr(struct nfsrv_descript *nd, vnode_t vp,
	struct nfsvattr nvap, nfsattrbit_t attrbitp, struct ucred *cred,
	NFSPROC_T *p)
	{
	struct nfsstate *stp;
	struct nfslockfile *lfp;
	struct nfsclient *clp;
	struct nfsvattr nva;
	fhandle_t nfh;
	int error = 0;
	nfsattrbit_t cbbits;
	u_quad_t delegfilerev;

	NFSCBGETATTR_ATTRBIT(attrbitp, &cbbits);
	if (!NFSNONZERO_ATTRBIT(&cbbits))
	goto out;
	if (nfsrv_writedelegcnt == 0)
	goto out;

	/*
	* Get the lock file structure.
	* (A return of -1 means no associated state, so return ok.)
	*/
	error = nfsrv_getlockfh(vp, NFSLCK_CHECK, NULL, &nfh, p);
	NFSLOCKSTATE();
	if (!error)
	error = nfsrv_getlockfile(NFSLCK_CHECK, NULL, &lfp, &nfh, 0);
	if (error) {
	NFSUNLOCKSTATE();
	if (error == -1)
	error = 0;
	goto out;
	}

	/*
	* Now, look for a write delegation.
	*/
	LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
	if (stp->ls_flags & NFSLCK_DELEGWRITE)
	break;
	}
	if (stp == LIST_END(&lfp->lf_deleg)) {
	NFSUNLOCKSTATE();
	goto out;
	}
	clp = stp->ls_clp;
	delegfilerev = stp->ls_filerev;

	/*
	* If the Write delegation was issued as a part of this Compound RPC
	* or if we have an Implied Clientid (used in a previous Op in this
	* compound) and it is the client the delegation was issued to,
	* just return ok.
	* I also assume that it is from the same client iff the network
	* host IP address is the same as the callback address. (Not
	* exactly correct by the RFC, but avoids a lot of Getattr
	* callbacks.)
	*/
	if (nd->nd_compref == stp->ls_compref \|\|
	((nd->nd_flag & ND_IMPLIEDCLID) &&
	clp->lc_clientid.qval == nd->nd_clientid.qval) \|\|
	nfsaddr2_match(clp->lc_req.nr_nam, nd->nd_nam)) {
	NFSUNLOCKSTATE();
	goto out;
	}

	/*
	* We are now done with the delegation state structure,
	* so the statelock can be released and we can now tsleep().
	*/

	/*
	* Now, we must do the CB Getattr callback, to see if Change or Size
	* has changed.
	*/
	if (clp->lc_expiry >= NFSD_MONOSEC) {
	NFSUNLOCKSTATE();
	NFSVNO_ATTRINIT(&nva);
	nva.na_filerev = NFS64BITSSET;
	error = nfsrv_docallback(clp, NFSV4OP_CBGETATTR, NULL,
	0, &nfh, &nva, &cbbits, p);
	if (!error) {
	if ((nva.na_filerev != NFS64BITSSET &&
	nva.na_filerev > delegfilerev) \|\|
	(NFSVNO_ISSETSIZE(&nva) &&
	nva.na_size != nvap->na_size)) {
	error = nfsvno_updfilerev(vp, nvap, cred, p);
	if (NFSVNO_ISSETSIZE(&nva))
	nvap->na_size = nva.na_size;
	}
	} else
	error = 0; /* Ignore callback errors for now. */
	} else {
	NFSUNLOCKSTATE();
	}

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* This function looks for openowners that haven't had any opens for
	* a while and throws them away. Called by an nfsd when NFSNSF_NOOPENS
	* is set.
	*/
	APPLESTATIC void
	nfsrv_throwawayopens(NFSPROC_T *p)
	{
	struct nfsclient clp, nclp;
	struct nfsstate stp, nstp;
	int i;

	NFSLOCKSTATE();
	nfsrv_stablefirst.nsf_flags &= ~NFSNSF_NOOPENS;
	/*
	* For each client...
	*/
	for (i = 0; i < nfsrv_clienthashsize; i++) {
	LIST_FOREACH_SAFE(clp, &nfsclienthash[i], lc_hash, nclp) {
	LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp) {
	if (LIST_EMPTY(&stp->ls_open) &&
	(stp->ls_noopens > NFSNOOPEN \|\|
	(nfsrv_openpluslock * 2) >
	nfsrv_v4statelimit))
	nfsrv_freeopenowner(stp, 0, p);
	}
	}
	}
	NFSUNLOCKSTATE();
	}

	/*
	* This function checks to see if the credentials are the same.
	* Returns 1 for not same, 0 otherwise.
	*/
	static int
	nfsrv_notsamecredname(struct nfsrv_descript nd, struct nfsclient clp)
	{

	if (nd->nd_flag & ND_GSS) {
	if (!(clp->lc_flags & LCL_GSS))
	return (1);
	if (clp->lc_flags & LCL_NAME) {
	if (nd->nd_princlen != clp->lc_namelen \|\|
	NFSBCMP(nd->nd_principal, clp->lc_name,
	clp->lc_namelen))
	return (1);
	else
	return (0);
	}
	if (nd->nd_cred->cr_uid == clp->lc_uid)
	return (0);
	else
	return (1);
	} else if (clp->lc_flags & LCL_GSS)
	return (1);
	/*
	* For AUTH_SYS, allow the same uid or root. (This is underspecified
	* in RFC3530, which talks about principals, but doesn't say anything
	* about uids for AUTH_SYS.)
	*/
	if (nd->nd_cred->cr_uid == clp->lc_uid \|\| nd->nd_cred->cr_uid == 0)
	return (0);
	else
	return (1);
	}

	/*
	* Calculate the lease expiry time.
	*/
	static time_t
	nfsrv_leaseexpiry(void)
	{

	if (nfsrv_stablefirst.nsf_eograce > NFSD_MONOSEC)
	return (NFSD_MONOSEC + 2 * (nfsrv_lease + NFSRV_LEASEDELTA));
	return (NFSD_MONOSEC + nfsrv_lease + NFSRV_LEASEDELTA);
	}

	/*
	* Delay the delegation timeout as far as ls_delegtimelimit, as required.
	*/
	static void
	nfsrv_delaydelegtimeout(struct nfsstate *stp)
	{

	if ((stp->ls_flags & NFSLCK_DELEGRECALL) == 0)
	return;

	if ((stp->ls_delegtime + 15) > NFSD_MONOSEC &&
	stp->ls_delegtime < stp->ls_delegtimelimit) {
	stp->ls_delegtime += nfsrv_lease;
	if (stp->ls_delegtime > stp->ls_delegtimelimit)
	stp->ls_delegtime = stp->ls_delegtimelimit;
	}
	}

	/*
	* This function checks to see if there is any other state associated
	* with the openowner for this Open.
	* It returns 1 if there is no other state, 0 otherwise.
	*/
	static int
	nfsrv_nootherstate(struct nfsstate *stp)
	{
	struct nfsstate *tstp;

	LIST_FOREACH(tstp, &stp->ls_openowner->ls_open, ls_list) {
	if (tstp != stp \|\| !LIST_EMPTY(&tstp->ls_lock))
	return (0);
	}
	return (1);
	}

	/*
	* Create a list of lock deltas (changes to local byte range locking
	* that can be rolled back using the list) and apply the changes via
	* nfsvno_advlock(). Optionally, lock the list. It is expected that either
	* the rollback or update function will be called after this.
	* It returns an error (and rolls back, as required), if any nfsvno_advlock()
	* call fails. If it returns an error, it will unlock the list.
	*/
	static int
	nfsrv_locallock(vnode_t vp, struct nfslockfile *lfp, int flags,
	uint64_t first, uint64_t end, struct nfslockconflict cfp, NFSPROC_T p)
	{
	struct nfslock lop, nlop;
	int error = 0;

	/* Loop through the list of locks. */
	lop = LIST_FIRST(&lfp->lf_locallock);
	while (first < end && lop != NULL) {
	nlop = LIST_NEXT(lop, lo_lckowner);
	if (first >= lop->lo_end) {
	/* not there yet */
	lop = nlop;
	} else if (first < lop->lo_first) {
	/* new one starts before entry in list */
	if (end <= lop->lo_first) {
	/* no overlap between old and new */
	error = nfsrv_dolocal(vp, lfp, flags,
	NFSLCK_UNLOCK, first, end, cfp, p);
	if (error != 0)
	break;
	first = end;
	} else {
	/* handle fragment overlapped with new one */
	error = nfsrv_dolocal(vp, lfp, flags,
	NFSLCK_UNLOCK, first, lop->lo_first, cfp,
	p);
	if (error != 0)
	break;
	first = lop->lo_first;
	}
	} else {
	/* new one overlaps this entry in list */
	if (end <= lop->lo_end) {
	/* overlaps all of new one */
	error = nfsrv_dolocal(vp, lfp, flags,
	lop->lo_flags, first, end, cfp, p);
	if (error != 0)
	break;
	first = end;
	} else {
	/* handle fragment overlapped with new one */
	error = nfsrv_dolocal(vp, lfp, flags,
	lop->lo_flags, first, lop->lo_end, cfp, p);
	if (error != 0)
	break;
	first = lop->lo_end;
	lop = nlop;
	}
	}
	}
	if (first < end && error == 0)
	/* handle fragment past end of list */
	error = nfsrv_dolocal(vp, lfp, flags, NFSLCK_UNLOCK, first,
	end, cfp, p);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Local lock unlock. Unlock all byte ranges that are no longer locked
	* by NFSv4. To do this, unlock any subranges of first-->end that
	* do not overlap with the byte ranges of any lock in the lfp->lf_lock
	* list. This list has all locks for the file held by other
	* <clientid, lockowner> tuples. The list is ordered by increasing
	* lo_first value, but may have entries that overlap each other, for
	* the case of read locks.
	*/
	static void
	nfsrv_localunlock(vnode_t vp, struct nfslockfile *lfp, uint64_t init_first,
	uint64_t init_end, NFSPROC_T *p)
	{
	struct nfslock *lop;
	uint64_t first, end, prevfirst;

	first = init_first;
	end = init_end;
	while (first < init_end) {
	/* Loop through all nfs locks, adjusting first and end */
	prevfirst = 0;
	LIST_FOREACH(lop, &lfp->lf_lock, lo_lckfile) {
	KASSERT(prevfirst <= lop->lo_first,
	("nfsv4 locks out of order"));
	KASSERT(lop->lo_first < lop->lo_end,
	("nfsv4 bogus lock"));
	prevfirst = lop->lo_first;
	if (first >= lop->lo_first &&
	first < lop->lo_end)
	/*
	* Overlaps with initial part, so trim
	* off that initial part by moving first past
	* it.
	*/
	first = lop->lo_end;
	else if (end > lop->lo_first &&
	lop->lo_first > first) {
	/*
	* This lock defines the end of the
	* segment to unlock, so set end to the
	* start of it and break out of the loop.
	*/
	end = lop->lo_first;
	break;
	}
	if (first >= end)
	/*
	* There is no segment left to do, so
	* break out of this loop and then exit
	* the outer while() since first will be set
	* to end, which must equal init_end here.
	*/
	break;
	}
	if (first < end) {
	/* Unlock this segment */
	(void) nfsrv_dolocal(vp, lfp, NFSLCK_UNLOCK,
	NFSLCK_READ, first, end, NULL, p);
	nfsrv_locallock_commit(lfp, NFSLCK_UNLOCK,
	first, end);
	}
	/*
	* Now move past this segment and look for any further
	* segment in the range, if there is one.
	*/
	first = end;
	end = init_end;
	}
	}

	/*
	* Do the local lock operation and update the rollback list, as required.
	* Perform the rollback and return the error if nfsvno_advlock() fails.
	*/
	static int
	nfsrv_dolocal(vnode_t vp, struct nfslockfile *lfp, int flags, int oldflags,
	uint64_t first, uint64_t end, struct nfslockconflict cfp, NFSPROC_T p)
	{
	struct nfsrollback *rlp;
	int error = 0, ltype, oldltype;

	if (flags & NFSLCK_WRITE)
	ltype = F_WRLCK;
	else if (flags & NFSLCK_READ)
	ltype = F_RDLCK;
	else
	ltype = F_UNLCK;
	if (oldflags & NFSLCK_WRITE)
	oldltype = F_WRLCK;
	else if (oldflags & NFSLCK_READ)
	oldltype = F_RDLCK;
	else
	oldltype = F_UNLCK;
	if (ltype == oldltype \|\| (oldltype == F_WRLCK && ltype == F_RDLCK))
	/* nothing to do */
	goto out;
	error = nfsvno_advlock(vp, ltype, first, end, p);
	if (error != 0) {
	if (cfp != NULL) {
	cfp->cl_clientid.lval[0] = 0;
	cfp->cl_clientid.lval[1] = 0;
	cfp->cl_first = 0;
	cfp->cl_end = NFS64BITSSET;
	cfp->cl_flags = NFSLCK_WRITE;
	cfp->cl_ownerlen = 5;
	NFSBCOPY("LOCAL", cfp->cl_owner, 5);
	}
	nfsrv_locallock_rollback(vp, lfp, p);
	} else if (ltype != F_UNLCK) {
	rlp = malloc(sizeof (struct nfsrollback), M_NFSDROLLBACK,
	M_WAITOK);
	rlp->rlck_first = first;
	rlp->rlck_end = end;
	rlp->rlck_type = oldltype;
	LIST_INSERT_HEAD(&lfp->lf_rollback, rlp, rlck_list);
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Roll back local lock changes and free up the rollback list.
	*/
	static void
	nfsrv_locallock_rollback(vnode_t vp, struct nfslockfile lfp, NFSPROC_T p)
	{
	struct nfsrollback rlp, nrlp;

	LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list, nrlp) {
	(void) nfsvno_advlock(vp, rlp->rlck_type, rlp->rlck_first,
	rlp->rlck_end, p);
	free(rlp, M_NFSDROLLBACK);
	}
	LIST_INIT(&lfp->lf_rollback);
	}

	/*
	* Update local lock list and delete rollback list (ie now committed to the
	* local locks). Most of the work is done by the internal function.
	*/
	static void
	nfsrv_locallock_commit(struct nfslockfile *lfp, int flags, uint64_t first,
	uint64_t end)
	{
	struct nfsrollback rlp, nrlp;
	struct nfslock new_lop, other_lop;

	new_lop = malloc(sizeof (struct nfslock), M_NFSDLOCK, M_WAITOK);
	if (flags & (NFSLCK_READ \| NFSLCK_WRITE))
	other_lop = malloc(sizeof (struct nfslock), M_NFSDLOCK,
	M_WAITOK);
	else
	other_lop = NULL;
	new_lop->lo_flags = flags;
	new_lop->lo_first = first;
	new_lop->lo_end = end;
	nfsrv_updatelock(NULL, &new_lop, &other_lop, lfp);
	if (new_lop != NULL)
	free(new_lop, M_NFSDLOCK);
	if (other_lop != NULL)
	free(other_lop, M_NFSDLOCK);

	/* and get rid of the rollback list */
	LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list, nrlp)
	free(rlp, M_NFSDROLLBACK);
	LIST_INIT(&lfp->lf_rollback);
	}

	/*
	* Lock the struct nfslockfile for local lock updating.
	*/
	static void
	nfsrv_locklf(struct nfslockfile *lfp)
	{
	int gotlock;

	/* lf_usecount ensures lfp won't be free'd /
	lfp->lf_usecount++;
	do {
	gotlock = nfsv4_lock(&lfp->lf_locallock_lck, 1, NULL,
	NFSSTATEMUTEXPTR, NULL);
	} while (gotlock == 0);
	lfp->lf_usecount--;
	}

	/*
	* Unlock the struct nfslockfile after local lock updating.
	*/
	static void
	nfsrv_unlocklf(struct nfslockfile *lfp)
	{

	nfsv4_unlock(&lfp->lf_locallock_lck, 0);
	}

	/*
	* Clear out all state for the NFSv4 server.
	* Must be called by a thread that can sleep when no nfsds are running.
	*/
	void
	nfsrv_throwawayallstate(NFSPROC_T *p)
	{
	struct nfsclient clp, nclp;
	struct nfslockfile lfp, nlfp;
	int i;

	/*
	* For each client, clean out the state and then free the structure.
	*/
	for (i = 0; i < nfsrv_clienthashsize; i++) {
	LIST_FOREACH_SAFE(clp, &nfsclienthash[i], lc_hash, nclp) {
	nfsrv_cleanclient(clp, p);
	nfsrv_freedeleglist(&clp->lc_deleg);
	nfsrv_freedeleglist(&clp->lc_olddeleg);
	free(clp->lc_stateid, M_NFSDCLIENT);
	free(clp, M_NFSDCLIENT);
	}
	}

	/*
	* Also, free up any remaining lock file structures.
	*/
	for (i = 0; i < nfsrv_lockhashsize; i++) {
	LIST_FOREACH_SAFE(lfp, &nfslockhash[i], lf_hash, nlfp) {
	printf("nfsd unload: fnd a lock file struct\n");
	nfsrv_freenfslockfile(lfp);
	}
	}
	}

	/*
	* Check the sequence# for the session and slot provided as an argument.
	* Also, renew the lease if the session will return NFS_OK.
	*/
	int
	nfsrv_checksequence(struct nfsrv_descript *nd, uint32_t sequenceid,
	uint32_t highest_slotidp, uint32_t target_highest_slotidp, int cache_this,
	uint32_t sflagsp, NFSPROC_T p)
	{
	struct nfsdsession *sep;
	struct nfssessionhash *shp;
	int error;
	SVCXPRT *savxprt;

	shp = NFSSESSIONHASH(nd->nd_sessionid);
	NFSLOCKSESSION(shp);
	sep = nfsrv_findsession(nd->nd_sessionid);
	if (sep == NULL) {
	NFSUNLOCKSESSION(shp);
	return (NFSERR_BADSESSION);
	}
	error = nfsv4_seqsession(sequenceid, nd->nd_slotid, *highest_slotidp,
	sep->sess_slots, NULL, NFSV4_SLOTS - 1);
	if (error != 0) {
	NFSUNLOCKSESSION(shp);
	return (error);
	}
	if (cache_this != 0)
	nd->nd_flag \|= ND_SAVEREPLY;
	/* Renew the lease. */
	sep->sess_clp->lc_expiry = nfsrv_leaseexpiry();
	nd->nd_clientid.qval = sep->sess_clp->lc_clientid.qval;
	nd->nd_flag \|= ND_IMPLIEDCLID;

	/*
	* If this session handles the backchannel, save the nd_xprt for this
	* RPC, since this is the one being used.
	*/
	if (sep->sess_clp->lc_req.nr_client != NULL &&
	(sep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0) {
	savxprt = sep->sess_cbsess.nfsess_xprt;
	SVC_ACQUIRE(nd->nd_xprt);
	nd->nd_xprt->xp_p2 =
	sep->sess_clp->lc_req.nr_client->cl_private;
	nd->nd_xprt->xp_idletimeout = 0; /* Disable timeout. */
	sep->sess_cbsess.nfsess_xprt = nd->nd_xprt;
	if (savxprt != NULL)
	SVC_RELEASE(savxprt);
	}

	*sflagsp = 0;
	if (sep->sess_clp->lc_req.nr_client == NULL)
	*sflagsp \|= NFSV4SEQ_CBPATHDOWN;
	NFSUNLOCKSESSION(shp);
	if (error == NFSERR_EXPIRED) {
	*sflagsp \|= NFSV4SEQ_EXPIREDALLSTATEREVOKED;
	error = 0;
	} else if (error == NFSERR_ADMINREVOKED) {
	*sflagsp \|= NFSV4SEQ_ADMINSTATEREVOKED;
	error = 0;
	}
	highest_slotidp = target_highest_slotidp = NFSV4_SLOTS - 1;
	return (0);
	}

	/*
	* Check/set reclaim complete for this session/clientid.
	*/
	int
	nfsrv_checkreclaimcomplete(struct nfsrv_descript *nd)
	{
	struct nfsdsession *sep;
	struct nfssessionhash *shp;
	int error = 0;

	shp = NFSSESSIONHASH(nd->nd_sessionid);
	NFSLOCKSTATE();
	NFSLOCKSESSION(shp);
	sep = nfsrv_findsession(nd->nd_sessionid);
	if (sep == NULL) {
	NFSUNLOCKSESSION(shp);
	NFSUNLOCKSTATE();
	return (NFSERR_BADSESSION);
	}

	/* Check to see if reclaim complete has already happened. */
	if ((sep->sess_clp->lc_flags & LCL_RECLAIMCOMPLETE) != 0)
	error = NFSERR_COMPLETEALREADY;
	else
	sep->sess_clp->lc_flags \|= LCL_RECLAIMCOMPLETE;
	NFSUNLOCKSESSION(shp);
	NFSUNLOCKSTATE();
	return (error);
	}

	/*
	* Cache the reply in a session slot.
	*/
	void
	nfsrv_cache_session(uint8_t *sessionid, uint32_t slotid, int repstat,
	struct mbuf **m)
	{
	struct nfsdsession *sep;
	struct nfssessionhash *shp;

	shp = NFSSESSIONHASH(sessionid);
	NFSLOCKSESSION(shp);
	sep = nfsrv_findsession(sessionid);
	if (sep == NULL) {
	NFSUNLOCKSESSION(shp);
	printf("nfsrv_cache_session: no session\n");
	m_freem(*m);
	return;
	}
	nfsv4_seqsess_cacherep(slotid, sep->sess_slots, repstat, m);
	NFSUNLOCKSESSION(shp);
	}

	/*
	* Search for a session that matches the sessionid.
	*/
	static struct nfsdsession *
	nfsrv_findsession(uint8_t *sessionid)
	{
	struct nfsdsession *sep;
	struct nfssessionhash *shp;

	shp = NFSSESSIONHASH(sessionid);
	LIST_FOREACH(sep, &shp->list, sess_hash) {
	if (!NFSBCMP(sessionid, sep->sess_sessionid, NFSX_V4SESSIONID))
	break;
	}
	return (sep);
	}

	/*
	* Destroy a session.
	*/
	int
	nfsrv_destroysession(struct nfsrv_descript nd, uint8_t sessionid)
	{
	int error, samesess;

	samesess = 0;
	if (!NFSBCMP(sessionid, nd->nd_sessionid, NFSX_V4SESSIONID)) {
	samesess = 1;
	if ((nd->nd_flag & ND_LASTOP) == 0)
	return (NFSERR_BADSESSION);
	}
	error = nfsrv_freesession(NULL, sessionid);
	if (error == 0 && samesess != 0)
	nd->nd_flag &= ~ND_HASSEQUENCE;
	return (error);
	}

	/*
	* Free up a session structure.
	*/
	static int
	nfsrv_freesession(struct nfsdsession sep, uint8_t sessionid)
	{
	struct nfssessionhash *shp;
	int i;

	NFSLOCKSTATE();
	if (sep == NULL) {
	shp = NFSSESSIONHASH(sessionid);
	NFSLOCKSESSION(shp);
	sep = nfsrv_findsession(sessionid);
	} else {
	shp = NFSSESSIONHASH(sep->sess_sessionid);
	NFSLOCKSESSION(shp);
	}
	if (sep != NULL) {
	sep->sess_refcnt--;
	if (sep->sess_refcnt > 0) {
	NFSUNLOCKSESSION(shp);
	NFSUNLOCKSTATE();
	return (0);
	}
	LIST_REMOVE(sep, sess_hash);
	LIST_REMOVE(sep, sess_list);
	}
	NFSUNLOCKSESSION(shp);
	NFSUNLOCKSTATE();
	if (sep == NULL)
	return (NFSERR_BADSESSION);
	for (i = 0; i < NFSV4_SLOTS; i++)
	if (sep->sess_slots[i].nfssl_reply != NULL)
	m_freem(sep->sess_slots[i].nfssl_reply);
	if (sep->sess_cbsess.nfsess_xprt != NULL)
	SVC_RELEASE(sep->sess_cbsess.nfsess_xprt);
	free(sep, M_NFSDSESSION);
	return (0);
	}

	/*
	* Free a stateid.
	* RFC5661 says that it should fail when there are associated opens, locks
	* or delegations. Since stateids represent opens, I don't see how you can
	* free an open stateid (it will be free'd when closed), so this function
	* only works for lock stateids (freeing the lock_owner) or delegations.
	*/
	int
	nfsrv_freestateid(struct nfsrv_descript nd, nfsv4stateid_t stateidp,
	NFSPROC_T *p)
	{
	struct nfsclient *clp;
	struct nfsstate *stp;
	int error;

	NFSLOCKSTATE();
	/*
	* Look up the stateid
	*/
	error = nfsrv_getclient((nfsquad_t)((u_quad_t)0), CLOPS_RENEW, &clp,
	NULL, (nfsquad_t)((u_quad_t)0), 0, nd, p);
	if (error == 0) {
	/* First, check for a delegation. */
	LIST_FOREACH(stp, &clp->lc_deleg, ls_list) {
	if (!NFSBCMP(stp->ls_stateid.other, stateidp->other,
	NFSX_STATEIDOTHER))
	break;
	}
	if (stp != NULL) {
	nfsrv_freedeleg(stp);
	NFSUNLOCKSTATE();
	return (error);
	}
	}
	/* Not a delegation, try for a lock_owner. */
	if (error == 0)
	error = nfsrv_getstate(clp, stateidp, 0, &stp);
	if (error == 0 && ((stp->ls_flags & (NFSLCK_OPEN \| NFSLCK_DELEGREAD \|
	NFSLCK_DELEGWRITE)) != 0 \|\| (stp->ls_flags & NFSLCK_LOCK) == 0))
	/* Not a lock_owner stateid. */
	error = NFSERR_LOCKSHELD;
	if (error == 0 && !LIST_EMPTY(&stp->ls_lock))
	error = NFSERR_LOCKSHELD;
	if (error == 0)
	nfsrv_freelockowner(stp, NULL, 0, p);
	NFSUNLOCKSTATE();
	return (error);
	}

	/*
	* Generate the xdr for an NFSv4.1 CBSequence Operation.
	*/
	static int
	nfsv4_setcbsequence(struct nfsrv_descript nd, struct nfsclient clp,
	int dont_replycache, struct nfsdsession **sepp)
	{
	struct nfsdsession *sep;
	uint32_t *tl, slotseq = 0;
	int maxslot, slotpos;
	uint8_t sessionid[NFSX_V4SESSIONID];
	int error;

	error = nfsv4_getcbsession(clp, sepp);
	if (error != 0)
	return (error);
	sep = *sepp;
	(void)nfsv4_sequencelookup(NULL, &sep->sess_cbsess, &slotpos, &maxslot,
	&slotseq, sessionid);
	KASSERT(maxslot >= 0, ("nfsv4_setcbsequence neg maxslot"));

	/* Build the Sequence arguments. */
	NFSM_BUILD(tl, uint32_t , NFSX_V4SESSIONID + 5 NFSX_UNSIGNED);
	bcopy(sessionid, tl, NFSX_V4SESSIONID);
	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
	nd->nd_slotseq = tl;
	*tl++ = txdr_unsigned(slotseq);
	*tl++ = txdr_unsigned(slotpos);
	*tl++ = txdr_unsigned(maxslot);
	if (dont_replycache == 0)
	*tl++ = newnfs_true;
	else
	*tl++ = newnfs_false;
	tl = 0; / No referring call list, for now. */
	nd->nd_flag \|= ND_HASSEQUENCE;
	return (0);
	}

	/*
	* Get a session for the callback.
	*/
	static int
	nfsv4_getcbsession(struct nfsclient clp, struct nfsdsession *sepp)
	{
	struct nfsdsession *sep;

	NFSLOCKSTATE();
	LIST_FOREACH(sep, &clp->lc_session, sess_list) {
	if ((sep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0)
	break;
	}
	if (sep == NULL) {
	NFSUNLOCKSTATE();
	return (NFSERR_BADSESSION);
	}
	sep->sess_refcnt++;
	*sepp = sep;
	NFSUNLOCKSTATE();
	return (0);
	}

	/*
	* Free up all backchannel xprts. This needs to be done when the nfsd threads
	* exit, since those transports will all be going away.
	* This is only called after all the nfsd threads are done performing RPCs,
	* so locking shouldn't be an issue.
	*/
	APPLESTATIC void
	nfsrv_freeallbackchannel_xprts(void)
	{
	struct nfsdsession *sep;
	struct nfsclient *clp;
	SVCXPRT *xprt;
	int i;

	for (i = 0; i < nfsrv_clienthashsize; i++) {
	LIST_FOREACH(clp, &nfsclienthash[i], lc_hash) {
	LIST_FOREACH(sep, &clp->lc_session, sess_list) {
	xprt = sep->sess_cbsess.nfsess_xprt;
	sep->sess_cbsess.nfsess_xprt = NULL;
	if (xprt != NULL)
	SVC_RELEASE(xprt);
	}
	}
	}
	}

	Index: head/sys/geom/geom_subr.c
	===================================================================
	--- head/sys/geom/geom_subr.c (revision 327172)
	+++ head/sys/geom/geom_subr.c (revision 327173)
	@@ -1,1572 +1,1571 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2002 Poul-Henning Kamp
	* Copyright (c) 2002 Networks Associates Technology, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project by Poul-Henning Kamp
	* and NAI Labs, the Security Research Division of Network Associates, Inc.
	* under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
	* DARPA CHATS research program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The names of the authors may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/devicestat.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/bio.h>
	#include <sys/sysctl.h>
	#include <sys/proc.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/errno.h>
	#include <sys/sbuf.h>
	#include <geom/geom.h>
	#include <geom/geom_int.h>
	#include <machine/stdarg.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif

	#ifdef KDB
	#include <sys/kdb.h>
	#endif

	struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes);
	static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms);
	char g_wait_event, g_wait_up, g_wait_down, g_wait_sim;

	struct g_hh00 {
	struct g_class *mp;
	struct g_provider *pp;
	off_t size;
	int error;
	int post;
	};

	/*
	* This event offers a new class a chance to taste all preexisting providers.
	*/
	static void
	g_load_class(void *arg, int flag)
	{
	struct g_hh00 *hh;
	struct g_class mp2, mp;
	struct g_geom *gp;
	struct g_provider *pp;

	g_topology_assert();
	if (flag == EV_CANCEL) /* XXX: can't happen ? */
	return;
	if (g_shutdown)
	return;

	hh = arg;
	mp = hh->mp;
	hh->error = 0;
	if (hh->post) {
	g_free(hh);
	hh = NULL;
	}
	g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name);
	KASSERT(mp->name != NULL && *mp->name != '\0',
	("GEOM class has no name"));
	LIST_FOREACH(mp2, &g_classes, class) {
	if (mp2 == mp) {
	printf("The GEOM class %s is already loaded.\n",
	mp2->name);
	if (hh != NULL)
	hh->error = EEXIST;
	return;
	} else if (strcmp(mp2->name, mp->name) == 0) {
	printf("A GEOM class %s is already loaded.\n",
	mp2->name);
	if (hh != NULL)
	hh->error = EEXIST;
	return;
	}
	}

	LIST_INIT(&mp->geom);
	LIST_INSERT_HEAD(&g_classes, mp, class);
	if (mp->init != NULL)
	mp->init(mp);
	if (mp->taste == NULL)
	return;
	LIST_FOREACH(mp2, &g_classes, class) {
	if (mp == mp2)
	continue;
	LIST_FOREACH(gp, &mp2->geom, geom) {
	LIST_FOREACH(pp, &gp->provider, provider) {
	mp->taste(mp, pp, 0);
	g_topology_assert();
	}
	}
	}
	}

	static int
	g_unload_class(struct g_class *mp)
	{
	struct g_geom *gp;
	struct g_provider *pp;
	struct g_consumer *cp;
	int error;

	g_topology_lock();
	g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name);
	retry:
	G_VALID_CLASS(mp);
	LIST_FOREACH(gp, &mp->geom, geom) {
	/* We refuse to unload if anything is open */
	LIST_FOREACH(pp, &gp->provider, provider)
	if (pp->acr \|\| pp->acw \|\| pp->ace) {
	g_topology_unlock();
	return (EBUSY);
	}
	LIST_FOREACH(cp, &gp->consumer, consumer)
	if (cp->acr \|\| cp->acw \|\| cp->ace) {
	g_topology_unlock();
	return (EBUSY);
	}
	/* If the geom is withering, wait for it to finish. */
	if (gp->flags & G_GEOM_WITHER) {
	g_topology_sleep(mp, 1);
	goto retry;
	}
	}

	/*
	* We allow unloading if we have no geoms, or a class
	* method we can use to get rid of them.
	*/
	if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) {
	g_topology_unlock();
	return (EOPNOTSUPP);
	}

	/* Bar new entries */
	mp->taste = NULL;
	mp->config = NULL;

	LIST_FOREACH(gp, &mp->geom, geom) {
	error = mp->destroy_geom(NULL, mp, gp);
	if (error != 0) {
	g_topology_unlock();
	return (error);
	}
	}
	/* Wait for withering to finish. */
	for (;;) {
	gp = LIST_FIRST(&mp->geom);
	if (gp == NULL)
	break;
	KASSERT(gp->flags & G_GEOM_WITHER,
	("Non-withering geom in class %s", mp->name));
	g_topology_sleep(mp, 1);
	}
	G_VALID_CLASS(mp);
	if (mp->fini != NULL)
	mp->fini(mp);
	LIST_REMOVE(mp, class);
	g_topology_unlock();

	return (0);
	}

	int
	g_modevent(module_t mod, int type, void *data)
	{
	struct g_hh00 *hh;
	int error;
	static int g_ignition;
	struct g_class *mp;

	mp = data;
	if (mp->version != G_VERSION) {
	printf("GEOM class %s has Wrong version %x\n",
	mp->name, mp->version);
	return (EINVAL);
	}
	if (!g_ignition) {
	g_ignition++;
	g_init();
	}
	error = EOPNOTSUPP;
	switch (type) {
	case MOD_LOAD:
	g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name);
	hh = g_malloc(sizeof *hh, M_WAITOK \| M_ZERO);
	hh->mp = mp;
	/*
	* Once the system is not cold, MOD_LOAD calls will be
	* from the userland and the g_event thread will be able
	* to acknowledge their completion.
	*/
	if (cold) {
	hh->post = 1;
	error = g_post_event(g_load_class, hh, M_WAITOK, NULL);
	} else {
	error = g_waitfor_event(g_load_class, hh, M_WAITOK,
	NULL);
	if (error == 0)
	error = hh->error;
	g_free(hh);
	}
	break;
	case MOD_UNLOAD:
	g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name);
	error = g_unload_class(mp);
	if (error == 0) {
	KASSERT(LIST_EMPTY(&mp->geom),
	("Unloaded class (%s) still has geom", mp->name));
	}
	break;
	}
	return (error);
	}

	static void
	g_retaste_event(void *arg, int flag)
	{
	struct g_class mp, mp2;
	struct g_geom *gp;
	struct g_hh00 *hh;
	struct g_provider *pp;
	struct g_consumer *cp;

	g_topology_assert();
	if (flag == EV_CANCEL) /* XXX: can't happen ? */
	return;
	if (g_shutdown \|\| g_notaste)
	return;

	hh = arg;
	mp = hh->mp;
	hh->error = 0;
	if (hh->post) {
	g_free(hh);
	hh = NULL;
	}
	g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name);

	LIST_FOREACH(mp2, &g_classes, class) {
	LIST_FOREACH(gp, &mp2->geom, geom) {
	LIST_FOREACH(pp, &gp->provider, provider) {
	if (pp->acr \|\| pp->acw \|\| pp->ace)
	continue;
	LIST_FOREACH(cp, &pp->consumers, consumers) {
	if (cp->geom->class == mp &&
	(cp->flags & G_CF_ORPHAN) == 0)
	break;
	}
	if (cp != NULL) {
	cp->flags \|= G_CF_ORPHAN;
	g_wither_geom(cp->geom, ENXIO);
	}
	mp->taste(mp, pp, 0);
	g_topology_assert();
	}
	}
	}
	}

	int
	g_retaste(struct g_class *mp)
	{
	struct g_hh00 *hh;
	int error;

	if (mp->taste == NULL)
	return (EINVAL);

	hh = g_malloc(sizeof *hh, M_WAITOK \| M_ZERO);
	hh->mp = mp;

	if (cold) {
	hh->post = 1;
	error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL);
	} else {
	error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL);
	if (error == 0)
	error = hh->error;
	g_free(hh);
	}

	return (error);
	}

	struct g_geom *
	g_new_geomf(struct g_class mp, const char fmt, ...)
	{
	struct g_geom *gp;
	va_list ap;
	struct sbuf *sb;

	g_topology_assert();
	G_VALID_CLASS(mp);
	sb = sbuf_new_auto();
	va_start(ap, fmt);
	sbuf_vprintf(sb, fmt, ap);
	va_end(ap);
	sbuf_finish(sb);
	gp = g_malloc(sizeof *gp, M_WAITOK \| M_ZERO);
	gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK \| M_ZERO);
	gp->class = mp;
	gp->rank = 1;
	LIST_INIT(&gp->consumer);
	LIST_INIT(&gp->provider);
	LIST_INIT(&gp->aliases);
	LIST_INSERT_HEAD(&mp->geom, gp, geom);
	TAILQ_INSERT_HEAD(&geoms, gp, geoms);
	strcpy(gp->name, sbuf_data(sb));
	sbuf_delete(sb);
	/* Fill in defaults from class */
	gp->start = mp->start;
	gp->spoiled = mp->spoiled;
	gp->attrchanged = mp->attrchanged;
	gp->providergone = mp->providergone;
	gp->dumpconf = mp->dumpconf;
	gp->access = mp->access;
	gp->orphan = mp->orphan;
	gp->ioctl = mp->ioctl;
	gp->resize = mp->resize;
	return (gp);
	}

	void
	g_destroy_geom(struct g_geom *gp)
	{
	struct g_geom_alias gap, gaptmp;

	g_topology_assert();
	G_VALID_GEOM(gp);
	g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name);
	KASSERT(LIST_EMPTY(&gp->consumer),
	("g_destroy_geom(%s) with consumer(s) [%p]",
	gp->name, LIST_FIRST(&gp->consumer)));
	KASSERT(LIST_EMPTY(&gp->provider),
	("g_destroy_geom(%s) with provider(s) [%p]",
	gp->name, LIST_FIRST(&gp->provider)));
	g_cancel_event(gp);
	LIST_REMOVE(gp, geom);
	TAILQ_REMOVE(&geoms, gp, geoms);
	LIST_FOREACH_SAFE(gap, &gp->aliases, ga_next, gaptmp)
	g_free(gap);
	g_free(gp->name);
	g_free(gp);
	}

	/*
	* This function is called (repeatedly) until the geom has withered away.
	*/
	void
	g_wither_geom(struct g_geom *gp, int error)
	{
	struct g_provider *pp;

	g_topology_assert();
	G_VALID_GEOM(gp);
	g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name);
	if (!(gp->flags & G_GEOM_WITHER)) {
	gp->flags \|= G_GEOM_WITHER;
	LIST_FOREACH(pp, &gp->provider, provider)
	if (!(pp->flags & G_PF_ORPHAN))
	g_orphan_provider(pp, error);
	}
	g_do_wither();
	}

	/*
	* Convenience function to destroy a particular provider.
	*/
	void
	g_wither_provider(struct g_provider *pp, int error)
	{

	pp->flags \|= G_PF_WITHER;
	if (!(pp->flags & G_PF_ORPHAN))
	g_orphan_provider(pp, error);
	}

	/*
	* This function is called (repeatedly) until the has withered away.
	*/
	void
	g_wither_geom_close(struct g_geom *gp, int error)
	{
	struct g_consumer *cp;

	g_topology_assert();
	G_VALID_GEOM(gp);
	g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name);
	LIST_FOREACH(cp, &gp->consumer, consumer)
	if (cp->acr \|\| cp->acw \|\| cp->ace)
	g_access(cp, -cp->acr, -cp->acw, -cp->ace);
	g_wither_geom(gp, error);
	}

	/*
	* This function is called (repeatedly) until we cant wash away more
	* withered bits at present.
	*/
	void
	g_wither_washer()
	{
	struct g_class *mp;
	struct g_geom gp, gp2;
	struct g_provider pp, pp2;
	struct g_consumer cp, cp2;

	g_topology_assert();
	LIST_FOREACH(mp, &g_classes, class) {
	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
	LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) {
	if (!(pp->flags & G_PF_WITHER))
	continue;
	if (LIST_EMPTY(&pp->consumers))
	g_destroy_provider(pp);
	}
	if (!(gp->flags & G_GEOM_WITHER))
	continue;
	LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) {
	if (LIST_EMPTY(&pp->consumers))
	g_destroy_provider(pp);
	}
	LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) {
	if (cp->acr \|\| cp->acw \|\| cp->ace)
	continue;
	if (cp->provider != NULL)
	g_detach(cp);
	g_destroy_consumer(cp);
	}
	if (LIST_EMPTY(&gp->provider) &&
	LIST_EMPTY(&gp->consumer))
	g_destroy_geom(gp);
	}
	}
	}

	struct g_consumer *
	g_new_consumer(struct g_geom *gp)
	{
	struct g_consumer *cp;

	g_topology_assert();
	G_VALID_GEOM(gp);
	KASSERT(!(gp->flags & G_GEOM_WITHER),
	("g_new_consumer on WITHERing geom(%s) (class %s)",
	gp->name, gp->class->name));
	KASSERT(gp->orphan != NULL,
	("g_new_consumer on geom(%s) (class %s) without orphan",
	gp->name, gp->class->name));

	cp = g_malloc(sizeof *cp, M_WAITOK \| M_ZERO);
	cp->geom = gp;
	cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
	LIST_INSERT_HEAD(&gp->consumer, cp, consumer);
	return(cp);
	}

	void
	g_destroy_consumer(struct g_consumer *cp)
	{
	struct g_geom *gp;

	g_topology_assert();
	G_VALID_CONSUMER(cp);
	g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp);
	KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached"));
	KASSERT (cp->acr == 0, ("g_destroy_consumer with acr"));
	KASSERT (cp->acw == 0, ("g_destroy_consumer with acw"));
	KASSERT (cp->ace == 0, ("g_destroy_consumer with ace"));
	g_cancel_event(cp);
	gp = cp->geom;
	LIST_REMOVE(cp, consumer);
	devstat_remove_entry(cp->stat);
	g_free(cp);
	if (gp->flags & G_GEOM_WITHER)
	g_do_wither();
	}

	static void
	g_new_provider_event(void *arg, int flag)
	{
	struct g_class *mp;
	struct g_provider *pp;
	struct g_consumer cp, next_cp;

	g_topology_assert();
	if (flag == EV_CANCEL)
	return;
	if (g_shutdown)
	return;
	pp = arg;
	G_VALID_PROVIDER(pp);
	KASSERT(!(pp->flags & G_PF_WITHER),
	("g_new_provider_event but withered"));
	LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) {
	if ((cp->flags & G_CF_ORPHAN) == 0 &&
	cp->geom->attrchanged != NULL)
	cp->geom->attrchanged(cp, "GEOM::media");
	}
	if (g_notaste)
	return;
	LIST_FOREACH(mp, &g_classes, class) {
	if (mp->taste == NULL)
	continue;
	LIST_FOREACH(cp, &pp->consumers, consumers)
	if (cp->geom->class == mp &&
	(cp->flags & G_CF_ORPHAN) == 0)
	break;
	if (cp != NULL)
	continue;
	mp->taste(mp, pp, 0);
	g_topology_assert();
	}
	}


	struct g_provider *
	g_new_providerf(struct g_geom gp, const char fmt, ...)
	{
	struct g_provider *pp;
	struct sbuf *sb;
	va_list ap;

	g_topology_assert();
	G_VALID_GEOM(gp);
	KASSERT(gp->access != NULL,
	("new provider on geom(%s) without ->access (class %s)",
	gp->name, gp->class->name));
	KASSERT(gp->start != NULL,
	("new provider on geom(%s) without ->start (class %s)",
	gp->name, gp->class->name));
	KASSERT(!(gp->flags & G_GEOM_WITHER),
	("new provider on WITHERing geom(%s) (class %s)",
	gp->name, gp->class->name));
	sb = sbuf_new_auto();
	va_start(ap, fmt);
	sbuf_vprintf(sb, fmt, ap);
	va_end(ap);
	sbuf_finish(sb);
	pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK \| M_ZERO);
	pp->name = (char *)(pp + 1);
	strcpy(pp->name, sbuf_data(sb));
	sbuf_delete(sb);
	LIST_INIT(&pp->consumers);
	pp->error = ENXIO;
	pp->geom = gp;
	pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED,
	DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
	LIST_INSERT_HEAD(&gp->provider, pp, provider);
	g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL);
	return (pp);
	}

	void
	g_error_provider(struct g_provider *pp, int error)
	{

	/* G_VALID_PROVIDER(pp); We may not have g_topology */
	pp->error = error;
	}

	static void
	g_resize_provider_event(void *arg, int flag)
	{
	struct g_hh00 *hh;
	struct g_class *mp;
	struct g_geom *gp;
	struct g_provider *pp;
	struct g_consumer cp, cp2;
	off_t size;

	g_topology_assert();
	if (g_shutdown)
	return;

	hh = arg;
	pp = hh->pp;
	size = hh->size;
	g_free(hh);

	G_VALID_PROVIDER(pp);
	KASSERT(!(pp->flags & G_PF_WITHER),
	("g_resize_provider_event but withered"));
	g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp);

	LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) {
	gp = cp->geom;
	if (gp->resize == NULL && size < pp->mediasize) {
	/*
	* XXX: g_dev_orphan method does deferred destroying
	* and it is possible, that other event could already
	* call the orphan method. Check consumer's flags to
	* do not schedule it twice.
	*/
	if (cp->flags & G_CF_ORPHAN)
	continue;
	cp->flags \|= G_CF_ORPHAN;
	cp->geom->orphan(cp);
	}
	}

	pp->mediasize = size;

	LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) {
	gp = cp->geom;
	if ((gp->flags & G_GEOM_WITHER) == 0 && gp->resize != NULL)
	gp->resize(cp);
	}

	/*
	* After resizing, the previously invalid GEOM class metadata
	* might become valid. This means we should retaste.
	*/
	LIST_FOREACH(mp, &g_classes, class) {
	if (mp->taste == NULL)
	continue;
	LIST_FOREACH(cp, &pp->consumers, consumers)
	if (cp->geom->class == mp &&
	(cp->flags & G_CF_ORPHAN) == 0)
	break;
	if (cp != NULL)
	continue;
	mp->taste(mp, pp, 0);
	g_topology_assert();
	}
	}

	void
	g_resize_provider(struct g_provider *pp, off_t size)
	{
	struct g_hh00 *hh;

	G_VALID_PROVIDER(pp);
	if (pp->flags & G_PF_WITHER)
	return;

	if (size == pp->mediasize)
	return;

	hh = g_malloc(sizeof *hh, M_WAITOK \| M_ZERO);
	hh->pp = pp;
	hh->size = size;
	g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL);
	}

	#ifndef _PATH_DEV
	#define _PATH_DEV "/dev/"
	#endif

	struct g_provider *
	g_provider_by_name(char const *arg)
	{
	struct g_class *cp;
	struct g_geom *gp;
	struct g_provider pp, wpp;

	if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
	arg += sizeof(_PATH_DEV) - 1;

	wpp = NULL;
	LIST_FOREACH(cp, &g_classes, class) {
	LIST_FOREACH(gp, &cp->geom, geom) {
	LIST_FOREACH(pp, &gp->provider, provider) {
	if (strcmp(arg, pp->name) != 0)
	continue;
	if ((gp->flags & G_GEOM_WITHER) == 0 &&
	(pp->flags & G_PF_WITHER) == 0)
	return (pp);
	else
	wpp = pp;
	}
	}
	}

	return (wpp);
	}

	void
	g_destroy_provider(struct g_provider *pp)
	{
	struct g_geom *gp;

	g_topology_assert();
	G_VALID_PROVIDER(pp);
	KASSERT(LIST_EMPTY(&pp->consumers),
	("g_destroy_provider but attached"));
	KASSERT (pp->acr == 0, ("g_destroy_provider with acr"));
	KASSERT (pp->acw == 0, ("g_destroy_provider with acw"));
	KASSERT (pp->ace == 0, ("g_destroy_provider with ace"));
	g_cancel_event(pp);
	LIST_REMOVE(pp, provider);
	gp = pp->geom;
	devstat_remove_entry(pp->stat);
	/*
	* If a callback was provided, send notification that the provider
	* is now gone.
	*/
	if (gp->providergone != NULL)
	gp->providergone(pp);

	g_free(pp);
	if ((gp->flags & G_GEOM_WITHER))
	g_do_wither();
	}

	/*
	* We keep the "geoms" list sorted by topological order (== increasing
	* numerical rank) at all times.
	* When an attach is done, the attaching geoms rank is invalidated
	* and it is moved to the tail of the list.
	* All geoms later in the sequence has their ranks reevaluated in
	* sequence. If we cannot assign rank to a geom because it's
	* prerequisites do not have rank, we move that element to the tail
	* of the sequence with invalid rank as well.
	* At some point we encounter our original geom and if we stil fail
	* to assign it a rank, there must be a loop and we fail back to
	* g_attach() which detach again and calls redo_rank again
	* to fix up the damage.
	* It would be much simpler code wise to do it recursively, but we
	* can't risk that on the kernel stack.
	*/

	static int
	redo_rank(struct g_geom *gp)
	{
	struct g_consumer *cp;
	struct g_geom gp1, gp2;
	int n, m;

	g_topology_assert();
	G_VALID_GEOM(gp);

	/* Invalidate this geoms rank and move it to the tail */
	gp1 = TAILQ_NEXT(gp, geoms);
	if (gp1 != NULL) {
	gp->rank = 0;
	TAILQ_REMOVE(&geoms, gp, geoms);
	TAILQ_INSERT_TAIL(&geoms, gp, geoms);
	} else {
	gp1 = gp;
	}

	/* re-rank the rest of the sequence */
	for (; gp1 != NULL; gp1 = gp2) {
	gp1->rank = 0;
	m = 1;
	LIST_FOREACH(cp, &gp1->consumer, consumer) {
	if (cp->provider == NULL)
	continue;
	n = cp->provider->geom->rank;
	if (n == 0) {
	m = 0;
	break;
	} else if (n >= m)
	m = n + 1;
	}
	gp1->rank = m;
	gp2 = TAILQ_NEXT(gp1, geoms);

	/* got a rank, moving on */
	if (m != 0)
	continue;

	/* no rank to original geom means loop */
	if (gp == gp1)
	return (ELOOP);

	/* no rank, put it at the end move on */
	TAILQ_REMOVE(&geoms, gp1, geoms);
	TAILQ_INSERT_TAIL(&geoms, gp1, geoms);
	}
	return (0);
	}

	int
	g_attach(struct g_consumer cp, struct g_provider pp)
	{
	int error;

	g_topology_assert();
	G_VALID_CONSUMER(cp);
	G_VALID_PROVIDER(pp);
	g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp);
	KASSERT(cp->provider == NULL, ("attach but attached"));
	cp->provider = pp;
	cp->flags &= ~G_CF_ORPHAN;
	LIST_INSERT_HEAD(&pp->consumers, cp, consumers);
	error = redo_rank(cp->geom);
	if (error) {
	LIST_REMOVE(cp, consumers);
	cp->provider = NULL;
	redo_rank(cp->geom);
	}
	return (error);
	}

	void
	g_detach(struct g_consumer *cp)
	{
	struct g_provider *pp;

	g_topology_assert();
	G_VALID_CONSUMER(cp);
	g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp);
	KASSERT(cp->provider != NULL, ("detach but not attached"));
	KASSERT(cp->acr == 0, ("detach but nonzero acr"));
	KASSERT(cp->acw == 0, ("detach but nonzero acw"));
	KASSERT(cp->ace == 0, ("detach but nonzero ace"));
	KASSERT(cp->nstart == cp->nend,
	("detach with active requests"));
	pp = cp->provider;
	LIST_REMOVE(cp, consumers);
	cp->provider = NULL;
	if ((cp->geom->flags & G_GEOM_WITHER) \|\|
	(pp->geom->flags & G_GEOM_WITHER) \|\|
	(pp->flags & G_PF_WITHER))
	g_do_wither();
	redo_rank(cp->geom);
	}

	/*
	* g_access()
	*
	* Access-check with delta values. The question asked is "can provider
	* "cp" change the access counters by the relative amounts dc[rwe] ?"
	*/

	int
	g_access(struct g_consumer *cp, int dcr, int dcw, int dce)
	{
	struct g_provider *pp;
	- int pr,pw,pe;
	+ int pw, pe;
	int error;

	g_topology_assert();
	G_VALID_CONSUMER(cp);
	pp = cp->provider;
	KASSERT(pp != NULL, ("access but not attached"));
	G_VALID_PROVIDER(pp);

	g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)",
	cp, pp->name, dcr, dcw, dce);

	KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr"));
	KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw"));
	KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace"));
	KASSERT(dcr != 0 \|\| dcw != 0 \|\| dce != 0, ("NOP access request"));
	KASSERT(pp->geom->access != NULL, ("NULL geom->access"));

	/*
	* If our class cares about being spoiled, and we have been, we
	* are probably just ahead of the event telling us that. Fail
	* now rather than having to unravel this later.
	*/
	if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) &&
	(dcr > 0 \|\| dcw > 0 \|\| dce > 0))
	return (ENXIO);

	/*
	* Figure out what counts the provider would have had, if this
	* consumer had (r0w0e0) at this time.
	*/
	- pr = pp->acr - cp->acr;
	pw = pp->acw - cp->acw;
	pe = pp->ace - cp->ace;

	g_trace(G_T_ACCESS,
	"open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)",
	dcr, dcw, dce,
	cp->acr, cp->acw, cp->ace,
	pp->acr, pp->acw, pp->ace,
	pp, pp->name);

	/* If foot-shooting is enabled, any open on rank#1 is OK */
	if ((g_debugflags & 16) && pp->geom->rank == 1)
	;
	/* If we try exclusive but already write: fail */
	else if (dce > 0 && pw > 0)
	return (EPERM);
	/* If we try write but already exclusive: fail */
	else if (dcw > 0 && pe > 0)
	return (EPERM);
	/* If we try to open more but provider is error'ed: fail */
	else if ((dcr > 0 \|\| dcw > 0 \|\| dce > 0) && pp->error != 0) {
	printf("%s(%d): provider %s has error %d set\n",
	__func__, __LINE__, pp->name, pp->error);
	return (pp->error);
	}

	/* Ok then... */

	error = pp->geom->access(pp, dcr, dcw, dce);
	KASSERT(dcr > 0 \|\| dcw > 0 \|\| dce > 0 \|\| error == 0,
	("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed "
	"closing ->access()", pp->geom->class->name, pp->name, dcr, dcw,
	dce, error));
	if (!error) {
	/*
	* If we open first write, spoil any partner consumers.
	* If we close last write and provider is not errored,
	* trigger re-taste.
	*/
	if (pp->acw == 0 && dcw != 0)
	g_spoil(pp, cp);
	else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 &&
	!(pp->geom->flags & G_GEOM_WITHER))
	g_post_event(g_new_provider_event, pp, M_WAITOK,
	pp, NULL);

	pp->acr += dcr;
	pp->acw += dcw;
	pp->ace += dce;
	cp->acr += dcr;
	cp->acw += dcw;
	cp->ace += dce;
	if (pp->acr != 0 \|\| pp->acw != 0 \|\| pp->ace != 0)
	KASSERT(pp->sectorsize > 0,
	("Provider %s lacks sectorsize", pp->name));
	if ((cp->geom->flags & G_GEOM_WITHER) &&
	cp->acr == 0 && cp->acw == 0 && cp->ace == 0)
	g_do_wither();
	}
	return (error);
	}

	int
	g_handleattr_int(struct bio bp, const char attribute, int val)
	{

	return (g_handleattr(bp, attribute, &val, sizeof val));
	}

	int
	g_handleattr_uint16_t(struct bio bp, const char attribute, uint16_t val)
	{

	return (g_handleattr(bp, attribute, &val, sizeof val));
	}

	int
	g_handleattr_off_t(struct bio bp, const char attribute, off_t val)
	{

	return (g_handleattr(bp, attribute, &val, sizeof val));
	}

	int
	g_handleattr_str(struct bio bp, const char attribute, const char *str)
	{

	return (g_handleattr(bp, attribute, str, 0));
	}

	int
	g_handleattr(struct bio bp, const char attribute, const void *val, int len)
	{
	int error = 0;

	if (strcmp(bp->bio_attribute, attribute))
	return (0);
	if (len == 0) {
	bzero(bp->bio_data, bp->bio_length);
	if (strlcpy(bp->bio_data, val, bp->bio_length) >=
	bp->bio_length) {
	printf("%s: %s bio_length %jd len %zu -> EFAULT\n",
	__func__, bp->bio_to->name,
	(intmax_t)bp->bio_length, strlen(val));
	error = EFAULT;
	}
	} else if (bp->bio_length == len) {
	bcopy(val, bp->bio_data, len);
	} else {
	printf("%s: %s bio_length %jd len %d -> EFAULT\n", __func__,
	bp->bio_to->name, (intmax_t)bp->bio_length, len);
	error = EFAULT;
	}
	if (error == 0)
	bp->bio_completed = bp->bio_length;
	g_io_deliver(bp, error);
	return (1);
	}

	int
	g_std_access(struct g_provider *pp,
	int dr __unused, int dw __unused, int de __unused)
	{

	g_topology_assert();
	G_VALID_PROVIDER(pp);
	return (0);
	}

	void
	g_std_done(struct bio *bp)
	{
	struct bio *bp2;

	bp2 = bp->bio_parent;
	if (bp2->bio_error == 0)
	bp2->bio_error = bp->bio_error;
	bp2->bio_completed += bp->bio_completed;
	g_destroy_bio(bp);
	bp2->bio_inbed++;
	if (bp2->bio_children == bp2->bio_inbed)
	g_io_deliver(bp2, bp2->bio_error);
	}

	/* XXX: maybe this is only g_slice_spoiled */

	void
	g_std_spoiled(struct g_consumer *cp)
	{
	struct g_geom *gp;
	struct g_provider *pp;

	g_topology_assert();
	G_VALID_CONSUMER(cp);
	g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp);
	cp->flags \|= G_CF_ORPHAN;
	g_detach(cp);
	gp = cp->geom;
	LIST_FOREACH(pp, &gp->provider, provider)
	g_orphan_provider(pp, ENXIO);
	g_destroy_consumer(cp);
	if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer))
	g_destroy_geom(gp);
	else
	gp->flags \|= G_GEOM_WITHER;
	}

	/*
	* Spoiling happens when a provider is opened for writing, but consumers
	* which are configured by in-band data are attached (slicers for instance).
	* Since the write might potentially change the in-band data, such consumers
	* need to re-evaluate their existence after the writing session closes.
	* We do this by (offering to) tear them down when the open for write happens
	* in return for a re-taste when it closes again.
	* Together with the fact that such consumers grab an 'e' bit whenever they
	* are open, regardless of mode, this ends up DTRT.
	*/

	static void
	g_spoil_event(void *arg, int flag)
	{
	struct g_provider *pp;
	struct g_consumer cp, cp2;

	g_topology_assert();
	if (flag == EV_CANCEL)
	return;
	pp = arg;
	G_VALID_PROVIDER(pp);
	g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp,
	pp->geom->class->name, pp->geom->name, pp->name);
	for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) {
	cp2 = LIST_NEXT(cp, consumers);
	if ((cp->flags & G_CF_SPOILED) == 0)
	continue;
	cp->flags &= ~G_CF_SPOILED;
	if (cp->geom->spoiled == NULL)
	continue;
	cp->geom->spoiled(cp);
	g_topology_assert();
	}
	}

	void
	g_spoil(struct g_provider pp, struct g_consumer cp)
	{
	struct g_consumer *cp2;

	g_topology_assert();
	G_VALID_PROVIDER(pp);
	G_VALID_CONSUMER(cp);

	LIST_FOREACH(cp2, &pp->consumers, consumers) {
	if (cp2 == cp)
	continue;
	/*
	KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr));
	KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw));
	*/
	KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace));
	cp2->flags \|= G_CF_SPOILED;
	}
	g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL);
	}

	static void
	g_media_changed_event(void *arg, int flag)
	{
	struct g_provider *pp;
	int retaste;

	g_topology_assert();
	if (flag == EV_CANCEL)
	return;
	pp = arg;
	G_VALID_PROVIDER(pp);

	/*
	* If provider was not open for writing, queue retaste after spoiling.
	* If it was, retaste will happen automatically on close.
	*/
	retaste = (pp->acw == 0 && pp->error == 0 &&
	!(pp->geom->flags & G_GEOM_WITHER));
	g_spoil_event(arg, flag);
	if (retaste)
	g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL);
	}

	int
	g_media_changed(struct g_provider *pp, int flag)
	{
	struct g_consumer *cp;

	LIST_FOREACH(cp, &pp->consumers, consumers)
	cp->flags \|= G_CF_SPOILED;
	return (g_post_event(g_media_changed_event, pp, flag, pp, NULL));
	}

	int
	g_media_gone(struct g_provider *pp, int flag)
	{
	struct g_consumer *cp;

	LIST_FOREACH(cp, &pp->consumers, consumers)
	cp->flags \|= G_CF_SPOILED;
	return (g_post_event(g_spoil_event, pp, flag, pp, NULL));
	}

	int
	g_getattr__(const char attr, struct g_consumer cp, void *var, int len)
	{
	int error, i;

	i = len;
	error = g_io_getattr(attr, cp, &i, var);
	if (error)
	return (error);
	if (i != len)
	return (EINVAL);
	return (0);
	}

	static int
	g_get_device_prefix_len(const char *name)
	{
	int len;

	if (strncmp(name, "ada", 3) == 0)
	len = 3;
	else if (strncmp(name, "ad", 2) == 0)
	len = 2;
	else
	return (0);
	if (name[len] < '0' \|\| name[len] > '9')
	return (0);
	do {
	len++;
	} while (name[len] >= '0' && name[len] <= '9');
	return (len);
	}

	int
	g_compare_names(const char namea, const char nameb)
	{
	int deva, devb;

	if (strcmp(namea, nameb) == 0)
	return (1);
	deva = g_get_device_prefix_len(namea);
	if (deva == 0)
	return (0);
	devb = g_get_device_prefix_len(nameb);
	if (devb == 0)
	return (0);
	if (strcmp(namea + deva, nameb + devb) == 0)
	return (1);
	return (0);
	}

	void
	g_geom_add_alias(struct g_geom gp, const char alias)
	{
	struct g_geom_alias *gap;

	gap = (struct g_geom_alias *)g_malloc(
	sizeof(struct g_geom_alias) + strlen(alias) + 1, M_WAITOK);
	strcpy((char *)(gap + 1), alias);
	gap->ga_alias = (const char *)(gap + 1);
	LIST_INSERT_HEAD(&gp->aliases, gap, ga_next);
	}

	#if defined(DIAGNOSTIC) \|\| defined(DDB)
	/*
	* This function walks the mesh and returns a non-zero integer if it
	* finds the argument pointer is an object. The return value indicates
	* which type of object it is believed to be. If topology is not locked,
	* this function is potentially dangerous, but we don't assert that the
	* topology lock is held when called from debugger.
	*/
	int
	g_valid_obj(void const *ptr)
	{
	struct g_class *mp;
	struct g_geom *gp;
	struct g_consumer *cp;
	struct g_provider *pp;

	#ifdef KDB
	if (kdb_active == 0)
	#endif
	g_topology_assert();

	LIST_FOREACH(mp, &g_classes, class) {
	if (ptr == mp)
	return (1);
	LIST_FOREACH(gp, &mp->geom, geom) {
	if (ptr == gp)
	return (2);
	LIST_FOREACH(cp, &gp->consumer, consumer)
	if (ptr == cp)
	return (3);
	LIST_FOREACH(pp, &gp->provider, provider)
	if (ptr == pp)
	return (4);
	}
	}
	return(0);
	}
	#endif

	#ifdef DDB

	#define gprintf(...) do { \
	db_printf("%*s", indent, ""); \
	db_printf(__VA_ARGS__); \
	} while (0)
	#define gprintln(...) do { \
	gprintf(__VA_ARGS__); \
	db_printf("\n"); \
	} while (0)

	#define ADDFLAG(obj, flag, sflag) do { \
	if ((obj)->flags & (flag)) { \
	if (comma) \
	strlcat(str, ",", size); \
	strlcat(str, (sflag), size); \
	comma = 1; \
	} \
	} while (0)

	static char *
	provider_flags_to_string(struct g_provider pp, char str, size_t size)
	{
	int comma = 0;

	bzero(str, size);
	if (pp->flags == 0) {
	strlcpy(str, "NONE", size);
	return (str);
	}
	ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER");
	ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN");
	return (str);
	}

	static char *
	geom_flags_to_string(struct g_geom gp, char str, size_t size)
	{
	int comma = 0;

	bzero(str, size);
	if (gp->flags == 0) {
	strlcpy(str, "NONE", size);
	return (str);
	}
	ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER");
	return (str);
	}
	static void
	db_show_geom_consumer(int indent, struct g_consumer *cp)
	{

	if (indent == 0) {
	gprintln("consumer: %p", cp);
	gprintln(" class: %s (%p)", cp->geom->class->name,
	cp->geom->class);
	gprintln(" geom: %s (%p)", cp->geom->name, cp->geom);
	if (cp->provider == NULL)
	gprintln(" provider: none");
	else {
	gprintln(" provider: %s (%p)", cp->provider->name,
	cp->provider);
	}
	gprintln(" access: r%dw%de%d", cp->acr, cp->acw, cp->ace);
	gprintln(" flags: 0x%04x", cp->flags);
	gprintln(" nstart: %u", cp->nstart);
	gprintln(" nend: %u", cp->nend);
	} else {
	gprintf("consumer: %p (%s), access=r%dw%de%d", cp,
	cp->provider != NULL ? cp->provider->name : "none",
	cp->acr, cp->acw, cp->ace);
	if (cp->flags)
	db_printf(", flags=0x%04x", cp->flags);
	db_printf("\n");
	}
	}

	static void
	db_show_geom_provider(int indent, struct g_provider *pp)
	{
	struct g_consumer *cp;
	char flags[64];

	if (indent == 0) {
	gprintln("provider: %s (%p)", pp->name, pp);
	gprintln(" class: %s (%p)", pp->geom->class->name,
	pp->geom->class);
	gprintln(" geom: %s (%p)", pp->geom->name, pp->geom);
	gprintln(" mediasize: %jd", (intmax_t)pp->mediasize);
	gprintln(" sectorsize: %u", pp->sectorsize);
	gprintln(" stripesize: %u", pp->stripesize);
	gprintln(" stripeoffset: %u", pp->stripeoffset);
	gprintln(" access: r%dw%de%d", pp->acr, pp->acw,
	pp->ace);
	gprintln(" flags: %s (0x%04x)",
	provider_flags_to_string(pp, flags, sizeof(flags)),
	pp->flags);
	gprintln(" error: %d", pp->error);
	gprintln(" nstart: %u", pp->nstart);
	gprintln(" nend: %u", pp->nend);
	if (LIST_EMPTY(&pp->consumers))
	gprintln(" consumers: none");
	} else {
	gprintf("provider: %s (%p), access=r%dw%de%d",
	pp->name, pp, pp->acr, pp->acw, pp->ace);
	if (pp->flags != 0) {
	db_printf(", flags=%s (0x%04x)",
	provider_flags_to_string(pp, flags, sizeof(flags)),
	pp->flags);
	}
	db_printf("\n");
	}
	if (!LIST_EMPTY(&pp->consumers)) {
	LIST_FOREACH(cp, &pp->consumers, consumers) {
	db_show_geom_consumer(indent + 2, cp);
	if (db_pager_quit)
	break;
	}
	}
	}

	static void
	db_show_geom_geom(int indent, struct g_geom *gp)
	{
	struct g_provider *pp;
	struct g_consumer *cp;
	char flags[64];

	if (indent == 0) {
	gprintln("geom: %s (%p)", gp->name, gp);
	gprintln(" class: %s (%p)", gp->class->name, gp->class);
	gprintln(" flags: %s (0x%04x)",
	geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags);
	gprintln(" rank: %d", gp->rank);
	if (LIST_EMPTY(&gp->provider))
	gprintln(" providers: none");
	if (LIST_EMPTY(&gp->consumer))
	gprintln(" consumers: none");
	} else {
	gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank);
	if (gp->flags != 0) {
	db_printf(", flags=%s (0x%04x)",
	geom_flags_to_string(gp, flags, sizeof(flags)),
	gp->flags);
	}
	db_printf("\n");
	}
	if (!LIST_EMPTY(&gp->provider)) {
	LIST_FOREACH(pp, &gp->provider, provider) {
	db_show_geom_provider(indent + 2, pp);
	if (db_pager_quit)
	break;
	}
	}
	if (!LIST_EMPTY(&gp->consumer)) {
	LIST_FOREACH(cp, &gp->consumer, consumer) {
	db_show_geom_consumer(indent + 2, cp);
	if (db_pager_quit)
	break;
	}
	}
	}

	static void
	db_show_geom_class(struct g_class *mp)
	{
	struct g_geom *gp;

	db_printf("class: %s (%p)\n", mp->name, mp);
	LIST_FOREACH(gp, &mp->geom, geom) {
	db_show_geom_geom(2, gp);
	if (db_pager_quit)
	break;
	}
	}

	/*
	* Print the GEOM topology or the given object.
	*/
	DB_SHOW_COMMAND(geom, db_show_geom)
	{
	struct g_class *mp;

	if (!have_addr) {
	/* No address given, print the entire topology. */
	LIST_FOREACH(mp, &g_classes, class) {
	db_show_geom_class(mp);
	db_printf("\n");
	if (db_pager_quit)
	break;
	}
	} else {
	switch (g_valid_obj((void *)addr)) {
	case 1:
	db_show_geom_class((struct g_class *)addr);
	break;
	case 2:
	db_show_geom_geom(0, (struct g_geom *)addr);
	break;
	case 3:
	db_show_geom_consumer(0, (struct g_consumer *)addr);
	break;
	case 4:
	db_show_geom_provider(0, (struct g_provider *)addr);
	break;
	default:
	db_printf("Not a GEOM object.\n");
	break;
	}
	}
	}

	static void
	db_print_bio_cmd(struct bio *bp)
	{
	db_printf(" cmd: ");
	switch (bp->bio_cmd) {
	case BIO_READ: db_printf("BIO_READ"); break;
	case BIO_WRITE: db_printf("BIO_WRITE"); break;
	case BIO_DELETE: db_printf("BIO_DELETE"); break;
	case BIO_GETATTR: db_printf("BIO_GETATTR"); break;
	case BIO_FLUSH: db_printf("BIO_FLUSH"); break;
	case BIO_CMD0: db_printf("BIO_CMD0"); break;
	case BIO_CMD1: db_printf("BIO_CMD1"); break;
	case BIO_CMD2: db_printf("BIO_CMD2"); break;
	case BIO_ZONE: db_printf("BIO_ZONE"); break;
	default: db_printf("UNKNOWN"); break;
	}
	db_printf("\n");
	}

	static void
	db_print_bio_flags(struct bio *bp)
	{
	int comma;

	comma = 0;
	db_printf(" flags: ");
	if (bp->bio_flags & BIO_ERROR) {
	db_printf("BIO_ERROR");
	comma = 1;
	}
	if (bp->bio_flags & BIO_DONE) {
	db_printf("%sBIO_DONE", (comma ? ", " : ""));
	comma = 1;
	}
	if (bp->bio_flags & BIO_ONQUEUE)
	db_printf("%sBIO_ONQUEUE", (comma ? ", " : ""));
	db_printf("\n");
	}

	/*
	* Print useful information in a BIO
	*/
	DB_SHOW_COMMAND(bio, db_show_bio)
	{
	struct bio *bp;

	if (have_addr) {
	bp = (struct bio *)addr;
	db_printf("BIO %p\n", bp);
	db_print_bio_cmd(bp);
	db_print_bio_flags(bp);
	db_printf(" cflags: 0x%hx\n", bp->bio_cflags);
	db_printf(" pflags: 0x%hx\n", bp->bio_pflags);
	db_printf(" offset: %jd\n", (intmax_t)bp->bio_offset);
	db_printf(" length: %jd\n", (intmax_t)bp->bio_length);
	db_printf(" bcount: %ld\n", bp->bio_bcount);
	db_printf(" resid: %ld\n", bp->bio_resid);
	db_printf(" completed: %jd\n", (intmax_t)bp->bio_completed);
	db_printf(" children: %u\n", bp->bio_children);
	db_printf(" inbed: %u\n", bp->bio_inbed);
	db_printf(" error: %d\n", bp->bio_error);
	db_printf(" parent: %p\n", bp->bio_parent);
	db_printf(" driver1: %p\n", bp->bio_driver1);
	db_printf(" driver2: %p\n", bp->bio_driver2);
	db_printf(" caller1: %p\n", bp->bio_caller1);
	db_printf(" caller2: %p\n", bp->bio_caller2);
	db_printf(" bio_from: %p\n", bp->bio_from);
	db_printf(" bio_to: %p\n", bp->bio_to);

	#if defined(BUF_TRACKING) \|\| defined(FULL_BUF_TRACKING)
	db_printf(" bio_track_bp: %p\n", bp->bio_track_bp);
	#endif
	}
	}

	#undef gprintf
	#undef gprintln
	#undef ADDFLAG

	#endif /* DDB */
	Index: head/sys/geom/raid/g_raid.c
	===================================================================
	--- head/sys/geom/raid/g_raid.c (revision 327172)
	+++ head/sys/geom/raid/g_raid.c (revision 327173)
	@@ -1,2577 +1,2575 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/bio.h>
	#include <sys/sbuf.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/eventhandler.h>
	#include <vm/uma.h>
	#include <geom/geom.h>
	#include <sys/proc.h>
	#include <sys/kthread.h>
	#include <sys/sched.h>
	#include <geom/raid/g_raid.h>
	#include "g_raid_md_if.h"
	#include "g_raid_tr_if.h"

	static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");

	SYSCTL_DECL(_kern_geom);
	SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
	int g_raid_enable = 1;
	SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN,
	&g_raid_enable, 0, "Enable on-disk metadata taste");
	u_int g_raid_aggressive_spare = 0;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN,
	&g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
	u_int g_raid_debug = 0;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0,
	"Debug level");
	int g_raid_read_err_thresh = 10;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN,
	&g_raid_read_err_thresh, 0,
	"Number of read errors equated to disk failure");
	u_int g_raid_start_timeout = 30;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN,
	&g_raid_start_timeout, 0,
	"Time to wait for all array components");
	static u_int g_raid_clean_time = 5;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN,
	&g_raid_clean_time, 0, "Mark volume as clean when idling");
	static u_int g_raid_disconnect_on_failure = 1;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
	&g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
	static u_int g_raid_name_format = 0;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN,
	&g_raid_name_format, 0, "Providers name format.");
	static u_int g_raid_idle_threshold = 1000000;
	SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN,
	&g_raid_idle_threshold, 1000000,
	"Time in microseconds to consider a volume idle.");

	#define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \
	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \
	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \
	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \
	} while (0)

	LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
	LIST_HEAD_INITIALIZER(g_raid_md_classes);

	LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
	LIST_HEAD_INITIALIZER(g_raid_tr_classes);

	LIST_HEAD(, g_raid_volume) g_raid_volumes =
	LIST_HEAD_INITIALIZER(g_raid_volumes);

	static eventhandler_tag g_raid_post_sync = NULL;
	static int g_raid_started = 0;
	static int g_raid_shutdown = 0;

	static int g_raid_destroy_geom(struct gctl_req req, struct g_class mp,
	struct g_geom *gp);
	static g_taste_t g_raid_taste;
	static void g_raid_init(struct g_class *mp);
	static void g_raid_fini(struct g_class *mp);

	struct g_class g_raid_class = {
	.name = G_RAID_CLASS_NAME,
	.version = G_VERSION,
	.ctlreq = g_raid_ctl,
	.taste = g_raid_taste,
	.destroy_geom = g_raid_destroy_geom,
	.init = g_raid_init,
	.fini = g_raid_fini
	};

	static void g_raid_destroy_provider(struct g_raid_volume *vol);
	static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
	static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
	static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
	static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
	static void g_raid_dumpconf(struct sbuf sb, const char indent,
	struct g_geom gp, struct g_consumer cp, struct g_provider *pp);
	static void g_raid_start(struct bio *bp);
	static void g_raid_start_request(struct bio *bp);
	static void g_raid_disk_done(struct bio *bp);
	static void g_raid_poll(struct g_raid_softc *sc);

	static const char *
	g_raid_node_event2str(int event)
	{

	switch (event) {
	case G_RAID_NODE_E_WAKE:
	return ("WAKE");
	case G_RAID_NODE_E_START:
	return ("START");
	default:
	return ("INVALID");
	}
	}

	const char *
	g_raid_disk_state2str(int state)
	{

	switch (state) {
	case G_RAID_DISK_S_NONE:
	return ("NONE");
	case G_RAID_DISK_S_OFFLINE:
	return ("OFFLINE");
	case G_RAID_DISK_S_DISABLED:
	return ("DISABLED");
	case G_RAID_DISK_S_FAILED:
	return ("FAILED");
	case G_RAID_DISK_S_STALE_FAILED:
	return ("STALE_FAILED");
	case G_RAID_DISK_S_SPARE:
	return ("SPARE");
	case G_RAID_DISK_S_STALE:
	return ("STALE");
	case G_RAID_DISK_S_ACTIVE:
	return ("ACTIVE");
	default:
	return ("INVALID");
	}
	}

	static const char *
	g_raid_disk_event2str(int event)
	{

	switch (event) {
	case G_RAID_DISK_E_DISCONNECTED:
	return ("DISCONNECTED");
	default:
	return ("INVALID");
	}
	}

	const char *
	g_raid_subdisk_state2str(int state)
	{

	switch (state) {
	case G_RAID_SUBDISK_S_NONE:
	return ("NONE");
	case G_RAID_SUBDISK_S_FAILED:
	return ("FAILED");
	case G_RAID_SUBDISK_S_NEW:
	return ("NEW");
	case G_RAID_SUBDISK_S_REBUILD:
	return ("REBUILD");
	case G_RAID_SUBDISK_S_UNINITIALIZED:
	return ("UNINITIALIZED");
	case G_RAID_SUBDISK_S_STALE:
	return ("STALE");
	case G_RAID_SUBDISK_S_RESYNC:
	return ("RESYNC");
	case G_RAID_SUBDISK_S_ACTIVE:
	return ("ACTIVE");
	default:
	return ("INVALID");
	}
	}

	static const char *
	g_raid_subdisk_event2str(int event)
	{

	switch (event) {
	case G_RAID_SUBDISK_E_NEW:
	return ("NEW");
	case G_RAID_SUBDISK_E_FAILED:
	return ("FAILED");
	case G_RAID_SUBDISK_E_DISCONNECTED:
	return ("DISCONNECTED");
	default:
	return ("INVALID");
	}
	}

	const char *
	g_raid_volume_state2str(int state)
	{

	switch (state) {
	case G_RAID_VOLUME_S_STARTING:
	return ("STARTING");
	case G_RAID_VOLUME_S_BROKEN:
	return ("BROKEN");
	case G_RAID_VOLUME_S_DEGRADED:
	return ("DEGRADED");
	case G_RAID_VOLUME_S_SUBOPTIMAL:
	return ("SUBOPTIMAL");
	case G_RAID_VOLUME_S_OPTIMAL:
	return ("OPTIMAL");
	case G_RAID_VOLUME_S_UNSUPPORTED:
	return ("UNSUPPORTED");
	case G_RAID_VOLUME_S_STOPPED:
	return ("STOPPED");
	default:
	return ("INVALID");
	}
	}

	static const char *
	g_raid_volume_event2str(int event)
	{

	switch (event) {
	case G_RAID_VOLUME_E_UP:
	return ("UP");
	case G_RAID_VOLUME_E_DOWN:
	return ("DOWN");
	case G_RAID_VOLUME_E_START:
	return ("START");
	case G_RAID_VOLUME_E_STARTMD:
	return ("STARTMD");
	default:
	return ("INVALID");
	}
	}

	const char *
	g_raid_volume_level2str(int level, int qual)
	{

	switch (level) {
	case G_RAID_VOLUME_RL_RAID0:
	return ("RAID0");
	case G_RAID_VOLUME_RL_RAID1:
	return ("RAID1");
	case G_RAID_VOLUME_RL_RAID3:
	if (qual == G_RAID_VOLUME_RLQ_R3P0)
	return ("RAID3-P0");
	if (qual == G_RAID_VOLUME_RLQ_R3PN)
	return ("RAID3-PN");
	return ("RAID3");
	case G_RAID_VOLUME_RL_RAID4:
	if (qual == G_RAID_VOLUME_RLQ_R4P0)
	return ("RAID4-P0");
	if (qual == G_RAID_VOLUME_RLQ_R4PN)
	return ("RAID4-PN");
	return ("RAID4");
	case G_RAID_VOLUME_RL_RAID5:
	if (qual == G_RAID_VOLUME_RLQ_R5RA)
	return ("RAID5-RA");
	if (qual == G_RAID_VOLUME_RLQ_R5RS)
	return ("RAID5-RS");
	if (qual == G_RAID_VOLUME_RLQ_R5LA)
	return ("RAID5-LA");
	if (qual == G_RAID_VOLUME_RLQ_R5LS)
	return ("RAID5-LS");
	return ("RAID5");
	case G_RAID_VOLUME_RL_RAID6:
	if (qual == G_RAID_VOLUME_RLQ_R6RA)
	return ("RAID6-RA");
	if (qual == G_RAID_VOLUME_RLQ_R6RS)
	return ("RAID6-RS");
	if (qual == G_RAID_VOLUME_RLQ_R6LA)
	return ("RAID6-LA");
	if (qual == G_RAID_VOLUME_RLQ_R6LS)
	return ("RAID6-LS");
	return ("RAID6");
	case G_RAID_VOLUME_RL_RAIDMDF:
	if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
	return ("RAIDMDF-RA");
	if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
	return ("RAIDMDF-RS");
	if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
	return ("RAIDMDF-LA");
	if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
	return ("RAIDMDF-LS");
	return ("RAIDMDF");
	case G_RAID_VOLUME_RL_RAID1E:
	if (qual == G_RAID_VOLUME_RLQ_R1EA)
	return ("RAID1E-A");
	if (qual == G_RAID_VOLUME_RLQ_R1EO)
	return ("RAID1E-O");
	return ("RAID1E");
	case G_RAID_VOLUME_RL_SINGLE:
	return ("SINGLE");
	case G_RAID_VOLUME_RL_CONCAT:
	return ("CONCAT");
	case G_RAID_VOLUME_RL_RAID5E:
	if (qual == G_RAID_VOLUME_RLQ_R5ERA)
	return ("RAID5E-RA");
	if (qual == G_RAID_VOLUME_RLQ_R5ERS)
	return ("RAID5E-RS");
	if (qual == G_RAID_VOLUME_RLQ_R5ELA)
	return ("RAID5E-LA");
	if (qual == G_RAID_VOLUME_RLQ_R5ELS)
	return ("RAID5E-LS");
	return ("RAID5E");
	case G_RAID_VOLUME_RL_RAID5EE:
	if (qual == G_RAID_VOLUME_RLQ_R5EERA)
	return ("RAID5EE-RA");
	if (qual == G_RAID_VOLUME_RLQ_R5EERS)
	return ("RAID5EE-RS");
	if (qual == G_RAID_VOLUME_RLQ_R5EELA)
	return ("RAID5EE-LA");
	if (qual == G_RAID_VOLUME_RLQ_R5EELS)
	return ("RAID5EE-LS");
	return ("RAID5EE");
	case G_RAID_VOLUME_RL_RAID5R:
	if (qual == G_RAID_VOLUME_RLQ_R5RRA)
	return ("RAID5R-RA");
	if (qual == G_RAID_VOLUME_RLQ_R5RRS)
	return ("RAID5R-RS");
	if (qual == G_RAID_VOLUME_RLQ_R5RLA)
	return ("RAID5R-LA");
	if (qual == G_RAID_VOLUME_RLQ_R5RLS)
	return ("RAID5R-LS");
	return ("RAID5E");
	default:
	return ("UNKNOWN");
	}
	}

	int
	g_raid_volume_str2level(const char str, int level, int *qual)
	{

	*level = G_RAID_VOLUME_RL_UNKNOWN;
	*qual = G_RAID_VOLUME_RLQ_NONE;
	if (strcasecmp(str, "RAID0") == 0)
	*level = G_RAID_VOLUME_RL_RAID0;
	else if (strcasecmp(str, "RAID1") == 0)
	*level = G_RAID_VOLUME_RL_RAID1;
	else if (strcasecmp(str, "RAID3-P0") == 0) {
	*level = G_RAID_VOLUME_RL_RAID3;
	*qual = G_RAID_VOLUME_RLQ_R3P0;
	} else if (strcasecmp(str, "RAID3-PN") == 0 \|\|
	strcasecmp(str, "RAID3") == 0) {
	*level = G_RAID_VOLUME_RL_RAID3;
	*qual = G_RAID_VOLUME_RLQ_R3PN;
	} else if (strcasecmp(str, "RAID4-P0") == 0) {
	*level = G_RAID_VOLUME_RL_RAID4;
	*qual = G_RAID_VOLUME_RLQ_R4P0;
	} else if (strcasecmp(str, "RAID4-PN") == 0 \|\|
	strcasecmp(str, "RAID4") == 0) {
	*level = G_RAID_VOLUME_RL_RAID4;
	*qual = G_RAID_VOLUME_RLQ_R4PN;
	} else if (strcasecmp(str, "RAID5-RA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5;
	*qual = G_RAID_VOLUME_RLQ_R5RA;
	} else if (strcasecmp(str, "RAID5-RS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5;
	*qual = G_RAID_VOLUME_RLQ_R5RS;
	} else if (strcasecmp(str, "RAID5") == 0 \|\|
	strcasecmp(str, "RAID5-LA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5;
	*qual = G_RAID_VOLUME_RLQ_R5LA;
	} else if (strcasecmp(str, "RAID5-LS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5;
	*qual = G_RAID_VOLUME_RLQ_R5LS;
	} else if (strcasecmp(str, "RAID6-RA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID6;
	*qual = G_RAID_VOLUME_RLQ_R6RA;
	} else if (strcasecmp(str, "RAID6-RS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID6;
	*qual = G_RAID_VOLUME_RLQ_R6RS;
	} else if (strcasecmp(str, "RAID6") == 0 \|\|
	strcasecmp(str, "RAID6-LA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID6;
	*qual = G_RAID_VOLUME_RLQ_R6LA;
	} else if (strcasecmp(str, "RAID6-LS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID6;
	*qual = G_RAID_VOLUME_RLQ_R6LS;
	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
	*level = G_RAID_VOLUME_RL_RAIDMDF;
	*qual = G_RAID_VOLUME_RLQ_RMDFRA;
	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
	*level = G_RAID_VOLUME_RL_RAIDMDF;
	*qual = G_RAID_VOLUME_RLQ_RMDFRS;
	} else if (strcasecmp(str, "RAIDMDF") == 0 \|\|
	strcasecmp(str, "RAIDMDF-LA") == 0) {
	*level = G_RAID_VOLUME_RL_RAIDMDF;
	*qual = G_RAID_VOLUME_RLQ_RMDFLA;
	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
	*level = G_RAID_VOLUME_RL_RAIDMDF;
	*qual = G_RAID_VOLUME_RLQ_RMDFLS;
	} else if (strcasecmp(str, "RAID10") == 0 \|\|
	strcasecmp(str, "RAID1E") == 0 \|\|
	strcasecmp(str, "RAID1E-A") == 0) {
	*level = G_RAID_VOLUME_RL_RAID1E;
	*qual = G_RAID_VOLUME_RLQ_R1EA;
	} else if (strcasecmp(str, "RAID1E-O") == 0) {
	*level = G_RAID_VOLUME_RL_RAID1E;
	*qual = G_RAID_VOLUME_RLQ_R1EO;
	} else if (strcasecmp(str, "SINGLE") == 0)
	*level = G_RAID_VOLUME_RL_SINGLE;
	else if (strcasecmp(str, "CONCAT") == 0)
	*level = G_RAID_VOLUME_RL_CONCAT;
	else if (strcasecmp(str, "RAID5E-RA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5E;
	*qual = G_RAID_VOLUME_RLQ_R5ERA;
	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5E;
	*qual = G_RAID_VOLUME_RLQ_R5ERS;
	} else if (strcasecmp(str, "RAID5E") == 0 \|\|
	strcasecmp(str, "RAID5E-LA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5E;
	*qual = G_RAID_VOLUME_RLQ_R5ELA;
	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5E;
	*qual = G_RAID_VOLUME_RLQ_R5ELS;
	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5EE;
	*qual = G_RAID_VOLUME_RLQ_R5EERA;
	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5EE;
	*qual = G_RAID_VOLUME_RLQ_R5EERS;
	} else if (strcasecmp(str, "RAID5EE") == 0 \|\|
	strcasecmp(str, "RAID5EE-LA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5EE;
	*qual = G_RAID_VOLUME_RLQ_R5EELA;
	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5EE;
	*qual = G_RAID_VOLUME_RLQ_R5EELS;
	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5R;
	*qual = G_RAID_VOLUME_RLQ_R5RRA;
	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5R;
	*qual = G_RAID_VOLUME_RLQ_R5RRS;
	} else if (strcasecmp(str, "RAID5R") == 0 \|\|
	strcasecmp(str, "RAID5R-LA") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5R;
	*qual = G_RAID_VOLUME_RLQ_R5RLA;
	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
	*level = G_RAID_VOLUME_RL_RAID5R;
	*qual = G_RAID_VOLUME_RLQ_R5RLS;
	} else
	return (-1);
	return (0);
	}

	const char *
	g_raid_get_diskname(struct g_raid_disk *disk)
	{

	if (disk->d_consumer == NULL \|\| disk->d_consumer->provider == NULL)
	return ("[unknown]");
	return (disk->d_consumer->provider->name);
	}

	void
	g_raid_get_disk_info(struct g_raid_disk *disk)
	{
	struct g_consumer *cp = disk->d_consumer;
	int error, len;

	/* Read kernel dumping information. */
	disk->d_kd.offset = 0;
	disk->d_kd.length = OFF_MAX;
	len = sizeof(disk->d_kd);
	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
	if (error)
	disk->d_kd.di.dumper = NULL;
	if (disk->d_kd.di.dumper == NULL)
	G_RAID_DEBUG1(2, disk->d_softc,
	"Dumping not supported by %s: %d.",
	cp->provider->name, error);

	/* Read BIO_DELETE support. */
	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
	if (error)
	disk->d_candelete = 0;
	if (!disk->d_candelete)
	G_RAID_DEBUG1(2, disk->d_softc,
	"BIO_DELETE not supported by %s: %d.",
	cp->provider->name, error);
	}

	void
	g_raid_report_disk_state(struct g_raid_disk *disk)
	{
	struct g_raid_subdisk *sd;
	int len, state;
	uint32_t s;

	if (disk->d_consumer == NULL)
	return;
	if (disk->d_state == G_RAID_DISK_S_DISABLED) {
	s = G_STATE_ACTIVE; /* XXX */
	} else if (disk->d_state == G_RAID_DISK_S_FAILED \|\|
	disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
	s = G_STATE_FAILED;
	} else {
	state = G_RAID_SUBDISK_S_ACTIVE;
	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
	if (sd->sd_state < state)
	state = sd->sd_state;
	}
	if (state == G_RAID_SUBDISK_S_FAILED)
	s = G_STATE_FAILED;
	else if (state == G_RAID_SUBDISK_S_NEW \|\|
	state == G_RAID_SUBDISK_S_REBUILD)
	s = G_STATE_REBUILD;
	else if (state == G_RAID_SUBDISK_S_STALE \|\|
	state == G_RAID_SUBDISK_S_RESYNC)
	s = G_STATE_RESYNC;
	else
	s = G_STATE_ACTIVE;
	}
	len = sizeof(s);
	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
	g_raid_get_diskname(disk), s);
	}

	void
	g_raid_change_disk_state(struct g_raid_disk *disk, int state)
	{

	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
	g_raid_get_diskname(disk),
	g_raid_disk_state2str(disk->d_state),
	g_raid_disk_state2str(state));
	disk->d_state = state;
	g_raid_report_disk_state(disk);
	}

	void
	g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
	{

	G_RAID_DEBUG1(0, sd->sd_softc,
	"Subdisk %s:%d-%s state changed from %s to %s.",
	sd->sd_volume->v_name, sd->sd_pos,
	sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
	g_raid_subdisk_state2str(sd->sd_state),
	g_raid_subdisk_state2str(state));
	sd->sd_state = state;
	if (sd->sd_disk)
	g_raid_report_disk_state(sd->sd_disk);
	}

	void
	g_raid_change_volume_state(struct g_raid_volume *vol, int state)
	{

	G_RAID_DEBUG1(0, vol->v_softc,
	"Volume %s state changed from %s to %s.",
	vol->v_name,
	g_raid_volume_state2str(vol->v_state),
	g_raid_volume_state2str(state));
	vol->v_state = state;
	}

	/*
	* --- Events handling functions ---
	* Events in geom_raid are used to maintain subdisks and volumes status
	* from one thread to simplify locking.
	*/
	static void
	g_raid_event_free(struct g_raid_event *ep)
	{

	free(ep, M_RAID);
	}

	int
	g_raid_event_send(void *arg, int event, int flags)
	{
	struct g_raid_softc *sc;
	struct g_raid_event *ep;
	int error;

	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
	sc = ((struct g_raid_volume *)arg)->v_softc;
	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
	sc = ((struct g_raid_disk *)arg)->d_softc;
	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
	sc = ((struct g_raid_subdisk *)arg)->sd_softc;
	} else {
	sc = arg;
	}
	ep = malloc(sizeof(*ep), M_RAID,
	sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
	if (ep == NULL)
	return (ENOMEM);
	ep->e_tgt = arg;
	ep->e_event = event;
	ep->e_flags = flags;
	ep->e_error = 0;
	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
	mtx_unlock(&sc->sc_queue_mtx);
	wakeup(sc);

	if ((flags & G_RAID_EVENT_WAIT) == 0)
	return (0);

	sx_assert(&sc->sc_lock, SX_XLOCKED);
	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
	sx_xunlock(&sc->sc_lock);
	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
	mtx_lock(&sc->sc_queue_mtx);
	MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO \| PDROP, "m:event",
	hz * 5);
	}
	error = ep->e_error;
	g_raid_event_free(ep);
	sx_xlock(&sc->sc_lock);
	return (error);
	}

	static void
	g_raid_event_cancel(struct g_raid_softc sc, void tgt)
	{
	struct g_raid_event ep, tmpep;

	sx_assert(&sc->sc_lock, SX_XLOCKED);

	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
	if (ep->e_tgt != tgt)
	continue;
	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
	g_raid_event_free(ep);
	else {
	ep->e_error = ECANCELED;
	wakeup(ep);
	}
	}
	mtx_unlock(&sc->sc_queue_mtx);
	}

	static int
	g_raid_event_check(struct g_raid_softc sc, void tgt)
	{
	struct g_raid_event *ep;
	int res = 0;

	sx_assert(&sc->sc_lock, SX_XLOCKED);

	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
	if (ep->e_tgt != tgt)
	continue;
	res = 1;
	break;
	}
	mtx_unlock(&sc->sc_queue_mtx);
	return (res);
	}

	/*
	* Return the number of disks in given state.
	* If state is equal to -1, count all connected disks.
	*/
	u_int
	g_raid_ndisks(struct g_raid_softc *sc, int state)
	{
	struct g_raid_disk *disk;
	u_int n;

	sx_assert(&sc->sc_lock, SX_LOCKED);

	n = 0;
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_state == state \|\| state == -1)
	n++;
	}
	return (n);
	}

	/*
	* Return the number of subdisks in given state.
	* If state is equal to -1, count all connected disks.
	*/
	u_int
	g_raid_nsubdisks(struct g_raid_volume *vol, int state)
	{
	struct g_raid_subdisk *subdisk;
	struct g_raid_softc *sc;
	u_int i, n ;

	sc = vol->v_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	n = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	subdisk = &vol->v_subdisks[i];
	if ((state == -1 &&
	subdisk->sd_state != G_RAID_SUBDISK_S_NONE) \|\|
	subdisk->sd_state == state)
	n++;
	}
	return (n);
	}

	/*
	* Return the first subdisk in given state.
	* If state is equal to -1, then the first connected disks.
	*/
	struct g_raid_subdisk *
	g_raid_get_subdisk(struct g_raid_volume *vol, int state)
	{
	struct g_raid_subdisk *sd;
	struct g_raid_softc *sc;
	u_int i;

	sc = vol->v_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if ((state == -1 &&
	sd->sd_state != G_RAID_SUBDISK_S_NONE) \|\|
	sd->sd_state == state)
	return (sd);
	}
	return (NULL);
	}

	struct g_consumer *
	g_raid_open_consumer(struct g_raid_softc sc, const char name)
	{
	struct g_consumer *cp;
	struct g_provider *pp;

	g_topology_assert();

	if (strncmp(name, "/dev/", 5) == 0)
	name += 5;
	pp = g_provider_by_name(name);
	if (pp == NULL)
	return (NULL);
	cp = g_new_consumer(sc->sc_geom);
	cp->flags \|= G_CF_DIRECT_RECEIVE;
	if (g_attach(cp, pp) != 0) {
	g_destroy_consumer(cp);
	return (NULL);
	}
	if (g_access(cp, 1, 1, 1) != 0) {
	g_detach(cp);
	g_destroy_consumer(cp);
	return (NULL);
	}
	return (cp);
	}

	static u_int
	g_raid_nrequests(struct g_raid_softc sc, struct g_consumer cp)
	{
	struct bio *bp;
	u_int nreqs = 0;

	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
	if (bp->bio_from == cp)
	nreqs++;
	}
	mtx_unlock(&sc->sc_queue_mtx);
	return (nreqs);
	}

	u_int
	g_raid_nopens(struct g_raid_softc *sc)
	{
	struct g_raid_volume *vol;
	u_int opens;

	opens = 0;
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_provider_open != 0)
	opens++;
	}
	return (opens);
	}

	static int
	g_raid_consumer_is_busy(struct g_raid_softc sc, struct g_consumer cp)
	{

	if (cp->index > 0) {
	G_RAID_DEBUG1(2, sc,
	"I/O requests for %s exist, can't destroy it now.",
	cp->provider->name);
	return (1);
	}
	if (g_raid_nrequests(sc, cp) > 0) {
	G_RAID_DEBUG1(2, sc,
	"I/O requests for %s in queue, can't destroy it now.",
	cp->provider->name);
	return (1);
	}
	return (0);
	}

	static void
	g_raid_destroy_consumer(void *arg, int flags __unused)
	{
	struct g_consumer *cp;

	g_topology_assert();

	cp = arg;
	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	void
	g_raid_kill_consumer(struct g_raid_softc sc, struct g_consumer cp)
	{
	struct g_provider *pp;
	int retaste_wait;

	g_topology_assert_not();

	g_topology_lock();
	cp->private = NULL;
	if (g_raid_consumer_is_busy(sc, cp))
	goto out;
	pp = cp->provider;
	retaste_wait = 0;
	if (cp->acw == 1) {
	if ((pp->geom->flags & G_GEOM_WITHER) == 0)
	retaste_wait = 1;
	}
	if (cp->acr > 0 \|\| cp->acw > 0 \|\| cp->ace > 0)
	g_access(cp, -cp->acr, -cp->acw, -cp->ace);
	if (retaste_wait) {
	/*
	* After retaste event was send (inside g_access()), we can send
	* event to detach and destroy consumer.
	* A class, which has consumer to the given provider connected
	* will not receive retaste event for the provider.
	* This is the way how I ignore retaste events when I close
	* consumers opened for write: I detach and destroy consumer
	* after retaste event is sent.
	*/
	g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
	goto out;
	}
	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
	g_detach(cp);
	g_destroy_consumer(cp);
	out:
	g_topology_unlock();
	}

	static void
	g_raid_orphan(struct g_consumer *cp)
	{
	struct g_raid_disk *disk;

	g_topology_assert();

	disk = cp->private;
	if (disk == NULL)
	return;
	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
	G_RAID_EVENT_DISK);
	}

	static void
	g_raid_clean(struct g_raid_volume *vol, int acw)
	{
	struct g_raid_softc *sc;
	int timeout;

	sc = vol->v_softc;
	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	// if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
	// return;
	if (!vol->v_dirty)
	return;
	if (vol->v_writes > 0)
	return;
	if (acw > 0 \|\| (acw == -1 &&
	vol->v_provider != NULL && vol->v_provider->acw > 0)) {
	timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
	if (!g_raid_shutdown && timeout > 0)
	return;
	}
	vol->v_dirty = 0;
	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
	vol->v_name);
	g_raid_write_metadata(sc, vol, NULL, NULL);
	}

	static void
	g_raid_dirty(struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;

	sc = vol->v_softc;
	g_topology_assert_not();
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	// if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
	// return;
	vol->v_dirty = 1;
	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
	vol->v_name);
	g_raid_write_metadata(sc, vol, NULL, NULL);
	}

	void
	g_raid_tr_flush_common(struct g_raid_tr_object tr, struct bio bp)
	{
	- struct g_raid_softc *sc;
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct bio_queue_head queue;
	struct bio *cbp;
	int i;

	vol = tr->tro_volume;
	- sc = vol->v_softc;

	/*
	* Allocate all bios before sending any request, so we can return
	* ENOMEM in nice and clean way.
	*/
	bioq_init(&queue);
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE \|\|
	sd->sd_state == G_RAID_SUBDISK_S_FAILED)
	continue;
	cbp = g_clone_bio(bp);
	if (cbp == NULL)
	goto failure;
	cbp->bio_caller1 = sd;
	bioq_insert_tail(&queue, cbp);
	}
	while ((cbp = bioq_takefirst(&queue)) != NULL) {
	sd = cbp->bio_caller1;
	cbp->bio_caller1 = NULL;
	g_raid_subdisk_iostart(sd, cbp);
	}
	return;
	failure:
	while ((cbp = bioq_takefirst(&queue)) != NULL)
	g_destroy_bio(cbp);
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_raid_iodone(bp, bp->bio_error);
	}

	static void
	g_raid_tr_kerneldump_common_done(struct bio *bp)
	{

	bp->bio_flags \|= BIO_DONE;
	}

	int
	g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
	void *virtual, vm_offset_t physical, off_t offset, size_t length)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;
	struct bio bp;

	vol = tr->tro_volume;
	sc = vol->v_softc;

	g_reset_bio(&bp);
	bp.bio_cmd = BIO_WRITE;
	bp.bio_done = g_raid_tr_kerneldump_common_done;
	bp.bio_attribute = NULL;
	bp.bio_offset = offset;
	bp.bio_length = length;
	bp.bio_data = virtual;
	bp.bio_to = vol->v_provider;

	g_raid_start(&bp);
	while (!(bp.bio_flags & BIO_DONE)) {
	G_RAID_DEBUG1(4, sc, "Poll...");
	g_raid_poll(sc);
	DELAY(10);
	}

	return (bp.bio_error != 0 ? EIO : 0);
	}

	static int
	g_raid_dump(void *arg,
	void *virtual, vm_offset_t physical, off_t offset, size_t length)
	{
	struct g_raid_volume *vol;
	int error;

	vol = (struct g_raid_volume *)arg;
	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
	(long long unsigned)offset, (long long unsigned)length);

	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
	virtual, physical, offset, length);
	return (error);
	}

	static void
	g_raid_kerneldump(struct g_raid_softc sc, struct bio bp)
	{
	struct g_kerneldump *gkd;
	struct g_provider *pp;
	struct g_raid_volume *vol;

	gkd = (struct g_kerneldump*)bp->bio_data;
	pp = bp->bio_to;
	vol = pp->private;
	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
	pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
	gkd->di.dumper = g_raid_dump;
	gkd->di.priv = vol;
	gkd->di.blocksize = vol->v_sectorsize;
	gkd->di.maxiosize = DFLTPHYS;
	gkd->di.mediaoffset = gkd->offset;
	if ((gkd->offset + gkd->length) > vol->v_mediasize)
	gkd->length = vol->v_mediasize - gkd->offset;
	gkd->di.mediasize = gkd->length;
	g_io_deliver(bp, 0);
	}

	static void
	g_raid_candelete(struct g_raid_softc sc, struct bio bp)
	{
	struct g_provider *pp;
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	int *val;
	int i;

	val = (int *)bp->bio_data;
	pp = bp->bio_to;
	vol = pp->private;
	*val = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
	continue;
	if (sd->sd_disk->d_candelete) {
	*val = 1;
	break;
	}
	}
	g_io_deliver(bp, 0);
	}

	static void
	g_raid_start(struct bio *bp)
	{
	struct g_raid_softc *sc;

	sc = bp->bio_to->geom->softc;
	/*
	* If sc == NULL or there are no valid disks, provider's error
	* should be set and g_raid_start() should not be called at all.
	*/
	// KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
	// ("Provider's error should be set (error=%d)(mirror=%s).",
	// bp->bio_to->error, bp->bio_to->name));
	G_RAID_LOGREQ(3, bp, "Request received.");

	switch (bp->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
	case BIO_DELETE:
	case BIO_FLUSH:
	break;
	case BIO_GETATTR:
	if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
	g_raid_candelete(sc, bp);
	else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
	g_raid_kerneldump(sc, bp);
	else
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	default:
	g_io_deliver(bp, EOPNOTSUPP);
	return;
	}
	mtx_lock(&sc->sc_queue_mtx);
	bioq_insert_tail(&sc->sc_queue, bp);
	mtx_unlock(&sc->sc_queue_mtx);
	if (!dumping) {
	G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
	wakeup(sc);
	}
	}

	static int
	g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
	{
	/*
	* 5 cases:
	* (1) bp entirely below NO
	* (2) bp entirely above NO
	* (3) bp start below, but end in range YES
	* (4) bp entirely within YES
	* (5) bp starts within, ends above YES
	*
	* lock range 10-19 (offset 10 length 10)
	* (1) 1-5: first if kicks it out
	* (2) 30-35: second if kicks it out
	* (3) 5-15: passes both ifs
	* (4) 12-14: passes both ifs
	* (5) 19-20: passes both
	*/
	off_t lend = lstart + len - 1;
	off_t bstart = bp->bio_offset;
	off_t bend = bp->bio_offset + bp->bio_length - 1;

	if (bend < lstart)
	return (0);
	if (lend < bstart)
	return (0);
	return (1);
	}

	static int
	g_raid_is_in_locked_range(struct g_raid_volume vol, const struct bio bp)
	{
	struct g_raid_lock *lp;

	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);

	LIST_FOREACH(lp, &vol->v_locks, l_next) {
	if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
	return (1);
	}
	return (0);
	}

	static void
	g_raid_start_request(struct bio *bp)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;

	sc = bp->bio_to->geom->softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);
	vol = bp->bio_to->private;

	/*
	* Check to see if this item is in a locked range. If so,
	* queue it to our locked queue and return. We'll requeue
	* it when the range is unlocked. Internal I/O for the
	* rebuild/rescan/recovery process is excluded from this
	* check so we can actually do the recovery.
	*/
	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
	g_raid_is_in_locked_range(vol, bp)) {
	G_RAID_LOGREQ(3, bp, "Defer request.");
	bioq_insert_tail(&vol->v_locked, bp);
	return;
	}

	/*
	* If we're actually going to do the write/delete, then
	* update the idle stats for the volume.
	*/
	if (bp->bio_cmd == BIO_WRITE \|\| bp->bio_cmd == BIO_DELETE) {
	if (!vol->v_dirty)
	g_raid_dirty(vol);
	vol->v_writes++;
	}

	/*
	* Put request onto inflight queue, so we can check if new
	* synchronization requests don't collide with it. Then tell
	* the transformation layer to start the I/O.
	*/
	bioq_insert_tail(&vol->v_inflight, bp);
	G_RAID_LOGREQ(4, bp, "Request started");
	G_RAID_TR_IOSTART(vol->v_tr, bp);
	}

	static void
	g_raid_finish_with_locked_ranges(struct g_raid_volume vol, struct bio bp)
	{
	off_t off, len;
	struct bio *nbp;
	struct g_raid_lock *lp;

	vol->v_pending_lock = 0;
	LIST_FOREACH(lp, &vol->v_locks, l_next) {
	if (lp->l_pending) {
	off = lp->l_offset;
	len = lp->l_length;
	lp->l_pending = 0;
	TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
	if (g_raid_bio_overlaps(nbp, off, len))
	lp->l_pending++;
	}
	if (lp->l_pending) {
	vol->v_pending_lock = 1;
	G_RAID_DEBUG1(4, vol->v_softc,
	"Deferred lock(%jd, %jd) has %d pending",
	(intmax_t)off, (intmax_t)(off + len),
	lp->l_pending);
	continue;
	}
	G_RAID_DEBUG1(4, vol->v_softc,
	"Deferred lock of %jd to %jd completed",
	(intmax_t)off, (intmax_t)(off + len));
	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
	}
	}
	}

	void
	g_raid_iodone(struct bio *bp, int error)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;

	sc = bp->bio_to->geom->softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);
	vol = bp->bio_to->private;
	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);

	/* Update stats if we done write/delete. */
	if (bp->bio_cmd == BIO_WRITE \|\| bp->bio_cmd == BIO_DELETE) {
	vol->v_writes--;
	vol->v_last_write = time_uptime;
	}

	bioq_remove(&vol->v_inflight, bp);
	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
	g_raid_finish_with_locked_ranges(vol, bp);
	getmicrouptime(&vol->v_last_done);
	g_io_deliver(bp, error);
	}

	int
	g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
	struct bio ignore, void argp)
	{
	struct g_raid_softc *sc;
	struct g_raid_lock *lp;
	struct bio *bp;

	sc = vol->v_softc;
	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK \| M_ZERO);
	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
	lp->l_offset = off;
	lp->l_length = len;
	lp->l_callback_arg = argp;

	lp->l_pending = 0;
	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
	if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
	lp->l_pending++;
	}

	/*
	* If there are any writes that are pending, we return EBUSY. All
	* callers will have to wait until all pending writes clear.
	*/
	if (lp->l_pending > 0) {
	vol->v_pending_lock = 1;
	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
	(intmax_t)off, (intmax_t)(off+len), lp->l_pending);
	return (EBUSY);
	}
	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
	(intmax_t)off, (intmax_t)(off+len));
	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
	return (0);
	}

	int
	g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
	{
	struct g_raid_lock *lp;
	struct g_raid_softc *sc;
	struct bio *bp;

	sc = vol->v_softc;
	LIST_FOREACH(lp, &vol->v_locks, l_next) {
	if (lp->l_offset == off && lp->l_length == len) {
	LIST_REMOVE(lp, l_next);
	/* XXX
	* Right now we just put them all back on the queue
	* and hope for the best. We hope this because any
	* locked ranges will go right back on this list
	* when the worker thread runs.
	* XXX
	*/
	G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
	(intmax_t)lp->l_offset,
	(intmax_t)(lp->l_offset+lp->l_length));
	mtx_lock(&sc->sc_queue_mtx);
	while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
	bioq_insert_tail(&sc->sc_queue, bp);
	mtx_unlock(&sc->sc_queue_mtx);
	free(lp, M_RAID);
	return (0);
	}
	}
	return (EINVAL);
	}

	void
	g_raid_subdisk_iostart(struct g_raid_subdisk sd, struct bio bp)
	{
	struct g_consumer *cp;
	struct g_raid_disk disk, tdisk;

	bp->bio_caller1 = sd;

	/*
	* Make sure that the disk is present. Generally it is a task of
	* transformation layers to not send requests to absent disks, but
	* it is better to be safe and report situation then sorry.
	*/
	if (sd->sd_disk == NULL) {
	G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
	nodisk:
	bp->bio_from = NULL;
	bp->bio_to = NULL;
	bp->bio_error = ENXIO;
	g_raid_disk_done(bp);
	return;
	}
	disk = sd->sd_disk;
	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
	disk->d_state != G_RAID_DISK_S_FAILED) {
	G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
	"wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
	goto nodisk;
	}

	cp = disk->d_consumer;
	bp->bio_from = cp;
	bp->bio_to = cp->provider;
	cp->index++;

	/* Update average disks load. */
	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
	if (tdisk->d_consumer == NULL)
	tdisk->d_load = 0;
	else
	tdisk->d_load = (tdisk->d_consumer->index *
	G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
	}

	disk->d_last_offset = bp->bio_offset + bp->bio_length;
	if (dumping) {
	G_RAID_LOGREQ(3, bp, "Sending dumping request.");
	if (bp->bio_cmd == BIO_WRITE) {
	bp->bio_error = g_raid_subdisk_kerneldump(sd,
	bp->bio_data, 0, bp->bio_offset, bp->bio_length);
	} else
	bp->bio_error = EOPNOTSUPP;
	g_raid_disk_done(bp);
	} else {
	bp->bio_done = g_raid_disk_done;
	bp->bio_offset += sd->sd_offset;
	G_RAID_LOGREQ(3, bp, "Sending request.");
	g_io_request(bp, cp);
	}
	}

	int
	g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
	void *virtual, vm_offset_t physical, off_t offset, size_t length)
	{

	if (sd->sd_disk == NULL)
	return (ENXIO);
	if (sd->sd_disk->d_kd.di.dumper == NULL)
	return (EOPNOTSUPP);
	return (dump_write(&sd->sd_disk->d_kd.di,
	virtual, physical,
	sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
	length));
	}

	static void
	g_raid_disk_done(struct bio *bp)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;

	sd = bp->bio_caller1;
	sc = sd->sd_softc;
	mtx_lock(&sc->sc_queue_mtx);
	bioq_insert_tail(&sc->sc_queue, bp);
	mtx_unlock(&sc->sc_queue_mtx);
	if (!dumping)
	wakeup(sc);
	}

	static void
	g_raid_disk_done_request(struct bio *bp)
	{
	struct g_raid_softc *sc;
	struct g_raid_disk *disk;
	struct g_raid_subdisk *sd;
	struct g_raid_volume *vol;

	g_topology_assert_not();

	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
	sd = bp->bio_caller1;
	sc = sd->sd_softc;
	vol = sd->sd_volume;
	if (bp->bio_from != NULL) {
	bp->bio_from->index--;
	disk = bp->bio_from->private;
	if (disk == NULL)
	g_raid_kill_consumer(sc, bp->bio_from);
	}
	bp->bio_offset -= sd->sd_offset;

	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
	}

	static void
	g_raid_handle_event(struct g_raid_softc sc, struct g_raid_event ep)
	{

	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
	ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
	ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
	ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
	else
	ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
	KASSERT(ep->e_error == 0,
	("Error cannot be handled."));
	g_raid_event_free(ep);
	} else {
	ep->e_flags \|= G_RAID_EVENT_DONE;
	G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
	mtx_lock(&sc->sc_queue_mtx);
	wakeup(ep);
	mtx_unlock(&sc->sc_queue_mtx);
	}
	}

	/*
	* Worker thread.
	*/
	static void
	g_raid_worker(void *arg)
	{
	struct g_raid_softc *sc;
	struct g_raid_event *ep;
	struct g_raid_volume *vol;
	struct bio *bp;
	struct timeval now, t;
	int timeout, rv;

	sc = arg;
	thread_lock(curthread);
	sched_prio(curthread, PRIBIO);
	thread_unlock(curthread);

	sx_xlock(&sc->sc_lock);
	for (;;) {
	mtx_lock(&sc->sc_queue_mtx);
	/*
	* First take a look at events.
	* This is important to handle events before any I/O requests.
	*/
	bp = NULL;
	vol = NULL;
	rv = 0;
	ep = TAILQ_FIRST(&sc->sc_events);
	if (ep != NULL)
	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
	else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
	;
	else {
	getmicrouptime(&now);
	t = now;
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (bioq_first(&vol->v_inflight) == NULL &&
	vol->v_tr &&
	timevalcmp(&vol->v_last_done, &t, < ))
	t = vol->v_last_done;
	}
	timevalsub(&t, &now);
	timeout = g_raid_idle_threshold +
	t.tv_sec * 1000000 + t.tv_usec;
	if (timeout > 0) {
	/*
	* Two steps to avoid overflows at HZ=1000
	* and idle timeouts > 2.1s. Some rounding
	* errors can occur, but they are < 1tick,
	* which is deemed to be close enough for
	* this purpose.
	*/
	int micpertic = 1000000 / hz;
	timeout = (timeout + micpertic - 1) / micpertic;
	sx_xunlock(&sc->sc_lock);
	MSLEEP(rv, sc, &sc->sc_queue_mtx,
	PRIBIO \| PDROP, "-", timeout);
	sx_xlock(&sc->sc_lock);
	goto process;
	} else
	rv = EWOULDBLOCK;
	}
	mtx_unlock(&sc->sc_queue_mtx);
	process:
	if (ep != NULL) {
	g_raid_handle_event(sc, ep);
	} else if (bp != NULL) {
	if (bp->bio_to != NULL &&
	bp->bio_to->geom == sc->sc_geom)
	g_raid_start_request(bp);
	else
	g_raid_disk_done_request(bp);
	} else if (rv == EWOULDBLOCK) {
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	g_raid_clean(vol, -1);
	if (bioq_first(&vol->v_inflight) == NULL &&
	vol->v_tr) {
	t.tv_sec = g_raid_idle_threshold / 1000000;
	t.tv_usec = g_raid_idle_threshold % 1000000;
	timevaladd(&t, &vol->v_last_done);
	getmicrouptime(&now);
	if (timevalcmp(&t, &now, <= )) {
	G_RAID_TR_IDLE(vol->v_tr);
	vol->v_last_done = now;
	}
	}
	}
	}
	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
	g_raid_destroy_node(sc, 1); /* May not return. */
	}
	}

	static void
	g_raid_poll(struct g_raid_softc *sc)
	{
	struct g_raid_event *ep;
	struct bio *bp;

	sx_xlock(&sc->sc_lock);
	mtx_lock(&sc->sc_queue_mtx);
	/*
	* First take a look at events.
	* This is important to handle events before any I/O requests.
	*/
	ep = TAILQ_FIRST(&sc->sc_events);
	if (ep != NULL) {
	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
	mtx_unlock(&sc->sc_queue_mtx);
	g_raid_handle_event(sc, ep);
	goto out;
	}
	bp = bioq_takefirst(&sc->sc_queue);
	if (bp != NULL) {
	mtx_unlock(&sc->sc_queue_mtx);
	if (bp->bio_from == NULL \|\|
	bp->bio_from->geom != sc->sc_geom)
	g_raid_start_request(bp);
	else
	g_raid_disk_done_request(bp);
	}
	out:
	sx_xunlock(&sc->sc_lock);
	}

	static void
	g_raid_launch_provider(struct g_raid_volume *vol)
	{
	struct g_raid_disk *disk;
	struct g_raid_subdisk *sd;
	struct g_raid_softc *sc;
	struct g_provider *pp;
	char name[G_RAID_MAX_VOLUMENAME];
	off_t off;
	int i;

	sc = vol->v_softc;
	sx_assert(&sc->sc_lock, SX_LOCKED);

	g_topology_lock();
	/* Try to name provider with volume name. */
	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
	if (g_raid_name_format == 0 \|\| vol->v_name[0] == 0 \|\|
	g_provider_by_name(name) != NULL) {
	/* Otherwise use sequential volume number. */
	snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
	}

	pp = g_new_providerf(sc->sc_geom, "%s", name);
	pp->flags \|= G_PF_DIRECT_RECEIVE;
	if (vol->v_tr->tro_class->trc_accept_unmapped) {
	pp->flags \|= G_PF_ACCEPT_UNMAPPED;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
	continue;
	if ((sd->sd_disk->d_consumer->provider->flags &
	G_PF_ACCEPT_UNMAPPED) == 0)
	pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
	}
	}
	pp->private = vol;
	pp->mediasize = vol->v_mediasize;
	pp->sectorsize = vol->v_sectorsize;
	pp->stripesize = 0;
	pp->stripeoffset = 0;
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
	if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
	disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL) {
	pp->stripesize = disk->d_consumer->provider->stripesize;
	off = disk->d_consumer->provider->stripeoffset;
	pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
	if (off > 0)
	pp->stripeoffset %= off;
	}
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
	pp->stripesize *= (vol->v_disks_count - 1);
	pp->stripeoffset *= (vol->v_disks_count - 1);
	}
	} else
	pp->stripesize = vol->v_strip_size;
	vol->v_provider = pp;
	g_error_provider(pp, 0);
	g_topology_unlock();
	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
	pp->name, vol->v_name);
	}

	static void
	g_raid_destroy_provider(struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_provider *pp;
	struct bio bp, tmp;

	g_topology_assert_not();
	sc = vol->v_softc;
	pp = vol->v_provider;
	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));

	g_topology_lock();
	g_error_provider(pp, ENXIO);
	mtx_lock(&sc->sc_queue_mtx);
	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
	if (bp->bio_to != pp)
	continue;
	bioq_remove(&sc->sc_queue, bp);
	g_io_deliver(bp, ENXIO);
	}
	mtx_unlock(&sc->sc_queue_mtx);
	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
	pp->name, vol->v_name);
	g_wither_provider(pp, ENXIO);
	g_topology_unlock();
	vol->v_provider = NULL;
	}

	/*
	* Update device state.
	*/
	static int
	g_raid_update_volume(struct g_raid_volume *vol, u_int event)
	{
	struct g_raid_softc *sc;

	sc = vol->v_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
	g_raid_volume_event2str(event),
	vol->v_name);
	switch (event) {
	case G_RAID_VOLUME_E_DOWN:
	if (vol->v_provider != NULL)
	g_raid_destroy_provider(vol);
	break;
	case G_RAID_VOLUME_E_UP:
	if (vol->v_provider == NULL)
	g_raid_launch_provider(vol);
	break;
	case G_RAID_VOLUME_E_START:
	if (vol->v_tr)
	G_RAID_TR_START(vol->v_tr);
	return (0);
	default:
	if (sc->sc_md)
	G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
	return (0);
	}

	/* Manage root mount release. */
	if (vol->v_starting) {
	vol->v_starting = 0;
	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
	root_mount_rel(vol->v_rootmount);
	vol->v_rootmount = NULL;
	}
	if (vol->v_stopping && vol->v_provider_open == 0)
	g_raid_destroy_volume(vol);
	return (0);
	}

	/*
	* Update subdisk state.
	*/
	static int
	g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;

	sc = sd->sd_softc;
	vol = sd->sd_volume;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
	g_raid_subdisk_event2str(event),
	vol->v_name, sd->sd_pos,
	sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
	if (vol->v_tr)
	G_RAID_TR_EVENT(vol->v_tr, sd, event);

	return (0);
	}

	/*
	* Update disk state.
	*/
	static int
	g_raid_update_disk(struct g_raid_disk *disk, u_int event)
	{
	struct g_raid_softc *sc;

	sc = disk->d_softc;
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
	g_raid_disk_event2str(event),
	g_raid_get_diskname(disk));

	if (sc->sc_md)
	G_RAID_MD_EVENT(sc->sc_md, disk, event);
	return (0);
	}

	/*
	* Node event.
	*/
	static int
	g_raid_update_node(struct g_raid_softc *sc, u_int event)
	{
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
	g_raid_node_event2str(event));

	if (event == G_RAID_NODE_E_WAKE)
	return (0);
	if (sc->sc_md)
	G_RAID_MD_EVENT(sc->sc_md, NULL, event);
	return (0);
	}

	static int
	g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
	{
	struct g_raid_volume *vol;
	struct g_raid_softc *sc;
	int dcw, opens, error = 0;

	g_topology_assert();
	sc = pp->geom->softc;
	vol = pp->private;
	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));

	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
	acr, acw, ace);
	dcw = pp->acw + acw;

	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	/* Deny new opens while dying. */
	if (sc->sc_stopping != 0 && (acr > 0 \|\| acw > 0 \|\| ace > 0)) {
	error = ENXIO;
	goto out;
	}
	/* Deny write opens for read-only volumes. */
	if (vol->v_read_only && acw > 0) {
	error = EROFS;
	goto out;
	}
	if (dcw == 0)
	g_raid_clean(vol, dcw);
	vol->v_provider_open += acr + acw + ace;
	/* Handle delayed node destruction. */
	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
	vol->v_provider_open == 0) {
	/* Count open volumes. */
	opens = g_raid_nopens(sc);
	if (opens == 0) {
	sc->sc_stopping = G_RAID_DESTROY_HARD;
	/* Wake up worker to make it selfdestruct. */
	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
	}
	}
	/* Handle open volume destruction. */
	if (vol->v_stopping && vol->v_provider_open == 0)
	g_raid_destroy_volume(vol);
	out:
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	return (error);
	}

	struct g_raid_softc *
	g_raid_create_node(struct g_class *mp,
	const char name, struct g_raid_md_object md)
	{
	struct g_raid_softc *sc;
	struct g_geom *gp;
	int error;

	g_topology_assert();
	G_RAID_DEBUG(1, "Creating array %s.", name);

	gp = g_new_geomf(mp, "%s", name);
	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK \| M_ZERO);
	gp->start = g_raid_start;
	gp->orphan = g_raid_orphan;
	gp->access = g_raid_access;
	gp->dumpconf = g_raid_dumpconf;

	sc->sc_md = md;
	sc->sc_geom = gp;
	sc->sc_flags = 0;
	TAILQ_INIT(&sc->sc_volumes);
	TAILQ_INIT(&sc->sc_disks);
	sx_init(&sc->sc_lock, "graid:lock");
	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
	TAILQ_INIT(&sc->sc_events);
	bioq_init(&sc->sc_queue);
	gp->softc = sc;
	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
	"g_raid %s", name);
	if (error != 0) {
	G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
	mtx_destroy(&sc->sc_queue_mtx);
	sx_destroy(&sc->sc_lock);
	g_destroy_geom(sc->sc_geom);
	free(sc, M_RAID);
	return (NULL);
	}

	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
	return (sc);
	}

	struct g_raid_volume *
	g_raid_create_volume(struct g_raid_softc sc, const char name, int id)
	{
	struct g_raid_volume vol, vol1;
	int i;

	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK \| M_ZERO);
	vol->v_softc = sc;
	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
	vol->v_state = G_RAID_VOLUME_S_STARTING;
	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
	vol->v_rotate_parity = 1;
	bioq_init(&vol->v_inflight);
	bioq_init(&vol->v_locked);
	LIST_INIT(&vol->v_locks);
	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
	vol->v_subdisks[i].sd_softc = sc;
	vol->v_subdisks[i].sd_volume = vol;
	vol->v_subdisks[i].sd_pos = i;
	vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
	}

	/* Find free ID for this volume. */
	g_topology_lock();
	vol1 = vol;
	if (id >= 0) {
	LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
	if (vol1->v_global_id == id)
	break;
	}
	}
	if (vol1 != NULL) {
	for (id = 0; ; id++) {
	LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
	if (vol1->v_global_id == id)
	break;
	}
	if (vol1 == NULL)
	break;
	}
	}
	vol->v_global_id = id;
	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
	g_topology_unlock();

	/* Delay root mounting. */
	vol->v_rootmount = root_mount_hold("GRAID");
	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
	vol->v_starting = 1;
	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
	return (vol);
	}

	struct g_raid_disk *
	g_raid_create_disk(struct g_raid_softc *sc)
	{
	struct g_raid_disk *disk;

	G_RAID_DEBUG1(1, sc, "Creating disk.");
	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK \| M_ZERO);
	disk->d_softc = sc;
	disk->d_state = G_RAID_DISK_S_NONE;
	TAILQ_INIT(&disk->d_subdisks);
	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
	return (disk);
	}

	int g_raid_start_volume(struct g_raid_volume *vol)
	{
	struct g_raid_tr_class *class;
	struct g_raid_tr_object *obj;
	int status;

	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
	if (!class->trc_enable)
	continue;
	G_RAID_DEBUG1(2, vol->v_softc,
	"Tasting volume %s for %s transformation.",
	vol->v_name, class->name);
	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
	M_WAITOK);
	obj->tro_class = class;
	obj->tro_volume = vol;
	status = G_RAID_TR_TASTE(obj, vol);
	if (status != G_RAID_TR_TASTE_FAIL)
	break;
	kobj_delete((kobj_t)obj, M_RAID);
	}
	if (class == NULL) {
	G_RAID_DEBUG1(0, vol->v_softc,
	"No transformation module found for %s.",
	vol->v_name);
	vol->v_tr = NULL;
	g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
	g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
	G_RAID_EVENT_VOLUME);
	return (-1);
	}
	G_RAID_DEBUG1(2, vol->v_softc,
	"Transformation module %s chosen for %s.",
	class->name, vol->v_name);
	vol->v_tr = obj;
	return (0);
	}

	int
	g_raid_destroy_node(struct g_raid_softc *sc, int worker)
	{
	struct g_raid_volume vol, tmpv;
	struct g_raid_disk disk, tmpd;
	int error = 0;

	sc->sc_stopping = G_RAID_DESTROY_HARD;
	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
	if (g_raid_destroy_volume(vol))
	error = EBUSY;
	}
	if (error)
	return (error);
	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
	if (g_raid_destroy_disk(disk))
	error = EBUSY;
	}
	if (error)
	return (error);
	if (sc->sc_md) {
	G_RAID_MD_FREE(sc->sc_md);
	kobj_delete((kobj_t)sc->sc_md, M_RAID);
	sc->sc_md = NULL;
	}
	if (sc->sc_geom != NULL) {
	G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
	g_topology_lock();
	sc->sc_geom->softc = NULL;
	g_wither_geom(sc->sc_geom, ENXIO);
	g_topology_unlock();
	sc->sc_geom = NULL;
	} else
	G_RAID_DEBUG(1, "Array destroyed.");
	if (worker) {
	g_raid_event_cancel(sc, sc);
	mtx_destroy(&sc->sc_queue_mtx);
	sx_xunlock(&sc->sc_lock);
	sx_destroy(&sc->sc_lock);
	wakeup(&sc->sc_stopping);
	free(sc, M_RAID);
	curthread->td_pflags &= ~TDP_GEOM;
	G_RAID_DEBUG(1, "Thread exiting.");
	kproc_exit(0);
	} else {
	/* Wake up worker to make it selfdestruct. */
	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
	}
	return (0);
	}

	int
	g_raid_destroy_volume(struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_disk *disk;
	int i;

	sc = vol->v_softc;
	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
	vol->v_stopping = 1;
	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
	if (vol->v_tr) {
	G_RAID_TR_STOP(vol->v_tr);
	return (EBUSY);
	} else
	vol->v_state = G_RAID_VOLUME_S_STOPPED;
	}
	if (g_raid_event_check(sc, vol) != 0)
	return (EBUSY);
	if (vol->v_provider != NULL)
	return (EBUSY);
	if (vol->v_provider_open != 0)
	return (EBUSY);
	if (vol->v_tr) {
	G_RAID_TR_FREE(vol->v_tr);
	kobj_delete((kobj_t)vol->v_tr, M_RAID);
	vol->v_tr = NULL;
	}
	if (vol->v_rootmount)
	root_mount_rel(vol->v_rootmount);
	g_topology_lock();
	LIST_REMOVE(vol, v_global_next);
	g_topology_unlock();
	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
	g_raid_event_cancel(sc, &vol->v_subdisks[i]);
	disk = vol->v_subdisks[i].sd_disk;
	if (disk == NULL)
	continue;
	TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
	}
	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
	if (sc->sc_md)
	G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
	g_raid_event_cancel(sc, vol);
	free(vol, M_RAID);
	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
	/* Wake up worker to let it selfdestruct. */
	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
	}
	return (0);
	}

	int
	g_raid_destroy_disk(struct g_raid_disk *disk)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk sd, tmp;

	sc = disk->d_softc;
	G_RAID_DEBUG1(2, sc, "Destroying disk.");
	if (disk->d_consumer) {
	g_raid_kill_consumer(sc, disk->d_consumer);
	disk->d_consumer = NULL;
	}
	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
	G_RAID_EVENT_SUBDISK);
	TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
	sd->sd_disk = NULL;
	}
	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
	if (sc->sc_md)
	G_RAID_MD_FREE_DISK(sc->sc_md, disk);
	g_raid_event_cancel(sc, disk);
	free(disk, M_RAID);
	return (0);
	}

	int
	g_raid_destroy(struct g_raid_softc *sc, int how)
	{
	int error, opens;

	g_topology_assert_not();
	if (sc == NULL)
	return (ENXIO);
	sx_assert(&sc->sc_lock, SX_XLOCKED);

	/* Count open volumes. */
	opens = g_raid_nopens(sc);

	/* React on some opened volumes. */
	if (opens > 0) {
	switch (how) {
	case G_RAID_DESTROY_SOFT:
	G_RAID_DEBUG1(1, sc,
	"%d volumes are still open.",
	opens);
	sx_xunlock(&sc->sc_lock);
	return (EBUSY);
	case G_RAID_DESTROY_DELAYED:
	G_RAID_DEBUG1(1, sc,
	"Array will be destroyed on last close.");
	sc->sc_stopping = G_RAID_DESTROY_DELAYED;
	sx_xunlock(&sc->sc_lock);
	return (EBUSY);
	case G_RAID_DESTROY_HARD:
	G_RAID_DEBUG1(1, sc,
	"%d volumes are still open.",
	opens);
	}
	}

	/* Mark node for destruction. */
	sc->sc_stopping = G_RAID_DESTROY_HARD;
	/* Wake up worker to let it selfdestruct. */
	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
	/* Sleep until node destroyed. */
	error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
	PRIBIO \| PDROP, "r:destroy", hz * 3);
	return (error == EWOULDBLOCK ? EBUSY : 0);
	}

	static void
	g_raid_taste_orphan(struct g_consumer *cp)
	{

	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
	cp->provider->name));
	}

	static struct g_geom *
	g_raid_taste(struct g_class mp, struct g_provider pp, int flags __unused)
	{
	struct g_consumer *cp;
	struct g_geom gp, geom;
	struct g_raid_md_class *class;
	struct g_raid_md_object *obj;
	int status;

	g_topology_assert();
	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
	if (!g_raid_enable)
	return (NULL);
	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);

	geom = NULL;
	status = G_RAID_MD_TASTE_FAIL;
	gp = g_new_geomf(mp, "raid:taste");
	/*
	* This orphan function should be never called.
	*/
	gp->orphan = g_raid_taste_orphan;
	cp = g_new_consumer(gp);
	cp->flags \|= G_CF_DIRECT_RECEIVE;
	g_attach(cp, pp);
	if (g_access(cp, 1, 0, 0) != 0)
	goto ofail;

	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
	if (!class->mdc_enable)
	continue;
	G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
	pp->name, class->name);
	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
	M_WAITOK);
	obj->mdo_class = class;
	status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
	if (status != G_RAID_MD_TASTE_NEW)
	kobj_delete((kobj_t)obj, M_RAID);
	if (status != G_RAID_MD_TASTE_FAIL)
	break;
	}

	if (status == G_RAID_MD_TASTE_FAIL)
	(void)g_access(cp, -1, 0, 0);
	ofail:
	g_detach(cp);
	g_destroy_consumer(cp);
	g_destroy_geom(gp);
	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
	return (geom);
	}

	int
	g_raid_create_node_format(const char format, struct gctl_req req,
	struct g_geom **gp)
	{
	struct g_raid_md_class *class;
	struct g_raid_md_object *obj;
	int status;

	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
	if (strcasecmp(class->name, format) == 0)
	break;
	}
	if (class == NULL) {
	G_RAID_DEBUG(1, "No support for %s metadata.", format);
	return (G_RAID_MD_TASTE_FAIL);
	}
	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
	M_WAITOK);
	obj->mdo_class = class;
	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
	if (status != G_RAID_MD_TASTE_NEW)
	kobj_delete((kobj_t)obj, M_RAID);
	return (status);
	}

	static int
	g_raid_destroy_geom(struct gctl_req *req __unused,
	struct g_class mp __unused, struct g_geom gp)
	{
	struct g_raid_softc *sc;
	int error;

	g_topology_unlock();
	sc = gp->softc;
	sx_xlock(&sc->sc_lock);
	g_cancel_event(sc);
	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
	g_topology_lock();
	return (error);
	}

	void g_raid_write_metadata(struct g_raid_softc sc, struct g_raid_volume vol,
	struct g_raid_subdisk sd, struct g_raid_disk disk)
	{

	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
	return;
	if (sc->sc_md)
	G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
	}

	void g_raid_fail_disk(struct g_raid_softc *sc,
	struct g_raid_subdisk sd, struct g_raid_disk disk)
	{

	if (disk == NULL)
	disk = sd->sd_disk;
	if (disk == NULL) {
	G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
	return;
	}
	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
	G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
	"wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
	return;
	}
	if (sc->sc_md)
	G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
	}

	static void
	g_raid_dumpconf(struct sbuf sb, const char indent, struct g_geom *gp,
	struct g_consumer cp, struct g_provider pp)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	int i, s;

	g_topology_assert();

	sc = gp->softc;
	if (sc == NULL)
	return;
	if (pp != NULL) {
	vol = pp->private;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
	sc->sc_md->mdo_class->name,
	g_raid_volume_level2str(vol->v_raid_level,
	vol->v_raid_level_qualifier));
	sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
	vol->v_name);
	sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
	g_raid_volume_level2str(vol->v_raid_level,
	vol->v_raid_level_qualifier));
	sbuf_printf(sb,
	"%s<Transformation>%s</Transformation>\n", indent,
	vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
	sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
	vol->v_disks_count);
	sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
	vol->v_strip_size);
	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
	g_raid_volume_state2str(vol->v_state));
	sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
	vol->v_dirty ? "Yes" : "No");
	sbuf_printf(sb, "%s<Subdisks>", indent);
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_disk != NULL &&
	sd->sd_disk->d_consumer != NULL) {
	sbuf_printf(sb, "%s ",
	g_raid_get_diskname(sd->sd_disk));
	} else {
	sbuf_printf(sb, "NONE ");
	}
	sbuf_printf(sb, "(%s",
	g_raid_subdisk_state2str(sd->sd_state));
	if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD \|\|
	sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
	sbuf_printf(sb, " %d%%",
	(int)(sd->sd_rebuild_pos * 100 /
	sd->sd_size));
	}
	sbuf_printf(sb, ")");
	if (i + 1 < vol->v_disks_count)
	sbuf_printf(sb, ", ");
	}
	sbuf_printf(sb, "</Subdisks>\n");
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	} else if (cp != NULL) {
	disk = cp->private;
	if (disk == NULL)
	return;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	sbuf_printf(sb, "%s<State>%s", indent,
	g_raid_disk_state2str(disk->d_state));
	if (!TAILQ_EMPTY(&disk->d_subdisks)) {
	sbuf_printf(sb, " (");
	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
	sbuf_printf(sb, "%s",
	g_raid_subdisk_state2str(sd->sd_state));
	if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD \|\|
	sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
	sbuf_printf(sb, " %d%%",
	(int)(sd->sd_rebuild_pos * 100 /
	sd->sd_size));
	}
	if (TAILQ_NEXT(sd, sd_next))
	sbuf_printf(sb, ", ");
	}
	sbuf_printf(sb, ")");
	}
	sbuf_printf(sb, "</State>\n");
	sbuf_printf(sb, "%s<Subdisks>", indent);
	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
	sbuf_printf(sb, "r%d(%s):%d@%ju",
	sd->sd_volume->v_global_id,
	sd->sd_volume->v_name,
	sd->sd_pos, sd->sd_offset);
	if (TAILQ_NEXT(sd, sd_next))
	sbuf_printf(sb, ", ");
	}
	sbuf_printf(sb, "</Subdisks>\n");
	sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
	disk->d_read_errs);
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	} else {
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	if (sc->sc_md) {
	sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
	sc->sc_md->mdo_class->name);
	}
	if (!TAILQ_EMPTY(&sc->sc_volumes)) {
	s = 0xff;
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_state < s)
	s = vol->v_state;
	}
	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
	g_raid_volume_state2str(s));
	}
	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	}
	}

	static void
	g_raid_shutdown_post_sync(void *arg, int howto)
	{
	struct g_class *mp;
	struct g_geom gp, gp2;
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;

	mp = arg;
	g_topology_lock();
	g_raid_shutdown = 1;
	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
	if ((sc = gp->softc) == NULL)
	continue;
	g_topology_unlock();
	sx_xlock(&sc->sc_lock);
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
	g_raid_clean(vol, -1);
	g_cancel_event(sc);
	g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
	g_topology_lock();
	}
	g_topology_unlock();
	}

	static void
	g_raid_init(struct g_class *mp)
	{

	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
	g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
	if (g_raid_post_sync == NULL)
	G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
	g_raid_started = 1;
	}

	static void
	g_raid_fini(struct g_class *mp)
	{

	if (g_raid_post_sync != NULL)
	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
	g_raid_started = 0;
	}

	int
	g_raid_md_modevent(module_t mod, int type, void *arg)
	{
	struct g_raid_md_class class, c, *nc;
	int error;

	error = 0;
	class = arg;
	switch (type) {
	case MOD_LOAD:
	c = LIST_FIRST(&g_raid_md_classes);
	if (c == NULL \|\| c->mdc_priority > class->mdc_priority)
	LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
	else {
	while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
	nc->mdc_priority < class->mdc_priority)
	c = nc;
	LIST_INSERT_AFTER(c, class, mdc_list);
	}
	if (g_raid_started)
	g_retaste(&g_raid_class);
	break;
	case MOD_UNLOAD:
	LIST_REMOVE(class, mdc_list);
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	return (error);
	}

	int
	g_raid_tr_modevent(module_t mod, int type, void *arg)
	{
	struct g_raid_tr_class class, c, *nc;
	int error;

	error = 0;
	class = arg;
	switch (type) {
	case MOD_LOAD:
	c = LIST_FIRST(&g_raid_tr_classes);
	if (c == NULL \|\| c->trc_priority > class->trc_priority)
	LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
	else {
	while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
	nc->trc_priority < class->trc_priority)
	c = nc;
	LIST_INSERT_AFTER(c, class, trc_list);
	}
	break;
	case MOD_UNLOAD:
	LIST_REMOVE(class, trc_list);
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	return (error);
	}

	/*
	* Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
	* to reduce module priority, allowing submodules to register them first.
	*/
	static moduledata_t g_raid_mod = {
	"g_raid",
	g_modevent,
	&g_raid_class
	};
	DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
	MODULE_VERSION(geom_raid, 0);
	Index: head/sys/geom/raid/md_ddf.c
	===================================================================
	--- head/sys/geom/raid/md_ddf.c (revision 327172)
	+++ head/sys/geom/raid/md_ddf.c (revision 327173)
	@@ -1,3097 +1,3087 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/clock.h>
	#include <geom/geom.h>
	#include "geom/raid/g_raid.h"
	#include "geom/raid/md_ddf.h"
	#include "g_raid_md_if.h"

	static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata");

	#define DDF_MAX_DISKS_HARD 128

	#define DDF_MAX_DISKS 16
	#define DDF_MAX_VDISKS 7
	#define DDF_MAX_PARTITIONS 1

	#define DECADE (360024(36510+2)) / 10 years in seconds. */

	struct ddf_meta {
	u_int sectorsize;
	u_int bigendian;
	struct ddf_header *hdr;
	struct ddf_cd_record *cdr;
	struct ddf_pd_record *pdr;
	struct ddf_vd_record *vdr;
	void *cr;
	struct ddf_pdd_record *pdd;
	struct ddf_bbm_log *bbm;
	};

	struct ddf_vol_meta {
	u_int sectorsize;
	u_int bigendian;
	struct ddf_header *hdr;
	struct ddf_cd_record *cdr;
	struct ddf_vd_entry *vde;
	struct ddf_vdc_record *vdc;
	struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD];
	};

	struct g_raid_md_ddf_perdisk {
	struct ddf_meta pd_meta;
	};

	struct g_raid_md_ddf_pervolume {
	struct ddf_vol_meta pv_meta;
	int pv_started;
	struct callout pv_start_co; /* STARTING state timer. */
	};

	struct g_raid_md_ddf_object {
	struct g_raid_md_object mdio_base;
	u_int mdio_bigendian;
	struct ddf_meta mdio_meta;
	int mdio_starting;
	struct callout mdio_start_co; /* STARTING state timer. */
	int mdio_started;
	struct root_hold_token mdio_rootmount; / Root mount delay token. */
	};

	static g_raid_md_create_req_t g_raid_md_create_req_ddf;
	static g_raid_md_taste_t g_raid_md_taste_ddf;
	static g_raid_md_event_t g_raid_md_event_ddf;
	static g_raid_md_volume_event_t g_raid_md_volume_event_ddf;
	static g_raid_md_ctl_t g_raid_md_ctl_ddf;
	static g_raid_md_write_t g_raid_md_write_ddf;
	static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf;
	static g_raid_md_free_disk_t g_raid_md_free_disk_ddf;
	static g_raid_md_free_volume_t g_raid_md_free_volume_ddf;
	static g_raid_md_free_t g_raid_md_free_ddf;

	static kobj_method_t g_raid_md_ddf_methods[] = {
	KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf),
	KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf),
	KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf),
	KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf),
	KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf),
	KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf),
	KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf),
	KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf),
	KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf),
	KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf),
	{ 0, 0 }
	};

	static struct g_raid_md_class g_raid_md_ddf_class = {
	"DDF",
	g_raid_md_ddf_methods,
	sizeof(struct g_raid_md_ddf_object),
	.mdc_enable = 1,
	.mdc_priority = 100
	};

	#define GET8(m, f) ((m)->f)
	#define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f))
	#define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f))
	#define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f))
	#define GET8D(m, f) (f)
	#define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f))
	#define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f))
	#define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f))
	#define GET8P(m, f) (*(f))
	#define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f))
	#define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f))
	#define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f))

	#define SET8P(m, f, v) \
	(*(f) = (v))
	#define SET16P(m, f, v) \
	do { \
	if ((m)->bigendian) \
	be16enc((f), (v)); \
	else \
	le16enc((f), (v)); \
	} while (0)
	#define SET32P(m, f, v) \
	do { \
	if ((m)->bigendian) \
	be32enc((f), (v)); \
	else \
	le32enc((f), (v)); \
	} while (0)
	#define SET64P(m, f, v) \
	do { \
	if ((m)->bigendian) \
	be64enc((f), (v)); \
	else \
	le64enc((f), (v)); \
	} while (0)
	#define SET8(m, f, v) SET8P((m), &((m)->f), (v))
	#define SET16(m, f, v) SET16P((m), &((m)->f), (v))
	#define SET32(m, f, v) SET32P((m), &((m)->f), (v))
	#define SET64(m, f, v) SET64P((m), &((m)->f), (v))
	#define SET8D(m, f, v) SET8P((m), &(f), (v))
	#define SET16D(m, f, v) SET16P((m), &(f), (v))
	#define SET32D(m, f, v) SET32P((m), &(f), (v))
	#define SET64D(m, f, v) SET64P((m), &(f), (v))

	#define GETCRNUM(m) (GET32((m), hdr->cr_length) / \
	GET16((m), hdr->Configuration_Record_Length))

	#define GETVDCPTR(m, n) ((struct ddf_vdc_record )((uint8_t )(m)->cr + \
	(n) * GET16((m), hdr->Configuration_Record_Length) * \
	(m)->sectorsize))

	#define GETSAPTR(m, n) ((struct ddf_sa_record )((uint8_t )(m)->cr + \
	(n) * GET16((m), hdr->Configuration_Record_Length) * \
	(m)->sectorsize))

	static int
	isff(uint8_t *buf, int size)
	{
	int i;

	for (i = 0; i < size; i++)
	if (buf[i] != 0xff)
	return (0);
	return (1);
	}

	static void
	print_guid(uint8_t *buf)
	{
	int i, ascii;

	ascii = 1;
	for (i = 0; i < 24; i++) {
	if (buf[i] != 0 && (buf[i] < ' ' \|\| buf[i] > 127)) {
	ascii = 0;
	break;
	}
	}
	if (ascii) {
	printf("'%.24s'", buf);
	} else {
	for (i = 0; i < 24; i++)
	printf("%02x", buf[i]);
	}
	}

	static void
	g_raid_md_ddf_print(struct ddf_meta *meta)
	{
	struct ddf_vdc_record *vdc;
	struct ddf_vuc_record *vuc;
	struct ddf_sa_record *sa;
	uint64_t *val2;
	uint32_t val;
	int i, j, k, num, num2;

	if (g_raid_debug < 1)
	return;

	printf("******* DDF Metadata *******\n");
	printf("** Header **\n");
	printf("DDF_Header_GUID ");
	print_guid(meta->hdr->DDF_Header_GUID);
	printf("\n");
	printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]);
	printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number));
	printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp));
	printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag));
	printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag));
	printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping));
	printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA));
	printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA));
	printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length));
	printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA));
	printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries));
	printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries));
	printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions));
	printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length));
	printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries));
	printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length));
	printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length));
	printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length));
	printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length));
	printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length));
	printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length));
	printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length));
	printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length));
	printf("** Controller Data **\n");
	printf("Controller_GUID ");
	print_guid(meta->cdr->Controller_GUID);
	printf("\n");
	printf("Controller_Type 0x%04x%04x 0x%04x%04x\n",
	GET16(meta, cdr->Controller_Type.Vendor_ID),
	GET16(meta, cdr->Controller_Type.Device_ID),
	GET16(meta, cdr->Controller_Type.SubVendor_ID),
	GET16(meta, cdr->Controller_Type.SubDevice_ID));
	printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]);
	printf("** Physical Disk Records **\n");
	printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs));
	printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported));
	for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) {
	if (isff(meta->pdr->entry[j].PD_GUID, 24))
	continue;
	if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff)
	continue;
	printf("PD_GUID ");
	print_guid(meta->pdr->entry[j].PD_GUID);
	printf("\n");
	printf("PD_Reference 0x%08x\n",
	GET32(meta, pdr->entry[j].PD_Reference));
	printf("PD_Type 0x%04x\n",
	GET16(meta, pdr->entry[j].PD_Type));
	printf("PD_State 0x%04x\n",
	GET16(meta, pdr->entry[j].PD_State));
	printf("Configured_Size %ju\n",
	GET64(meta, pdr->entry[j].Configured_Size));
	printf("Block_Size %u\n",
	GET16(meta, pdr->entry[j].Block_Size));
	}
	printf("** Virtual Disk Records **\n");
	printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs));
	printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported));
	for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) {
	if (isff(meta->vdr->entry[j].VD_GUID, 24))
	continue;
	printf("VD_GUID ");
	print_guid(meta->vdr->entry[j].VD_GUID);
	printf("\n");
	printf("VD_Number 0x%04x\n",
	GET16(meta, vdr->entry[j].VD_Number));
	printf("VD_Type 0x%04x\n",
	GET16(meta, vdr->entry[j].VD_Type));
	printf("VD_State 0x%02x\n",
	GET8(meta, vdr->entry[j].VD_State));
	printf("Init_State 0x%02x\n",
	GET8(meta, vdr->entry[j].Init_State));
	printf("Drive_Failures_Remaining %u\n",
	GET8(meta, vdr->entry[j].Drive_Failures_Remaining));
	printf("VD_Name '%.16s'\n",
	(char *)&meta->vdr->entry[j].VD_Name);
	}
	printf("** Configuration Records **\n");
	num = GETCRNUM(meta);
	for (j = 0; j < num; j++) {
	vdc = GETVDCPTR(meta, j);
	val = GET32D(meta, vdc->Signature);
	switch (val) {
	case DDF_VDCR_SIGNATURE:
	printf(" Virtual Disk Configuration \n");
	printf("VD_GUID ");
	print_guid(vdc->VD_GUID);
	printf("\n");
	printf("Timestamp 0x%08x\n",
	GET32D(meta, vdc->Timestamp));
	printf("Sequence_Number 0x%08x\n",
	GET32D(meta, vdc->Sequence_Number));
	printf("Primary_Element_Count %u\n",
	GET16D(meta, vdc->Primary_Element_Count));
	printf("Stripe_Size %u\n",
	GET8D(meta, vdc->Stripe_Size));
	printf("Primary_RAID_Level 0x%02x\n",
	GET8D(meta, vdc->Primary_RAID_Level));
	printf("RLQ 0x%02x\n",
	GET8D(meta, vdc->RLQ));
	printf("Secondary_Element_Count %u\n",
	GET8D(meta, vdc->Secondary_Element_Count));
	printf("Secondary_Element_Seq %u\n",
	GET8D(meta, vdc->Secondary_Element_Seq));
	printf("Secondary_RAID_Level 0x%02x\n",
	GET8D(meta, vdc->Secondary_RAID_Level));
	printf("Block_Count %ju\n",
	GET64D(meta, vdc->Block_Count));
	printf("VD_Size %ju\n",
	GET64D(meta, vdc->VD_Size));
	printf("Block_Size %u\n",
	GET16D(meta, vdc->Block_Size));
	printf("Rotate_Parity_count %u\n",
	GET8D(meta, vdc->Rotate_Parity_count));
	printf("Associated_Spare_Disks");
	for (i = 0; i < 8; i++) {
	if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff)
	printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i]));
	}
	printf("\n");
	printf("Cache_Flags %016jx\n",
	GET64D(meta, vdc->Cache_Flags));
	printf("BG_Rate %u\n",
	GET8D(meta, vdc->BG_Rate));
	printf("MDF_Parity_Disks %u\n",
	GET8D(meta, vdc->MDF_Parity_Disks));
	printf("MDF_Parity_Generator_Polynomial 0x%04x\n",
	GET16D(meta, vdc->MDF_Parity_Generator_Polynomial));
	printf("MDF_Constant_Generation_Method 0x%02x\n",
	GET8D(meta, vdc->MDF_Constant_Generation_Method));
	printf("Physical_Disks ");
	num2 = GET16D(meta, vdc->Primary_Element_Count);
	val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]);
	for (i = 0; i < num2; i++)
	printf(" 0x%08x @ %ju",
	GET32D(meta, vdc->Physical_Disk_Sequence[i]),
	GET64P(meta, val2 + i));
	printf("\n");
	break;
	case DDF_VUCR_SIGNATURE:
	printf(" Vendor Unique Configuration \n");
	vuc = (struct ddf_vuc_record *)vdc;
	printf("VD_GUID ");
	print_guid(vuc->VD_GUID);
	printf("\n");
	break;
	case DDF_SA_SIGNATURE:
	printf(" Spare Assignment Configuration \n");
	sa = (struct ddf_sa_record *)vdc;
	printf("Timestamp 0x%08x\n",
	GET32D(meta, sa->Timestamp));
	printf("Spare_Type 0x%02x\n",
	GET8D(meta, sa->Spare_Type));
	printf("Populated_SAEs %u\n",
	GET16D(meta, sa->Populated_SAEs));
	printf("MAX_SAE_Supported %u\n",
	GET16D(meta, sa->MAX_SAE_Supported));
	for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) {
	if (isff(sa->entry[i].VD_GUID, 24))
	continue;
	printf("VD_GUID ");
	for (k = 0; k < 24; k++)
	printf("%02x", sa->entry[i].VD_GUID[k]);
	printf("\n");
	printf("Secondary_Element %u\n",
	GET16D(meta, sa->entry[i].Secondary_Element));
	}
	break;
	case 0x00000000:
	case 0xFFFFFFFF:
	break;
	default:
	printf("Unknown configuration signature %08x\n", val);
	break;
	}
	}
	printf("** Physical Disk Data **\n");
	printf("PD_GUID ");
	print_guid(meta->pdd->PD_GUID);
	printf("\n");
	printf("PD_Reference 0x%08x\n",
	GET32(meta, pdd->PD_Reference));
	printf("Forced_Ref_Flag 0x%02x\n",
	GET8(meta, pdd->Forced_Ref_Flag));
	printf("Forced_PD_GUID_Flag 0x%02x\n",
	GET8(meta, pdd->Forced_PD_GUID_Flag));
	}

	static int
	ddf_meta_find_pd(struct ddf_meta meta, uint8_t GUID, uint32_t PD_Reference)
	{
	int i;

	for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) {
	if (GUID != NULL) {
	if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0)
	return (i);
	} else if (PD_Reference != 0xffffffff) {
	if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference)
	return (i);
	} else
	if (isff(meta->pdr->entry[i].PD_GUID, 24))
	return (i);
	}
	if (GUID == NULL && PD_Reference == 0xffffffff) {
	if (i >= GET16(meta, pdr->Max_PDE_Supported))
	return (-1);
	SET16(meta, pdr->Populated_PDEs, i + 1);
	return (i);
	}
	return (-1);
	}

	static int
	ddf_meta_find_vd(struct ddf_meta meta, uint8_t GUID)
	{
	int i;

	for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) {
	if (GUID != NULL) {
	if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0)
	return (i);
	} else
	if (isff(meta->vdr->entry[i].VD_GUID, 24))
	return (i);
	}
	if (GUID == NULL) {
	if (i >= GET16(meta, vdr->Max_VDE_Supported))
	return (-1);
	SET16(meta, vdr->Populated_VDEs, i + 1);
	return (i);
	}
	return (-1);
	}

	static struct ddf_vdc_record *
	ddf_meta_find_vdc(struct ddf_meta meta, uint8_t GUID)
	{
	struct ddf_vdc_record *vdc;
	int i, num;

	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	if (GUID != NULL) {
	if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE &&
	memcmp(vdc->VD_GUID, GUID, 24) == 0)
	return (vdc);
	} else
	if (GET32D(meta, vdc->Signature) == 0xffffffff \|\|
	GET32D(meta, vdc->Signature) == 0)
	return (vdc);
	}
	return (NULL);
	}

	static int
	ddf_meta_count_vdc(struct ddf_meta meta, uint8_t GUID)
	{
	struct ddf_vdc_record *vdc;
	int i, num, cnt;

	cnt = 0;
	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
	continue;
	if (GUID == NULL \|\| memcmp(vdc->VD_GUID, GUID, 24) == 0)
	cnt++;
	}
	return (cnt);
	}

	static int
	ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference,
	int bvdp, int posp)
	{
	int i, bvd, pos;

	i = 0;
	for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) {
	if (vmeta->bvdc[bvd] == NULL) {
	i += GET16(vmeta, vdc->Primary_Element_Count); // XXX
	continue;
	}
	for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count);
	pos++, i++) {
	if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) ==
	PD_Reference) {
	if (bvdp != NULL)
	*bvdp = bvd;
	if (posp != NULL)
	*posp = pos;
	return (i);
	}
	}
	}
	return (-1);
	}

	static struct ddf_sa_record *
	ddf_meta_find_sa(struct ddf_meta *meta, int create)
	{
	struct ddf_sa_record *sa;
	int i, num;

	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	sa = GETSAPTR(meta, i);
	if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE)
	return (sa);
	}
	if (create) {
	for (i = 0; i < num; i++) {
	sa = GETSAPTR(meta, i);
	if (GET32D(meta, sa->Signature) == 0xffffffff \|\|
	GET32D(meta, sa->Signature) == 0)
	return (sa);
	}
	}
	return (NULL);
	}

	static void
	ddf_meta_create(struct g_raid_disk disk, struct ddf_meta sample)
	{
	struct timespec ts;
	struct clocktime ct;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_meta *meta;
	struct ddf_pd_entry *pde;
	off_t anchorlba;
	u_int ss, pos, size;
	int len, error;
	char serial_buffer[24];

	if (sample->hdr == NULL)
	sample = NULL;

	mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	meta = &pd->pd_meta;
	ss = disk->d_consumer->provider->sectorsize;
	anchorlba = disk->d_consumer->provider->mediasize / ss - 1;

	meta->sectorsize = ss;
	meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian;
	getnanotime(&ts);
	clock_ts_to_ct(&ts, &ct);

	/* Header */
	meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memset(meta->hdr, 0xff, ss);
	if (sample) {
	memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header));
	if (ss != sample->sectorsize) {
	SET32(meta, hdr->WorkSpace_Length,
	howmany(GET32(sample, hdr->WorkSpace_Length) *
	sample->sectorsize, ss));
	SET16(meta, hdr->Configuration_Record_Length,
	howmany(GET16(sample,
	hdr->Configuration_Record_Length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->cd_length,
	howmany(GET32(sample, hdr->cd_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->pdr_length,
	howmany(GET32(sample, hdr->pdr_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->vdr_length,
	howmany(GET32(sample, hdr->vdr_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->cr_length,
	howmany(GET32(sample, hdr->cr_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->pdd_length,
	howmany(GET32(sample, hdr->pdd_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->bbmlog_length,
	howmany(GET32(sample, hdr->bbmlog_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->Diagnostic_Space,
	howmany(GET32(sample, hdr->bbmlog_length) *
	sample->sectorsize, ss));
	SET32(meta, hdr->Vendor_Specific_Logs,
	howmany(GET32(sample, hdr->bbmlog_length) *
	sample->sectorsize, ss));
	}
	} else {
	SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE);
	snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x",
	(u_int)(ts.tv_sec - DECADE), arc4random());
	memcpy(meta->hdr->DDF_rev, "02.00.00", 8);
	SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE));
	SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss);
	SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1);
	SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS);
	SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS);
	SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS);
	SET16(meta, hdr->Configuration_Record_Length,
	howmany(sizeof(struct ddf_vdc_record) + (4 + 8) *
	GET16(meta, hdr->Max_Primary_Element_Entries), ss));
	SET32(meta, hdr->cd_length,
	howmany(sizeof(struct ddf_cd_record), ss));
	SET32(meta, hdr->pdr_length,
	howmany(sizeof(struct ddf_pd_record) +
	sizeof(struct ddf_pd_entry) * GET16(meta,
	hdr->Max_PD_Entries), ss));
	SET32(meta, hdr->vdr_length,
	howmany(sizeof(struct ddf_vd_record) +
	sizeof(struct ddf_vd_entry) *
	GET16(meta, hdr->Max_VD_Entries), ss));
	SET32(meta, hdr->cr_length,
	GET16(meta, hdr->Configuration_Record_Length) *
	(GET16(meta, hdr->Max_Partitions) + 1));
	SET32(meta, hdr->pdd_length,
	howmany(sizeof(struct ddf_pdd_record), ss));
	SET32(meta, hdr->bbmlog_length, 0);
	SET32(meta, hdr->Diagnostic_Space_Length, 0);
	SET32(meta, hdr->Vendor_Specific_Logs_Length, 0);
	}
	pos = 1;
	SET32(meta, hdr->cd_section, pos);
	pos += GET32(meta, hdr->cd_length);
	SET32(meta, hdr->pdr_section, pos);
	pos += GET32(meta, hdr->pdr_length);
	SET32(meta, hdr->vdr_section, pos);
	pos += GET32(meta, hdr->vdr_length);
	SET32(meta, hdr->cr_section, pos);
	pos += GET32(meta, hdr->cr_length);
	SET32(meta, hdr->pdd_section, pos);
	pos += GET32(meta, hdr->pdd_length);
	SET32(meta, hdr->bbmlog_section,
	GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff);
	pos += GET32(meta, hdr->bbmlog_length);
	SET32(meta, hdr->Diagnostic_Space,
	GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff);
	pos += GET32(meta, hdr->Diagnostic_Space_Length);
	SET32(meta, hdr->Vendor_Specific_Logs,
	GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff);
	pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1);
	SET64(meta, hdr->Primary_Header_LBA,
	anchorlba - pos);
	SET64(meta, hdr->Secondary_Header_LBA,
	0xffffffffffffffffULL);
	SET64(meta, hdr->WorkSpace_LBA,
	anchorlba + 1 - 32 * 1024 * 1024 / ss);

	/* Controller Data */
	size = GET32(meta, hdr->cd_length) * ss;
	meta->cdr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->cdr, 0xff, size);
	SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE);
	memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24);
	memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16);

	/* Physical Drive Records. */
	size = GET32(meta, hdr->pdr_length) * ss;
	meta->pdr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->pdr, 0xff, size);
	SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE);
	SET16(meta, pdr->Populated_PDEs, 1);
	SET16(meta, pdr->Max_PDE_Supported,
	GET16(meta, hdr->Max_PD_Entries));

	pde = &meta->pdr->entry[0];
	len = sizeof(serial_buffer);
	error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer);
	if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20)
	snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer);
	else
	snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x",
	ct.year, ct.mon, ct.day,
	arc4random(), arc4random() & 0xffff);
	SET32D(meta, pde->PD_Reference, arc4random());
	SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE);
	SET16D(meta, pde->PD_State, 0);
	SET64D(meta, pde->Configured_Size,
	anchorlba + 1 - 32 * 1024 * 1024 / ss);
	SET16D(meta, pde->Block_Size, ss);

	/* Virtual Drive Records. */
	size = GET32(meta, hdr->vdr_length) * ss;
	meta->vdr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->vdr, 0xff, size);
	SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE);
	SET32(meta, vdr->Populated_VDEs, 0);
	SET16(meta, vdr->Max_VDE_Supported,
	GET16(meta, hdr->Max_VD_Entries));

	/* Configuration Records. */
	size = GET32(meta, hdr->cr_length) * ss;
	meta->cr = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->cr, 0xff, size);

	/* Physical Disk Data. */
	size = GET32(meta, hdr->pdd_length) * ss;
	meta->pdd = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->pdd, 0xff, size);
	SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE);
	memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24);
	SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference));
	SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF);
	SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID);

	/* Bad Block Management Log. */
	if (GET32(meta, hdr->bbmlog_length) != 0) {
	size = GET32(meta, hdr->bbmlog_length) * ss;
	meta->bbm = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->bbm, 0xff, size);
	SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE);
	SET32(meta, bbm->Entry_Count, 0);
	SET32(meta, bbm->Spare_Block_Count, 0);
	}
	}

	static void
	ddf_meta_copy(struct ddf_meta dst, struct ddf_meta src)
	{
	- struct ddf_header *hdr;
	u_int ss;

	- hdr = src->hdr;
	dst->bigendian = src->bigendian;
	ss = dst->sectorsize = src->sectorsize;
	dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->hdr, src->hdr, ss);
	dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
	dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss);
	dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss);
	dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss);
	dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss);
	if (src->bbm != NULL) {
	dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss);
	}
	}

	static void
	ddf_meta_update(struct ddf_meta meta, struct ddf_meta src)
	{
	struct ddf_pd_entry pde, spde;
	int i, j;

	for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) {
	spde = &src->pdr->entry[i];
	if (isff(spde->PD_GUID, 24))
	continue;
	j = ddf_meta_find_pd(meta, NULL,
	GET32(src, pdr->entry[i].PD_Reference));
	if (j < 0) {
	j = ddf_meta_find_pd(meta, NULL, 0xffffffff);
	pde = &meta->pdr->entry[j];
	memcpy(pde, spde, sizeof(*pde));
	} else {
	pde = &meta->pdr->entry[j];
	SET16D(meta, pde->PD_State,
	GET16D(meta, pde->PD_State) \|
	GET16D(src, pde->PD_State));
	}
	}
	}

	static void
	ddf_meta_free(struct ddf_meta *meta)
	{

	if (meta->hdr != NULL) {
	free(meta->hdr, M_MD_DDF);
	meta->hdr = NULL;
	}
	if (meta->cdr != NULL) {
	free(meta->cdr, M_MD_DDF);
	meta->cdr = NULL;
	}
	if (meta->pdr != NULL) {
	free(meta->pdr, M_MD_DDF);
	meta->pdr = NULL;
	}
	if (meta->vdr != NULL) {
	free(meta->vdr, M_MD_DDF);
	meta->vdr = NULL;
	}
	if (meta->cr != NULL) {
	free(meta->cr, M_MD_DDF);
	meta->cr = NULL;
	}
	if (meta->pdd != NULL) {
	free(meta->pdd, M_MD_DDF);
	meta->pdd = NULL;
	}
	if (meta->bbm != NULL) {
	free(meta->bbm, M_MD_DDF);
	meta->bbm = NULL;
	}
	}

	static void
	ddf_vol_meta_create(struct ddf_vol_meta meta, struct ddf_meta sample)
	{
	struct timespec ts;
	struct clocktime ct;
	- struct ddf_header *hdr;
	u_int ss, size;

	- hdr = sample->hdr;
	meta->bigendian = sample->bigendian;
	ss = meta->sectorsize = sample->sectorsize;
	meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->hdr, sample->hdr, ss);
	meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss);
	meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
	memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry));
	getnanotime(&ts);
	clock_ts_to_ct(&ts, &ct);
	snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x",
	ct.year, ct.mon, ct.day,
	arc4random(), arc4random() & 0xf);
	size = GET16(sample, hdr->Configuration_Record_Length) * ss;
	meta->vdc = malloc(size, M_MD_DDF, M_WAITOK);
	memset(meta->vdc, 0xff, size);
	SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE);
	memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24);
	SET32(meta, vdc->Sequence_Number, 0);
	}

	static void
	ddf_vol_meta_update(struct ddf_vol_meta dst, struct ddf_meta src,
	uint8_t *GUID, int started)
	{
	- struct ddf_header *hdr;
	struct ddf_vd_entry *vde;
	struct ddf_vdc_record *vdc;
	int vnew, bvnew, bvd, size;
	u_int ss;

	- hdr = src->hdr;
	vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)];
	vdc = ddf_meta_find_vdc(src, GUID);
	if (GET8D(src, vdc->Secondary_Element_Count) == 1)
	bvd = 0;
	else
	bvd = GET8D(src, vdc->Secondary_Element_Seq);
	size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize;

	if (dst->vdc == NULL \|\|
	(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
	GET32(dst, vdc->Sequence_Number))) > 0))
	vnew = 1;
	else
	vnew = 0;

	if (dst->bvdc[bvd] == NULL \|\|
	(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
	GET32(dst, bvdc[bvd]->Sequence_Number))) > 0))
	bvnew = 1;
	else
	bvnew = 0;

	if (vnew) {
	dst->bigendian = src->bigendian;
	ss = dst->sectorsize = src->sectorsize;
	if (dst->hdr != NULL)
	free(dst->hdr, M_MD_DDF);
	dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->hdr, src->hdr, ss);
	if (dst->cdr != NULL)
	free(dst->cdr, M_MD_DDF);
	dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
	if (dst->vde != NULL)
	free(dst->vde, M_MD_DDF);
	dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
	memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry));
	if (dst->vdc != NULL)
	free(dst->vdc, M_MD_DDF);
	dst->vdc = malloc(size, M_MD_DDF, M_WAITOK);
	memcpy(dst->vdc, vdc, size);
	}
	if (bvnew) {
	if (dst->bvdc[bvd] != NULL)
	free(dst->bvdc[bvd], M_MD_DDF);
	dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK);
	memcpy(dst->bvdc[bvd], vdc, size);
	}
	}

	static void
	ddf_vol_meta_free(struct ddf_vol_meta *meta)
	{
	int i;

	if (meta->hdr != NULL) {
	free(meta->hdr, M_MD_DDF);
	meta->hdr = NULL;
	}
	if (meta->cdr != NULL) {
	free(meta->cdr, M_MD_DDF);
	meta->cdr = NULL;
	}
	if (meta->vde != NULL) {
	free(meta->vde, M_MD_DDF);
	meta->vde = NULL;
	}
	if (meta->vdc != NULL) {
	free(meta->vdc, M_MD_DDF);
	meta->vdc = NULL;
	}
	for (i = 0; i < DDF_MAX_DISKS_HARD; i++) {
	if (meta->bvdc[i] != NULL) {
	free(meta->bvdc[i], M_MD_DDF);
	meta->bvdc[i] = NULL;
	}
	}
	}

	static int
	ddf_meta_unused_range(struct ddf_meta meta, off_t off, off_t *size)
	{
	struct ddf_vdc_record *vdc;
	off_t beg[32], end[32], beg1, end1;
	uint64_t *offp;
	int i, j, n, num, pos;
	uint32_t ref;

	*off = 0;
	*size = 0;
	ref = GET32(meta, pdd->PD_Reference);
	pos = ddf_meta_find_pd(meta, NULL, ref);
	beg[0] = 0;
	end[0] = GET64(meta, pdr->entry[pos].Configured_Size);
	n = 1;
	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
	continue;
	for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++)
	if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref)
	break;
	if (pos == GET16D(meta, vdc->Primary_Element_Count))
	continue;
	offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[
	GET16(meta, hdr->Max_Primary_Element_Entries)]);
	beg1 = GET64P(meta, offp + pos);
	end1 = beg1 + GET64D(meta, vdc->Block_Count);
	for (j = 0; j < n; j++) {
	if (beg[j] >= end1 \|\| end[j] <= beg1 )
	continue;
	if (beg[j] < beg1 && end[j] > end1) {
	beg[n] = end1;
	end[n] = end[j];
	end[j] = beg1;
	n++;
	} else if (beg[j] < beg1)
	end[j] = beg1;
	else
	beg[j] = end1;
	}
	}
	for (j = 0; j < n; j++) {
	if (end[j] - beg[j] > *size) {
	*off = beg[j];
	*size = end[j] - beg[j];
	}
	}
	return ((*size > 0) ? 1 : 0);
	}

	static void
	ddf_meta_get_name(struct ddf_meta meta, int num, char buf)
	{
	const char *b;
	int i;

	b = meta->vdr->entry[num].VD_Name;
	for (i = 15; i >= 0; i--)
	if (b[i] != 0x20)
	break;
	memcpy(buf, b, i + 1);
	buf[i + 1] = 0;
	}

	static void
	ddf_meta_put_name(struct ddf_vol_meta meta, char buf)
	{
	int len;

	len = min(strlen(buf), 16);
	memset(meta->vde->VD_Name, 0x20, 16);
	memcpy(meta->vde->VD_Name, buf, len);
	}

	static int
	ddf_meta_read(struct g_consumer cp, struct ddf_meta meta)
	{
	struct g_provider *pp;
	struct ddf_header ahdr, hdr;
	char abuf, buf;
	off_t plba, slba, lba;
	int error, len, i;
	u_int ss;
	uint32_t val;

	ddf_meta_free(meta);
	pp = cp->provider;
	ss = meta->sectorsize = pp->sectorsize;
	/* Read anchor block. */
	abuf = g_read_data(cp, pp->mediasize - ss, ss, &error);
	if (abuf == NULL) {
	G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
	pp->name, error);
	return (error);
	}
	ahdr = (struct ddf_header *)abuf;

	/* Check if this is an DDF RAID struct */
	if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
	meta->bigendian = 1;
	else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
	meta->bigendian = 0;
	else {
	G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if (ahdr->Header_Type != DDF_HEADER_ANCHOR) {
	G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	meta->hdr = ahdr;
	plba = GET64(meta, hdr->Primary_Header_LBA);
	slba = GET64(meta, hdr->Secondary_Header_LBA);
	val = GET32(meta, hdr->CRC);
	SET32(meta, hdr->CRC, 0xffffffff);
	meta->hdr = NULL;
	if (crc32(ahdr, ss) != val) {
	G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if ((plba + 6) * ss >= pp->mediasize) {
	G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if (slba != -1 && (slba + 6) * ss >= pp->mediasize) {
	G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	lba = plba;

	doread:
	error = 0;
	ddf_meta_free(meta);

	/* Read header block. */
	buf = g_read_data(cp, lba * ss, ss, &error);
	if (buf == NULL) {
	readerror:
	G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).",
	(lba == plba) ? "primary" : "secondary", pp->name, error);
	if (lba == plba && slba != -1) {
	lba = slba;
	goto doread;
	}
	G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name);
	goto done;
	}
	meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->hdr, buf, ss);
	g_free(buf);
	hdr = meta->hdr;
	val = GET32(meta, hdr->CRC);
	SET32(meta, hdr->CRC, 0xffffffff);
	if (hdr->Signature != ahdr->Signature \|\|
	crc32(meta->hdr, ss) != val \|\|
	memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) \|\|
	GET64(meta, hdr->Primary_Header_LBA) != plba \|\|
	GET64(meta, hdr->Secondary_Header_LBA) != slba) {
	hdrerror:
	G_RAID_DEBUG(1, "DDF %s metadata check failed on %s",
	(lba == plba) ? "primary" : "secondary", pp->name);
	if (lba == plba && slba != -1) {
	lba = slba;
	goto doread;
	}
	G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name);
	error = EINVAL;
	goto done;
	}
	if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) \|\|
	(lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY))
	goto hdrerror;
	len = 1;
	len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length));
	len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length));
	len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length));
	len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length));
	len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length));
	if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff)
	len = max(len, val + GET32(meta, hdr->bbmlog_length));
	if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff)
	len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length));
	if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff)
	len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length));
	if ((plba + len) * ss >= pp->mediasize)
	goto hdrerror;
	if (slba != -1 && (slba + len) * ss >= pp->mediasize)
	goto hdrerror;
	/* Workaround for Adaptec implementation. */
	if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) {
	SET16(meta, hdr->Max_Primary_Element_Entries,
	min(GET16(meta, hdr->Max_PD_Entries),
	(GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12));
	}

	if (GET32(meta, hdr->cd_length) * ss >= MAXPHYS \|\|
	GET32(meta, hdr->pdr_length) * ss >= MAXPHYS \|\|
	GET32(meta, hdr->vdr_length) * ss >= MAXPHYS \|\|
	GET32(meta, hdr->cr_length) * ss >= MAXPHYS \|\|
	GET32(meta, hdr->pdd_length) * ss >= MAXPHYS \|\|
	GET32(meta, hdr->bbmlog_length) * ss >= MAXPHYS) {
	G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name);
	goto hdrerror;
	}

	/* Read controller data. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
	GET32(meta, hdr->cd_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss);
	g_free(buf);
	if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE)
	goto hdrerror;

	/* Read physical disk records. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
	GET32(meta, hdr->pdr_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss);
	g_free(buf);
	if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE)
	goto hdrerror;
	/*
	* Workaround for reading metadata corrupted due to graid bug.
	* XXX: Remove this before we have disks above 128PB. :)
	*/
	if (meta->bigendian) {
	for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) {
	if (isff(meta->pdr->entry[i].PD_GUID, 24))
	continue;
	if (GET32(meta, pdr->entry[i].PD_Reference) ==
	0xffffffff)
	continue;
	if (GET64(meta, pdr->entry[i].Configured_Size) >=
	(1ULL << 48)) {
	SET16(meta, pdr->entry[i].PD_State,
	GET16(meta, pdr->entry[i].PD_State) &
	~DDF_PDE_FAILED);
	SET64(meta, pdr->entry[i].Configured_Size,
	GET64(meta, pdr->entry[i].Configured_Size) &
	((1ULL << 48) - 1));
	}
	}
	}

	/* Read virtual disk records. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
	GET32(meta, hdr->vdr_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss);
	g_free(buf);
	if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE)
	goto hdrerror;

	/* Read configuration records. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
	GET32(meta, hdr->cr_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss);
	g_free(buf);

	/* Read physical disk data. */
	buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
	GET32(meta, hdr->pdd_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss);
	g_free(buf);
	if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE)
	goto hdrerror;
	i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference));
	if (i < 0)
	goto hdrerror;

	/* Read BBM Log. */
	if (GET32(meta, hdr->bbmlog_section) != 0xffffffff &&
	GET32(meta, hdr->bbmlog_length) != 0) {
	buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss,
	GET32(meta, hdr->bbmlog_length) * ss, &error);
	if (buf == NULL)
	goto readerror;
	meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
	memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss);
	g_free(buf);
	if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE)
	goto hdrerror;
	}

	done:
	g_free(abuf);
	if (error != 0)
	ddf_meta_free(meta);
	return (error);
	}

	static int
	ddf_meta_write(struct g_consumer cp, struct ddf_meta meta)
	{
	struct g_provider *pp;
	struct ddf_vdc_record *vdc;
	off_t alba, plba, slba, lba;
	u_int ss, size;
	int error, i, num;

	pp = cp->provider;
	ss = pp->sectorsize;
	lba = alba = pp->mediasize / ss - 1;
	plba = GET64(meta, hdr->Primary_Header_LBA);
	slba = GET64(meta, hdr->Secondary_Header_LBA);

	next:
	SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR :
	(lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY);
	SET32(meta, hdr->CRC, 0xffffffff);
	SET32(meta, hdr->CRC, crc32(meta->hdr, ss));
	error = g_write_data(cp, lba * ss, meta->hdr, ss);
	if (error != 0) {
	err:
	G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
	pp->name, error);
	if (lba != alba)
	goto done;
	}
	if (lba == alba) {
	lba = plba;
	goto next;
	}

	size = GET32(meta, hdr->cd_length) * ss;
	SET32(meta, cdr->CRC, 0xffffffff);
	SET32(meta, cdr->CRC, crc32(meta->cdr, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
	meta->cdr, size);
	if (error != 0)
	goto err;

	size = GET32(meta, hdr->pdr_length) * ss;
	SET32(meta, pdr->CRC, 0xffffffff);
	SET32(meta, pdr->CRC, crc32(meta->pdr, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
	meta->pdr, size);
	if (error != 0)
	goto err;

	size = GET32(meta, hdr->vdr_length) * ss;
	SET32(meta, vdr->CRC, 0xffffffff);
	SET32(meta, vdr->CRC, crc32(meta->vdr, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
	meta->vdr, size);
	if (error != 0)
	goto err;

	size = GET16(meta, hdr->Configuration_Record_Length) * ss;
	num = GETCRNUM(meta);
	for (i = 0; i < num; i++) {
	vdc = GETVDCPTR(meta, i);
	SET32D(meta, vdc->CRC, 0xffffffff);
	SET32D(meta, vdc->CRC, crc32(vdc, size));
	}
	error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
	meta->cr, size * num);
	if (error != 0)
	goto err;

	size = GET32(meta, hdr->pdd_length) * ss;
	SET32(meta, pdd->CRC, 0xffffffff);
	SET32(meta, pdd->CRC, crc32(meta->pdd, size));
	error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
	meta->pdd, size);
	if (error != 0)
	goto err;

	if (GET32(meta, hdr->bbmlog_length) != 0) {
	size = GET32(meta, hdr->bbmlog_length) * ss;
	SET32(meta, bbm->CRC, 0xffffffff);
	SET32(meta, bbm->CRC, crc32(meta->bbm, size));
	error = g_write_data(cp,
	(lba + GET32(meta, hdr->bbmlog_section)) * ss,
	meta->bbm, size);
	if (error != 0)
	goto err;
	}

	done:
	if (lba == plba && slba != -1) {
	lba = slba;
	goto next;
	}

	return (error);
	}

	static int
	ddf_meta_erase(struct g_consumer *cp)
	{
	struct g_provider *pp;
	char *buf;
	int error;

	pp = cp->provider;
	buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK \| M_ZERO);
	error = g_write_data(cp, pp->mediasize - pp->sectorsize,
	buf, pp->sectorsize);
	if (error != 0) {
	G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
	pp->name, error);
	}
	free(buf, M_MD_DDF);
	return (error);
	}

	static struct g_raid_volume *
	g_raid_md_ddf_get_volume(struct g_raid_softc sc, uint8_t GUID)
	{
	struct g_raid_volume *vol;
	struct g_raid_md_ddf_pervolume *pv;

	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0)
	break;
	}
	return (vol);
	}

	static struct g_raid_disk *
	g_raid_md_ddf_get_disk(struct g_raid_softc sc, uint8_t GUID, uint32_t id)
	{
	struct g_raid_disk *disk;
	struct g_raid_md_ddf_perdisk *pd;
	struct ddf_meta *meta;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	meta = &pd->pd_meta;
	if (GUID != NULL) {
	if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0)
	break;
	} else {
	if (GET32(meta, pdd->PD_Reference) == id)
	break;
	}
	}
	return (disk);
	}

	static int
	g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc)
	{
	struct g_raid_volume vol, tvol;
	- struct g_raid_md_ddf_pervolume *pv;
	int i, res;

	res = 0;
	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
	- pv = vol->v_md_data;
	if (vol->v_stopping)
	continue;
	for (i = 0; i < vol->v_disks_count; i++) {
	if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
	break;
	}
	if (i >= vol->v_disks_count) {
	g_raid_destroy_volume(vol);
	res = 1;
	}
	}
	return (res);
	}

	static int
	g_raid_md_ddf_purge_disks(struct g_raid_softc *sc)
	{
	#if 0
	struct g_raid_disk disk, tdisk;
	struct g_raid_volume *vol;
	struct g_raid_md_ddf_perdisk *pd;
	int i, j, res;

	res = 0;
	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
	if (disk->d_state == G_RAID_DISK_S_SPARE)
	continue;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;

	/* Scan for deleted volumes. */
	for (i = 0; i < pd->pd_subdisks; ) {
	vol = g_raid_md_ddf_get_volume(sc,
	pd->pd_meta[i]->volume_id);
	if (vol != NULL && !vol->v_stopping) {
	i++;
	continue;
	}
	free(pd->pd_meta[i], M_MD_DDF);
	for (j = i; j < pd->pd_subdisks - 1; j++)
	pd->pd_meta[j] = pd->pd_meta[j + 1];
	pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL;
	pd->pd_subdisks--;
	pd->pd_updated = 1;
	}

	/* If there is no metadata left - erase and delete disk. */
	if (pd->pd_subdisks == 0) {
	ddf_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	res = 1;
	}
	}
	return (res);
	#endif
	return (0);
	}

	static int
	g_raid_md_ddf_supported(int level, int qual, int disks, int force)
	{

	if (disks > DDF_MAX_DISKS_HARD)
	return (0);
	switch (level) {
	case G_RAID_VOLUME_RL_RAID0:
	if (qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	if (disks < 1)
	return (0);
	if (!force && disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1:
	if (disks < 1)
	return (0);
	if (qual == G_RAID_VOLUME_RLQ_R1SM) {
	if (!force && disks != 2)
	return (0);
	} else if (qual == G_RAID_VOLUME_RLQ_R1MM) {
	if (!force && disks != 3)
	return (0);
	} else
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID3:
	if (qual != G_RAID_VOLUME_RLQ_R3P0 &&
	qual != G_RAID_VOLUME_RLQ_R3PN)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID4:
	if (qual != G_RAID_VOLUME_RLQ_R4P0 &&
	qual != G_RAID_VOLUME_RLQ_R4PN)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5:
	if (qual != G_RAID_VOLUME_RLQ_R5RA &&
	qual != G_RAID_VOLUME_RLQ_R5RS &&
	qual != G_RAID_VOLUME_RLQ_R5LA &&
	qual != G_RAID_VOLUME_RLQ_R5LS)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID6:
	if (qual != G_RAID_VOLUME_RLQ_R6RA &&
	qual != G_RAID_VOLUME_RLQ_R6RS &&
	qual != G_RAID_VOLUME_RLQ_R6LA &&
	qual != G_RAID_VOLUME_RLQ_R6LS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAIDMDF:
	if (qual != G_RAID_VOLUME_RLQ_RMDFRA &&
	qual != G_RAID_VOLUME_RLQ_RMDFRS &&
	qual != G_RAID_VOLUME_RLQ_RMDFLA &&
	qual != G_RAID_VOLUME_RLQ_RMDFLS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1E:
	if (qual != G_RAID_VOLUME_RLQ_R1EA &&
	qual != G_RAID_VOLUME_RLQ_R1EO)
	return (0);
	if (disks < 3)
	return (0);
	break;
	case G_RAID_VOLUME_RL_SINGLE:
	if (qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	if (disks != 1)
	return (0);
	break;
	case G_RAID_VOLUME_RL_CONCAT:
	if (qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	if (disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5E:
	if (qual != G_RAID_VOLUME_RLQ_R5ERA &&
	qual != G_RAID_VOLUME_RLQ_R5ERS &&
	qual != G_RAID_VOLUME_RLQ_R5ELA &&
	qual != G_RAID_VOLUME_RLQ_R5ELS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5EE:
	if (qual != G_RAID_VOLUME_RLQ_R5EERA &&
	qual != G_RAID_VOLUME_RLQ_R5EERS &&
	qual != G_RAID_VOLUME_RLQ_R5EELA &&
	qual != G_RAID_VOLUME_RLQ_R5EELS)
	return (0);
	if (disks < 4)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5R:
	if (qual != G_RAID_VOLUME_RLQ_R5RRA &&
	qual != G_RAID_VOLUME_RLQ_R5RRS &&
	qual != G_RAID_VOLUME_RLQ_R5RLA &&
	qual != G_RAID_VOLUME_RLQ_R5RLS)
	return (0);
	if (disks < 3)
	return (0);
	break;
	default:
	return (0);
	}
	return (1);
	}

	static int
	g_raid_md_ddf_start_disk(struct g_raid_disk disk, struct g_raid_volume vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_vol_meta *vmeta;
	struct ddf_meta pdmeta, gmeta;
	struct ddf_vdc_record *vdc1;
	struct ddf_sa_record *sa;
	off_t size, eoff = 0, esize = 0;
	uint64_t *val2;
	int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos;
	int i, resurrection = 0;
	uint32_t reference;

	sc = disk->d_softc;
	mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	pdmeta = &pd->pd_meta;
	reference = GET32(&pd->pd_meta, pdd->PD_Reference);

	pv = vol->v_md_data;
	vmeta = &pv->pv_meta;
	gmeta = &mdi->mdio_meta;

	/* Find disk position in metadata by its reference. */
	disk_pos = ddf_meta_find_disk(vmeta, reference,
	&md_disk_bvd, &md_disk_pos);
	md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference);

	if (disk_pos < 0) {
	G_RAID_DEBUG1(1, sc,
	"Disk %s is not a present part of the volume %s",
	g_raid_get_diskname(disk), vol->v_name);

	/* Failed stale disk is useless for us. */
	if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
	return (0);
	}

	/* If disk has some metadata for this volume - erase. */
	if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL)
	SET32D(pdmeta, vdc1->Signature, 0xffffffff);

	/* If we are in the start process, that's all for now. */
	if (!pv->pv_started)
	goto nofit;
	/*
	* If we have already started - try to get use of the disk.
	* Try to replace OFFLINE disks first, then FAILED.
	*/
	if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
	GET16(&pd->pd_meta, hdr->Max_Partitions)) {
	G_RAID_DEBUG1(1, sc, "No free partitions on disk %s",
	g_raid_get_diskname(disk));
	goto nofit;
	}
	ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize);
	if (esize == 0) {
	G_RAID_DEBUG1(1, sc, "No free space on disk %s",
	g_raid_get_diskname(disk));
	goto nofit;
	}
	eoff *= pd->pd_meta.sectorsize;
	esize *= pd->pd_meta.sectorsize;
	size = INT64_MAX;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
	size = sd->sd_size;
	if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
	(disk_pos < 0 \|\|
	vol->v_subdisks[i].sd_state < sd->sd_state))
	disk_pos = i;
	}
	if (disk_pos >= 0 &&
	vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
	esize < size) {
	G_RAID_DEBUG1(1, sc, "Disk %s free space "
	"is too small (%ju < %ju)",
	g_raid_get_diskname(disk), esize, size);
	disk_pos = -1;
	}
	if (disk_pos >= 0) {
	if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
	esize = size;
	md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX
	md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX
	} else {
	nofit:
	if (disk->d_state == G_RAID_DISK_S_NONE)
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_STALE);
	return (0);
	}

	/*
	* If spare is committable, delete spare record.
	* Othersize, mark it active and leave there.
	*/
	sa = ddf_meta_find_sa(&pd->pd_meta, 0);
	if (sa != NULL) {
	if ((GET8D(&pd->pd_meta, sa->Spare_Type) &
	DDF_SAR_TYPE_REVERTIBLE) == 0) {
	SET32D(&pd->pd_meta, sa->Signature, 0xffffffff);
	} else {
	SET8D(&pd->pd_meta, sa->Spare_Type,
	GET8D(&pd->pd_meta, sa->Spare_Type) \|
	DDF_SAR_TYPE_ACTIVE);
	}
	}

	G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
	g_raid_get_diskname(disk), disk_pos, vol->v_name);
	resurrection = 1;
	}

	sd = &vol->v_subdisks[disk_pos];

	if (resurrection && sd->sd_disk != NULL) {
	g_raid_change_disk_state(sd->sd_disk,
	G_RAID_DISK_S_STALE_FAILED);
	TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
	sd, sd_next);
	}
	vol->v_subdisks[disk_pos].sd_disk = disk;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);

	/* Welcome the new disk. */
	if (resurrection)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
	else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
	else
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);

	if (resurrection) {
	sd->sd_offset = eoff;
	sd->sd_size = esize;
	} else if (pdmeta->cr != NULL &&
	(vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) {
	val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
	sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512;
	sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512;
	}

	if (resurrection) {
	/* Stale disk, almost same as new. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_NEW);
	} else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) {
	/* Failed disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	} else if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) &
	(DDF_PDE_FAILED \| DDF_PDE_REBUILD)) != 0) {
	/* Rebuilding disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_REBUILD);
	sd->sd_rebuild_pos = 0;
	} else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 \|\|
	(GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) !=
	DDF_VDE_INIT_FULL) {
	/* Stale disk or dirty volume (unclean shutdown). */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_STALE);
	} else {
	/* Up to date disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	}
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);

	return (resurrection);
	}

	static void
	g_raid_md_ddf_refill(struct g_raid_softc *sc)
	{
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	int update, updated, i, bad;

	md = sc->sc_md;
	restart:
	updated = 0;
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (!pv->pv_started \|\| vol->v_stopping)
	continue;

	/* Search for subdisk that needs replacement. */
	bad = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE \|\|
	sd->sd_state == G_RAID_SUBDISK_S_FAILED)
	bad = 1;
	}
	if (!bad)
	continue;

	G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
	"trying to refill.", vol->v_name);

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	/* Skip failed. */
	if (disk->d_state < G_RAID_DISK_S_SPARE)
	continue;
	/* Skip already used by this volume. */
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_disk == disk)
	break;
	}
	if (i < vol->v_disks_count)
	continue;

	/* Try to use disk if it has empty extents. */
	pd = disk->d_md_data;
	if (ddf_meta_count_vdc(&pd->pd_meta, NULL) <
	GET16(&pd->pd_meta, hdr->Max_Partitions)) {
	update = g_raid_md_ddf_start_disk(disk, vol);
	} else
	update = 0;
	if (update) {
	updated = 1;
	g_raid_md_write_ddf(md, vol, NULL, disk);
	break;
	}
	}
	}
	if (updated)
	goto restart;
	}

	static void
	g_raid_md_ddf_start(struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_vol_meta *vmeta;
	- struct ddf_vdc_record *vdc;
	uint64_t *val2;
	int i, j, bvd;

	sc = vol->v_softc;
	md = sc->sc_md;
	mdi = (struct g_raid_md_ddf_object *)md;
	pv = vol->v_md_data;
	vmeta = &pv->pv_meta;
	- vdc = vmeta->vdc;

	vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level);
	vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ);
	if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 &&
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 &&
	GET8(vmeta, vdc->Secondary_RAID_Level) == 0)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
	vol->v_sectorsize = GET16(vmeta, vdc->Block_Size);
	if (vol->v_sectorsize == 0xffff)
	vol->v_sectorsize = vmeta->sectorsize;
	vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size);
	vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) *
	GET8(vmeta, vdc->Secondary_Element_Count);
	vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks);
	vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial);
	vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method);
	if (GET8(vmeta, vdc->Rotate_Parity_count) > 31)
	vol->v_rotate_parity = 1;
	else
	vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count);
	vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize;
	for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) {
	if (j == GET16(vmeta, vdc->Primary_Element_Count)) {
	j = 0;
	bvd++;
	}
	sd = &vol->v_subdisks[i];
	if (vmeta->bvdc[bvd] == NULL) {
	sd->sd_offset = 0;
	sd->sd_size = GET64(vmeta, vdc->Block_Count) *
	vol->v_sectorsize;
	continue;
	}
	val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
	GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
	sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize;
	sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) *
	vol->v_sectorsize;
	}
	g_raid_start_volume(vol);

	/* Make all disks found till the moment take their places. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL)
	g_raid_md_ddf_start_disk(disk, vol);
	}

	pv->pv_started = 1;
	mdi->mdio_starting--;
	callout_stop(&pv->pv_start_co);
	G_RAID_DEBUG1(0, sc, "Volume started.");
	g_raid_md_write_ddf(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_ddf_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
	}

	static void
	g_raid_ddf_go(void *arg)
	{
	struct g_raid_volume *vol;
	struct g_raid_softc *sc;
	struct g_raid_md_ddf_pervolume *pv;

	vol = arg;
	pv = vol->v_md_data;
	sc = vol->v_softc;
	if (!pv->pv_started) {
	G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
	g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
	G_RAID_EVENT_VOLUME);
	}
	}

	static void
	g_raid_md_ddf_new_disk(struct g_raid_disk *disk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_object *md;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct g_raid_volume *vol;
	struct ddf_meta *pdmeta;
	struct ddf_vol_meta *vmeta;
	struct ddf_vdc_record *vdc;
	struct ddf_vd_entry *vde;
	int i, j, k, num, have, need, cnt, spare;
	uint32_t val;
	char buf[17];

	sc = disk->d_softc;
	md = sc->sc_md;
	mdi = (struct g_raid_md_ddf_object *)md;
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	pdmeta = &pd->pd_meta;
	spare = -1;

	if (mdi->mdio_meta.hdr == NULL)
	ddf_meta_copy(&mdi->mdio_meta, pdmeta);
	else
	ddf_meta_update(&mdi->mdio_meta, pdmeta);

	num = GETCRNUM(pdmeta);
	for (j = 0; j < num; j++) {
	vdc = GETVDCPTR(pdmeta, j);
	val = GET32D(pdmeta, vdc->Signature);

	if (val == DDF_SA_SIGNATURE && spare == -1)
	spare = 1;

	if (val != DDF_VDCR_SIGNATURE)
	continue;
	spare = 0;
	k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID);
	if (k < 0)
	continue;
	vde = &pdmeta->vdr->entry[k];

	/* Look for volume with matching ID. */
	vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID);
	if (vol == NULL) {
	ddf_meta_get_name(pdmeta, k, buf);
	vol = g_raid_create_volume(sc, buf,
	GET16D(pdmeta, vde->VD_Number));
	pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK \| M_ZERO);
	vol->v_md_data = pv;
	callout_init(&pv->pv_start_co, 1);
	callout_reset(&pv->pv_start_co,
	g_raid_start_timeout * hz,
	g_raid_ddf_go, vol);
	mdi->mdio_starting++;
	} else
	pv = vol->v_md_data;

	/* If we haven't started yet - check metadata freshness. */
	vmeta = &pv->pv_meta;
	ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started);
	}

	if (spare == 1) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	g_raid_md_ddf_refill(sc);
	}

	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	vmeta = &pv->pv_meta;

	if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL)
	continue;

	if (pv->pv_started) {
	if (g_raid_md_ddf_start_disk(disk, vol))
	g_raid_md_write_ddf(md, vol, NULL, NULL);
	continue;
	}

	/* If we collected all needed disks - start array. */
	need = 0;
	have = 0;
	for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) {
	if (vmeta->bvdc[k] == NULL) {
	need += GET16(vmeta, vdc->Primary_Element_Count);
	continue;
	}
	cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count);
	need += cnt;
	for (i = 0; i < cnt; i++) {
	val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]);
	if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL)
	have++;
	}
	}
	G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks",
	vol->v_name, have, need);
	if (have == need)
	g_raid_md_ddf_start(vol);
	}
	}

	static int
	g_raid_md_create_req_ddf(struct g_raid_md_object md, struct g_class mp,
	struct gctl_req req, struct g_geom *gp)
	{
	struct g_geom *geom;
	struct g_raid_softc *sc;
	struct g_raid_md_ddf_object mdi, mdi1;
	char name[16];
	const char *fmtopt;
	int be = 1;

	mdi = (struct g_raid_md_ddf_object *)md;
	fmtopt = gctl_get_asciiparam(req, "fmtopt");
	if (fmtopt == NULL \|\| strcasecmp(fmtopt, "BE") == 0)
	be = 1;
	else if (strcasecmp(fmtopt, "LE") == 0)
	be = 0;
	else {
	gctl_error(req, "Incorrect fmtopt argument.");
	return (G_RAID_MD_TASTE_FAIL);
	}

	/* Search for existing node. */
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md;
	if (mdi1->mdio_bigendian != be)
	continue;
	break;
	}
	if (geom != NULL) {
	*gp = geom;
	return (G_RAID_MD_TASTE_EXISTING);
	}

	/* Create new one if not found. */
	mdi->mdio_bigendian = be;
	snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
	sc = g_raid_create_node(mp, name, md);
	if (sc == NULL)
	return (G_RAID_MD_TASTE_FAIL);
	md->mdo_softc = sc;
	*gp = sc->sc_geom;
	return (G_RAID_MD_TASTE_NEW);
	}

	static int
	g_raid_md_taste_ddf(struct g_raid_md_object md, struct g_class mp,
	struct g_consumer cp, struct g_geom *gp)
	{
	struct g_consumer *rcp;
	struct g_provider *pp;
	struct g_raid_softc *sc;
	struct g_raid_disk *disk;
	struct ddf_meta meta;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_object *mdi;
	struct g_geom *geom;
	int error, result, be;
	char name[16];

	G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name);
	mdi = (struct g_raid_md_ddf_object *)md;
	pp = cp->provider;

	/* Read metadata from device. */
	g_topology_unlock();
	bzero(&meta, sizeof(meta));
	error = ddf_meta_read(cp, &meta);
	g_topology_lock();
	if (error != 0)
	return (G_RAID_MD_TASTE_FAIL);
	be = meta.bigendian;

	/* Metadata valid. Print it. */
	g_raid_md_ddf_print(&meta);

	/* Search for matching node. */
	sc = NULL;
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
	if (mdi->mdio_bigendian != be)
	continue;
	break;
	}

	/* Found matching node. */
	if (geom != NULL) {
	G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
	result = G_RAID_MD_TASTE_EXISTING;

	} else { /* Not found matching node -- create one. */
	result = G_RAID_MD_TASTE_NEW;
	mdi->mdio_bigendian = be;
	snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
	sc = g_raid_create_node(mp, name, md);
	md->mdo_softc = sc;
	geom = sc->sc_geom;
	}

	/* There is no return after this point, so we close passed consumer. */
	g_access(cp, -1, 0, 0);

	rcp = g_new_consumer(geom);
	rcp->flags \|= G_CF_DIRECT_RECEIVE;
	g_attach(rcp, pp);
	if (g_access(rcp, 1, 1, 1) != 0)
	; //goto fail1;

	g_topology_unlock();
	sx_xlock(&sc->sc_lock);

	pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK \| M_ZERO);
	pd->pd_meta = meta;
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = rcp;
	rcp->private = disk;

	g_raid_get_disk_info(disk);

	g_raid_md_ddf_new_disk(disk);

	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	*gp = geom;
	return (result);
	}

	static int
	g_raid_md_event_ddf(struct g_raid_md_object *md,
	struct g_raid_disk *disk, u_int event)
	{
	struct g_raid_softc *sc;

	sc = md->mdo_softc;
	if (disk == NULL)
	return (-1);
	switch (event) {
	case G_RAID_DISK_E_DISCONNECTED:
	/* Delete disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
	g_raid_destroy_disk(disk);
	g_raid_md_ddf_purge_volumes(sc);

	/* Write updated metadata to all disks. */
	g_raid_md_write_ddf(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_ddf_refill(sc);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_volume_event_ddf(struct g_raid_md_object *md,
	struct g_raid_volume *vol, u_int event)
	{
	struct g_raid_md_ddf_pervolume *pv;

	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	switch (event) {
	case G_RAID_VOLUME_E_STARTMD:
	if (!pv->pv_started)
	g_raid_md_ddf_start(vol);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_ctl_ddf(struct g_raid_md_object *md,
	struct gctl_req *req)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume vol, vol1;
	struct g_raid_subdisk *sd;
	struct g_raid_disk disk, disks[DDF_MAX_DISKS_HARD];
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_sa_record *sa;
	struct g_consumer *cp;
	struct g_provider *pp;
	char arg[16];
	const char nodename, verb, volname, levelname, *diskname;
	char *tmp;
	int nargs, force;
	off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize;
	intmax_t sizearg, striparg;
	int i, numdisks, len, level, qual;
	int error;

	sc = md->mdo_softc;
	mdi = (struct g_raid_md_ddf_object *)md;
	verb = gctl_get_param(req, "verb", NULL);
	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	error = 0;

	if (strcmp(verb, "label") == 0) {

	if (*nargs < 4) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req, "arg1");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}
	levelname = gctl_get_asciiparam(req, "arg2");
	if (levelname == NULL) {
	gctl_error(req, "No RAID level.");
	return (-3);
	}
	if (g_raid_volume_str2level(levelname, &level, &qual)) {
	gctl_error(req, "Unknown RAID level '%s'.", levelname);
	return (-4);
	}
	numdisks = *nargs - 3;
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (!g_raid_md_ddf_supported(level, qual, numdisks,
	force ? *force : 0)) {
	gctl_error(req, "Unsupported RAID level "
	"(0x%02x/0x%02x), or number of disks (%d).",
	level, qual, numdisks);
	return (-5);
	}

	/* Search for disks, connect them and probe. */
	size = INT64_MAX;
	sectorsize = 0;
	bzero(disks, sizeof(disks));
	bzero(offs, sizeof(offs));
	for (i = 0; i < numdisks; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i + 3);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -6;
	break;
	}
	if (strcmp(diskname, "NONE") == 0)
	continue;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk != NULL) {
	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
	gctl_error(req, "Disk '%s' is in a "
	"wrong state (%s).", diskname,
	g_raid_disk_state2str(disk->d_state));
	error = -7;
	break;
	}
	pd = disk->d_md_data;
	if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
	GET16(&pd->pd_meta, hdr->Max_Partitions)) {
	gctl_error(req, "No free partitions "
	"on disk '%s'.",
	diskname);
	error = -7;
	break;
	}
	pp = disk->d_consumer->provider;
	disks[i] = disk;
	ddf_meta_unused_range(&pd->pd_meta,
	&offs[i], &esize);
	offs[i] *= pp->sectorsize;
	size = MIN(size, (off_t)esize * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	continue;
	}

	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -8;
	break;
	}
	pp = cp->provider;
	pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK \| M_ZERO);
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = cp;
	disks[i] = disk;
	cp->private = disk;
	ddf_meta_create(disk, &mdi->mdio_meta);
	if (mdi->mdio_meta.hdr == NULL)
	ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
	else
	ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
	g_topology_unlock();

	g_raid_get_disk_info(disk);

	/* Reserve some space for metadata. */
	size = MIN(size, GET64(&pd->pd_meta,
	pdr->entry[0].Configured_Size) * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	}
	if (error != 0) {
	for (i = 0; i < numdisks; i++) {
	if (disks[i] != NULL &&
	disks[i]->d_state == G_RAID_DISK_S_NONE)
	g_raid_destroy_disk(disks[i]);
	}
	return (error);
	}

	if (sectorsize <= 0) {
	gctl_error(req, "Can't get sector size.");
	return (-8);
	}

	/* Handle size argument. */
	len = sizeof(*sizearg);
	sizearg = gctl_get_param(req, "size", &len);
	if (sizearg != NULL && len == sizeof(*sizearg) &&
	*sizearg > 0) {
	if (*sizearg > size) {
	gctl_error(req, "Size too big %lld > %lld.",
	(long long)*sizearg, (long long)size);
	return (-9);
	}
	size = *sizearg;
	}

	/* Handle strip argument. */
	strip = 131072;
	len = sizeof(*striparg);
	striparg = gctl_get_param(req, "strip", &len);
	if (striparg != NULL && len == sizeof(*striparg) &&
	*striparg > 0) {
	if (*striparg < sectorsize) {
	gctl_error(req, "Strip size too small.");
	return (-10);
	}
	if (*striparg % sectorsize != 0) {
	gctl_error(req, "Incorrect strip size.");
	return (-11);
	}
	strip = *striparg;
	}

	/* Round size down to strip or sector. */
	if (level == G_RAID_VOLUME_RL_RAID1 \|\|
	level == G_RAID_VOLUME_RL_RAID3 \|\|
	level == G_RAID_VOLUME_RL_SINGLE \|\|
	level == G_RAID_VOLUME_RL_CONCAT)
	size -= (size % sectorsize);
	else if (level == G_RAID_VOLUME_RL_RAID1E &&
	(numdisks & 1) != 0)
	size -= (size % (2 * strip));
	else
	size -= (size % strip);
	if (size <= 0) {
	gctl_error(req, "Size too small.");
	return (-13);
	}

	/* We have all we need, create things: volume, ... */
	pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK \| M_ZERO);
	ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta);
	pv->pv_started = 1;
	vol = g_raid_create_volume(sc, volname, -1);
	vol->v_md_data = pv;
	vol->v_raid_level = level;
	vol->v_raid_level_qualifier = qual;
	vol->v_strip_size = strip;
	vol->v_disks_count = numdisks;
	if (level == G_RAID_VOLUME_RL_RAID0 \|\|
	level == G_RAID_VOLUME_RL_CONCAT \|\|
	level == G_RAID_VOLUME_RL_SINGLE)
	vol->v_mediasize = size * numdisks;
	else if (level == G_RAID_VOLUME_RL_RAID1)
	vol->v_mediasize = size;
	else if (level == G_RAID_VOLUME_RL_RAID3 \|\|
	level == G_RAID_VOLUME_RL_RAID4 \|\|
	level == G_RAID_VOLUME_RL_RAID5)
	vol->v_mediasize = size * (numdisks - 1);
	else if (level == G_RAID_VOLUME_RL_RAID5R) {
	vol->v_mediasize = size * (numdisks - 1);
	vol->v_rotate_parity = 1024;
	} else if (level == G_RAID_VOLUME_RL_RAID6 \|\|
	level == G_RAID_VOLUME_RL_RAID5E \|\|
	level == G_RAID_VOLUME_RL_RAID5EE)
	vol->v_mediasize = size * (numdisks - 2);
	else if (level == G_RAID_VOLUME_RL_RAIDMDF) {
	if (numdisks < 5)
	vol->v_mdf_pdisks = 2;
	else
	vol->v_mdf_pdisks = 3;
	vol->v_mdf_polynomial = 0x11d;
	vol->v_mdf_method = 0x00;
	vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks);
	} else { /* RAID1E */
	vol->v_mediasize = ((size * numdisks) / strip / 2) *
	strip;
	}
	vol->v_sectorsize = sectorsize;
	g_raid_start_volume(vol);

	/* , and subdisks. */
	for (i = 0; i < numdisks; i++) {
	disk = disks[i];
	sd = &vol->v_subdisks[i];
	sd->sd_disk = disk;
	sd->sd_offset = offs[i];
	sd->sd_size = size;
	if (disk == NULL)
	continue;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_ACTIVE);
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write metadata based on created entities. */
	G_RAID_DEBUG1(0, sc, "Array started.");
	g_raid_md_write_ddf(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_ddf_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START,
	G_RAID_EVENT_VOLUME);
	return (0);
	}
	if (strcmp(verb, "add") == 0) {

	gctl_error(req, "`add` command is not applicable, "
	"use `label` instead.");
	return (-99);
	}
	if (strcmp(verb, "delete") == 0) {

	nodename = gctl_get_asciiparam(req, "arg0");
	if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
	nodename = NULL;

	/* Full node destruction. */
	if (*nargs == 1 && nodename != NULL) {
	/* Check if some volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	g_raid_nopens(sc) != 0) {
	gctl_error(req, "Some volume is still open.");
	return (-4);
	}

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	ddf_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	return (0);
	}

	/* Destroy specified volume. If it was last - all node. */
	if (*nargs > 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req,
	nodename != NULL ? "arg1" : "arg0");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}

	/* Search for volume. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (strcmp(vol->v_name, volname) == 0)
	break;
	pp = vol->v_provider;
	if (pp == NULL)
	continue;
	if (strcmp(pp->name, volname) == 0)
	break;
	if (strncmp(pp->name, "raid/", 5) == 0 &&
	strcmp(pp->name + 5, volname) == 0)
	break;
	}
	if (vol == NULL) {
	i = strtol(volname, &tmp, 10);
	if (verb != volname && tmp[0] == 0) {
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_global_id == i)
	break;
	}
	}
	}
	if (vol == NULL) {
	gctl_error(req, "Volume '%s' not found.", volname);
	return (-3);
	}

	/* Check if volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	vol->v_provider_open != 0) {
	gctl_error(req, "Volume is still open.");
	return (-4);
	}

	/* Destroy volume and potentially node. */
	i = 0;
	TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
	i++;
	if (i >= 2) {
	g_raid_destroy_volume(vol);
	g_raid_md_ddf_purge_disks(sc);
	g_raid_md_write_ddf(md, NULL, NULL, NULL);
	} else {
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	ddf_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	}
	return (0);
	}
	if (strcmp(verb, "remove") == 0 \|\|
	strcmp(verb, "fail") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -2;
	break;
	}
	if (strncmp(diskname, "/dev/", 5) == 0)
	diskname += 5;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk == NULL) {
	gctl_error(req, "Disk '%s' not found.",
	diskname);
	error = -3;
	break;
	}

	if (strcmp(verb, "fail") == 0) {
	g_raid_md_fail_disk_ddf(md, NULL, disk);
	continue;
	}

	/* Erase metadata on deleting disk and destroy it. */
	ddf_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	}
	g_raid_md_ddf_purge_volumes(sc);

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_ddf(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_ddf_refill(sc);
	return (error);
	}
	if (strcmp(verb, "insert") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	/* Get disk name. */
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -3;
	break;
	}

	/* Try to find provider with specified name. */
	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -4;
	break;
	}
	pp = cp->provider;
	g_topology_unlock();

	pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK \| M_ZERO);

	disk = g_raid_create_disk(sc);
	disk->d_consumer = cp;
	disk->d_md_data = (void *)pd;
	cp->private = disk;

	g_raid_get_disk_info(disk);

	/* Welcome the "new" disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	ddf_meta_create(disk, &mdi->mdio_meta);
	sa = ddf_meta_find_sa(&pd->pd_meta, 1);
	if (sa != NULL) {
	SET32D(&pd->pd_meta, sa->Signature,
	DDF_SA_SIGNATURE);
	SET8D(&pd->pd_meta, sa->Spare_Type, 0);
	SET16D(&pd->pd_meta, sa->Populated_SAEs, 0);
	SET16D(&pd->pd_meta, sa->MAX_SAE_Supported,
	(GET16(&pd->pd_meta, hdr->Configuration_Record_Length) *
	pd->pd_meta.sectorsize -
	sizeof(struct ddf_sa_record)) /
	sizeof(struct ddf_sa_entry));
	}
	if (mdi->mdio_meta.hdr == NULL)
	ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
	else
	ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
	g_raid_md_write_ddf(md, NULL, NULL, NULL);
	g_raid_md_ddf_refill(sc);
	}
	return (error);
	}
	return (-100);
	}

	static int
	g_raid_md_write_ddf(struct g_raid_md_object md, struct g_raid_volume tvol,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_md_ddf_pervolume *pv;
	struct g_raid_md_ddf_object *mdi;
	struct ddf_meta *gmeta;
	struct ddf_vol_meta *vmeta;
	struct ddf_vdc_record *vdc;
	struct ddf_sa_record *sa;
	uint64_t *val2;
	int i, j, pos, bvd, size;

	sc = md->mdo_softc;
	mdi = (struct g_raid_md_ddf_object *)md;
	gmeta = &mdi->mdio_meta;

	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
	return (0);

	/*
	* Clear disk flags to let only really needed ones to be reset.
	* Do it only if there are no volumes in starting state now,
	* as they can update disk statuses yet and we may kill innocent.
	*/
	if (mdi->mdio_starting == 0) {
	for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
	if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
	continue;
	SET16(gmeta, pdr->entry[i].PD_Type,
	GET16(gmeta, pdr->entry[i].PD_Type) &
	~(DDF_PDE_PARTICIPATING \|
	DDF_PDE_GLOBAL_SPARE \| DDF_PDE_CONFIG_SPARE));
	if ((GET16(gmeta, pdr->entry[i].PD_State) &
	DDF_PDE_PFA) == 0)
	SET16(gmeta, pdr->entry[i].PD_State, 0);
	}
	}

	/* Generate/update new per-volume metadata. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	if (vol->v_stopping \|\| !pv->pv_started)
	continue;
	vmeta = &pv->pv_meta;

	SET32(vmeta, vdc->Sequence_Number,
	GET32(vmeta, vdc->Sequence_Number) + 1);
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
	vol->v_disks_count % 2 == 0)
	SET16(vmeta, vdc->Primary_Element_Count, 2);
	else
	SET16(vmeta, vdc->Primary_Element_Count,
	vol->v_disks_count);
	SET8(vmeta, vdc->Stripe_Size,
	ffs(vol->v_strip_size / vol->v_sectorsize) - 1);
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
	vol->v_disks_count % 2 == 0) {
	SET8(vmeta, vdc->Primary_RAID_Level,
	DDF_VDCR_RAID1);
	SET8(vmeta, vdc->RLQ, 0);
	SET8(vmeta, vdc->Secondary_Element_Count,
	vol->v_disks_count / 2);
	SET8(vmeta, vdc->Secondary_RAID_Level, 0);
	} else {
	SET8(vmeta, vdc->Primary_RAID_Level,
	vol->v_raid_level);
	SET8(vmeta, vdc->RLQ,
	vol->v_raid_level_qualifier);
	SET8(vmeta, vdc->Secondary_Element_Count, 1);
	SET8(vmeta, vdc->Secondary_RAID_Level, 0);
	}
	SET8(vmeta, vdc->Secondary_Element_Seq, 0);
	SET64(vmeta, vdc->Block_Count, 0);
	SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize);
	SET16(vmeta, vdc->Block_Size, vol->v_sectorsize);
	SET8(vmeta, vdc->Rotate_Parity_count,
	fls(vol->v_rotate_parity) - 1);
	SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks);
	SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial,
	vol->v_mdf_polynomial);
	SET8(vmeta, vdc->MDF_Constant_Generation_Method,
	vol->v_mdf_method);

	SET16(vmeta, vde->VD_Number, vol->v_global_id);
	if (vol->v_state <= G_RAID_VOLUME_S_BROKEN)
	SET8(vmeta, vde->VD_State, DDF_VDE_FAILED);
	else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
	SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED);
	else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL)
	SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL);
	else
	SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL);
	if (vol->v_dirty \|\|
	g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 \|\|
	g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0)
	SET8(vmeta, vde->VD_State,
	GET8(vmeta, vde->VD_State) \| DDF_VDE_DIRTY);
	SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX
	ddf_meta_put_name(vmeta, vol->v_name);

	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	bvd = i / GET16(vmeta, vdc->Primary_Element_Count);
	pos = i % GET16(vmeta, vdc->Primary_Element_Count);
	disk = sd->sd_disk;
	if (disk != NULL) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	if (vmeta->bvdc[bvd] == NULL) {
	size = GET16(vmeta,
	hdr->Configuration_Record_Length) *
	vmeta->sectorsize;
	vmeta->bvdc[bvd] = malloc(size,
	M_MD_DDF, M_WAITOK);
	memset(vmeta->bvdc[bvd], 0xff, size);
	}
	memcpy(vmeta->bvdc[bvd], vmeta->vdc,
	sizeof(struct ddf_vdc_record));
	SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd);
	SET64(vmeta, bvdc[bvd]->Block_Count,
	sd->sd_size / vol->v_sectorsize);
	SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos],
	GET32(&pd->pd_meta, pdd->PD_Reference));
	val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
	GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
	SET64P(vmeta, val2 + pos,
	sd->sd_offset / vol->v_sectorsize);
	}
	if (vmeta->bvdc[bvd] == NULL)
	continue;

	j = ddf_meta_find_pd(gmeta, NULL,
	GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]));
	if (j < 0)
	continue;
	SET16(gmeta, pdr->entry[j].PD_Type,
	GET16(gmeta, pdr->entry[j].PD_Type) \|
	DDF_PDE_PARTICIPATING);
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	(DDF_PDE_FAILED \| DDF_PDE_MISSING));
	else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED)
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	(DDF_PDE_FAILED \| DDF_PDE_PFA));
	else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD)
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	DDF_PDE_REBUILD);
	else
	SET16(gmeta, pdr->entry[j].PD_State,
	GET16(gmeta, pdr->entry[j].PD_State) \|
	DDF_PDE_ONLINE);
	}
	}

	/* Mark spare and failed disks as such. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	i = ddf_meta_find_pd(gmeta, NULL,
	GET32(&pd->pd_meta, pdd->PD_Reference));
	if (i < 0)
	continue;
	if (disk->d_state == G_RAID_DISK_S_FAILED) {
	SET16(gmeta, pdr->entry[i].PD_State,
	GET16(gmeta, pdr->entry[i].PD_State) \|
	(DDF_PDE_FAILED \| DDF_PDE_PFA));
	}
	if (disk->d_state != G_RAID_DISK_S_SPARE)
	continue;
	sa = ddf_meta_find_sa(&pd->pd_meta, 0);
	if (sa == NULL \|\|
	(GET8D(&pd->pd_meta, sa->Spare_Type) &
	DDF_SAR_TYPE_DEDICATED) == 0) {
	SET16(gmeta, pdr->entry[i].PD_Type,
	GET16(gmeta, pdr->entry[i].PD_Type) \|
	DDF_PDE_GLOBAL_SPARE);
	} else {
	SET16(gmeta, pdr->entry[i].PD_Type,
	GET16(gmeta, pdr->entry[i].PD_Type) \|
	DDF_PDE_CONFIG_SPARE);
	}
	SET16(gmeta, pdr->entry[i].PD_State,
	GET16(gmeta, pdr->entry[i].PD_State) \|
	DDF_PDE_ONLINE);
	}

	/* Remove disks without "participating" flag (unused). */
	for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
	if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
	continue;
	if ((GET16(gmeta, pdr->entry[i].PD_Type) &
	(DDF_PDE_PARTICIPATING \|
	DDF_PDE_GLOBAL_SPARE \| DDF_PDE_CONFIG_SPARE)) != 0 \|\|
	g_raid_md_ddf_get_disk(sc,
	NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL)
	j = i;
	else
	memset(&gmeta->pdr->entry[i], 0xff,
	sizeof(struct ddf_pd_entry));
	}
	SET16(gmeta, pdr->Populated_PDEs, j + 1);

	/* Update per-disk metadata and write them. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
	disk->d_state != G_RAID_DISK_S_SPARE)
	continue;
	/* Update PDR. */
	memcpy(pd->pd_meta.pdr, gmeta->pdr,
	GET32(&pd->pd_meta, hdr->pdr_length) *
	pd->pd_meta.sectorsize);
	/* Update VDR. */
	SET16(&pd->pd_meta, vdr->Populated_VDEs, 0);
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_stopping)
	continue;
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	i = ddf_meta_find_vd(&pd->pd_meta,
	pv->pv_meta.vde->VD_GUID);
	if (i < 0)
	i = ddf_meta_find_vd(&pd->pd_meta, NULL);
	if (i >= 0)
	memcpy(&pd->pd_meta.vdr->entry[i],
	pv->pv_meta.vde,
	sizeof(struct ddf_vd_entry));
	}
	/* Update VDC. */
	if (mdi->mdio_starting == 0) {
	/* Remove all VDCs to restore needed later. */
	j = GETCRNUM(&pd->pd_meta);
	for (i = 0; i < j; i++) {
	vdc = GETVDCPTR(&pd->pd_meta, i);
	if (GET32D(&pd->pd_meta, vdc->Signature) !=
	DDF_VDCR_SIGNATURE)
	continue;
	SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff);
	}
	}
	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
	vol = sd->sd_volume;
	if (vol->v_stopping)
	continue;
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	vmeta = &pv->pv_meta;
	vdc = ddf_meta_find_vdc(&pd->pd_meta,
	vmeta->vde->VD_GUID);
	if (vdc == NULL)
	vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL);
	if (vdc != NULL) {
	bvd = sd->sd_pos / GET16(vmeta,
	vdc->Primary_Element_Count);
	memcpy(vdc, vmeta->bvdc[bvd],
	GET16(&pd->pd_meta,
	hdr->Configuration_Record_Length) *
	pd->pd_meta.sectorsize);
	}
	}
	G_RAID_DEBUG(1, "Writing DDF metadata to %s",
	g_raid_get_diskname(disk));
	g_raid_md_ddf_print(&pd->pd_meta);
	ddf_meta_write(disk->d_consumer, &pd->pd_meta);
	}
	return (0);
	}

	static int
	g_raid_md_fail_disk_ddf(struct g_raid_md_object *md,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_ddf_perdisk *pd;
	struct g_raid_subdisk *sd;
	int i;

	sc = md->mdo_softc;
	pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data;

	/* We can't fail disk that is not a part of array now. */
	if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
	return (-1);

	/*
	* Mark disk as failed in metadata and try to write that metadata
	* to the disk itself to prevent it's later resurrection as STALE.
	*/
	G_RAID_DEBUG(1, "Writing DDF metadata to %s",
	g_raid_get_diskname(tdisk));
	i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference));
	SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED \| DDF_PDE_PFA);
	if (tdisk->d_consumer != NULL)
	ddf_meta_write(tdisk->d_consumer, &pd->pd_meta);

	/* Change states. */
	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_ddf(md, NULL, NULL, tdisk);

	g_raid_md_ddf_refill(sc);
	return (0);
	}

	static int
	g_raid_md_free_disk_ddf(struct g_raid_md_object *md,
	struct g_raid_disk *disk)
	{
	struct g_raid_md_ddf_perdisk *pd;

	pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
	ddf_meta_free(&pd->pd_meta);
	free(pd, M_MD_DDF);
	disk->d_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_volume_ddf(struct g_raid_md_object *md,
	struct g_raid_volume *vol)
	{
	struct g_raid_md_ddf_object *mdi;
	struct g_raid_md_ddf_pervolume *pv;

	mdi = (struct g_raid_md_ddf_object *)md;
	pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
	ddf_vol_meta_free(&pv->pv_meta);
	if (!pv->pv_started) {
	pv->pv_started = 1;
	mdi->mdio_starting--;
	callout_stop(&pv->pv_start_co);
	}
	free(pv, M_MD_DDF);
	vol->v_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_ddf(struct g_raid_md_object *md)
	{
	struct g_raid_md_ddf_object *mdi;

	mdi = (struct g_raid_md_ddf_object *)md;
	if (!mdi->mdio_started) {
	mdi->mdio_started = 0;
	callout_stop(&mdi->mdio_start_co);
	G_RAID_DEBUG1(1, md->mdo_softc,
	"root_mount_rel %p", mdi->mdio_rootmount);
	root_mount_rel(mdi->mdio_rootmount);
	mdi->mdio_rootmount = NULL;
	}
	ddf_meta_free(&mdi->mdio_meta);
	return (0);
	}

	G_RAID_MD_DECLARE(ddf, "DDF");
	Index: head/sys/geom/raid/md_promise.c
	===================================================================
	--- head/sys/geom/raid/md_promise.c (revision 327172)
	+++ head/sys/geom/raid/md_promise.c (revision 327173)
	@@ -1,2008 +1,2007 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
	* Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/systm.h>
	#include <geom/geom.h>
	#include "geom/raid/g_raid.h"
	#include "g_raid_md_if.h"

	static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");

	#define PROMISE_MAX_DISKS 8
	#define PROMISE_MAX_SUBDISKS 2
	#define PROMISE_META_OFFSET 14

	struct promise_raid_disk {
	uint8_t flags; /* Subdisk status. */
	#define PROMISE_F_VALID 0x01
	#define PROMISE_F_ONLINE 0x02
	#define PROMISE_F_ASSIGNED 0x04
	#define PROMISE_F_SPARE 0x08
	#define PROMISE_F_DUPLICATE 0x10
	#define PROMISE_F_REDIR 0x20
	#define PROMISE_F_DOWN 0x40
	#define PROMISE_F_READY 0x80

	uint8_t number; /* Position in a volume. */
	uint8_t channel; /* ATA channel number. */
	uint8_t device; /* ATA device number. */
	uint64_t id __packed; /* Subdisk ID. */
	} __packed;

	struct promise_raid_conf {
	char promise_id[24];
	#define PROMISE_MAGIC "Promise Technology, Inc."
	#define FREEBSD_MAGIC "FreeBSD ATA driver RAID "

	uint32_t dummy_0;
	uint64_t magic_0;
	#define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) \| \
	((uint64_t)(x.device != 0) << 56))
	uint16_t magic_1;
	uint32_t magic_2;
	uint8_t filler1[470];

	uint32_t integrity;
	#define PROMISE_I_VALID 0x00000080

	struct promise_raid_disk disk; /* This subdisk info. */
	uint32_t disk_offset; /* Subdisk offset. */
	uint32_t disk_sectors; /* Subdisk size */
	uint32_t disk_rebuild; /* Rebuild position. */
	uint16_t generation; /* Generation number. */
	uint8_t status; /* Volume status. */
	#define PROMISE_S_VALID 0x01
	#define PROMISE_S_ONLINE 0x02
	#define PROMISE_S_INITED 0x04
	#define PROMISE_S_READY 0x08
	#define PROMISE_S_DEGRADED 0x10
	#define PROMISE_S_MARKED 0x20
	#define PROMISE_S_MIGRATING 0x40
	#define PROMISE_S_FUNCTIONAL 0x80

	uint8_t type; /* Voluem type. */
	#define PROMISE_T_RAID0 0x00
	#define PROMISE_T_RAID1 0x01
	#define PROMISE_T_RAID3 0x02
	#define PROMISE_T_RAID5 0x04
	#define PROMISE_T_SPAN 0x08
	#define PROMISE_T_JBOD 0x10

	uint8_t total_disks; /* Disks in this volume. */
	uint8_t stripe_shift; /* Strip size. */
	uint8_t array_width; /* Number of RAID0 stripes. */
	uint8_t array_number; /* Global volume number. */
	uint32_t total_sectors; /* Volume size. */
	uint16_t cylinders; /* Volume geometry: C. */
	uint8_t heads; /* Volume geometry: H. */
	uint8_t sectors; /* Volume geometry: S. */
	uint64_t volume_id __packed; /* Volume ID, */
	struct promise_raid_disk disks[PROMISE_MAX_DISKS];
	/* Subdisks in this volume. */
	char name[32]; /* Volume label. */

	uint32_t filler2[8];
	uint32_t magic_3; /* Something related to rebuild. */
	uint64_t rebuild_lba64; /* Per-volume rebuild position. */
	uint32_t magic_4;
	uint32_t magic_5;
	uint32_t total_sectors_high;
	uint8_t magic_6;
	uint8_t sector_size;
	uint16_t magic_7;
	uint32_t magic_8[31];
	uint32_t backup_time;
	uint16_t magic_9;
	uint32_t disk_offset_high;
	uint32_t disk_sectors_high;
	uint32_t disk_rebuild_high;
	uint16_t magic_10;
	uint32_t magic_11[3];
	uint32_t filler3[284];
	uint32_t checksum;
	} __packed;

	struct g_raid_md_promise_perdisk {
	int pd_updated;
	int pd_subdisks;
	struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS];
	};

	struct g_raid_md_promise_pervolume {
	struct promise_raid_conf *pv_meta;
	uint64_t pv_id;
	uint16_t pv_generation;
	int pv_disks_present;
	int pv_started;
	struct callout pv_start_co; /* STARTING state timer. */
	};

	static g_raid_md_create_t g_raid_md_create_promise;
	static g_raid_md_taste_t g_raid_md_taste_promise;
	static g_raid_md_event_t g_raid_md_event_promise;
	static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
	static g_raid_md_ctl_t g_raid_md_ctl_promise;
	static g_raid_md_write_t g_raid_md_write_promise;
	static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
	static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
	static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
	static g_raid_md_free_t g_raid_md_free_promise;

	static kobj_method_t g_raid_md_promise_methods[] = {
	KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise),
	KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise),
	KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise),
	KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise),
	KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise),
	KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise),
	KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise),
	KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise),
	KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise),
	KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise),
	{ 0, 0 }
	};

	static struct g_raid_md_class g_raid_md_promise_class = {
	"Promise",
	g_raid_md_promise_methods,
	sizeof(struct g_raid_md_object),
	.mdc_enable = 1,
	.mdc_priority = 100
	};


	static void
	g_raid_md_promise_print(struct promise_raid_conf *meta)
	{
	int i;

	if (g_raid_debug < 1)
	return;

	printf("******* ATA Promise Metadata *******\n");
	printf("promise_id <%.24s>\n", meta->promise_id);
	printf("disk %02x %02x %02x %02x %016jx\n",
	meta->disk.flags, meta->disk.number, meta->disk.channel,
	meta->disk.device, meta->disk.id);
	printf("disk_offset %u\n", meta->disk_offset);
	printf("disk_sectors %u\n", meta->disk_sectors);
	printf("disk_rebuild %u\n", meta->disk_rebuild);
	printf("generation %u\n", meta->generation);
	printf("status 0x%02x\n", meta->status);
	printf("type %u\n", meta->type);
	printf("total_disks %u\n", meta->total_disks);
	printf("stripe_shift %u\n", meta->stripe_shift);
	printf("array_width %u\n", meta->array_width);
	printf("array_number %u\n", meta->array_number);
	printf("total_sectors %u\n", meta->total_sectors);
	printf("cylinders %u\n", meta->cylinders);
	printf("heads %u\n", meta->heads);
	printf("sectors %u\n", meta->sectors);
	printf("volume_id 0x%016jx\n", meta->volume_id);
	printf("disks:\n");
	for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
	printf(" %02x %02x %02x %02x %016jx\n",
	meta->disks[i].flags, meta->disks[i].number,
	meta->disks[i].channel, meta->disks[i].device,
	meta->disks[i].id);
	}
	printf("name <%.32s>\n", meta->name);
	printf("magic_3 0x%08x\n", meta->magic_3);
	printf("rebuild_lba64 %ju\n", meta->rebuild_lba64);
	printf("magic_4 0x%08x\n", meta->magic_4);
	printf("magic_5 0x%08x\n", meta->magic_5);
	printf("total_sectors_high 0x%08x\n", meta->total_sectors_high);
	printf("sector_size %u\n", meta->sector_size);
	printf("backup_time %d\n", meta->backup_time);
	printf("disk_offset_high 0x%08x\n", meta->disk_offset_high);
	printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high);
	printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high);
	printf("=================================================\n");
	}

	static struct promise_raid_conf *
	promise_meta_copy(struct promise_raid_conf *meta)
	{
	struct promise_raid_conf *nmeta;

	nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
	memcpy(nmeta, meta, sizeof(*nmeta));
	return (nmeta);
	}

	static int
	promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
	{
	int pos;

	for (pos = 0; pos < meta->total_disks; pos++) {
	if (meta->disks[pos].id == id)
	return (pos);
	}
	return (-1);
	}

	static int
	promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
	off_t sectors, off_t off, off_t size)
	{
	off_t coff, csize, tmp;
	int i, j;

	sectors -= 131072;
	*off = 0;
	*size = 0;
	coff = 0;
	csize = sectors;
	i = 0;
	while (1) {
	for (j = 0; j < nsd; j++) {
	tmp = ((off_t)metaarr[j]->disk_offset_high << 32) +
	metaarr[j]->disk_offset;
	if (tmp >= coff)
	csize = MIN(csize, tmp - coff);
	}
	if (csize > *size) {
	*off = coff;
	*size = csize;
	}
	if (i >= nsd)
	break;
	coff = ((off_t)metaarr[i]->disk_offset_high << 32) +
	metaarr[i]->disk_offset +
	((off_t)metaarr[i]->disk_sectors_high << 32) +
	metaarr[i]->disk_sectors;
	csize = sectors - coff;
	i++;
	}
	return ((*size > 0) ? 1 : 0);
	}

	static int
	promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
	{
	int disk_pos, width;

	if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
	width = vol->v_disks_count / 2;
	disk_pos = (md_disk_pos / width) +
	(md_disk_pos % width) * width;
	} else
	disk_pos = md_disk_pos;
	return (disk_pos);
	}

	static void
	promise_meta_get_name(struct promise_raid_conf meta, char buf)
	{
	int i;

	strncpy(buf, meta->name, 32);
	buf[32] = 0;
	for (i = 31; i >= 0; i--) {
	if (buf[i] > 0x20)
	break;
	buf[i] = 0;
	}
	}

	static void
	promise_meta_put_name(struct promise_raid_conf meta, char buf)
	{

	memset(meta->name, 0x20, 32);
	memcpy(meta->name, buf, MIN(strlen(buf), 32));
	}

	static int
	promise_meta_read(struct g_consumer cp, struct promise_raid_conf *metaarr)
	{
	struct g_provider *pp;
	struct promise_raid_conf *meta;
	char *buf;
	int error, i, subdisks;
	uint32_t checksum, *ptr;

	pp = cp->provider;
	subdisks = 0;

	if (pp->sectorsize * 4 > MAXPHYS) {
	G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name);
	return (subdisks);
	}
	next:
	/* Read metadata block. */
	buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
	(63 - subdisks * PROMISE_META_OFFSET),
	pp->sectorsize * 4, &error);
	if (buf == NULL) {
	G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
	pp->name, error);
	return (subdisks);
	}
	meta = (struct promise_raid_conf *)buf;

	/* Check if this is an Promise RAID struct */
	if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
	strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
	if (subdisks == 0)
	G_RAID_DEBUG(1,
	"Promise signature check failed on %s", pp->name);
	g_free(buf);
	return (subdisks);
	}
	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
	memcpy(meta, buf, MIN(sizeof(meta), pp->sectorsize 4));
	g_free(buf);

	/* Check metadata checksum. */
	for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
	checksum += *ptr++;
	if (checksum != meta->checksum) {
	G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
	free(meta, M_MD_PROMISE);
	return (subdisks);
	}

	if ((meta->integrity & PROMISE_I_VALID) == 0) {
	G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
	free(meta, M_MD_PROMISE);
	return (subdisks);
	}

	if (meta->total_disks > PROMISE_MAX_DISKS) {
	G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
	pp->name, meta->total_disks);
	free(meta, M_MD_PROMISE);
	return (subdisks);
	}

	/* Remove filler garbage from fields used in newer metadata. */
	if (meta->disk_offset_high == 0x8b8c8d8e &&
	meta->disk_sectors_high == 0x8788898a &&
	meta->disk_rebuild_high == 0x83848586) {
	meta->disk_offset_high = 0;
	meta->disk_sectors_high = 0;
	if (meta->disk_rebuild == UINT32_MAX)
	meta->disk_rebuild_high = UINT32_MAX;
	else
	meta->disk_rebuild_high = 0;
	if (meta->total_sectors_high == 0x15161718) {
	meta->total_sectors_high = 0;
	meta->backup_time = 0;
	if (meta->rebuild_lba64 == 0x2122232425262728)
	meta->rebuild_lba64 = UINT64_MAX;
	}
	}
	if (meta->sector_size < 1 \|\| meta->sector_size > 8)
	meta->sector_size = 1;

	/* Save this part and look for next. */
	*metaarr = meta;
	metaarr++;
	subdisks++;
	if (subdisks < PROMISE_MAX_SUBDISKS)
	goto next;

	return (subdisks);
	}

	static int
	promise_meta_write(struct g_consumer *cp,
	struct promise_raid_conf **metaarr, int nsd)
	{
	struct g_provider *pp;
	struct promise_raid_conf *meta;
	char *buf;
	off_t off, size;
	int error, i, subdisk, fake;
	uint32_t checksum, *ptr;

	pp = cp->provider;
	subdisk = 0;
	fake = 0;
	next:
	buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK \| M_ZERO);
	meta = NULL;
	if (subdisk < nsd) {
	meta = metaarr[subdisk];
	} else if (!fake && promise_meta_unused_range(metaarr, nsd,
	cp->provider->mediasize / cp->provider->sectorsize,
	&off, &size)) {
	/* Optionally add record for unused space. */
	meta = (struct promise_raid_conf *)buf;
	memcpy(&meta->promise_id[0], PROMISE_MAGIC,
	sizeof(PROMISE_MAGIC) - 1);
	meta->dummy_0 = 0x00020000;
	meta->integrity = PROMISE_I_VALID;
	meta->disk.flags = PROMISE_F_ONLINE \| PROMISE_F_VALID;
	meta->disk.number = 0xff;
	arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
	meta->disk_offset_high = off >> 32;
	meta->disk_offset = (uint32_t)off;
	meta->disk_sectors_high = size >> 32;
	meta->disk_sectors = (uint32_t)size;
	meta->disk_rebuild_high = UINT32_MAX;
	meta->disk_rebuild = UINT32_MAX;
	fake = 1;
	}
	if (meta != NULL) {
	/* Recalculate checksum for case if metadata were changed. */
	meta->checksum = 0;
	for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
	checksum += *ptr++;
	meta->checksum = checksum;
	memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
	}
	error = g_write_data(cp, pp->mediasize - pp->sectorsize *
	(63 - subdisk * PROMISE_META_OFFSET),
	buf, pp->sectorsize * 4);
	if (error != 0) {
	G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
	pp->name, error);
	}
	free(buf, M_MD_PROMISE);

	subdisk++;
	if (subdisk < PROMISE_MAX_SUBDISKS)
	goto next;

	return (error);
	}

	static int
	promise_meta_erase(struct g_consumer *cp)
	{
	struct g_provider *pp;
	char *buf;
	int error, subdisk;

	pp = cp->provider;
	buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK \| M_ZERO);
	for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
	error = g_write_data(cp, pp->mediasize - pp->sectorsize *
	(63 - subdisk * PROMISE_META_OFFSET),
	buf, 4 * pp->sectorsize);
	if (error != 0) {
	G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
	pp->name, error);
	}
	}
	free(buf, M_MD_PROMISE);
	return (error);
	}

	static int
	promise_meta_write_spare(struct g_consumer *cp)
	{
	struct promise_raid_conf *meta;
	off_t tmp;
	int error;

	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1);
	meta->dummy_0 = 0x00020000;
	meta->integrity = PROMISE_I_VALID;
	meta->disk.flags = PROMISE_F_SPARE \| PROMISE_F_ONLINE \| PROMISE_F_VALID;
	meta->disk.number = 0xff;
	arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
	tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072;
	meta->disk_sectors_high = tmp >> 32;
	meta->disk_sectors = (uint32_t)tmp;
	meta->disk_rebuild_high = UINT32_MAX;
	meta->disk_rebuild = UINT32_MAX;
	error = promise_meta_write(cp, &meta, 1);
	free(meta, M_MD_PROMISE);
	return (error);
	}

	static struct g_raid_volume *
	g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
	{
	struct g_raid_volume *vol;
	struct g_raid_md_promise_pervolume *pv;

	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (pv->pv_id == id)
	break;
	}
	return (vol);
	}

	static int
	g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
	{
	struct g_raid_volume vol, tvol;
	struct g_raid_md_promise_pervolume *pv;
	int i, res;

	res = 0;
	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
	pv = vol->v_md_data;
	if (!pv->pv_started \|\| vol->v_stopping)
	continue;
	for (i = 0; i < vol->v_disks_count; i++) {
	if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
	break;
	}
	if (i >= vol->v_disks_count) {
	g_raid_destroy_volume(vol);
	res = 1;
	}
	}
	return (res);
	}

	static int
	g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
	{
	struct g_raid_disk disk, tdisk;
	struct g_raid_volume *vol;
	struct g_raid_md_promise_perdisk *pd;
	int i, j, res;

	res = 0;
	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
	if (disk->d_state == G_RAID_DISK_S_SPARE)
	continue;
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;

	/* Scan for deleted volumes. */
	for (i = 0; i < pd->pd_subdisks; ) {
	vol = g_raid_md_promise_get_volume(sc,
	pd->pd_meta[i]->volume_id);
	if (vol != NULL && !vol->v_stopping) {
	i++;
	continue;
	}
	free(pd->pd_meta[i], M_MD_PROMISE);
	for (j = i; j < pd->pd_subdisks - 1; j++)
	pd->pd_meta[j] = pd->pd_meta[j + 1];
	pd->pd_meta[pd->pd_subdisks - 1] = NULL;
	pd->pd_subdisks--;
	pd->pd_updated = 1;
	}

	/* If there is no metadata left - erase and delete disk. */
	if (pd->pd_subdisks == 0) {
	promise_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	res = 1;
	}
	}
	return (res);
	}

	static int
	g_raid_md_promise_supported(int level, int qual, int disks, int force)
	{

	if (disks > PROMISE_MAX_DISKS)
	return (0);
	switch (level) {
	case G_RAID_VOLUME_RL_RAID0:
	if (disks < 1)
	return (0);
	if (!force && disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1:
	if (disks < 1)
	return (0);
	if (!force && (disks != 2))
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID1E:
	if (disks < 2)
	return (0);
	if (disks % 2 != 0)
	return (0);
	if (!force && (disks != 4))
	return (0);
	break;
	case G_RAID_VOLUME_RL_SINGLE:
	if (disks != 1)
	return (0);
	break;
	case G_RAID_VOLUME_RL_CONCAT:
	if (disks < 2)
	return (0);
	break;
	case G_RAID_VOLUME_RL_RAID5:
	if (disks < 3)
	return (0);
	if (qual != G_RAID_VOLUME_RLQ_R5LA)
	return (0);
	break;
	default:
	return (0);
	}
	if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
	return (0);
	return (1);
	}

	static int
	g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
	struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct promise_raid_conf *meta;
	off_t eoff, esize, size;
	int disk_pos, md_disk_pos, i, resurrection = 0;

	sc = disk->d_softc;
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;

	pv = vol->v_md_data;
	meta = pv->pv_meta;

	if (sdn >= 0) {
	/* Find disk position in metadata by its serial. */
	md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
	/* For RAID0+1 we need to translate order. */
	disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
	} else {
	md_disk_pos = -1;
	disk_pos = -1;
	}
	if (disk_pos < 0) {
	G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
	g_raid_get_diskname(disk), vol->v_name);
	/* Failed stale disk is useless for us. */
	if (sdn >= 0 &&
	pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
	return (0);
	}
	/* If we were given specific metadata subdisk - erase it. */
	if (sdn >= 0) {
	free(pd->pd_meta[sdn], M_MD_PROMISE);
	for (i = sdn; i < pd->pd_subdisks - 1; i++)
	pd->pd_meta[i] = pd->pd_meta[i + 1];
	pd->pd_meta[pd->pd_subdisks - 1] = NULL;
	pd->pd_subdisks--;
	}
	/* If we are in the start process, that's all for now. */
	if (!pv->pv_started)
	goto nofit;
	/*
	* If we have already started - try to get use of the disk.
	* Try to replace OFFLINE disks first, then FAILED.
	*/
	promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
	disk->d_consumer->provider->mediasize /
	disk->d_consumer->provider->sectorsize,
	&eoff, &esize);
	if (esize == 0) {
	G_RAID_DEBUG1(1, sc, "No free space on disk %s",
	g_raid_get_diskname(disk));
	goto nofit;
	}
	size = INT64_MAX;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
	size = sd->sd_size;
	if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
	(disk_pos < 0 \|\|
	vol->v_subdisks[i].sd_state < sd->sd_state))
	disk_pos = i;
	}
	if (disk_pos >= 0 &&
	vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
	(off_t)esize * 512 < size) {
	G_RAID_DEBUG1(1, sc, "Disk %s free space "
	"is too small (%ju < %ju)",
	g_raid_get_diskname(disk),
	(off_t)esize * 512, size);
	disk_pos = -1;
	}
	if (disk_pos >= 0) {
	if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
	esize = size / 512;
	/* For RAID0+1 we need to translate order. */
	md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
	} else {
	nofit:
	if (pd->pd_subdisks == 0) {
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_SPARE);
	}
	return (0);
	}
	G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
	g_raid_get_diskname(disk), disk_pos, vol->v_name);
	resurrection = 1;
	}

	sd = &vol->v_subdisks[disk_pos];

	if (resurrection && sd->sd_disk != NULL) {
	g_raid_change_disk_state(sd->sd_disk,
	G_RAID_DISK_S_STALE_FAILED);
	TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
	sd, sd_next);
	}
	vol->v_subdisks[disk_pos].sd_disk = disk;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);

	/* Welcome the new disk. */
	if (resurrection)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
	else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
	g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
	else
	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);

	if (resurrection) {
	sd->sd_offset = (off_t)eoff * 512;
	sd->sd_size = (off_t)esize * 512;
	} else {
	sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high
	<< 32) + pd->pd_meta[sdn]->disk_offset) * 512;
	sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high
	<< 32) + pd->pd_meta[sdn]->disk_sectors) * 512;
	}

	if (resurrection) {
	/* Stale disk, almost same as new. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_NEW);
	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
	/* Failed disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
	/* Rebuilding disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_REBUILD);
	if (pd->pd_meta[sdn]->generation != meta->generation)
	sd->sd_rebuild_pos = 0;
	else {
	sd->sd_rebuild_pos =
	(((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) +
	pd->pd_meta[sdn]->disk_rebuild) * 512;
	}
	} else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
	/* Rebuilding disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_NEW);
	} else if (pd->pd_meta[sdn]->generation != meta->generation \|\|
	(meta->status & PROMISE_S_MARKED)) {
	/* Stale disk or dirty volume (unclean shutdown). */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_STALE);
	} else {
	/* Up to date disk. */
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	}
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);

	return (resurrection);
	}

	static void
	g_raid_md_promise_refill(struct g_raid_softc *sc)
	{
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	int update, updated, i, bad;

	md = sc->sc_md;
	restart:
	updated = 0;
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	pv = vol->v_md_data;
	if (!pv->pv_started \|\| vol->v_stopping)
	continue;

	/* Search for subdisk that needs replacement. */
	bad = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE \|\|
	sd->sd_state == G_RAID_SUBDISK_S_FAILED)
	bad = 1;
	}
	if (!bad)
	continue;

	G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
	"trying to refill.", vol->v_name);

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	/* Skip failed. */
	if (disk->d_state < G_RAID_DISK_S_SPARE)
	continue;
	/* Skip already used by this volume. */
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	if (sd->sd_disk == disk)
	break;
	}
	if (i < vol->v_disks_count)
	continue;

	/* Try to use disk if it has empty extents. */
	pd = disk->d_md_data;
	if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
	update =
	g_raid_md_promise_start_disk(disk, -1, vol);
	} else
	update = 0;
	if (update) {
	updated = 1;
	g_raid_md_write_promise(md, vol, NULL, disk);
	break;
	}
	}
	}
	if (updated)
	goto restart;
	}

	static void
	g_raid_md_promise_start(struct g_raid_volume *vol)
	{
	struct g_raid_softc *sc;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_object *md;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct promise_raid_conf *meta;
	u_int i;

	sc = vol->v_softc;
	md = sc->sc_md;
	pv = vol->v_md_data;
	meta = pv->pv_meta;

	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
	if (meta->type == PROMISE_T_RAID0)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
	else if (meta->type == PROMISE_T_RAID1) {
	if (meta->array_width == 1)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
	else
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
	} else if (meta->type == PROMISE_T_RAID3)
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
	else if (meta->type == PROMISE_T_RAID5) {
	vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
	} else if (meta->type == PROMISE_T_SPAN)
	vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
	else if (meta->type == PROMISE_T_JBOD)
	vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
	else
	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
	vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
	vol->v_disks_count = meta->total_disks;
	vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
	if (meta->total_sectors_high < 256) /* If value looks sane. */
	vol->v_mediasize +=
	((off_t)meta->total_sectors_high << 32) * 512; //ZZZ
	vol->v_sectorsize = 512 * meta->sector_size;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	sd->sd_offset = (((off_t)meta->disk_offset_high << 32) +
	meta->disk_offset) * 512;
	sd->sd_size = (((off_t)meta->disk_sectors_high << 32) +
	meta->disk_sectors) * 512;
	}
	g_raid_start_volume(vol);

	/* Make all disks found till the moment take their places. */
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = disk->d_md_data;
	for (i = 0; i < pd->pd_subdisks; i++) {
	if (pd->pd_meta[i]->volume_id == meta->volume_id)
	g_raid_md_promise_start_disk(disk, i, vol);
	}
	}

	pv->pv_started = 1;
	callout_stop(&pv->pv_start_co);
	G_RAID_DEBUG1(0, sc, "Volume started.");
	g_raid_md_write_promise(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_promise_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
	}

	static void
	g_raid_promise_go(void *arg)
	{
	struct g_raid_volume *vol;
	struct g_raid_softc *sc;
	struct g_raid_md_promise_pervolume *pv;

	vol = arg;
	pv = vol->v_md_data;
	sc = vol->v_softc;
	if (!pv->pv_started) {
	G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
	g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
	G_RAID_EVENT_VOLUME);
	}
	}

	static void
	g_raid_md_promise_new_disk(struct g_raid_disk *disk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_object *md;
	struct promise_raid_conf *pdmeta;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct g_raid_volume *vol;
	int i;
	char buf[33];

	sc = disk->d_softc;
	md = sc->sc_md;
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;

	if (pd->pd_subdisks == 0) {
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	g_raid_md_promise_refill(sc);
	return;
	}

	for (i = 0; i < pd->pd_subdisks; i++) {
	pdmeta = pd->pd_meta[i];

	/* Look for volume with matching ID. */
	vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
	if (vol == NULL) {
	promise_meta_get_name(pdmeta, buf);
	vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
	pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	pv->pv_id = pdmeta->volume_id;
	vol->v_md_data = pv;
	callout_init(&pv->pv_start_co, 1);
	callout_reset(&pv->pv_start_co,
	g_raid_start_timeout * hz,
	g_raid_promise_go, vol);
	} else
	pv = vol->v_md_data;

	/* If we haven't started yet - check metadata freshness. */
	if (pv->pv_meta == NULL \|\| !pv->pv_started) {
	if (pv->pv_meta == NULL \|\|
	((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
	G_RAID_DEBUG1(1, sc, "Newer disk");
	if (pv->pv_meta != NULL)
	free(pv->pv_meta, M_MD_PROMISE);
	pv->pv_meta = promise_meta_copy(pdmeta);
	pv->pv_generation = pv->pv_meta->generation;
	pv->pv_disks_present = 1;
	} else if (pdmeta->generation == pv->pv_generation) {
	pv->pv_disks_present++;
	G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
	pv->pv_disks_present,
	pv->pv_meta->total_disks);
	} else {
	G_RAID_DEBUG1(1, sc, "Older disk");
	}
	}
	}

	for (i = 0; i < pd->pd_subdisks; i++) {
	pdmeta = pd->pd_meta[i];

	/* Look for volume with matching ID. */
	vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
	if (vol == NULL)
	continue;
	pv = vol->v_md_data;

	if (pv->pv_started) {
	if (g_raid_md_promise_start_disk(disk, i, vol))
	g_raid_md_write_promise(md, vol, NULL, NULL);
	} else {
	/* If we collected all needed disks - start array. */
	if (pv->pv_disks_present == pv->pv_meta->total_disks)
	g_raid_md_promise_start(vol);
	}
	}
	}

	static int
	g_raid_md_create_promise(struct g_raid_md_object md, struct g_class mp,
	struct g_geom **gp)
	{
	struct g_geom *geom;
	struct g_raid_softc *sc;

	/* Search for existing node. */
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	break;
	}
	if (geom != NULL) {
	*gp = geom;
	return (G_RAID_MD_TASTE_EXISTING);
	}

	/* Create new one if not found. */
	sc = g_raid_create_node(mp, "Promise", md);
	if (sc == NULL)
	return (G_RAID_MD_TASTE_FAIL);
	md->mdo_softc = sc;
	*gp = sc->sc_geom;
	return (G_RAID_MD_TASTE_NEW);
	}

	static int
	g_raid_md_taste_promise(struct g_raid_md_object md, struct g_class mp,
	struct g_consumer cp, struct g_geom *gp)
	{
	struct g_consumer *rcp;
	struct g_provider *pp;
	struct g_raid_softc *sc;
	struct g_raid_disk *disk;
	- struct promise_raid_conf meta, metaarr[4];
	+ struct promise_raid_conf *metaarr[4];
	struct g_raid_md_promise_perdisk *pd;
	struct g_geom *geom;
	int i, j, result, len, subdisks;
	char name[16];
	uint16_t vendor;

	G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
	pp = cp->provider;

	/* Read metadata from device. */
	- meta = NULL;
	g_topology_unlock();
	vendor = 0xffff;
	len = sizeof(vendor);
	if (pp->geom->rank == 1)
	g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
	subdisks = promise_meta_read(cp, metaarr);
	g_topology_lock();
	if (subdisks == 0) {
	if (g_raid_aggressive_spare) {
	if (vendor == 0x105a \|\| vendor == 0x1002) {
	G_RAID_DEBUG(1,
	"No Promise metadata, forcing spare.");
	goto search;
	} else {
	G_RAID_DEBUG(1,
	"Promise/ATI vendor mismatch "
	"0x%04x != 0x105a/0x1002",
	vendor);
	}
	}
	return (G_RAID_MD_TASTE_FAIL);
	}

	/* Metadata valid. Print it. */
	for (i = 0; i < subdisks; i++)
	g_raid_md_promise_print(metaarr[i]);

	/* Purge meaningless (empty/spare) records. */
	for (i = 0; i < subdisks; ) {
	if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
	i++;
	continue;
	}
	free(metaarr[i], M_MD_PROMISE);
	for (j = i; j < subdisks - 1; j++)
	metaarr[i] = metaarr[j + 1];
	metaarr[subdisks - 1] = NULL;
	subdisks--;
	}

	search:
	/* Search for matching node. */
	sc = NULL;
	LIST_FOREACH(geom, &mp->geom, geom) {
	sc = geom->softc;
	if (sc == NULL)
	continue;
	if (sc->sc_stopping != 0)
	continue;
	if (sc->sc_md->mdo_class != md->mdo_class)
	continue;
	break;
	}

	/* Found matching node. */
	if (geom != NULL) {
	G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
	result = G_RAID_MD_TASTE_EXISTING;

	} else { /* Not found matching node -- create one. */
	result = G_RAID_MD_TASTE_NEW;
	snprintf(name, sizeof(name), "Promise");
	sc = g_raid_create_node(mp, name, md);
	md->mdo_softc = sc;
	geom = sc->sc_geom;
	}

	/* There is no return after this point, so we close passed consumer. */
	g_access(cp, -1, 0, 0);

	rcp = g_new_consumer(geom);
	rcp->flags \|= G_CF_DIRECT_RECEIVE;
	g_attach(rcp, pp);
	if (g_access(rcp, 1, 1, 1) != 0)
	; //goto fail1;

	g_topology_unlock();
	sx_xlock(&sc->sc_lock);

	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	pd->pd_subdisks = subdisks;
	for (i = 0; i < subdisks; i++)
	pd->pd_meta[i] = metaarr[i];
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = rcp;
	rcp->private = disk;

	g_raid_get_disk_info(disk);

	g_raid_md_promise_new_disk(disk);

	sx_xunlock(&sc->sc_lock);
	g_topology_lock();
	*gp = geom;
	return (result);
	}

	static int
	g_raid_md_event_promise(struct g_raid_md_object *md,
	struct g_raid_disk *disk, u_int event)
	{
	struct g_raid_softc *sc;

	sc = md->mdo_softc;
	if (disk == NULL)
	return (-1);
	switch (event) {
	case G_RAID_DISK_E_DISCONNECTED:
	/* Delete disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
	g_raid_destroy_disk(disk);
	g_raid_md_promise_purge_volumes(sc);

	/* Write updated metadata to all disks. */
	g_raid_md_write_promise(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_promise_refill(sc);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_volume_event_promise(struct g_raid_md_object *md,
	struct g_raid_volume *vol, u_int event)
	{
	struct g_raid_md_promise_pervolume *pv;

	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
	switch (event) {
	case G_RAID_VOLUME_E_STARTMD:
	if (!pv->pv_started)
	g_raid_md_promise_start(vol);
	return (0);
	}
	return (-2);
	}

	static int
	g_raid_md_ctl_promise(struct g_raid_md_object *md,
	struct gctl_req *req)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume vol, vol1;
	struct g_raid_subdisk *sd;
	struct g_raid_disk disk, disks[PROMISE_MAX_DISKS];
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct g_consumer *cp;
	struct g_provider *pp;
	char arg[16];
	const char nodename, verb, volname, levelname, *diskname;
	char *tmp;
	int nargs, force;
	off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip;
	intmax_t sizearg, striparg;
	int numdisks, i, len, level, qual;
	int error;

	sc = md->mdo_softc;
	verb = gctl_get_param(req, "verb", NULL);
	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
	error = 0;
	if (strcmp(verb, "label") == 0) {

	if (*nargs < 4) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req, "arg1");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}
	levelname = gctl_get_asciiparam(req, "arg2");
	if (levelname == NULL) {
	gctl_error(req, "No RAID level.");
	return (-3);
	}
	if (strcasecmp(levelname, "RAID5") == 0)
	levelname = "RAID5-LA";
	if (g_raid_volume_str2level(levelname, &level, &qual)) {
	gctl_error(req, "Unknown RAID level '%s'.", levelname);
	return (-4);
	}
	numdisks = *nargs - 3;
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (!g_raid_md_promise_supported(level, qual, numdisks,
	force ? *force : 0)) {
	gctl_error(req, "Unsupported RAID level "
	"(0x%02x/0x%02x), or number of disks (%d).",
	level, qual, numdisks);
	return (-5);
	}

	/* Search for disks, connect them and probe. */
	size = INT64_MAX;
	sectorsize = 0;
	bzero(disks, sizeof(disks));
	bzero(offs, sizeof(offs));
	for (i = 0; i < numdisks; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i + 3);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -6;
	break;
	}
	if (strcmp(diskname, "NONE") == 0)
	continue;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk != NULL) {
	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
	gctl_error(req, "Disk '%s' is in a "
	"wrong state (%s).", diskname,
	g_raid_disk_state2str(disk->d_state));
	error = -7;
	break;
	}
	pd = disk->d_md_data;
	if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
	gctl_error(req, "Disk '%s' already "
	"used by %d volumes.",
	diskname, pd->pd_subdisks);
	error = -7;
	break;
	}
	pp = disk->d_consumer->provider;
	disks[i] = disk;
	promise_meta_unused_range(pd->pd_meta,
	pd->pd_subdisks,
	pp->mediasize / pp->sectorsize,
	&offs[i], &esize);
	size = MIN(size, (off_t)esize * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	continue;
	}

	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -8;
	break;
	}
	pp = cp->provider;
	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	disk = g_raid_create_disk(sc);
	disk->d_md_data = (void *)pd;
	disk->d_consumer = cp;
	disks[i] = disk;
	cp->private = disk;
	g_topology_unlock();

	g_raid_get_disk_info(disk);

	/* Reserve some space for metadata. */
	size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
	sectorsize = MAX(sectorsize, pp->sectorsize);
	}
	if (error != 0) {
	for (i = 0; i < numdisks; i++) {
	if (disks[i] != NULL &&
	disks[i]->d_state == G_RAID_DISK_S_NONE)
	g_raid_destroy_disk(disks[i]);
	}
	return (error);
	}

	if (sectorsize <= 0) {
	gctl_error(req, "Can't get sector size.");
	return (-8);
	}

	/* Handle size argument. */
	len = sizeof(*sizearg);
	sizearg = gctl_get_param(req, "size", &len);
	if (sizearg != NULL && len == sizeof(*sizearg) &&
	*sizearg > 0) {
	if (*sizearg > size) {
	gctl_error(req, "Size too big %lld > %lld.",
	(long long)*sizearg, (long long)size);
	return (-9);
	}
	size = *sizearg;
	}

	/* Handle strip argument. */
	strip = 131072;
	len = sizeof(*striparg);
	striparg = gctl_get_param(req, "strip", &len);
	if (striparg != NULL && len == sizeof(*striparg) &&
	*striparg > 0) {
	if (*striparg < sectorsize) {
	gctl_error(req, "Strip size too small.");
	return (-10);
	}
	if (*striparg % sectorsize != 0) {
	gctl_error(req, "Incorrect strip size.");
	return (-11);
	}
	strip = *striparg;
	}

	/* Round size down to strip or sector. */
	if (level == G_RAID_VOLUME_RL_RAID1 \|\|
	level == G_RAID_VOLUME_RL_SINGLE \|\|
	level == G_RAID_VOLUME_RL_CONCAT)
	size -= (size % sectorsize);
	else if (level == G_RAID_VOLUME_RL_RAID1E &&
	(numdisks & 1) != 0)
	size -= (size % (2 * strip));
	else
	size -= (size % strip);
	if (size <= 0) {
	gctl_error(req, "Size too small.");
	return (-13);
	}

	/* We have all we need, create things: volume, ... */
	pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
	pv->pv_generation = 0;
	pv->pv_started = 1;
	vol = g_raid_create_volume(sc, volname, -1);
	vol->v_md_data = pv;
	vol->v_raid_level = level;
	vol->v_raid_level_qualifier = qual;
	vol->v_strip_size = strip;
	vol->v_disks_count = numdisks;
	if (level == G_RAID_VOLUME_RL_RAID0 \|\|
	level == G_RAID_VOLUME_RL_CONCAT \|\|
	level == G_RAID_VOLUME_RL_SINGLE)
	vol->v_mediasize = size * numdisks;
	else if (level == G_RAID_VOLUME_RL_RAID1)
	vol->v_mediasize = size;
	else if (level == G_RAID_VOLUME_RL_RAID3 \|\|
	level == G_RAID_VOLUME_RL_RAID5)
	vol->v_mediasize = size * (numdisks - 1);
	else { /* RAID1E */
	vol->v_mediasize = ((size * numdisks) / strip / 2) *
	strip;
	}
	vol->v_sectorsize = sectorsize;
	g_raid_start_volume(vol);

	/* , and subdisks. */
	for (i = 0; i < numdisks; i++) {
	disk = disks[i];
	sd = &vol->v_subdisks[i];
	sd->sd_disk = disk;
	sd->sd_offset = (off_t)offs[i] * 512;
	sd->sd_size = size;
	if (disk == NULL)
	continue;
	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
	g_raid_change_disk_state(disk,
	G_RAID_DISK_S_ACTIVE);
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_ACTIVE);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write metadata based on created entities. */
	G_RAID_DEBUG1(0, sc, "Array started.");
	g_raid_md_write_promise(md, vol, NULL, NULL);

	/* Pickup any STALE/SPARE disks to refill array if needed. */
	g_raid_md_promise_refill(sc);

	g_raid_event_send(vol, G_RAID_VOLUME_E_START,
	G_RAID_EVENT_VOLUME);
	return (0);
	}
	if (strcmp(verb, "add") == 0) {

	gctl_error(req, "`add` command is not applicable, "
	"use `label` instead.");
	return (-99);
	}
	if (strcmp(verb, "delete") == 0) {

	nodename = gctl_get_asciiparam(req, "arg0");
	if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
	nodename = NULL;

	/* Full node destruction. */
	if (*nargs == 1 && nodename != NULL) {
	/* Check if some volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	g_raid_nopens(sc) != 0) {
	gctl_error(req, "Some volume is still open.");
	return (-4);
	}

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	promise_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	return (0);
	}

	/* Destroy specified volume. If it was last - all node. */
	if (*nargs > 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	volname = gctl_get_asciiparam(req,
	nodename != NULL ? "arg1" : "arg0");
	if (volname == NULL) {
	gctl_error(req, "No volume name.");
	return (-2);
	}

	/* Search for volume. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (strcmp(vol->v_name, volname) == 0)
	break;
	pp = vol->v_provider;
	if (pp == NULL)
	continue;
	if (strcmp(pp->name, volname) == 0)
	break;
	if (strncmp(pp->name, "raid/", 5) == 0 &&
	strcmp(pp->name + 5, volname) == 0)
	break;
	}
	if (vol == NULL) {
	i = strtol(volname, &tmp, 10);
	if (verb != volname && tmp[0] == 0) {
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_global_id == i)
	break;
	}
	}
	}
	if (vol == NULL) {
	gctl_error(req, "Volume '%s' not found.", volname);
	return (-3);
	}

	/* Check if volume is still open. */
	force = gctl_get_paraml(req, "force", sizeof(*force));
	if (force != NULL && *force == 0 &&
	vol->v_provider_open != 0) {
	gctl_error(req, "Volume is still open.");
	return (-4);
	}

	/* Destroy volume and potentially node. */
	i = 0;
	TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
	i++;
	if (i >= 2) {
	g_raid_destroy_volume(vol);
	g_raid_md_promise_purge_disks(sc);
	g_raid_md_write_promise(md, NULL, NULL, NULL);
	} else {
	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer)
	promise_meta_erase(disk->d_consumer);
	}
	g_raid_destroy_node(sc, 0);
	}
	return (0);
	}
	if (strcmp(verb, "remove") == 0 \|\|
	strcmp(verb, "fail") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -2;
	break;
	}
	if (strncmp(diskname, "/dev/", 5) == 0)
	diskname += 5;

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	if (disk->d_consumer != NULL &&
	disk->d_consumer->provider != NULL &&
	strcmp(disk->d_consumer->provider->name,
	diskname) == 0)
	break;
	}
	if (disk == NULL) {
	gctl_error(req, "Disk '%s' not found.",
	diskname);
	error = -3;
	break;
	}

	if (strcmp(verb, "fail") == 0) {
	g_raid_md_fail_disk_promise(md, NULL, disk);
	continue;
	}

	/* Erase metadata on deleting disk and destroy it. */
	promise_meta_erase(disk->d_consumer);
	g_raid_destroy_disk(disk);
	}
	g_raid_md_promise_purge_volumes(sc);

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_promise(md, NULL, NULL, NULL);

	/* Check if anything left. */
	if (g_raid_ndisks(sc, -1) == 0)
	g_raid_destroy_node(sc, 0);
	else
	g_raid_md_promise_refill(sc);
	return (error);
	}
	if (strcmp(verb, "insert") == 0) {
	if (*nargs < 2) {
	gctl_error(req, "Invalid number of arguments.");
	return (-1);
	}
	for (i = 1; i < *nargs; i++) {
	/* Get disk name. */
	snprintf(arg, sizeof(arg), "arg%d", i);
	diskname = gctl_get_asciiparam(req, arg);
	if (diskname == NULL) {
	gctl_error(req, "No disk name (%s).", arg);
	error = -3;
	break;
	}

	/* Try to find provider with specified name. */
	g_topology_lock();
	cp = g_raid_open_consumer(sc, diskname);
	if (cp == NULL) {
	gctl_error(req, "Can't open disk '%s'.",
	diskname);
	g_topology_unlock();
	error = -4;
	break;
	}
	pp = cp->provider;
	g_topology_unlock();

	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK \| M_ZERO);

	disk = g_raid_create_disk(sc);
	disk->d_consumer = cp;
	disk->d_md_data = (void *)pd;
	cp->private = disk;

	g_raid_get_disk_info(disk);

	/* Welcome the "new" disk. */
	g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
	promise_meta_write_spare(cp);
	g_raid_md_promise_refill(sc);
	}
	return (error);
	}
	return (-100);
	}

	static int
	g_raid_md_write_promise(struct g_raid_md_object md, struct g_raid_volume tvol,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct g_raid_disk *disk;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_md_promise_pervolume *pv;
	struct promise_raid_conf *meta;
	off_t rebuild_lba64;
	int i, j, pos, rebuild;

	sc = md->mdo_softc;

	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
	return (0);

	/* Generate new per-volume metadata for affected volumes. */
	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
	if (vol->v_stopping)
	continue;

	/* Skip volumes not related to specified targets. */
	if (tvol != NULL && vol != tvol)
	continue;
	if (tsd != NULL && vol != tsd->sd_volume)
	continue;
	if (tdisk != NULL) {
	for (i = 0; i < vol->v_disks_count; i++) {
	if (vol->v_subdisks[i].sd_disk == tdisk)
	break;
	}
	if (i >= vol->v_disks_count)
	continue;
	}

	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
	pv->pv_generation++;

	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK \| M_ZERO);
	if (pv->pv_meta != NULL)
	memcpy(meta, pv->pv_meta, sizeof(*meta));
	memcpy(meta->promise_id, PROMISE_MAGIC,
	sizeof(PROMISE_MAGIC) - 1);
	meta->dummy_0 = 0x00020000;
	meta->integrity = PROMISE_I_VALID;

	meta->generation = pv->pv_generation;
	meta->status = PROMISE_S_VALID \| PROMISE_S_ONLINE \|
	PROMISE_S_INITED \| PROMISE_S_READY;
	if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
	meta->status \|= PROMISE_S_DEGRADED;
	if (vol->v_dirty)
	meta->status \|= PROMISE_S_MARKED; /* XXX: INVENTED! */
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
	meta->type = PROMISE_T_RAID0;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
	meta->type = PROMISE_T_RAID1;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
	meta->type = PROMISE_T_RAID3;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
	meta->type = PROMISE_T_RAID5;
	else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
	meta->type = PROMISE_T_SPAN;
	else
	meta->type = PROMISE_T_JBOD;
	meta->total_disks = vol->v_disks_count;
	meta->stripe_shift = ffs(vol->v_strip_size / 1024);
	meta->array_width = vol->v_disks_count;
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
	meta->array_width /= 2;
	meta->array_number = vol->v_global_id;
	meta->total_sectors = vol->v_mediasize / 512;
	meta->total_sectors_high = (vol->v_mediasize / 512) >> 32;
	meta->sector_size = vol->v_sectorsize / 512;
	meta->cylinders = meta->total_sectors / (255 * 63) - 1;
	meta->heads = 254;
	meta->sectors = 63;
	meta->volume_id = pv->pv_id;
	rebuild_lba64 = UINT64_MAX;
	rebuild = 0;
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	/* For RAID0+1 we need to translate order. */
	pos = promise_meta_translate_disk(vol, i);
	meta->disks[pos].flags = PROMISE_F_VALID \|
	PROMISE_F_ASSIGNED;
	if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
	meta->disks[pos].flags \|= 0;
	} else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
	meta->disks[pos].flags \|=
	PROMISE_F_DOWN \| PROMISE_F_REDIR;
	} else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
	meta->disks[pos].flags \|=
	PROMISE_F_ONLINE \| PROMISE_F_REDIR;
	if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
	rebuild_lba64 = MIN(rebuild_lba64,
	sd->sd_rebuild_pos / 512);
	} else
	rebuild_lba64 = 0;
	rebuild = 1;
	} else {
	meta->disks[pos].flags \|= PROMISE_F_ONLINE;
	if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
	meta->status \|= PROMISE_S_MARKED;
	if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
	rebuild_lba64 = MIN(rebuild_lba64,
	sd->sd_rebuild_pos / 512);
	} else
	rebuild_lba64 = 0;
	}
	}
	if (pv->pv_meta != NULL) {
	meta->disks[pos].id = pv->pv_meta->disks[pos].id;
	} else {
	meta->disks[pos].number = i * 2;
	arc4rand(&meta->disks[pos].id,
	sizeof(meta->disks[pos].id), 0);
	}
	}
	promise_meta_put_name(meta, vol->v_name);

	/* Try to mimic AMD BIOS rebuild/resync behavior. */
	if (rebuild_lba64 != UINT64_MAX) {
	if (rebuild)
	meta->magic_3 = 0x03040010UL; /* Rebuild? */
	else
	meta->magic_3 = 0x03040008UL; /* Resync? */
	/* Translate from per-disk to per-volume LBA. */
	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
	rebuild_lba64 *= meta->array_width;
	} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 \|\|
	vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
	rebuild_lba64 *= meta->array_width - 1;
	} else
	rebuild_lba64 = 0;
	} else
	meta->magic_3 = 0x03000000UL;
	meta->rebuild_lba64 = rebuild_lba64;
	meta->magic_4 = 0x04010101UL;

	/* Replace per-volume metadata with new. */
	if (pv->pv_meta != NULL)
	free(pv->pv_meta, M_MD_PROMISE);
	pv->pv_meta = meta;

	/* Copy new metadata to the disks, adding or replacing old. */
	for (i = 0; i < vol->v_disks_count; i++) {
	sd = &vol->v_subdisks[i];
	disk = sd->sd_disk;
	if (disk == NULL)
	continue;
	/* For RAID0+1 we need to translate order. */
	pos = promise_meta_translate_disk(vol, i);
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
	for (j = 0; j < pd->pd_subdisks; j++) {
	if (pd->pd_meta[j]->volume_id == meta->volume_id)
	break;
	}
	if (j == pd->pd_subdisks)
	pd->pd_subdisks++;
	if (pd->pd_meta[j] != NULL)
	free(pd->pd_meta[j], M_MD_PROMISE);
	pd->pd_meta[j] = promise_meta_copy(meta);
	pd->pd_meta[j]->disk = meta->disks[pos];
	pd->pd_meta[j]->disk.number = pos;
	pd->pd_meta[j]->disk_offset_high =
	(sd->sd_offset / 512) >> 32;
	pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
	pd->pd_meta[j]->disk_sectors_high =
	(sd->sd_size / 512) >> 32;
	pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
	if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
	pd->pd_meta[j]->disk_rebuild_high =
	(sd->sd_rebuild_pos / 512) >> 32;
	pd->pd_meta[j]->disk_rebuild =
	sd->sd_rebuild_pos / 512;
	} else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) {
	pd->pd_meta[j]->disk_rebuild_high = 0;
	pd->pd_meta[j]->disk_rebuild = 0;
	} else {
	pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX;
	pd->pd_meta[j]->disk_rebuild = UINT32_MAX;
	}
	pd->pd_updated = 1;
	}
	}

	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
	if (disk->d_state != G_RAID_DISK_S_ACTIVE)
	continue;
	if (!pd->pd_updated)
	continue;
	G_RAID_DEBUG(1, "Writing Promise metadata to %s",
	g_raid_get_diskname(disk));
	for (i = 0; i < pd->pd_subdisks; i++)
	g_raid_md_promise_print(pd->pd_meta[i]);
	promise_meta_write(disk->d_consumer,
	pd->pd_meta, pd->pd_subdisks);
	pd->pd_updated = 0;
	}

	return (0);
	}

	static int
	g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
	struct g_raid_subdisk tsd, struct g_raid_disk tdisk)
	{
	struct g_raid_softc *sc;
	struct g_raid_md_promise_perdisk *pd;
	struct g_raid_subdisk *sd;
	int i, pos;

	sc = md->mdo_softc;
	pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;

	/* We can't fail disk that is not a part of array now. */
	if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
	return (-1);

	/*
	* Mark disk as failed in metadata and try to write that metadata
	* to the disk itself to prevent it's later resurrection as STALE.
	*/
	if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
	G_RAID_DEBUG(1, "Writing Promise metadata to %s",
	g_raid_get_diskname(tdisk));
	for (i = 0; i < pd->pd_subdisks; i++) {
	pd->pd_meta[i]->disk.flags \|=
	PROMISE_F_DOWN \| PROMISE_F_REDIR;
	pos = pd->pd_meta[i]->disk.number;
	if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
	pd->pd_meta[i]->disks[pos].flags \|=
	PROMISE_F_DOWN \| PROMISE_F_REDIR;
	}
	g_raid_md_promise_print(pd->pd_meta[i]);
	}
	if (tdisk->d_consumer != NULL)
	promise_meta_write(tdisk->d_consumer,
	pd->pd_meta, pd->pd_subdisks);

	/* Change states. */
	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
	g_raid_change_subdisk_state(sd,
	G_RAID_SUBDISK_S_FAILED);
	g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
	G_RAID_EVENT_SUBDISK);
	}

	/* Write updated metadata to remaining disks. */
	g_raid_md_write_promise(md, NULL, NULL, tdisk);

	g_raid_md_promise_refill(sc);
	return (0);
	}

	static int
	g_raid_md_free_disk_promise(struct g_raid_md_object *md,
	struct g_raid_disk *disk)
	{
	struct g_raid_md_promise_perdisk *pd;
	int i;

	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
	for (i = 0; i < pd->pd_subdisks; i++) {
	if (pd->pd_meta[i] != NULL) {
	free(pd->pd_meta[i], M_MD_PROMISE);
	pd->pd_meta[i] = NULL;
	}
	}
	free(pd, M_MD_PROMISE);
	disk->d_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_volume_promise(struct g_raid_md_object *md,
	struct g_raid_volume *vol)
	{
	struct g_raid_md_promise_pervolume *pv;

	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
	if (pv && pv->pv_meta != NULL) {
	free(pv->pv_meta, M_MD_PROMISE);
	pv->pv_meta = NULL;
	}
	if (pv && !pv->pv_started) {
	pv->pv_started = 1;
	callout_stop(&pv->pv_start_co);
	}
	free(pv, M_MD_PROMISE);
	vol->v_md_data = NULL;
	return (0);
	}

	static int
	g_raid_md_free_promise(struct g_raid_md_object *md)
	{

	return (0);
	}

	G_RAID_MD_DECLARE(promise, "Promise");
	Index: head/sys/geom/raid/tr_raid5.c
	===================================================================
	--- head/sys/geom/raid/tr_raid5.c (revision 327172)
	+++ head/sys/geom/raid/tr_raid5.c (revision 327173)
	@@ -1,423 +1,421 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <geom/geom.h>
	#include "geom/raid/g_raid.h"
	#include "g_raid_tr_if.h"

	static MALLOC_DEFINE(M_TR_RAID5, "tr_raid5_data", "GEOM_RAID RAID5 data");

	#define TR_RAID5_NONE 0
	#define TR_RAID5_REBUILD 1
	#define TR_RAID5_RESYNC 2

	#define TR_RAID5_F_DOING_SOME 0x1
	#define TR_RAID5_F_LOCKED 0x2
	#define TR_RAID5_F_ABORT 0x4

	struct g_raid_tr_raid5_object {
	struct g_raid_tr_object trso_base;
	int trso_starting;
	int trso_stopping;
	int trso_type;
	int trso_recover_slabs; /* slabs before rest */
	int trso_fair_io;
	int trso_meta_update;
	int trso_flags;
	struct g_raid_subdisk trso_failed_sd; / like per volume */
	void trso_buffer; / Buffer space */
	struct bio trso_bio;
	};

	static g_raid_tr_taste_t g_raid_tr_taste_raid5;
	static g_raid_tr_event_t g_raid_tr_event_raid5;
	static g_raid_tr_start_t g_raid_tr_start_raid5;
	static g_raid_tr_stop_t g_raid_tr_stop_raid5;
	static g_raid_tr_iostart_t g_raid_tr_iostart_raid5;
	static g_raid_tr_iodone_t g_raid_tr_iodone_raid5;
	static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid5;
	static g_raid_tr_locked_t g_raid_tr_locked_raid5;
	static g_raid_tr_free_t g_raid_tr_free_raid5;

	static kobj_method_t g_raid_tr_raid5_methods[] = {
	KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid5),
	KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid5),
	KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid5),
	KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid5),
	KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid5),
	KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid5),
	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid5),
	KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid5),
	KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid5),
	{ 0, 0 }
	};

	static struct g_raid_tr_class g_raid_tr_raid5_class = {
	"RAID5",
	g_raid_tr_raid5_methods,
	sizeof(struct g_raid_tr_raid5_object),
	.trc_enable = 1,
	.trc_priority = 100
	};

	static int
	g_raid_tr_taste_raid5(struct g_raid_tr_object tr, struct g_raid_volume vol)
	{
	struct g_raid_tr_raid5_object *trs;
	u_int qual;

	trs = (struct g_raid_tr_raid5_object *)tr;
	qual = tr->tro_volume->v_raid_level_qualifier;
	if (tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID4 &&
	(qual == G_RAID_VOLUME_RLQ_R4P0 \|\|
	qual == G_RAID_VOLUME_RLQ_R4PN)) {
	/* RAID4 */
	} else if ((tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5 \|\|
	tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5E \|\|
	tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5EE \|\|
	tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5R \|\|
	tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID6 \|\|
	tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAIDMDF) &&
	(qual == G_RAID_VOLUME_RLQ_R5RA \|\|
	qual == G_RAID_VOLUME_RLQ_R5RS \|\|
	qual == G_RAID_VOLUME_RLQ_R5LA \|\|
	qual == G_RAID_VOLUME_RLQ_R5LS)) {
	/* RAID5/5E/5EE/5R/6/MDF */
	} else
	return (G_RAID_TR_TASTE_FAIL);
	trs->trso_starting = 1;
	return (G_RAID_TR_TASTE_SUCCEED);
	}

	static int
	g_raid_tr_update_state_raid5(struct g_raid_volume *vol,
	struct g_raid_subdisk *sd)
	{
	struct g_raid_tr_raid5_object *trs;
	struct g_raid_softc *sc;
	u_int s;
	int na, ns, nu;

	sc = vol->v_softc;
	trs = (struct g_raid_tr_raid5_object *)vol->v_tr;
	if (trs->trso_stopping &&
	(trs->trso_flags & TR_RAID5_F_DOING_SOME) == 0)
	s = G_RAID_VOLUME_S_STOPPED;
	else if (trs->trso_starting)
	s = G_RAID_VOLUME_S_STARTING;
	else {
	na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
	ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
	g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
	nu = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
	if (na == vol->v_disks_count)
	s = G_RAID_VOLUME_S_OPTIMAL;
	else if (na + ns == vol->v_disks_count \|\|
	na + ns + nu == vol->v_disks_count /* XXX: Temporary. */)
	s = G_RAID_VOLUME_S_SUBOPTIMAL;
	else if (na == vol->v_disks_count - 1 \|\|
	na + ns + nu == vol->v_disks_count)
	s = G_RAID_VOLUME_S_DEGRADED;
	else
	s = G_RAID_VOLUME_S_BROKEN;
	}
	if (s != vol->v_state) {
	g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
	G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
	G_RAID_EVENT_VOLUME);
	g_raid_change_volume_state(vol, s);
	if (!trs->trso_starting && !trs->trso_stopping)
	g_raid_write_metadata(sc, vol, NULL, NULL);
	}
	return (0);
	}

	static int
	g_raid_tr_event_raid5(struct g_raid_tr_object *tr,
	struct g_raid_subdisk *sd, u_int event)
	{

	g_raid_tr_update_state_raid5(tr->tro_volume, sd);
	return (0);
	}

	static int
	g_raid_tr_start_raid5(struct g_raid_tr_object *tr)
	{
	struct g_raid_tr_raid5_object *trs;
	struct g_raid_volume *vol;

	trs = (struct g_raid_tr_raid5_object *)tr;
	trs->trso_starting = 0;
	vol = tr->tro_volume;
	vol->v_read_only = 1;
	g_raid_tr_update_state_raid5(vol, NULL);
	return (0);
	}

	static int
	g_raid_tr_stop_raid5(struct g_raid_tr_object *tr)
	{
	struct g_raid_tr_raid5_object *trs;
	struct g_raid_volume *vol;

	trs = (struct g_raid_tr_raid5_object *)tr;
	vol = tr->tro_volume;
	trs->trso_starting = 0;
	trs->trso_stopping = 1;
	g_raid_tr_update_state_raid5(vol, NULL);
	return (0);
	}

	static void
	g_raid_tr_iostart_raid5_read(struct g_raid_tr_object tr, struct bio bp)
	{
	struct g_raid_volume *vol;
	struct g_raid_subdisk *sd;
	struct bio_queue_head queue;
	struct bio *cbp;
	char *addr;
	off_t offset, start, length, nstripe, remain;
	int no, pno, ddisks, pdisks, protate, pleft;
	u_int strip_size, lvl, qual;

	vol = tr->tro_volume;
	addr = bp->bio_data;
	strip_size = vol->v_strip_size;
	lvl = tr->tro_volume->v_raid_level;
	qual = tr->tro_volume->v_raid_level_qualifier;
	protate = tr->tro_volume->v_rotate_parity;

	/* Stripe number. */
	nstripe = bp->bio_offset / strip_size;
	/* Start position in stripe. */
	start = bp->bio_offset % strip_size;
	/* Number of data and parity disks. */
	if (lvl == G_RAID_VOLUME_RL_RAIDMDF)
	pdisks = tr->tro_volume->v_mdf_pdisks;
	else if (lvl == G_RAID_VOLUME_RL_RAID5EE \|\|
	lvl == G_RAID_VOLUME_RL_RAID6)
	pdisks = 2;
	else
	pdisks = 1;
	ddisks = vol->v_disks_count - pdisks;
	/* Parity disk number. */
	if (lvl == G_RAID_VOLUME_RL_RAID4) {
	if (qual == 0) /* P0 */
	pno = 0;
	else /* PN */
	pno = ddisks;
	pleft = -1;
	} else {
	pno = (nstripe / (ddisks * protate)) % vol->v_disks_count;
	pleft = protate - (nstripe / ddisks) % protate;
	if (qual >= 2) { /* PN/Left */
	pno = ddisks - pno;
	if (pno < 0)
	pno += vol->v_disks_count;
	}
	}
	/* Data disk number. */
	no = nstripe % ddisks;
	if (lvl == G_RAID_VOLUME_RL_RAID4) {
	if (qual == 0)
	no += pdisks;
	} else if (qual & 1) { /* Continuation/Symmetric */
	no = (pno + pdisks + no) % vol->v_disks_count;
	} else if (no >= pno) /* Restart/Asymmetric */
	no += pdisks;
	else
	no += imax(0, pno + pdisks - vol->v_disks_count);
	/* Stripe start position in disk. */
	offset = (nstripe / ddisks) * strip_size;
	/* Length of data to operate. */
	remain = bp->bio_length;

	bioq_init(&queue);
	do {
	length = MIN(strip_size - start, remain);
	cbp = g_clone_bio(bp);
	if (cbp == NULL)
	goto failure;
	cbp->bio_offset = offset + start;
	cbp->bio_data = addr;
	cbp->bio_length = length;
	cbp->bio_caller1 = &vol->v_subdisks[no];
	bioq_insert_tail(&queue, cbp);
	no++;
	if (lvl == G_RAID_VOLUME_RL_RAID4) {
	no %= vol->v_disks_count;
	if (no == pno)
	no = (no + pdisks) % vol->v_disks_count;
	} else if (qual & 1) { /* Continuation/Symmetric */
	no %= vol->v_disks_count;
	if (no == pno) {
	if ((--pleft) <= 0) {
	pleft += protate;
	if (qual < 2) /* P0/Right */
	pno++;
	else /* PN/Left */
	pno += vol->v_disks_count - 1;
	pno %= vol->v_disks_count;
	}
	no = (pno + pdisks) % vol->v_disks_count;
	offset += strip_size;
	}
	} else { /* Restart/Asymmetric */
	if (no == pno)
	no += pdisks;
	if (no >= vol->v_disks_count) {
	no -= vol->v_disks_count;
	if ((--pleft) <= 0) {
	pleft += protate;
	if (qual < 2) /* P0/Right */
	pno++;
	else /* PN/Left */
	pno += vol->v_disks_count - 1;
	pno %= vol->v_disks_count;
	}
	if (no == pno)
	no += pdisks;
	else
	no += imax(0, pno + pdisks - vol->v_disks_count);
	offset += strip_size;
	}
	}
	remain -= length;
	addr += length;
	start = 0;
	} while (remain > 0);
	while ((cbp = bioq_takefirst(&queue)) != NULL) {
	sd = cbp->bio_caller1;
	cbp->bio_caller1 = NULL;
	g_raid_subdisk_iostart(sd, cbp);
	}
	return;
	failure:
	while ((cbp = bioq_takefirst(&queue)) != NULL)
	g_destroy_bio(cbp);
	if (bp->bio_error == 0)
	bp->bio_error = ENOMEM;
	g_raid_iodone(bp, bp->bio_error);
	}

	static void
	g_raid_tr_iostart_raid5(struct g_raid_tr_object tr, struct bio bp)
	{
	struct g_raid_volume *vol;
	- struct g_raid_tr_raid5_object *trs;

	vol = tr->tro_volume;
	- trs = (struct g_raid_tr_raid5_object *)tr;
	if (vol->v_state < G_RAID_VOLUME_S_SUBOPTIMAL) {
	g_raid_iodone(bp, EIO);
	return;
	}
	switch (bp->bio_cmd) {
	case BIO_READ:
	g_raid_tr_iostart_raid5_read(tr, bp);
	break;
	case BIO_WRITE:
	case BIO_DELETE:
	case BIO_FLUSH:
	g_raid_iodone(bp, ENODEV);
	break;
	default:
	KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
	bp->bio_cmd, vol->v_name));
	break;
	}
	}

	static void
	g_raid_tr_iodone_raid5(struct g_raid_tr_object *tr,
	struct g_raid_subdisk sd, struct bio bp)
	{
	struct bio *pbp;

	pbp = bp->bio_parent;
	if (pbp->bio_error == 0)
	pbp->bio_error = bp->bio_error;
	pbp->bio_inbed++;
	g_destroy_bio(bp);
	if (pbp->bio_children == pbp->bio_inbed) {
	pbp->bio_completed = pbp->bio_length;
	g_raid_iodone(pbp, pbp->bio_error);
	}
	}

	static int
	g_raid_tr_kerneldump_raid5(struct g_raid_tr_object *tr,
	void *virtual, vm_offset_t physical, off_t offset, size_t length)
	{

	return (ENODEV);
	}

	static int
	g_raid_tr_locked_raid5(struct g_raid_tr_object tr, void argp)
	{
	struct bio *bp;
	struct g_raid_subdisk *sd;

	bp = (struct bio *)argp;
	sd = (struct g_raid_subdisk *)bp->bio_caller1;
	g_raid_subdisk_iostart(sd, bp);

	return (0);
	}

	static int
	g_raid_tr_free_raid5(struct g_raid_tr_object *tr)
	{
	struct g_raid_tr_raid5_object *trs;

	trs = (struct g_raid_tr_raid5_object *)tr;

	if (trs->trso_buffer != NULL) {
	free(trs->trso_buffer, M_TR_RAID5);
	trs->trso_buffer = NULL;
	}
	return (0);
	}

	G_RAID_TR_DECLARE(raid5, "RAID5");
	Index: head/sys/kern/kern_synch.c
	===================================================================
	--- head/sys/kern/kern_synch.c (revision 327172)
	+++ head/sys/kern/kern_synch.c (revision 327173)
	@@ -1,576 +1,572 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1982, 1986, 1990, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ktrace.h"
	#include "opt_sched.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/condvar.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/sdt.h>
	#include <sys/signalvar.h>
	#include <sys/sleepqueue.h>
	#include <sys/smp.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/vmmeter.h>
	#ifdef KTRACE
	#include <sys/uio.h>
	#include <sys/ktrace.h>
	#endif

	#include <machine/cpu.h>

	static void synch_setup(void *dummy);
	SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
	NULL);

	int hogticks;
	static uint8_t pause_wchan[MAXCPU];

	static struct callout loadav_callout;

	struct loadavg averunnable =
	{ {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
	/*
	* Constants for averages over 1, 5, and 15 minutes
	* when sampling at 5 second intervals.
	*/
	static fixpt_t cexp[3] = {
	0.9200444146293232 * FSCALE, /* exp(-1/12) */
	0.9834714538216174 * FSCALE, /* exp(-1/60) */
	0.9944598480048967 * FSCALE, /* exp(-1/180) */
	};

	/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
	SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, "");

	static void loadav(void *arg);

	SDT_PROVIDER_DECLARE(sched);
	SDT_PROBE_DEFINE(sched, , , preempt);

	static void
	sleepinit(void *unused)
	{

	hogticks = (hz / 10) * 2; /* Default only. */
	init_sleepqueues();
	}

	/*
	* vmem tries to lock the sleepq mutexes when free'ing kva, so make sure
	* it is available.
	*/
	SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0);

	/*
	* General sleep call. Suspends the current thread until a wakeup is
	* performed on the specified identifier. The thread will then be made
	* runnable with the specified priority. Sleeps at most sbt units of time
	* (0 means no timeout). If pri includes the PCATCH flag, let signals
	* interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if
	* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
	* signal becomes pending, ERESTART is returned if the current system
	* call should be restarted if possible, and EINTR is returned if the system
	* call should be interrupted by the signal (return EINTR).
	*
	* The lock argument is unlocked before the caller is suspended, and
	* re-locked before _sleep() returns. If priority includes the PDROP
	* flag the lock is not re-locked before returning.
	*/
	int
	_sleep(void ident, struct lock_object lock, int priority,
	const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
	{
	struct thread *td;
	- struct proc *p;
	struct lock_class *class;
	uintptr_t lock_state;
	int catch, pri, rval, sleepq_flags;
	WITNESS_SAVE_DECL(lock_witness);

	td = curthread;
	- p = td->td_proc;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(1, 0, wmesg);
	#endif
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK, lock,
	"Sleeping on \"%s\"", wmesg);
	KASSERT(sbt != 0 \|\| mtx_owned(&Giant) \|\| lock != NULL,
	("sleeping without a lock"));
	KASSERT(ident != NULL, ("_sleep: NULL ident"));
	KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running"));
	if (priority & PDROP)
	KASSERT(lock != NULL && lock != &Giant.lock_object,
	("PDROP requires a non-Giant lock"));
	if (lock != NULL)
	class = LOCK_CLASS(lock);
	else
	class = NULL;

	if (SCHEDULER_STOPPED_TD(td)) {
	if (lock != NULL && priority & PDROP)
	class->lc_unlock(lock);
	return (0);
	}
	catch = priority & PCATCH;
	pri = priority & PRIMASK;

	KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep"));

	if ((uint8_t *)ident >= &pause_wchan[0] &&
	(uint8_t *)ident <= &pause_wchan[MAXCPU - 1])
	sleepq_flags = SLEEPQ_PAUSE;
	else
	sleepq_flags = SLEEPQ_SLEEP;
	if (catch)
	sleepq_flags \|= SLEEPQ_INTERRUPTIBLE;

	sleepq_lock(ident);
	CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
	- td->td_tid, p->p_pid, td->td_name, wmesg, ident);
	+ td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident);

	if (lock == &Giant.lock_object)
	mtx_assert(&Giant, MA_OWNED);
	DROP_GIANT();
	if (lock != NULL && lock != &Giant.lock_object &&
	!(class->lc_flags & LC_SLEEPABLE)) {
	WITNESS_SAVE(lock, lock_witness);
	lock_state = class->lc_unlock(lock);
	} else
	/* GCC needs to follow the Yellow Brick Road */
	lock_state = -1;

	/*
	* We put ourselves on the sleep queue and start our timeout
	* before calling thread_suspend_check, as we could stop there,
	* and a wakeup or a SIGCONT (or both) could occur while we were
	* stopped without resuming us. Thus, we must be ready for sleep
	* when cursig() is called. If the wakeup happens while we're
	* stopped, then td will no longer be on a sleep queue upon
	* return from cursig().
	*/
	sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
	if (sbt != 0)
	sleepq_set_timeout_sbt(ident, sbt, pr, flags);
	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
	sleepq_release(ident);
	WITNESS_SAVE(lock, lock_witness);
	lock_state = class->lc_unlock(lock);
	sleepq_lock(ident);
	}
	if (sbt != 0 && catch)
	rval = sleepq_timedwait_sig(ident, pri);
	else if (sbt != 0)
	rval = sleepq_timedwait(ident, pri);
	else if (catch)
	rval = sleepq_wait_sig(ident, pri);
	else {
	sleepq_wait(ident, pri);
	rval = 0;
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(0, 0, wmesg);
	#endif
	PICKUP_GIANT();
	if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
	class->lc_lock(lock, lock_state);
	WITNESS_RESTORE(lock, lock_witness);
	}
	return (rval);
	}

	int
	msleep_spin_sbt(void ident, struct mtx mtx, const char *wmesg,
	sbintime_t sbt, sbintime_t pr, int flags)
	{
	struct thread *td;
	- struct proc *p;
	int rval;
	WITNESS_SAVE_DECL(mtx);

	td = curthread;
	- p = td->td_proc;
	KASSERT(mtx != NULL, ("sleeping without a mutex"));
	KASSERT(ident != NULL, ("msleep_spin_sbt: NULL ident"));
	KASSERT(TD_IS_RUNNING(td), ("msleep_spin_sbt: curthread not running"));

	if (SCHEDULER_STOPPED_TD(td))
	return (0);

	sleepq_lock(ident);
	CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
	- td->td_tid, p->p_pid, td->td_name, wmesg, ident);
	+ td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident);

	DROP_GIANT();
	mtx_assert(mtx, MA_OWNED \| MA_NOTRECURSED);
	WITNESS_SAVE(&mtx->lock_object, mtx);
	mtx_unlock_spin(mtx);

	/*
	* We put ourselves on the sleep queue and start our timeout.
	*/
	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
	if (sbt != 0)
	sleepq_set_timeout_sbt(ident, sbt, pr, flags);

	/*
	* Can't call ktrace with any spin locks held so it can lock the
	* ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
	* any spin lock. Thus, we have to drop the sleepq spin lock while
	* we handle those requests. This is safe since we have placed our
	* thread on the sleep queue already.
	*/
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW)) {
	sleepq_release(ident);
	ktrcsw(1, 0, wmesg);
	sleepq_lock(ident);
	}
	#endif
	#ifdef WITNESS
	sleepq_release(ident);
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
	wmesg);
	sleepq_lock(ident);
	#endif
	if (sbt != 0)
	rval = sleepq_timedwait(ident, 0);
	else {
	sleepq_wait(ident, 0);
	rval = 0;
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(0, 0, wmesg);
	#endif
	PICKUP_GIANT();
	mtx_lock_spin(mtx);
	WITNESS_RESTORE(&mtx->lock_object, mtx);
	return (rval);
	}

	/*
	* pause() delays the calling thread by the given number of system ticks.
	* During cold bootup, pause() uses the DELAY() function instead of
	* the tsleep() function to do the waiting. The "timo" argument must be
	* greater than or equal to zero. A "timo" value of zero is equivalent
	* to a "timo" value of one.
	*/
	int
	pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
	{
	KASSERT(sbt >= 0, ("pause: timeout must be >= 0"));

	/* silently convert invalid timeouts */
	if (sbt == 0)
	sbt = tick_sbt;

	if ((cold && curthread == &thread0) \|\| kdb_active \|\|
	SCHEDULER_STOPPED()) {
	/*
	* We delay one second at a time to avoid overflowing the
	* system specific DELAY() function(s):
	*/
	while (sbt >= SBT_1S) {
	DELAY(1000000);
	sbt -= SBT_1S;
	}
	/* Do the delay remainder, if any */
	sbt = howmany(sbt, SBT_1US);
	if (sbt > 0)
	DELAY(sbt);
	return (0);
	}
	return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags));
	}

	/*
	* Make all threads sleeping on the specified identifier runnable.
	*/
	void
	wakeup(void *ident)
	{
	int wakeup_swapper;

	sleepq_lock(ident);
	wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
	sleepq_release(ident);
	if (wakeup_swapper) {
	KASSERT(ident != &proc0,
	("wakeup and wakeup_swapper and proc0"));
	kick_proc0();
	}
	}

	/*
	* Make a thread sleeping on the specified identifier runnable.
	* May wake more than one thread if a target thread is currently
	* swapped out.
	*/
	void
	wakeup_one(void *ident)
	{
	int wakeup_swapper;

	sleepq_lock(ident);
	wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
	sleepq_release(ident);
	if (wakeup_swapper)
	kick_proc0();
	}

	static void
	kdb_switch(void)
	{
	thread_unlock(curthread);
	kdb_backtrace();
	kdb_reenter();
	panic("%s: did not reenter debugger", __func__);
	}

	/*
	* The machine independent parts of context switching.
	*/
	void
	mi_switch(int flags, struct thread *newtd)
	{
	uint64_t runtime, new_switchtime;
	struct thread *td;

	td = curthread; /* XXX */
	THREAD_LOCK_ASSERT(td, MA_OWNED \| MA_NOTRECURSED);
	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
	#ifdef INVARIANTS
	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
	mtx_assert(&Giant, MA_NOTOWNED);
	#endif
	KASSERT(td->td_critnest == 1 \|\| panicstr,
	("mi_switch: switch in a critical section"));
	KASSERT((flags & (SW_INVOL \| SW_VOL)) != 0,
	("mi_switch: switch must be voluntary or involuntary"));
	KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));

	/*
	* Don't perform context switches from the debugger.
	*/
	if (kdb_active)
	kdb_switch();
	if (SCHEDULER_STOPPED_TD(td))
	return;
	if (flags & SW_VOL) {
	td->td_ru.ru_nvcsw++;
	td->td_swvoltick = ticks;
	} else {
	td->td_ru.ru_nivcsw++;
	td->td_swinvoltick = ticks;
	}
	#ifdef SCHED_STATS
	SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
	#endif
	/*
	* Compute the amount of time during which the current
	* thread was running, and add that to its total so far.
	*/
	new_switchtime = cpu_ticks();
	runtime = new_switchtime - PCPU_GET(switchtime);
	td->td_runtime += runtime;
	td->td_incruntime += runtime;
	PCPU_SET(switchtime, new_switchtime);
	td->td_generation++; /* bump preempt-detect counter */
	VM_CNT_INC(v_swtch);
	PCPU_SET(switchticks, ticks);
	CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
	td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name);
	#ifdef KDTRACE_HOOKS
	if ((flags & SW_PREEMPT) != 0 \|\| ((flags & SW_INVOL) != 0 &&
	(flags & SW_TYPE_MASK) == SWT_NEEDRESCHED))
	SDT_PROBE0(sched, , , preempt);
	#endif
	sched_switch(td, newtd, flags);
	CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
	td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name);

	/*
	* If the last thread was exiting, finish cleaning it up.
	*/
	if ((td = PCPU_GET(deadthread))) {
	PCPU_SET(deadthread, NULL);
	thread_stash(td);
	}
	}

	/*
	* Change thread state to be runnable, placing it on the run queue if
	* it is in memory. If it is swapped out, return true so our caller
	* will know to awaken the swapper.
	*/
	int
	setrunnable(struct thread *td)
	{

	THREAD_LOCK_ASSERT(td, MA_OWNED);
	KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
	("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
	switch (td->td_state) {
	case TDS_RUNNING:
	case TDS_RUNQ:
	return (0);
	case TDS_INHIBITED:
	/*
	* If we are only inhibited because we are swapped out
	* then arange to swap in this process. Otherwise just return.
	*/
	if (td->td_inhibitors != TDI_SWAPPED)
	return (0);
	/* FALLTHROUGH */
	case TDS_CAN_RUN:
	break;
	default:
	printf("state is 0x%x", td->td_state);
	panic("setrunnable(2)");
	}
	if ((td->td_flags & TDF_INMEM) == 0) {
	if ((td->td_flags & TDF_SWAPINREQ) == 0) {
	td->td_flags \|= TDF_SWAPINREQ;
	return (1);
	}
	} else
	sched_wakeup(td);
	return (0);
	}

	/*
	* Compute a tenex style load average of a quantity on
	* 1, 5 and 15 minute intervals.
	*/
	static void
	loadav(void *arg)
	{
	int i, nrun;
	struct loadavg *avg;

	nrun = sched_load();
	avg = &averunnable;

	for (i = 0; i < 3; i++)
	avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
	nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;

	/*
	* Schedule the next update to occur after 5 seconds, but add a
	* random variation to avoid synchronisation with processes that
	* run at regular intervals.
	*/
	callout_reset_sbt(&loadav_callout,
	SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US,
	loadav, NULL, C_DIRECT_EXEC \| C_PREL(32));
	}

	/* ARGSUSED */
	static void
	synch_setup(void *dummy)
	{
	callout_init(&loadav_callout, 1);

	/* Kick off timeout driven events by calling first time. */
	loadav(NULL);
	}

	int
	should_yield(void)
	{

	return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks);
	}

	void
	maybe_yield(void)
	{

	if (should_yield())
	kern_yield(PRI_USER);
	}

	void
	kern_yield(int prio)
	{
	struct thread *td;

	td = curthread;
	DROP_GIANT();
	thread_lock(td);
	if (prio == PRI_USER)
	prio = td->td_user_pri;
	if (prio >= 0)
	sched_prio(td, prio);
	mi_switch(SW_VOL \| SWT_RELINQUISH, NULL);
	thread_unlock(td);
	PICKUP_GIANT();
	}

	/*
	* General purpose yield system call.
	*/
	int
	sys_yield(struct thread td, struct yield_args uap)
	{

	thread_lock(td);
	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
	sched_prio(td, PRI_MAX_TIMESHARE);
	mi_switch(SW_VOL \| SWT_RELINQUISH, NULL);
	thread_unlock(td);
	td->td_retval[0] = 0;
	return (0);
	}
	Index: head/sys/kern/link_elf.c
	===================================================================
	--- head/sys/kern/link_elf.c (revision 327172)
	+++ head/sys/kern/link_elf.c (revision 327173)
	@@ -1,1660 +1,1652 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1998-2000 Doug Rabson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_gdb.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#ifdef GPROF
	#include <sys/gmon.h>
	#endif
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/mount.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/namei.h>
	#include <sys/fcntl.h>
	#include <sys/vnode.h>
	#include <sys/linker.h>

	#include <machine/elf.h>

	#include <net/vnet.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#ifdef SPARSE_MAPPING
	#include <vm/vm_object.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#endif
	#include <vm/pmap.h>
	#include <vm/vm_map.h>

	#include <sys/link_elf.h>

	#ifdef DDB_CTF
	#include <sys/zlib.h>
	#endif

	#include "linker_if.h"

	#define MAXSEGS 4

	typedef struct elf_file {
	struct linker_file lf; /* Common fields */
	int preloaded; /* Was file pre-loaded */
	caddr_t address; /* Relocation address */
	#ifdef SPARSE_MAPPING
	vm_object_t object; /* VM object to hold file pages */
	#endif
	Elf_Dyn dynamic; / Symbol table etc. */
	Elf_Hashelt nbuckets; /* DT_HASH info */
	Elf_Hashelt nchains;
	const Elf_Hashelt *buckets;
	const Elf_Hashelt *chains;
	caddr_t hash;
	caddr_t strtab; /* DT_STRTAB */
	int strsz; /* DT_STRSZ */
	const Elf_Sym symtab; / DT_SYMTAB */
	Elf_Addr got; / DT_PLTGOT */
	const Elf_Rel pltrel; / DT_JMPREL */
	int pltrelsize; /* DT_PLTRELSZ */
	const Elf_Rela pltrela; / DT_JMPREL */
	int pltrelasize; /* DT_PLTRELSZ */
	const Elf_Rel rel; / DT_REL */
	int relsize; /* DT_RELSZ */
	const Elf_Rela rela; / DT_RELA */
	int relasize; /* DT_RELASZ */
	caddr_t modptr;
	const Elf_Sym ddbsymtab; / The symbol table we are using */
	long ddbsymcnt; /* Number of symbols */
	caddr_t ddbstrtab; /* String table */
	long ddbstrcnt; /* number of bytes in string table */
	caddr_t symbase; /* malloc'ed symbold base */
	caddr_t strbase; /* malloc'ed string base */
	caddr_t ctftab; /* CTF table */
	long ctfcnt; /* number of bytes in CTF table */
	caddr_t ctfoff; /* CTF offset table */
	caddr_t typoff; /* Type offset table */
	long typlen; /* Number of type entries. */
	Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */
	Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */
	Elf_Addr pcpu_base; /* Relocated pcpu set address. */
	#ifdef VIMAGE
	Elf_Addr vnet_start; /* Pre-relocation vnet set start. */
	Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */
	Elf_Addr vnet_base; /* Relocated vnet set address. */
	#endif
	#ifdef GDB
	struct link_map gdb; /* hooks for gdb */
	#endif
	} *elf_file_t;

	struct elf_set {
	Elf_Addr es_start;
	Elf_Addr es_stop;
	Elf_Addr es_base;
	TAILQ_ENTRY(elf_set) es_link;
	};

	TAILQ_HEAD(elf_set_head, elf_set);

	#include <kern/kern_ctf.c>

	static int link_elf_link_common_finish(linker_file_t);
	static int link_elf_link_preload(linker_class_t cls,
	const char , linker_file_t );
	static int link_elf_link_preload_finish(linker_file_t);
	static int link_elf_load_file(linker_class_t, const char *,
	linker_file_t *);
	static int link_elf_lookup_symbol(linker_file_t, const char *,
	c_linker_sym_t *);
	static int link_elf_symbol_values(linker_file_t, c_linker_sym_t,
	linker_symval_t *);
	static int link_elf_search_symbol(linker_file_t, caddr_t,
	c_linker_sym_t , long );

	static void link_elf_unload_file(linker_file_t);
	static void link_elf_unload_preload(linker_file_t);
	static int link_elf_lookup_set(linker_file_t, const char *,
	void *, void , int );
	static int link_elf_each_function_name(linker_file_t,
	int ()(const char , void ), void );
	static int link_elf_each_function_nameval(linker_file_t,
	linker_function_nameval_callback_t, void *);
	static void link_elf_reloc_local(linker_file_t);
	static long link_elf_symtab_get(linker_file_t, const Elf_Sym **);
	static long link_elf_strtab_get(linker_file_t, caddr_t *);
	static int elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *);

	static kobj_method_t link_elf_methods[] = {
	KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol),
	KOBJMETHOD(linker_symbol_values, link_elf_symbol_values),
	KOBJMETHOD(linker_search_symbol, link_elf_search_symbol),
	KOBJMETHOD(linker_unload, link_elf_unload_file),
	KOBJMETHOD(linker_load_file, link_elf_load_file),
	KOBJMETHOD(linker_link_preload, link_elf_link_preload),
	KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
	KOBJMETHOD(linker_lookup_set, link_elf_lookup_set),
	KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
	KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
	KOBJMETHOD(linker_ctf_get, link_elf_ctf_get),
	KOBJMETHOD(linker_symtab_get, link_elf_symtab_get),
	KOBJMETHOD(linker_strtab_get, link_elf_strtab_get),
	{ 0, 0 }
	};

	static struct linker_class link_elf_class = {
	#if ELF_TARG_CLASS == ELFCLASS32
	"elf32",
	#else
	"elf64",
	#endif
	link_elf_methods, sizeof(struct elf_file)
	};

	static int parse_dynamic(elf_file_t);
	static int relocate_file(elf_file_t);
	static int link_elf_preload_parse_symbols(elf_file_t);

	static struct elf_set_head set_pcpu_list;
	#ifdef VIMAGE
	static struct elf_set_head set_vnet_list;
	#endif

	static void
	elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base)
	{
	struct elf_set set, iter;

	set = malloc(sizeof(*set), M_LINKER, M_WAITOK);
	set->es_start = start;
	set->es_stop = stop;
	set->es_base = base;

	TAILQ_FOREACH(iter, list, es_link) {

	KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) \|\|
	(set->es_start > iter->es_start && set->es_stop > iter->es_stop),
	("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx",
	(uintmax_t)set->es_start, (uintmax_t)set->es_stop,
	(uintmax_t)iter->es_start, (uintmax_t)iter->es_stop));

	if (iter->es_start > set->es_start) {
	TAILQ_INSERT_BEFORE(iter, set, es_link);
	break;
	}
	}

	if (iter == NULL)
	TAILQ_INSERT_TAIL(list, set, es_link);
	}

	static int
	elf_set_find(struct elf_set_head list, Elf_Addr addr, Elf_Addr start, Elf_Addr *base)
	{
	struct elf_set *set;

	TAILQ_FOREACH(set, list, es_link) {
	if (addr < set->es_start)
	return (0);
	if (addr < set->es_stop) {
	*start = set->es_start;
	*base = set->es_base;
	return (1);
	}
	}

	return (0);
	}

	static void
	elf_set_delete(struct elf_set_head *list, Elf_Addr start)
	{
	struct elf_set *set;

	TAILQ_FOREACH(set, list, es_link) {
	if (start < set->es_start)
	break;
	if (start == set->es_start) {
	TAILQ_REMOVE(list, set, es_link);
	free(set, M_LINKER);
	return;
	}
	}
	KASSERT(0, ("deleting unknown linker set (start = 0x%jx)",
	(uintmax_t)start));
	}

	#ifdef GDB
	static void r_debug_state(struct r_debug , struct link_map );

	/*
	* A list of loaded modules for GDB to use for loading symbols.
	*/
	struct r_debug r_debug;

	#define GDB_STATE(s) do { \
	r_debug.r_state = s; r_debug_state(NULL, NULL); \
	} while (0)

	/*
	* Function for the debugger to set a breakpoint on to gain control.
	*/
	static void
	r_debug_state(struct r_debug *dummy_one __unused,
	struct link_map *dummy_two __unused)
	{
	}

	static void
	link_elf_add_gdb(struct link_map *l)
	{
	struct link_map *prev;

	l->l_next = NULL;

	if (r_debug.r_map == NULL) {
	/* Add first. */
	l->l_prev = NULL;
	r_debug.r_map = l;
	} else {
	/* Append to list. */
	for (prev = r_debug.r_map;
	prev->l_next != NULL;
	prev = prev->l_next)
	;
	l->l_prev = prev;
	prev->l_next = l;
	}
	}

	static void
	link_elf_delete_gdb(struct link_map *l)
	{
	if (l->l_prev == NULL) {
	/* Remove first. */
	if ((r_debug.r_map = l->l_next) != NULL)
	l->l_next->l_prev = NULL;
	} else {
	/* Remove any but first. */
	if ((l->l_prev->l_next = l->l_next) != NULL)
	l->l_next->l_prev = l->l_prev;
	}
	}
	#endif /* GDB */

	/*
	* The kernel symbol table starts here.
	*/
	extern struct _dynamic _DYNAMIC;

	static void
	link_elf_error(const char filename, const char s)
	{
	if (filename == NULL)
	printf("kldload: %s\n", s);
	else
	printf("kldload: %s: %s\n", filename, s);
	}

	static void
	link_elf_invoke_ctors(caddr_t addr, size_t size)
	{
	void (**ctor)(void);
	size_t i, cnt;

	if (addr == NULL \|\| size == 0)
	return;
	cnt = size / sizeof(*ctor);
	ctor = (void *)addr;
	for (i = 0; i < cnt; i++) {
	if (ctor[i] != NULL)
	(*ctor[i])();
	}
	}

	/*
	* Actions performed after linking/loading both the preloaded kernel and any
	* modules; whether preloaded or dynamicly loaded.
	*/
	static int
	link_elf_link_common_finish(linker_file_t lf)
	{
	#ifdef GDB
	elf_file_t ef = (elf_file_t)lf;
	char *newfilename;
	#endif
	int error;

	/* Notify MD code that a module is being loaded. */
	error = elf_cpu_load_file(lf);
	if (error != 0)
	return (error);

	#ifdef GDB
	GDB_STATE(RT_ADD);
	ef->gdb.l_addr = lf->address;
	newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
	strcpy(newfilename, lf->filename);
	ef->gdb.l_name = newfilename;
	ef->gdb.l_ld = ef->dynamic;
	link_elf_add_gdb(&ef->gdb);
	GDB_STATE(RT_CONSISTENT);
	#endif

	/* Invoke .ctors */
	link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size);
	return (0);
	}

	extern vm_offset_t __startkernel;

	static void
	link_elf_init(void* arg)
	{
	Elf_Dyn *dp;
	Elf_Addr *ctors_addrp;
	Elf_Size *ctors_sizep;
	caddr_t modptr, baseptr, sizeptr;
	elf_file_t ef;
	char *modname;

	linker_add_class(&link_elf_class);

	dp = (Elf_Dyn *)&_DYNAMIC;
	modname = NULL;
	modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel");
	if (modptr == NULL)
	modptr = preload_search_by_type("elf kernel");
	modname = (char *)preload_search_info(modptr, MODINFO_NAME);
	if (modname == NULL)
	modname = "kernel";
	linker_kernel_file = linker_make_file(modname, &link_elf_class);
	if (linker_kernel_file == NULL)
	panic("%s: Can't create linker structures for kernel",
	__func__);

	ef = (elf_file_t) linker_kernel_file;
	ef->preloaded = 1;
	#ifdef __powerpc__
	ef->address = (caddr_t) (__startkernel - KERNBASE);
	#else
	ef->address = 0;
	#endif
	#ifdef SPARSE_MAPPING
	ef->object = 0;
	#endif
	ef->dynamic = dp;

	if (dp != NULL)
	parse_dynamic(ef);
	linker_kernel_file->address += KERNBASE;
	linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;

	if (modptr != NULL) {
	ef->modptr = modptr;
	baseptr = preload_search_info(modptr, MODINFO_ADDR);
	if (baseptr != NULL)
	linker_kernel_file->address = (caddr_t )baseptr;
	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
	if (sizeptr != NULL)
	linker_kernel_file->size = (size_t )sizeptr;
	ctors_addrp = (Elf_Addr *)preload_search_info(modptr,
	MODINFO_METADATA \| MODINFOMD_CTORS_ADDR);
	ctors_sizep = (Elf_Size *)preload_search_info(modptr,
	MODINFO_METADATA \| MODINFOMD_CTORS_SIZE);
	if (ctors_addrp != NULL && ctors_sizep != NULL) {
	linker_kernel_file->ctors_addr = ef->address +
	*ctors_addrp;
	linker_kernel_file->ctors_size = *ctors_sizep;
	}
	}
	(void)link_elf_preload_parse_symbols(ef);

	#ifdef GDB
	r_debug.r_map = NULL;
	r_debug.r_brk = r_debug_state;
	r_debug.r_state = RT_CONSISTENT;
	#endif

	(void)link_elf_link_common_finish(linker_kernel_file);
	linker_kernel_file->flags \|= LINKER_FILE_LINKED;
	TAILQ_INIT(&set_pcpu_list);
	#ifdef VIMAGE
	TAILQ_INIT(&set_vnet_list);
	#endif
	}

	SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0);

	static int
	link_elf_preload_parse_symbols(elf_file_t ef)
	{
	caddr_t pointer;
	caddr_t ssym, esym, base;
	caddr_t strtab;
	int strcnt;
	Elf_Sym *symtab;
	int symcnt;

	if (ef->modptr == NULL)
	return (0);
	pointer = preload_search_info(ef->modptr,
	MODINFO_METADATA \| MODINFOMD_SSYM);
	if (pointer == NULL)
	return (0);
	ssym = (caddr_t )pointer;
	pointer = preload_search_info(ef->modptr,
	MODINFO_METADATA \| MODINFOMD_ESYM);
	if (pointer == NULL)
	return (0);
	esym = (caddr_t )pointer;

	base = ssym;

	symcnt = (long )base;
	base += sizeof(long);
	symtab = (Elf_Sym *)base;
	base += roundup(symcnt, sizeof(long));

	if (base > esym \|\| base < ssym) {
	printf("Symbols are corrupt!\n");
	return (EINVAL);
	}

	strcnt = (long )base;
	base += sizeof(long);
	strtab = base;
	base += roundup(strcnt, sizeof(long));

	if (base > esym \|\| base < ssym) {
	printf("Symbols are corrupt!\n");
	return (EINVAL);
	}

	ef->ddbsymtab = symtab;
	ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
	ef->ddbstrtab = strtab;
	ef->ddbstrcnt = strcnt;

	return (0);
	}

	static int
	parse_dynamic(elf_file_t ef)
	{
	Elf_Dyn *dp;
	int plttype = DT_REL;

	for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
	switch (dp->d_tag) {
	case DT_HASH:
	{
	/* From src/libexec/rtld-elf/rtld.c */
	const Elf_Hashelt hashtab = (const Elf_Hashelt )
	(ef->address + dp->d_un.d_ptr);
	ef->nbuckets = hashtab[0];
	ef->nchains = hashtab[1];
	ef->buckets = hashtab + 2;
	ef->chains = ef->buckets + ef->nbuckets;
	break;
	}
	case DT_STRTAB:
	ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
	break;
	case DT_STRSZ:
	ef->strsz = dp->d_un.d_val;
	break;
	case DT_SYMTAB:
	ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
	break;
	case DT_SYMENT:
	if (dp->d_un.d_val != sizeof(Elf_Sym))
	return (ENOEXEC);
	break;
	case DT_PLTGOT:
	ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
	break;
	case DT_REL:
	ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
	break;
	case DT_RELSZ:
	ef->relsize = dp->d_un.d_val;
	break;
	case DT_RELENT:
	if (dp->d_un.d_val != sizeof(Elf_Rel))
	return (ENOEXEC);
	break;
	case DT_JMPREL:
	ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
	break;
	case DT_PLTRELSZ:
	ef->pltrelsize = dp->d_un.d_val;
	break;
	case DT_RELA:
	ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
	break;
	case DT_RELASZ:
	ef->relasize = dp->d_un.d_val;
	break;
	case DT_RELAENT:
	if (dp->d_un.d_val != sizeof(Elf_Rela))
	return (ENOEXEC);
	break;
	case DT_PLTREL:
	plttype = dp->d_un.d_val;
	if (plttype != DT_REL && plttype != DT_RELA)
	return (ENOEXEC);
	break;
	#ifdef GDB
	case DT_DEBUG:
	dp->d_un.d_ptr = (Elf_Addr)&r_debug;
	break;
	#endif
	}
	}

	if (plttype == DT_RELA) {
	ef->pltrela = (const Elf_Rela *)ef->pltrel;
	ef->pltrel = NULL;
	ef->pltrelasize = ef->pltrelsize;
	ef->pltrelsize = 0;
	}

	ef->ddbsymtab = ef->symtab;
	ef->ddbsymcnt = ef->nchains;
	ef->ddbstrtab = ef->strtab;
	ef->ddbstrcnt = ef->strsz;

	return (0);
	}

	static int
	parse_dpcpu(elf_file_t ef)
	{
	int count;
	int error;

	ef->pcpu_start = 0;
	ef->pcpu_stop = 0;
	error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start,
	(void ***)&ef->pcpu_stop, &count);
	/* Error just means there is no pcpu set to relocate. */
	if (error != 0)
	return (0);
	count = sizeof(void );
	/*
	* Allocate space in the primary pcpu area. Copy in our
	* initialization from the data section and then initialize
	* all per-cpu storage from that.
	*/
	ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(count);
	if (ef->pcpu_base == 0)
	return (ENOSPC);
	memcpy((void )ef->pcpu_base, (void )ef->pcpu_start, count);
	dpcpu_copy((void *)ef->pcpu_base, count);
	elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop,
	ef->pcpu_base);

	return (0);
	}

	#ifdef VIMAGE
	static int
	parse_vnet(elf_file_t ef)
	{
	int count;
	int error;

	ef->vnet_start = 0;
	ef->vnet_stop = 0;
	error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start,
	(void ***)&ef->vnet_stop, &count);
	/* Error just means there is no vnet data set to relocate. */
	if (error != 0)
	return (0);
	count = sizeof(void );
	/*
	* Allocate space in the primary vnet area. Copy in our
	* initialization from the data section and then initialize
	* all per-vnet storage from that.
	*/
	ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count);
	if (ef->vnet_base == 0)
	return (ENOSPC);
	memcpy((void )ef->vnet_base, (void )ef->vnet_start, count);
	vnet_data_copy((void *)ef->vnet_base, count);
	elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop,
	ef->vnet_base);

	return (0);
	}
	#endif

	static int
	link_elf_link_preload(linker_class_t cls,
	const char* filename, linker_file_t *result)
	{
	Elf_Addr *ctors_addrp;
	Elf_Size *ctors_sizep;
	caddr_t modptr, baseptr, sizeptr, dynptr;
	char *type;
	elf_file_t ef;
	linker_file_t lf;
	int error;
	vm_offset_t dp;

	/* Look to see if we have the file preloaded */
	modptr = preload_search_by_name(filename);
	if (modptr == NULL)
	return (ENOENT);

	type = (char *)preload_search_info(modptr, MODINFO_TYPE);
	baseptr = preload_search_info(modptr, MODINFO_ADDR);
	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
	dynptr = preload_search_info(modptr,
	MODINFO_METADATA \| MODINFOMD_DYNAMIC);
	if (type == NULL \|\|
	(strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 &&
	strcmp(type, "elf module") != 0))
	return (EFTYPE);
	if (baseptr == NULL \|\| sizeptr == NULL \|\| dynptr == NULL)
	return (EINVAL);

	lf = linker_make_file(filename, &link_elf_class);
	if (lf == NULL)
	return (ENOMEM);

	ef = (elf_file_t) lf;
	ef->preloaded = 1;
	ef->modptr = modptr;
	ef->address = (caddr_t )baseptr;
	#ifdef SPARSE_MAPPING
	ef->object = 0;
	#endif
	dp = (vm_offset_t)ef->address + (vm_offset_t )dynptr;
	ef->dynamic = (Elf_Dyn *)dp;
	lf->address = ef->address;
	lf->size = (size_t )sizeptr;

	ctors_addrp = (Elf_Addr *)preload_search_info(modptr,
	MODINFO_METADATA \| MODINFOMD_CTORS_ADDR);
	ctors_sizep = (Elf_Size *)preload_search_info(modptr,
	MODINFO_METADATA \| MODINFOMD_CTORS_SIZE);
	if (ctors_addrp != NULL && ctors_sizep != NULL) {
	lf->ctors_addr = ef->address + *ctors_addrp;
	lf->ctors_size = *ctors_sizep;
	}

	error = parse_dynamic(ef);
	if (error == 0)
	error = parse_dpcpu(ef);
	#ifdef VIMAGE
	if (error == 0)
	error = parse_vnet(ef);
	#endif
	if (error != 0) {
	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
	return (error);
	}
	link_elf_reloc_local(lf);
	*result = lf;
	return (0);
	}

	static int
	link_elf_link_preload_finish(linker_file_t lf)
	{
	elf_file_t ef;
	int error;

	ef = (elf_file_t) lf;
	error = relocate_file(ef);
	if (error != 0)
	return (error);
	(void)link_elf_preload_parse_symbols(ef);

	return (link_elf_link_common_finish(lf));
	}

	static int
	link_elf_load_file(linker_class_t cls, const char* filename,
	linker_file_t* result)
	{
	struct nameidata nd;
	struct thread* td = curthread; /* XXX */
	Elf_Ehdr *hdr;
	caddr_t firstpage;
	int nbytes, i;
	Elf_Phdr *phdr;
	Elf_Phdr *phlimit;
	Elf_Phdr *segs[MAXSEGS];
	int nsegs;
	Elf_Phdr *phdyn;
	- Elf_Phdr *phphdr;
	caddr_t mapbase;
	size_t mapsize;
	- Elf_Off base_offset;
	Elf_Addr base_vaddr;
	Elf_Addr base_vlimit;
	int error = 0;
	ssize_t resid;
	int flags;
	elf_file_t ef;
	linker_file_t lf;
	Elf_Shdr *shdr;
	int symtabindex;
	int symstrindex;
	int shstrindex;
	int symcnt;
	int strcnt;
	char *shstrs;

	shdr = NULL;
	lf = NULL;
	shstrs = NULL;

	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
	flags = FREAD;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error != 0)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (nd.ni_vp->v_type != VREG) {
	error = ENOEXEC;
	firstpage = NULL;
	goto out;
	}
	#ifdef MAC
	error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp);
	if (error != 0) {
	firstpage = NULL;
	goto out;
	}
	#endif

	/*
	* Read the elf header from the file.
	*/
	firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
	hdr = (Elf_Ehdr *)firstpage;
	error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
	UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	&resid, td);
	nbytes = PAGE_SIZE - resid;
	if (error != 0)
	goto out;

	if (!IS_ELF(*hdr)) {
	error = ENOEXEC;
	goto out;
	}

	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS \|\|
	hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
	link_elf_error(filename, "Unsupported file layout");
	error = ENOEXEC;
	goto out;
	}
	if (hdr->e_ident[EI_VERSION] != EV_CURRENT \|\|
	hdr->e_version != EV_CURRENT) {
	link_elf_error(filename, "Unsupported file version");
	error = ENOEXEC;
	goto out;
	}
	if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
	error = ENOSYS;
	goto out;
	}
	if (hdr->e_machine != ELF_TARG_MACH) {
	link_elf_error(filename, "Unsupported machine");
	error = ENOEXEC;
	goto out;
	}

	/*
	* We rely on the program header being in the first page.
	* This is not strictly required by the ABI specification, but
	* it seems to always true in practice. And, it simplifies
	* things considerably.
	*/
	if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
	(hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
	(hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
	link_elf_error(filename, "Unreadable program headers");

	/*
	* Scan the program header entries, and save key information.
	*
	* We rely on there being exactly two load segments, text and data,
	* in that order.
	*/
	phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
	phlimit = phdr + hdr->e_phnum;
	nsegs = 0;
	phdyn = NULL;
	- phphdr = NULL;
	while (phdr < phlimit) {
	switch (phdr->p_type) {
	case PT_LOAD:
	if (nsegs == MAXSEGS) {
	link_elf_error(filename, "Too many sections");
	error = ENOEXEC;
	goto out;
	}
	/*
	* XXX: We just trust they come in right order ??
	*/
	segs[nsegs] = phdr;
	++nsegs;
	break;

	- case PT_PHDR:
	- phphdr = phdr;
	- break;
	-
	case PT_DYNAMIC:
	phdyn = phdr;
	break;

	case PT_INTERP:
	error = ENOSYS;
	goto out;
	}

	++phdr;
	}
	if (phdyn == NULL) {
	link_elf_error(filename, "Object is not dynamically-linked");
	error = ENOEXEC;
	goto out;
	}
	if (nsegs == 0) {
	link_elf_error(filename, "No sections");
	error = ENOEXEC;
	goto out;
	}

	/*
	* Allocate the entire address space of the object, to stake
	* out our contiguous region, and to establish the base
	* address for relocation.
	*/
	- base_offset = trunc_page(segs[0]->p_offset);
	base_vaddr = trunc_page(segs[0]->p_vaddr);
	base_vlimit = round_page(segs[nsegs - 1]->p_vaddr +
	segs[nsegs - 1]->p_memsz);
	mapsize = base_vlimit - base_vaddr;

	lf = linker_make_file(filename, &link_elf_class);
	if (lf == NULL) {
	error = ENOMEM;
	goto out;
	}

	ef = (elf_file_t) lf;
	#ifdef SPARSE_MAPPING
	ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
	if (ef->object == NULL) {
	error = ENOMEM;
	goto out;
	}
	ef->address = (caddr_t) vm_map_min(kernel_map);
	error = vm_map_find(kernel_map, ef->object, 0,
	(vm_offset_t *) &ef->address, mapsize, 0, VMFS_OPTIMAL_SPACE,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error != 0) {
	vm_object_deallocate(ef->object);
	ef->object = 0;
	goto out;
	}
	#else
	ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
	#endif
	mapbase = ef->address;

	/*
	* Read the text and data sections and zero the bss.
	*/
	for (i = 0; i < nsegs; i++) {
	caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
	error = vn_rdwr(UIO_READ, nd.ni_vp,
	segbase, segs[i]->p_filesz, segs[i]->p_offset,
	UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	&resid, td);
	if (error != 0)
	goto out;
	bzero(segbase + segs[i]->p_filesz,
	segs[i]->p_memsz - segs[i]->p_filesz);

	#ifdef SPARSE_MAPPING
	/*
	* Wire down the pages
	*/
	error = vm_map_wire(kernel_map,
	(vm_offset_t) segbase,
	(vm_offset_t) segbase + segs[i]->p_memsz,
	VM_MAP_WIRE_SYSTEM\|VM_MAP_WIRE_NOHOLES);
	if (error != KERN_SUCCESS) {
	error = ENOMEM;
	goto out;
	}
	#endif
	}

	#ifdef GPROF
	/* Update profiling information with the new text segment. */
	mtx_lock(&Giant);
	kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
	segs[0]->p_memsz));
	mtx_unlock(&Giant);
	#endif

	ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);

	lf->address = ef->address;
	lf->size = mapsize;

	error = parse_dynamic(ef);
	if (error != 0)
	goto out;
	error = parse_dpcpu(ef);
	if (error != 0)
	goto out;
	#ifdef VIMAGE
	error = parse_vnet(ef);
	if (error != 0)
	goto out;
	#endif
	link_elf_reloc_local(lf);

	VOP_UNLOCK(nd.ni_vp, 0);
	error = linker_load_dependencies(lf);
	vn_lock(nd.ni_vp, LK_EXCLUSIVE \| LK_RETRY);
	if (error != 0)
	goto out;
	error = relocate_file(ef);
	if (error != 0)
	goto out;

	/*
	* Try and load the symbol table if it's present. (you can
	* strip it!)
	*/
	nbytes = hdr->e_shnum * hdr->e_shentsize;
	if (nbytes == 0 \|\| hdr->e_shoff == 0)
	goto nosyms;
	shdr = malloc(nbytes, M_LINKER, M_WAITOK \| M_ZERO);
	error = vn_rdwr(UIO_READ, nd.ni_vp,
	(caddr_t)shdr, nbytes, hdr->e_shoff,
	UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	&resid, td);
	if (error != 0)
	goto out;

	/* Read section string table */
	shstrindex = hdr->e_shstrndx;
	if (shstrindex != 0 && shdr[shstrindex].sh_type == SHT_STRTAB &&
	shdr[shstrindex].sh_size != 0) {
	nbytes = shdr[shstrindex].sh_size;
	shstrs = malloc(nbytes, M_LINKER, M_WAITOK \| M_ZERO);
	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shstrs, nbytes,
	shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED,
	td->td_ucred, NOCRED, &resid, td);
	if (error)
	goto out;
	}

	symtabindex = -1;
	symstrindex = -1;
	for (i = 0; i < hdr->e_shnum; i++) {
	if (shdr[i].sh_type == SHT_SYMTAB) {
	symtabindex = i;
	symstrindex = shdr[i].sh_link;
	} else if (shstrs != NULL && shdr[i].sh_name != 0 &&
	strcmp(shstrs + shdr[i].sh_name, ".ctors") == 0) {
	/* Record relocated address and size of .ctors. */
	lf->ctors_addr = mapbase + shdr[i].sh_addr - base_vaddr;
	lf->ctors_size = shdr[i].sh_size;
	}
	}
	if (symtabindex < 0 \|\| symstrindex < 0)
	goto nosyms;

	symcnt = shdr[symtabindex].sh_size;
	ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
	strcnt = shdr[symstrindex].sh_size;
	ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);

	error = vn_rdwr(UIO_READ, nd.ni_vp,
	ef->symbase, symcnt, shdr[symtabindex].sh_offset,
	UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	&resid, td);
	if (error != 0)
	goto out;
	error = vn_rdwr(UIO_READ, nd.ni_vp,
	ef->strbase, strcnt, shdr[symstrindex].sh_offset,
	UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	&resid, td);
	if (error != 0)
	goto out;

	ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
	ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
	ef->ddbstrcnt = strcnt;
	ef->ddbstrtab = ef->strbase;

	nosyms:
	error = link_elf_link_common_finish(lf);
	if (error != 0)
	goto out;

	*result = lf;

	out:
	VOP_UNLOCK(nd.ni_vp, 0);
	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
	if (error != 0 && lf != NULL)
	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
	free(shdr, M_LINKER);
	free(firstpage, M_LINKER);
	free(shstrs, M_LINKER);

	return (error);
	}

	Elf_Addr
	elf_relocaddr(linker_file_t lf, Elf_Addr x)
	{
	elf_file_t ef;

	ef = (elf_file_t)lf;
	if (x >= ef->pcpu_start && x < ef->pcpu_stop)
	return ((x - ef->pcpu_start) + ef->pcpu_base);
	#ifdef VIMAGE
	if (x >= ef->vnet_start && x < ef->vnet_stop)
	return ((x - ef->vnet_start) + ef->vnet_base);
	#endif
	return (x);
	}


	static void
	link_elf_unload_file(linker_file_t file)
	{
	elf_file_t ef = (elf_file_t) file;

	if (ef->pcpu_base != 0) {
	dpcpu_free((void *)ef->pcpu_base,
	ef->pcpu_stop - ef->pcpu_start);
	elf_set_delete(&set_pcpu_list, ef->pcpu_start);
	}
	#ifdef VIMAGE
	if (ef->vnet_base != 0) {
	vnet_data_free((void *)ef->vnet_base,
	ef->vnet_stop - ef->vnet_start);
	elf_set_delete(&set_vnet_list, ef->vnet_start);
	}
	#endif
	#ifdef GDB
	if (ef->gdb.l_ld != NULL) {
	GDB_STATE(RT_DELETE);
	free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
	link_elf_delete_gdb(&ef->gdb);
	GDB_STATE(RT_CONSISTENT);
	}
	#endif

	/* Notify MD code that a module is being unloaded. */
	elf_cpu_unload_file(file);

	if (ef->preloaded) {
	link_elf_unload_preload(file);
	return;
	}

	#ifdef SPARSE_MAPPING
	if (ef->object != NULL) {
	vm_map_remove(kernel_map, (vm_offset_t) ef->address,
	(vm_offset_t) ef->address
	+ (ef->object->size << PAGE_SHIFT));
	}
	#else
	free(ef->address, M_LINKER);
	#endif
	free(ef->symbase, M_LINKER);
	free(ef->strbase, M_LINKER);
	free(ef->ctftab, M_LINKER);
	free(ef->ctfoff, M_LINKER);
	free(ef->typoff, M_LINKER);
	}

	static void
	link_elf_unload_preload(linker_file_t file)
	{
	if (file->filename != NULL)
	preload_delete_name(file->filename);
	}

	static const char *
	symbol_name(elf_file_t ef, Elf_Size r_info)
	{
	const Elf_Sym *ref;

	if (ELF_R_SYM(r_info)) {
	ref = ef->symtab + ELF_R_SYM(r_info);
	return (ef->strtab + ref->st_name);
	}
	return (NULL);
	}

	static int
	relocate_file(elf_file_t ef)
	{
	const Elf_Rel *rellim;
	const Elf_Rel *rel;
	const Elf_Rela *relalim;
	const Elf_Rela *rela;
	const char *symname;

	/* Perform relocations without addend if there are any: */
	rel = ef->rel;
	if (rel != NULL) {
	rellim = (const Elf_Rel *)
	((const char *)ef->rel + ef->relsize);
	while (rel < rellim) {
	if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
	ELF_RELOC_REL, elf_lookup)) {
	symname = symbol_name(ef, rel->r_info);
	printf("link_elf: symbol %s undefined\n", symname);
	return (ENOENT);
	}
	rel++;
	}
	}

	/* Perform relocations with addend if there are any: */
	rela = ef->rela;
	if (rela != NULL) {
	relalim = (const Elf_Rela *)
	((const char *)ef->rela + ef->relasize);
	while (rela < relalim) {
	if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
	ELF_RELOC_RELA, elf_lookup)) {
	symname = symbol_name(ef, rela->r_info);
	printf("link_elf: symbol %s undefined\n",
	symname);
	return (ENOENT);
	}
	rela++;
	}
	}

	/* Perform PLT relocations without addend if there are any: */
	rel = ef->pltrel;
	if (rel != NULL) {
	rellim = (const Elf_Rel *)
	((const char *)ef->pltrel + ef->pltrelsize);
	while (rel < rellim) {
	if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
	ELF_RELOC_REL, elf_lookup)) {
	symname = symbol_name(ef, rel->r_info);
	printf("link_elf: symbol %s undefined\n",
	symname);
	return (ENOENT);
	}
	rel++;
	}
	}

	/* Perform relocations with addend if there are any: */
	rela = ef->pltrela;
	if (rela != NULL) {
	relalim = (const Elf_Rela *)
	((const char *)ef->pltrela + ef->pltrelasize);
	while (rela < relalim) {
	if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
	ELF_RELOC_RELA, elf_lookup)) {
	symname = symbol_name(ef, rela->r_info);
	printf("link_elf: symbol %s undefined\n",
	symname);
	return (ENOENT);
	}
	rela++;
	}
	}

	return (0);
	}

	/*
	* Hash function for symbol table lookup. Don't even think about changing
	* this. It is specified by the System V ABI.
	*/
	static unsigned long
	elf_hash(const char *name)
	{
	const unsigned char p = (const unsigned char ) name;
	unsigned long h = 0;
	unsigned long g;

	while (*p != '\0') {
	h = (h << 4) + *p++;
	if ((g = h & 0xf0000000) != 0)
	h ^= g >> 24;
	h &= ~g;
	}
	return (h);
	}

	static int
	link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
	{
	elf_file_t ef = (elf_file_t) lf;
	unsigned long symnum;
	const Elf_Sym* symp;
	const char *strp;
	unsigned long hash;
	int i;

	/* If we don't have a hash, bail. */
	if (ef->buckets == NULL \|\| ef->nbuckets == 0) {
	printf("link_elf_lookup_symbol: missing symbol hash table\n");
	return (ENOENT);
	}

	/* First, search hashed global symbols */
	hash = elf_hash(name);
	symnum = ef->buckets[hash % ef->nbuckets];

	while (symnum != STN_UNDEF) {
	if (symnum >= ef->nchains) {
	printf("%s: corrupt symbol table\n", __func__);
	return (ENOENT);
	}

	symp = ef->symtab + symnum;
	if (symp->st_name == 0) {
	printf("%s: corrupt symbol table\n", __func__);
	return (ENOENT);
	}

	strp = ef->strtab + symp->st_name;

	if (strcmp(name, strp) == 0) {
	if (symp->st_shndx != SHN_UNDEF \|\|
	(symp->st_value != 0 &&
	ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
	*sym = (c_linker_sym_t) symp;
	return (0);
	}
	return (ENOENT);
	}

	symnum = ef->chains[symnum];
	}

	/* If we have not found it, look at the full table (if loaded) */
	if (ef->symtab == ef->ddbsymtab)
	return (ENOENT);

	/* Exhaustive search */
	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
	strp = ef->ddbstrtab + symp->st_name;
	if (strcmp(name, strp) == 0) {
	if (symp->st_shndx != SHN_UNDEF \|\|
	(symp->st_value != 0 &&
	ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
	*sym = (c_linker_sym_t) symp;
	return (0);
	}
	return (ENOENT);
	}
	}

	return (ENOENT);
	}

	static int
	link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
	linker_symval_t *symval)
	{
	elf_file_t ef = (elf_file_t) lf;
	const Elf_Sym* es = (const Elf_Sym*) sym;

	if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) {
	symval->name = ef->strtab + es->st_name;
	symval->value = (caddr_t) ef->address + es->st_value;
	symval->size = es->st_size;
	return (0);
	}
	if (ef->symtab == ef->ddbsymtab)
	return (ENOENT);
	if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
	symval->name = ef->ddbstrtab + es->st_name;
	symval->value = (caddr_t) ef->address + es->st_value;
	symval->size = es->st_size;
	return (0);
	}
	return (ENOENT);
	}

	static int
	link_elf_search_symbol(linker_file_t lf, caddr_t value,
	c_linker_sym_t sym, long diffp)
	{
	elf_file_t ef = (elf_file_t) lf;
	u_long off = (uintptr_t) (void *) value;
	u_long diff = off;
	u_long st_value;
	const Elf_Sym* es;
	const Elf_Sym* best = NULL;
	int i;

	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
	if (es->st_name == 0)
	continue;
	st_value = es->st_value + (uintptr_t) (void *) ef->address;
	if (off >= st_value) {
	if (off - st_value < diff) {
	diff = off - st_value;
	best = es;
	if (diff == 0)
	break;
	} else if (off - st_value == diff) {
	best = es;
	}
	}
	}
	if (best == NULL)
	*diffp = off;
	else
	*diffp = diff;
	*sym = (c_linker_sym_t) best;

	return (0);
	}

	/*
	* Look up a linker set on an ELF system.
	*/
	static int
	link_elf_lookup_set(linker_file_t lf, const char *name,
	void *startp, void stopp, int countp)
	{
	c_linker_sym_t sym;
	linker_symval_t symval;
	char *setsym;
	void start, stop;
	int len, error = 0, count;

	len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
	setsym = malloc(len, M_LINKER, M_WAITOK);

	/* get address of first entry */
	snprintf(setsym, len, "%s%s", "__start_set_", name);
	error = link_elf_lookup_symbol(lf, setsym, &sym);
	if (error != 0)
	goto out;
	link_elf_symbol_values(lf, sym, &symval);
	if (symval.value == 0) {
	error = ESRCH;
	goto out;
	}
	start = (void **)symval.value;

	/* get address of last entry */
	snprintf(setsym, len, "%s%s", "__stop_set_", name);
	error = link_elf_lookup_symbol(lf, setsym, &sym);
	if (error != 0)
	goto out;
	link_elf_symbol_values(lf, sym, &symval);
	if (symval.value == 0) {
	error = ESRCH;
	goto out;
	}
	stop = (void **)symval.value;

	/* and the number of entries */
	count = stop - start;

	/* and copy out */
	if (startp != NULL)
	*startp = start;
	if (stopp != NULL)
	*stopp = stop;
	if (countp != NULL)
	*countp = count;

	out:
	free(setsym, M_LINKER);
	return (error);
	}

	static int
	link_elf_each_function_name(linker_file_t file,
	int (callback)(const char , void ), void opaque)
	{
	elf_file_t ef = (elf_file_t)file;
	const Elf_Sym *symp;
	int i, error;

	/* Exhaustive search */
	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
	if (symp->st_value != 0 &&
	ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
	error = callback(ef->ddbstrtab + symp->st_name, opaque);
	if (error != 0)
	return (error);
	}
	}
	return (0);
	}

	static int
	link_elf_each_function_nameval(linker_file_t file,
	linker_function_nameval_callback_t callback, void *opaque)
	{
	linker_symval_t symval;
	elf_file_t ef = (elf_file_t)file;
	const Elf_Sym* symp;
	int i, error;

	/* Exhaustive search */
	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
	if (symp->st_value != 0 &&
	ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
	error = link_elf_symbol_values(file,
	(c_linker_sym_t) symp, &symval);
	if (error != 0)
	return (error);
	error = callback(file, i, &symval, opaque);
	if (error != 0)
	return (error);
	}
	}
	return (0);
	}

	const Elf_Sym *
	elf_get_sym(linker_file_t lf, Elf_Size symidx)
	{
	elf_file_t ef = (elf_file_t)lf;

	if (symidx >= ef->nchains)
	return (NULL);
	return (ef->symtab + symidx);
	}

	const char *
	elf_get_symname(linker_file_t lf, Elf_Size symidx)
	{
	elf_file_t ef = (elf_file_t)lf;
	const Elf_Sym *sym;

	if (symidx >= ef->nchains)
	return (NULL);
	sym = ef->symtab + symidx;
	return (ef->strtab + sym->st_name);
	}

	/*
	* Symbol lookup function that can be used when the symbol index is known (ie
	* in relocations). It uses the symbol index instead of doing a fully fledged
	* hash table based lookup when such is valid. For example for local symbols.
	* This is not only more efficient, it's also more correct. It's not always
	* the case that the symbol can be found through the hash table.
	*/
	static int
	elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res)
	{
	elf_file_t ef = (elf_file_t)lf;
	const Elf_Sym *sym;
	const char *symbol;
	Elf_Addr addr, start, base;

	/* Don't even try to lookup the symbol if the index is bogus. */
	if (symidx >= ef->nchains) {
	*res = 0;
	return (EINVAL);
	}

	sym = ef->symtab + symidx;

	/*
	* Don't do a full lookup when the symbol is local. It may even
	* fail because it may not be found through the hash table.
	*/
	if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
	/* Force lookup failure when we have an insanity. */
	if (sym->st_shndx == SHN_UNDEF \|\| sym->st_value == 0) {
	*res = 0;
	return (EINVAL);
	}
	*res = ((Elf_Addr)ef->address + sym->st_value);
	return (0);
	}

	/*
	* XXX we can avoid doing a hash table based lookup for global
	* symbols as well. This however is not always valid, so we'll
	* just do it the hard way for now. Performance tweaks can
	* always be added.
	*/

	symbol = ef->strtab + sym->st_name;

	/* Force a lookup failure if the symbol name is bogus. */
	if (*symbol == 0) {
	*res = 0;
	return (EINVAL);
	}

	addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
	if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) {
	*res = 0;
	return (EINVAL);
	}

	if (elf_set_find(&set_pcpu_list, addr, &start, &base))
	addr = addr - start + base;
	#ifdef VIMAGE
	else if (elf_set_find(&set_vnet_list, addr, &start, &base))
	addr = addr - start + base;
	#endif
	*res = addr;
	return (0);
	}

	static void
	link_elf_reloc_local(linker_file_t lf)
	{
	const Elf_Rel *rellim;
	const Elf_Rel *rel;
	const Elf_Rela *relalim;
	const Elf_Rela *rela;
	elf_file_t ef = (elf_file_t)lf;

	/* Perform relocations without addend if there are any: */
	if ((rel = ef->rel) != NULL) {
	rellim = (const Elf_Rel )((const char )ef->rel + ef->relsize);
	while (rel < rellim) {
	elf_reloc_local(lf, (Elf_Addr)ef->address, rel,
	ELF_RELOC_REL, elf_lookup);
	rel++;
	}
	}

	/* Perform relocations with addend if there are any: */
	if ((rela = ef->rela) != NULL) {
	relalim = (const Elf_Rela *)
	((const char *)ef->rela + ef->relasize);
	while (rela < relalim) {
	elf_reloc_local(lf, (Elf_Addr)ef->address, rela,
	ELF_RELOC_RELA, elf_lookup);
	rela++;
	}
	}
	}

	static long
	link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
	{
	elf_file_t ef = (elf_file_t)lf;

	*symtab = ef->ddbsymtab;

	if (*symtab == NULL)
	return (0);

	return (ef->ddbsymcnt);
	}

	static long
	link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
	{
	elf_file_t ef = (elf_file_t)lf;

	*strtab = ef->ddbstrtab;

	if (*strtab == NULL)
	return (0);

	return (ef->ddbstrcnt);
	}
	Index: head/sys/kern/subr_msgbuf.c
	===================================================================
	--- head/sys/kern/subr_msgbuf.c (revision 327172)
	+++ head/sys/kern/subr_msgbuf.c (revision 327173)
	@@ -1,419 +1,418 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2003 Ian Dowse. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* Generic message buffer support routines.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/kernel.h>
	#include <sys/mutex.h>
	#include <sys/msgbuf.h>
	#include <sys/sysctl.h>

	/*
	* Maximum number conversion buffer length: uintmax_t in base 2, plus <>
	* around the priority, and a terminating NUL.
	*/
	#define MAXPRIBUF (sizeof(intmax_t) * NBBY + 3)

	/* Read/write sequence numbers are modulo a multiple of the buffer size. */
	#define SEQMOD(size) ((size) * 16)

	static u_int msgbuf_cksum(struct msgbuf *mbp);

	/*
	* Timestamps in msgbuf are useful when trying to diagnose when core dumps
	* or other actions occurred.
	*/
	static int msgbuf_show_timestamp = 0;
	SYSCTL_INT(_kern, OID_AUTO, msgbuf_show_timestamp, CTLFLAG_RWTUN,
	&msgbuf_show_timestamp, 0, "Show timestamp in msgbuf");

	/*
	* Initialize a message buffer of the specified size at the specified
	* location. This also zeros the buffer area.
	*/
	void
	msgbuf_init(struct msgbuf mbp, void ptr, int size)
	{

	mbp->msg_ptr = ptr;
	mbp->msg_size = size;
	mbp->msg_seqmod = SEQMOD(size);
	msgbuf_clear(mbp);
	mbp->msg_magic = MSG_MAGIC;
	mbp->msg_lastpri = -1;
	mbp->msg_flags = 0;
	bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
	mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
	}

	/*
	* Reinitialize a message buffer, retaining its previous contents if
	* the size and checksum are correct. If the old contents cannot be
	* recovered, the message buffer is cleared.
	*/
	void
	msgbuf_reinit(struct msgbuf mbp, void ptr, int size)
	{
	u_int cksum;

	if (mbp->msg_magic != MSG_MAGIC \|\| mbp->msg_size != size) {
	msgbuf_init(mbp, ptr, size);
	return;
	}
	mbp->msg_seqmod = SEQMOD(size);
	mbp->msg_wseq = MSGBUF_SEQNORM(mbp, mbp->msg_wseq);
	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq);
	mbp->msg_ptr = ptr;
	cksum = msgbuf_cksum(mbp);
	if (cksum != mbp->msg_cksum) {
	if (bootverbose) {
	printf("msgbuf cksum mismatch (read %x, calc %x)\n",
	mbp->msg_cksum, cksum);
	printf("Old msgbuf not recovered\n");
	}
	msgbuf_clear(mbp);
	}

	mbp->msg_lastpri = -1;
	/* Assume that the old message buffer didn't end in a newline. */
	mbp->msg_flags \|= MSGBUF_NEEDNL;
	bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
	mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
	}

	/*
	* Clear the message buffer.
	*/
	void
	msgbuf_clear(struct msgbuf *mbp)
	{

	bzero(mbp->msg_ptr, mbp->msg_size);
	mbp->msg_wseq = 0;
	mbp->msg_rseq = 0;
	mbp->msg_cksum = 0;
	}

	/*
	* Get a count of the number of unread characters in the message buffer.
	*/
	int
	msgbuf_getcount(struct msgbuf *mbp)
	{
	u_int len;

	len = MSGBUF_SEQSUB(mbp, mbp->msg_wseq, mbp->msg_rseq);
	if (len > mbp->msg_size)
	len = mbp->msg_size;
	return (len);
	}

	/*
	* Add a character into the message buffer, and update the checksum and
	* sequence number.
	*
	* The caller should hold the message buffer spinlock.
	*/

	static void
	msgbuf_do_addchar(struct msgbuf * const mbp, u_int * const seq, const int c)
	{
	u_int pos;

	/* Make sure we properly wrap the sequence number. */
	pos = MSGBUF_SEQ_TO_POS(mbp, *seq);
	mbp->msg_cksum += (u_int)(u_char)c -
	(u_int)(u_char)mbp->msg_ptr[pos];
	mbp->msg_ptr[pos] = c;
	seq = MSGBUF_SEQNORM(mbp, seq + 1);
	}

	/*
	* Append a character to a message buffer.
	*/
	void
	msgbuf_addchar(struct msgbuf *mbp, int c)
	{
	mtx_lock_spin(&mbp->msg_lock);

	msgbuf_do_addchar(mbp, &mbp->msg_wseq, c);

	mtx_unlock_spin(&mbp->msg_lock);
	}

	/*
	* Append a NUL-terminated string with a priority to a message buffer.
	* Filter carriage returns if the caller requests it.
	*
	* XXX The carriage return filtering behavior is present in the
	* msglogchar() API, however testing has shown that we don't seem to send
	* carriage returns down this path. So do we still need it?
	*/
	void
	msgbuf_addstr(struct msgbuf mbp, int pri, char str, int filter_cr)
	{
	u_int seq;
	size_t len, prefix_len;
	char prefix[MAXPRIBUF];
	char buf[32];
	- int nl, i, j, needtime;
	+ int i, j, needtime;

	len = strlen(str);
	prefix_len = 0;
	- nl = 0;

	/* If we have a zero-length string, no need to do anything. */
	if (len == 0)
	return;

	mtx_lock_spin(&mbp->msg_lock);

	/*
	* If this is true, we may need to insert a new priority sequence,
	* so prepare the prefix.
	*/
	if (pri != -1)
	prefix_len = sprintf(prefix, "<%d>", pri);

	/*
	* Starting write sequence number.
	*/
	seq = mbp->msg_wseq;

	/*
	* Whenever there is a change in priority, we have to insert a
	* newline, and a priority prefix if the priority is not -1. Here
	* we detect whether there was a priority change, and whether we
	* did not end with a newline. If that is the case, we need to
	* insert a newline before this string.
	*/
	if (mbp->msg_lastpri != pri && (mbp->msg_flags & MSGBUF_NEEDNL) != 0) {

	msgbuf_do_addchar(mbp, &seq, '\n');
	mbp->msg_flags &= ~MSGBUF_NEEDNL;
	}

	needtime = 1;
	for (i = 0; i < len; i++) {
	/*
	* If we just had a newline, and the priority is not -1
	* (and therefore prefix_len != 0), then we need a priority
	* prefix for this line.
	*/
	if ((mbp->msg_flags & MSGBUF_NEEDNL) == 0 && prefix_len != 0) {
	int j;

	for (j = 0; j < prefix_len; j++)
	msgbuf_do_addchar(mbp, &seq, prefix[j]);
	}

	if (msgbuf_show_timestamp && needtime == 1 &&
	(mbp->msg_flags & MSGBUF_NEEDNL) == 0) {

	snprintf(buf, sizeof(buf), "[%jd] ",
	(intmax_t)time_uptime);
	for (j = 0; buf[j] != '\0'; j++)
	msgbuf_do_addchar(mbp, &seq, buf[j]);
	needtime = 0;
	}

	/*
	* Don't copy carriage returns if the caller requested
	* filtering.
	*
	* XXX This matches the behavior of msglogchar(), but is it
	* necessary? Testing has shown that we don't seem to get
	* carriage returns here.
	*/
	if ((filter_cr != 0) && (str[i] == '\r'))
	continue;

	/*
	* Clear this flag if we see a newline. This affects whether
	* we need to insert a new prefix or insert a newline later.
	*/
	if (str[i] == '\n')
	mbp->msg_flags &= ~MSGBUF_NEEDNL;
	else
	mbp->msg_flags \|= MSGBUF_NEEDNL;

	msgbuf_do_addchar(mbp, &seq, str[i]);
	}
	/*
	* Update the write sequence number for the actual number of
	* characters we put in the message buffer. (Depends on whether
	* carriage returns are filtered.)
	*/
	mbp->msg_wseq = seq;

	/*
	* Set the last priority.
	*/
	mbp->msg_lastpri = pri;

	mtx_unlock_spin(&mbp->msg_lock);

	}

	/*
	* Read and mark as read a character from a message buffer.
	* Returns the character, or -1 if no characters are available.
	*/
	int
	msgbuf_getchar(struct msgbuf *mbp)
	{
	u_int len, wseq;
	int c;

	mtx_lock_spin(&mbp->msg_lock);

	wseq = mbp->msg_wseq;
	len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
	if (len == 0) {
	mtx_unlock_spin(&mbp->msg_lock);
	return (-1);
	}
	if (len > mbp->msg_size)
	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
	c = (u_char)mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq)];
	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + 1);

	mtx_unlock_spin(&mbp->msg_lock);

	return (c);
	}

	/*
	* Read and mark as read a number of characters from a message buffer.
	* Returns the number of characters that were placed in `buf'.
	*/
	int
	msgbuf_getbytes(struct msgbuf mbp, char buf, int buflen)
	{
	u_int len, pos, wseq;

	mtx_lock_spin(&mbp->msg_lock);

	wseq = mbp->msg_wseq;
	len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
	if (len == 0) {
	mtx_unlock_spin(&mbp->msg_lock);
	return (0);
	}
	if (len > mbp->msg_size) {
	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
	len = mbp->msg_size;
	}
	pos = MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq);
	len = min(len, mbp->msg_size - pos);
	len = min(len, (u_int)buflen);

	bcopy(&mbp->msg_ptr[pos], buf, len);
	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + len);

	mtx_unlock_spin(&mbp->msg_lock);

	return (len);
	}

	/*
	* Peek at the full contents of a message buffer without marking any
	* data as read. `seqp' should point to an unsigned integer that
	* msgbuf_peekbytes() can use to retain state between calls so that
	* the whole message buffer can be read in multiple short reads.
	* To initialise this variable to the start of the message buffer,
	* call msgbuf_peekbytes() with a NULL `buf' parameter.
	*
	* Returns the number of characters that were placed in `buf'.
	*/
	int
	msgbuf_peekbytes(struct msgbuf mbp, char buf, int buflen, u_int *seqp)
	{
	u_int len, pos, wseq;

	mtx_lock_spin(&mbp->msg_lock);

	if (buf == NULL) {
	/* Just initialise seqp. /
	*seqp = MSGBUF_SEQNORM(mbp, mbp->msg_wseq - mbp->msg_size);
	mtx_unlock_spin(&mbp->msg_lock);
	return (0);
	}

	wseq = mbp->msg_wseq;
	len = MSGBUF_SEQSUB(mbp, wseq, *seqp);
	if (len == 0) {
	mtx_unlock_spin(&mbp->msg_lock);
	return (0);
	}
	if (len > mbp->msg_size) {
	*seqp = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
	len = mbp->msg_size;
	}
	pos = MSGBUF_SEQ_TO_POS(mbp, *seqp);
	len = min(len, mbp->msg_size - pos);
	len = min(len, (u_int)buflen);
	bcopy(&mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, *seqp)], buf, len);
	seqp = MSGBUF_SEQNORM(mbp, seqp + len);

	mtx_unlock_spin(&mbp->msg_lock);

	return (len);
	}

	/*
	* Compute the checksum for the complete message buffer contents.
	*/
	static u_int
	msgbuf_cksum(struct msgbuf *mbp)
	{
	u_int i, sum;

	sum = 0;
	for (i = 0; i < mbp->msg_size; i++)
	sum += (u_char)mbp->msg_ptr[i];
	return (sum);
	}

	/*
	* Copy from one message buffer to another.
	*/
	void
	msgbuf_copy(struct msgbuf src, struct msgbuf dst)
	{
	int c;

	while ((c = msgbuf_getchar(src)) >= 0)
	msgbuf_addchar(dst, c);
	}
	Index: head/sys/kern/subr_sleepqueue.c
	===================================================================
	--- head/sys/kern/subr_sleepqueue.c (revision 327172)
	+++ head/sys/kern/subr_sleepqueue.c (revision 327173)
	@@ -1,1455 +1,1454 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* Implementation of sleep queues used to hold queue of threads blocked on
	* a wait channel. Sleep queues are different from turnstiles in that wait
	* channels are not owned by anyone, so there is no priority propagation.
	* Sleep queues can also provide a timeout and can also be interrupted by
	* signals. That said, there are several similarities between the turnstile
	* and sleep queue implementations. (Note: turnstiles were implemented
	* first.) For example, both use a hash table of the same size where each
	* bucket is referred to as a "chain" that contains both a spin lock and
	* a linked list of queues. An individual queue is located by using a hash
	* to pick a chain, locking the chain, and then walking the chain searching
	* for the queue. This means that a wait channel object does not need to
	* embed its queue head just as locks do not embed their turnstile queue
	* head. Threads also carry around a sleep queue that they lend to the
	* wait channel when blocking. Just as in turnstiles, the queue includes
	* a free list of the sleep queues of other threads blocked on the same
	* wait channel in the case of multiple waiters.
	*
	* Some additional functionality provided by sleep queues include the
	* ability to set a timeout. The timeout is managed using a per-thread
	* callout that resumes a thread if it is asleep. A thread may also
	* catch signals while it is asleep (aka an interruptible sleep). The
	* signal code uses sleepq_abort() to interrupt a sleeping thread. Finally,
	* sleep queues also provide some extra assertions. One is not allowed to
	* mix the sleep/wakeup and cv APIs for a given wait channel. Also, one
	* must consistently use the same lock to synchronize with a wait channel,
	* though this check is currently only a warning for sleep/wakeup due to
	* pre-existing abuse of that API. The same lock must also be held when
	* awakening threads, though that is currently only enforced for condition
	* variables.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_sleepqueue_profiling.h"
	#include "opt_ddb.h"
	#include "opt_sched.h"
	#include "opt_stack.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/sched.h>
	#include <sys/sdt.h>
	#include <sys/signalvar.h>
	#include <sys/sleepqueue.h>
	#include <sys/stack.h>
	#include <sys/sysctl.h>
	#include <sys/time.h>

	#include <machine/atomic.h>

	#include <vm/uma.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif


	/*
	* Constants for the hash table of sleep queue chains.
	* SC_TABLESIZE must be a power of two for SC_MASK to work properly.
	*/
	#ifndef SC_TABLESIZE
	#define SC_TABLESIZE 256
	#endif
	CTASSERT(powerof2(SC_TABLESIZE));
	#define SC_MASK (SC_TABLESIZE - 1)
	#define SC_SHIFT 8
	#define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
	SC_MASK)
	#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)]
	#define NR_SLEEPQS 2
	/*
	* There are two different lists of sleep queues. Both lists are connected
	* via the sq_hash entries. The first list is the sleep queue chain list
	* that a sleep queue is on when it is attached to a wait channel. The
	* second list is the free list hung off of a sleep queue that is attached
	* to a wait channel.
	*
	* Each sleep queue also contains the wait channel it is attached to, the
	* list of threads blocked on that wait channel, flags specific to the
	* wait channel, and the lock used to synchronize with a wait channel.
	* The flags are used to catch mismatches between the various consumers
	* of the sleep queue API (e.g. sleep/wakeup and condition variables).
	* The lock pointer is only used when invariants are enabled for various
	* debugging checks.
	*
	* Locking key:
	* c - sleep queue chain lock
	*/
	struct sleepqueue {
	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */
	u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */
	LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */
	LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */
	void sq_wchan; / (c) Wait channel. */
	int sq_type; /* (c) Queue type. */
	#ifdef INVARIANTS
	struct lock_object sq_lock; / (c) Associated lock. */
	#endif
	};

	struct sleepqueue_chain {
	LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */
	struct mtx sc_lock; /* Spin lock for this chain. */
	#ifdef SLEEPQUEUE_PROFILING
	u_int sc_depth; /* Length of sc_queues. */
	u_int sc_max_depth; /* Max length of sc_queues. */
	#endif
	} __aligned(CACHE_LINE_SIZE);

	#ifdef SLEEPQUEUE_PROFILING
	u_int sleepq_max_depth;
	static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
	static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
	"sleepq chain stats");
	SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
	0, "maxmimum depth achieved of a single chain");

	static void sleepq_profile(const char *wmesg);
	static int prof_enabled;
	#endif
	static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
	static uma_zone_t sleepq_zone;

	/*
	* Prototypes for non-exported routines.
	*/
	static int sleepq_catch_signals(void *wchan, int pri);
	static int sleepq_check_signals(void);
	static int sleepq_check_timeout(void);
	#ifdef INVARIANTS
	static void sleepq_dtor(void mem, int size, void arg);
	#endif
	static int sleepq_init(void *mem, int size, int flags);
	static int sleepq_resume_thread(struct sleepqueue sq, struct thread td,
	int pri);
	static void sleepq_switch(void *wchan, int pri);
	static void sleepq_timeout(void *arg);

	SDT_PROBE_DECLARE(sched, , , sleep);
	SDT_PROBE_DECLARE(sched, , , wakeup);

	/*
	* Initialize SLEEPQUEUE_PROFILING specific sysctl nodes.
	* Note that it must happen after sleepinit() has been fully executed, so
	* it must happen after SI_SUB_KMEM SYSINIT() subsystem setup.
	*/
	#ifdef SLEEPQUEUE_PROFILING
	static void
	init_sleepqueue_profiling(void)
	{
	char chain_name[10];
	struct sysctl_oid *chain_oid;
	u_int i;

	for (i = 0; i < SC_TABLESIZE; i++) {
	snprintf(chain_name, sizeof(chain_name), "%u", i);
	chain_oid = SYSCTL_ADD_NODE(NULL,
	SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
	chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
	"depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
	"max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
	NULL);
	}
	}

	SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
	init_sleepqueue_profiling, NULL);
	#endif

	/*
	* Early initialization of sleep queues that is called from the sleepinit()
	* SYSINIT.
	*/
	void
	init_sleepqueues(void)
	{
	int i;

	for (i = 0; i < SC_TABLESIZE; i++) {
	LIST_INIT(&sleepq_chains[i].sc_queues);
	mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
	MTX_SPIN \| MTX_RECURSE);
	}
	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
	#ifdef INVARIANTS
	NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
	#else
	NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
	#endif

	thread0.td_sleepqueue = sleepq_alloc();
	}

	/*
	* Get a sleep queue for a new thread.
	*/
	struct sleepqueue *
	sleepq_alloc(void)
	{

	return (uma_zalloc(sleepq_zone, M_WAITOK));
	}

	/*
	* Free a sleep queue when a thread is destroyed.
	*/
	void
	sleepq_free(struct sleepqueue *sq)
	{

	uma_zfree(sleepq_zone, sq);
	}

	/*
	* Lock the sleep queue chain associated with the specified wait channel.
	*/
	void
	sleepq_lock(void *wchan)
	{
	struct sleepqueue_chain *sc;

	sc = SC_LOOKUP(wchan);
	mtx_lock_spin(&sc->sc_lock);
	}

	/*
	* Look up the sleep queue associated with a given wait channel in the hash
	* table locking the associated sleep queue chain. If no queue is found in
	* the table, NULL is returned.
	*/
	struct sleepqueue *
	sleepq_lookup(void *wchan)
	{
	struct sleepqueue_chain *sc;
	struct sleepqueue *sq;

	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
	sc = SC_LOOKUP(wchan);
	mtx_assert(&sc->sc_lock, MA_OWNED);
	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
	if (sq->sq_wchan == wchan)
	return (sq);
	return (NULL);
	}

	/*
	* Unlock the sleep queue chain associated with a given wait channel.
	*/
	void
	sleepq_release(void *wchan)
	{
	struct sleepqueue_chain *sc;

	sc = SC_LOOKUP(wchan);
	mtx_unlock_spin(&sc->sc_lock);
	}

	/*
	* Places the current thread on the sleep queue for the specified wait
	* channel. If INVARIANTS is enabled, then it associates the passed in
	* lock with the sleepq to make sure it is held when that sleep queue is
	* woken up.
	*/
	void
	sleepq_add(void wchan, struct lock_object lock, const char *wmesg, int flags,
	int queue)
	{
	struct sleepqueue_chain *sc;
	struct sleepqueue *sq;
	struct thread *td;

	td = curthread;
	sc = SC_LOOKUP(wchan);
	mtx_assert(&sc->sc_lock, MA_OWNED);
	MPASS(td->td_sleepqueue != NULL);
	MPASS(wchan != NULL);
	MPASS((queue >= 0) && (queue < NR_SLEEPQS));

	/* If this thread is not allowed to sleep, die a horrible death. */
	KASSERT(td->td_no_sleeping == 0,
	("%s: td %p to sleep on wchan %p with sleeping prohibited",
	__func__, td, wchan));

	/* Look up the sleep queue associated with the wait channel 'wchan'. */
	sq = sleepq_lookup(wchan);

	/*
	* If the wait channel does not already have a sleep queue, use
	* this thread's sleep queue. Otherwise, insert the current thread
	* into the sleep queue already in use by this wait channel.
	*/
	if (sq == NULL) {
	#ifdef INVARIANTS
	int i;

	sq = td->td_sleepqueue;
	for (i = 0; i < NR_SLEEPQS; i++) {
	KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
	("thread's sleep queue %d is not empty", i));
	KASSERT(sq->sq_blockedcnt[i] == 0,
	("thread's sleep queue %d count mismatches", i));
	}
	KASSERT(LIST_EMPTY(&sq->sq_free),
	("thread's sleep queue has a non-empty free list"));
	KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
	sq->sq_lock = lock;
	#endif
	#ifdef SLEEPQUEUE_PROFILING
	sc->sc_depth++;
	if (sc->sc_depth > sc->sc_max_depth) {
	sc->sc_max_depth = sc->sc_depth;
	if (sc->sc_max_depth > sleepq_max_depth)
	sleepq_max_depth = sc->sc_max_depth;
	}
	#endif
	sq = td->td_sleepqueue;
	LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
	sq->sq_wchan = wchan;
	sq->sq_type = flags & SLEEPQ_TYPE;
	} else {
	MPASS(wchan == sq->sq_wchan);
	MPASS(lock == sq->sq_lock);
	MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
	LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
	}
	thread_lock(td);
	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
	sq->sq_blockedcnt[queue]++;
	td->td_sleepqueue = NULL;
	td->td_sqqueue = queue;
	td->td_wchan = wchan;
	td->td_wmesg = wmesg;
	if (flags & SLEEPQ_INTERRUPTIBLE) {
	td->td_flags \|= TDF_SINTR;
	td->td_flags &= ~TDF_SLEEPABORT;
	}
	thread_unlock(td);
	}

	/*
	* Sets a timeout that will remove the current thread from the specified
	* sleep queue after timo ticks if the thread has not already been awakened.
	*/
	void
	sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
	int flags)
	{
	struct sleepqueue_chain *sc;
	struct thread *td;
	sbintime_t pr1;

	td = curthread;
	sc = SC_LOOKUP(wchan);
	mtx_assert(&sc->sc_lock, MA_OWNED);
	MPASS(TD_ON_SLEEPQ(td));
	MPASS(td->td_sleepqueue == NULL);
	MPASS(wchan != NULL);
	if (cold && td == &thread0)
	panic("timed sleep before timers are working");
	KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx",
	td->td_tid, td, (uintmax_t)td->td_sleeptimo));
	thread_lock(td);
	callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1);
	thread_unlock(td);
	callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1,
	sleepq_timeout, td, PCPU_GET(cpuid), flags \| C_PRECALC \|
	C_DIRECT_EXEC);
	}

	/*
	* Return the number of actual sleepers for the specified queue.
	*/
	u_int
	sleepq_sleepcnt(void *wchan, int queue)
	{
	struct sleepqueue *sq;

	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
	sq = sleepq_lookup(wchan);
	if (sq == NULL)
	return (0);
	return (sq->sq_blockedcnt[queue]);
	}

	/*
	* Marks the pending sleep of the current thread as interruptible and
	* makes an initial check for pending signals before putting a thread
	* to sleep. Enters and exits with the thread lock held. Thread lock
	* may have transitioned from the sleepq lock to a run lock.
	*/
	static int
	sleepq_catch_signals(void *wchan, int pri)
	{
	struct sleepqueue_chain *sc;
	struct sleepqueue *sq;
	struct thread *td;
	struct proc *p;
	struct sigacts *ps;
	int sig, ret;

	ret = 0;
	td = curthread;
	p = curproc;
	sc = SC_LOOKUP(wchan);
	mtx_assert(&sc->sc_lock, MA_OWNED);
	MPASS(wchan != NULL);
	if ((td->td_pflags & TDP_WAKEUP) != 0) {
	td->td_pflags &= ~TDP_WAKEUP;
	ret = EINTR;
	thread_lock(td);
	goto out;
	}

	/*
	* See if there are any pending signals or suspension requests for this
	* thread. If not, we can switch immediately.
	*/
	thread_lock(td);
	if ((td->td_flags & (TDF_NEEDSIGCHK \| TDF_NEEDSUSPCHK)) != 0) {
	thread_unlock(td);
	mtx_unlock_spin(&sc->sc_lock);
	CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
	(void *)td, (long)p->p_pid, td->td_name);
	PROC_LOCK(p);
	/*
	* Check for suspension first. Checking for signals and then
	* suspending could result in a missed signal, since a signal
	* can be delivered while this thread is suspended.
	*/
	if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
	ret = thread_suspend_check(1);
	MPASS(ret == 0 \|\| ret == EINTR \|\| ret == ERESTART);
	if (ret != 0) {
	PROC_UNLOCK(p);
	mtx_lock_spin(&sc->sc_lock);
	thread_lock(td);
	goto out;
	}
	}
	if ((td->td_flags & TDF_NEEDSIGCHK) != 0) {
	ps = p->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	sig = cursig(td);
	if (sig == -1) {
	mtx_unlock(&ps->ps_mtx);
	KASSERT((td->td_flags & TDF_SBDRY) != 0,
	("lost TDF_SBDRY"));
	KASSERT(TD_SBDRY_INTR(td),
	("lost TDF_SERESTART of TDF_SEINTR"));
	KASSERT((td->td_flags &
	(TDF_SEINTR \| TDF_SERESTART)) !=
	(TDF_SEINTR \| TDF_SERESTART),
	("both TDF_SEINTR and TDF_SERESTART"));
	ret = TD_SBDRY_ERRNO(td);
	} else if (sig != 0) {
	ret = SIGISMEMBER(ps->ps_sigintr, sig) ?
	EINTR : ERESTART;
	mtx_unlock(&ps->ps_mtx);
	} else {
	mtx_unlock(&ps->ps_mtx);
	}
	}
	/*
	* Lock the per-process spinlock prior to dropping the PROC_LOCK
	* to avoid a signal delivery race. PROC_LOCK, PROC_SLOCK, and
	* thread_lock() are currently held in tdsendsignal().
	*/
	PROC_SLOCK(p);
	mtx_lock_spin(&sc->sc_lock);
	PROC_UNLOCK(p);
	thread_lock(td);
	PROC_SUNLOCK(p);
	}
	if (ret == 0) {
	sleepq_switch(wchan, pri);
	return (0);
	}
	out:
	/*
	* There were pending signals and this thread is still
	* on the sleep queue, remove it from the sleep queue.
	*/
	if (TD_ON_SLEEPQ(td)) {
	sq = sleepq_lookup(wchan);
	if (sleepq_resume_thread(sq, td, 0)) {
	#ifdef INVARIANTS
	/*
	* This thread hasn't gone to sleep yet, so it
	* should not be swapped out.
	*/
	panic("not waking up swapper");
	#endif
	}
	}
	mtx_unlock_spin(&sc->sc_lock);
	MPASS(td->td_lock != &sc->sc_lock);
	return (ret);
	}

	/*
	* Switches to another thread if we are still asleep on a sleep queue.
	* Returns with thread lock.
	*/
	static void
	sleepq_switch(void *wchan, int pri)
	{
	struct sleepqueue_chain *sc;
	struct sleepqueue *sq;
	struct thread *td;
	bool rtc_changed;

	td = curthread;
	sc = SC_LOOKUP(wchan);
	mtx_assert(&sc->sc_lock, MA_OWNED);
	THREAD_LOCK_ASSERT(td, MA_OWNED);

	/*
	* If we have a sleep queue, then we've already been woken up, so
	* just return.
	*/
	if (td->td_sleepqueue != NULL) {
	mtx_unlock_spin(&sc->sc_lock);
	return;
	}

	/*
	* If TDF_TIMEOUT is set, then our sleep has been timed out
	* already but we are still on the sleep queue, so dequeue the
	* thread and return.
	*
	* Do the same if the real-time clock has been adjusted since this
	* thread calculated its timeout based on that clock. This handles
	* the following race:
	* - The Ts thread needs to sleep until an absolute real-clock time.
	* It copies the global rtc_generation into curthread->td_rtcgen,
	* reads the RTC, and calculates a sleep duration based on that time.
	* See umtxq_sleep() for an example.
	* - The Tc thread adjusts the RTC, bumps rtc_generation, and wakes
	* threads that are sleeping until an absolute real-clock time.
	* See tc_setclock() and the POSIX specification of clock_settime().
	* - Ts reaches the code below. It holds the sleepqueue chain lock,
	* so Tc has finished waking, so this thread must test td_rtcgen.
	* (The declaration of td_rtcgen refers to this comment.)
	*/
	rtc_changed = td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation;
	if ((td->td_flags & TDF_TIMEOUT) \|\| rtc_changed) {
	if (rtc_changed) {
	td->td_rtcgen = 0;
	}
	MPASS(TD_ON_SLEEPQ(td));
	sq = sleepq_lookup(wchan);
	if (sleepq_resume_thread(sq, td, 0)) {
	#ifdef INVARIANTS
	/*
	* This thread hasn't gone to sleep yet, so it
	* should not be swapped out.
	*/
	panic("not waking up swapper");
	#endif
	}
	mtx_unlock_spin(&sc->sc_lock);
	return;
	}
	#ifdef SLEEPQUEUE_PROFILING
	if (prof_enabled)
	sleepq_profile(td->td_wmesg);
	#endif
	MPASS(td->td_sleepqueue == NULL);
	sched_sleep(td, pri);
	thread_lock_set(td, &sc->sc_lock);
	SDT_PROBE0(sched, , , sleep);
	TD_SET_SLEEPING(td);
	mi_switch(SW_VOL \| SWT_SLEEPQ, NULL);
	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
	(void )td, (long)td->td_proc->p_pid, (void )td->td_name);
	}

	/*
	* Check to see if we timed out.
	*/
	static int
	sleepq_check_timeout(void)
	{
	struct thread *td;
	int res;

	td = curthread;
	THREAD_LOCK_ASSERT(td, MA_OWNED);

	/*
	* If TDF_TIMEOUT is set, we timed out. But recheck
	* td_sleeptimo anyway.
	*/
	res = 0;
	if (td->td_sleeptimo != 0) {
	if (td->td_sleeptimo <= sbinuptime())
	res = EWOULDBLOCK;
	td->td_sleeptimo = 0;
	}
	if (td->td_flags & TDF_TIMEOUT)
	td->td_flags &= ~TDF_TIMEOUT;
	else
	/*
	* We ignore the situation where timeout subsystem was
	* unable to stop our callout. The struct thread is
	* type-stable, the callout will use the correct
	* memory when running. The checks of the
	* td_sleeptimo value in this function and in
	* sleepq_timeout() ensure that the thread does not
	* get spurious wakeups, even if the callout was reset
	* or thread reused.
	*/
	callout_stop(&td->td_slpcallout);
	return (res);
	}

	/*
	* Check to see if we were awoken by a signal.
	*/
	static int
	sleepq_check_signals(void)
	{
	struct thread *td;

	td = curthread;
	THREAD_LOCK_ASSERT(td, MA_OWNED);

	/* We are no longer in an interruptible sleep. */
	if (td->td_flags & TDF_SINTR)
	td->td_flags &= ~TDF_SINTR;

	if (td->td_flags & TDF_SLEEPABORT) {
	td->td_flags &= ~TDF_SLEEPABORT;
	return (td->td_intrval);
	}

	return (0);
	}

	/*
	* Block the current thread until it is awakened from its sleep queue.
	*/
	void
	sleepq_wait(void *wchan, int pri)
	{
	struct thread *td;

	td = curthread;
	MPASS(!(td->td_flags & TDF_SINTR));
	thread_lock(td);
	sleepq_switch(wchan, pri);
	thread_unlock(td);
	}

	/*
	* Block the current thread until it is awakened from its sleep queue
	* or it is interrupted by a signal.
	*/
	int
	sleepq_wait_sig(void *wchan, int pri)
	{
	int rcatch;
	int rval;

	rcatch = sleepq_catch_signals(wchan, pri);
	rval = sleepq_check_signals();
	thread_unlock(curthread);
	if (rcatch)
	return (rcatch);
	return (rval);
	}

	/*
	* Block the current thread until it is awakened from its sleep queue
	* or it times out while waiting.
	*/
	int
	sleepq_timedwait(void *wchan, int pri)
	{
	struct thread *td;
	int rval;

	td = curthread;
	MPASS(!(td->td_flags & TDF_SINTR));
	thread_lock(td);
	sleepq_switch(wchan, pri);
	rval = sleepq_check_timeout();
	thread_unlock(td);

	return (rval);
	}

	/*
	* Block the current thread until it is awakened from its sleep queue,
	* it is interrupted by a signal, or it times out waiting to be awakened.
	*/
	int
	sleepq_timedwait_sig(void *wchan, int pri)
	{
	int rcatch, rvalt, rvals;

	rcatch = sleepq_catch_signals(wchan, pri);
	rvalt = sleepq_check_timeout();
	rvals = sleepq_check_signals();
	thread_unlock(curthread);
	if (rcatch)
	return (rcatch);
	if (rvals)
	return (rvals);
	return (rvalt);
	}

	/*
	* Returns the type of sleepqueue given a waitchannel.
	*/
	int
	sleepq_type(void *wchan)
	{
	struct sleepqueue *sq;
	int type;

	MPASS(wchan != NULL);

	sleepq_lock(wchan);
	sq = sleepq_lookup(wchan);
	if (sq == NULL) {
	sleepq_release(wchan);
	return (-1);
	}
	type = sq->sq_type;
	sleepq_release(wchan);
	return (type);
	}

	/*
	* Removes a thread from a sleep queue and makes it
	* runnable.
	*/
	static int
	sleepq_resume_thread(struct sleepqueue sq, struct thread td, int pri)
	{
	struct sleepqueue_chain *sc;

	MPASS(td != NULL);
	MPASS(sq->sq_wchan != NULL);
	MPASS(td->td_wchan == sq->sq_wchan);
	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
	THREAD_LOCK_ASSERT(td, MA_OWNED);
	sc = SC_LOOKUP(sq->sq_wchan);
	mtx_assert(&sc->sc_lock, MA_OWNED);

	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);

	/* Remove the thread from the queue. */
	sq->sq_blockedcnt[td->td_sqqueue]--;
	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);

	/*
	* Get a sleep queue for this thread. If this is the last waiter,
	* use the queue itself and take it out of the chain, otherwise,
	* remove a queue from the free list.
	*/
	if (LIST_EMPTY(&sq->sq_free)) {
	td->td_sleepqueue = sq;
	#ifdef INVARIANTS
	sq->sq_wchan = NULL;
	#endif
	#ifdef SLEEPQUEUE_PROFILING
	sc->sc_depth--;
	#endif
	} else
	td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
	LIST_REMOVE(td->td_sleepqueue, sq_hash);

	td->td_wmesg = NULL;
	td->td_wchan = NULL;
	td->td_flags &= ~TDF_SINTR;

	CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
	(void *)td, (long)td->td_proc->p_pid, td->td_name);

	/* Adjust priority if requested. */
	MPASS(pri == 0 \|\| (pri >= PRI_MIN && pri <= PRI_MAX));
	if (pri != 0 && td->td_priority > pri &&
	PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
	sched_prio(td, pri);

	/*
	* Note that thread td might not be sleeping if it is running
	* sleepq_catch_signals() on another CPU or is blocked on its
	* proc lock to check signals. There's no need to mark the
	* thread runnable in that case.
	*/
	if (TD_IS_SLEEPING(td)) {
	TD_CLR_SLEEPING(td);
	return (setrunnable(td));
	}
	return (0);
	}

	#ifdef INVARIANTS
	/*
	* UMA zone item deallocator.
	*/
	static void
	sleepq_dtor(void mem, int size, void arg)
	{
	struct sleepqueue *sq;
	int i;

	sq = mem;
	for (i = 0; i < NR_SLEEPQS; i++) {
	MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
	MPASS(sq->sq_blockedcnt[i] == 0);
	}
	}
	#endif

	/*
	* UMA zone item initializer.
	*/
	static int
	sleepq_init(void *mem, int size, int flags)
	{
	struct sleepqueue *sq;
	int i;

	bzero(mem, size);
	sq = mem;
	for (i = 0; i < NR_SLEEPQS; i++) {
	TAILQ_INIT(&sq->sq_blocked[i]);
	sq->sq_blockedcnt[i] = 0;
	}
	LIST_INIT(&sq->sq_free);
	return (0);
	}

	/*
	* Find the highest priority thread sleeping on a wait channel and resume it.
	*/
	int
	sleepq_signal(void *wchan, int flags, int pri, int queue)
	{
	struct sleepqueue *sq;
	struct thread td, besttd;
	int wakeup_swapper;

	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
	sq = sleepq_lookup(wchan);
	if (sq == NULL)
	return (0);
	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
	("%s: mismatch between sleep/wakeup and cv_*", __func__));

	/*
	* Find the highest priority thread on the queue. If there is a
	* tie, use the thread that first appears in the queue as it has
	* been sleeping the longest since threads are always added to
	* the tail of sleep queues.
	*/
	besttd = TAILQ_FIRST(&sq->sq_blocked[queue]);
	TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
	if (td->td_priority < besttd->td_priority)
	besttd = td;
	}
	MPASS(besttd != NULL);
	thread_lock(besttd);
	wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
	thread_unlock(besttd);
	return (wakeup_swapper);
	}

	static bool
	match_any(struct thread *td __unused)
	{

	return (true);
	}

	/*
	* Resume all threads sleeping on a specified wait channel.
	*/
	int
	sleepq_broadcast(void *wchan, int flags, int pri, int queue)
	{
	struct sleepqueue *sq;

	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
	sq = sleepq_lookup(wchan);
	if (sq == NULL)
	return (0);
	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
	("%s: mismatch between sleep/wakeup and cv_*", __func__));

	return (sleepq_remove_matching(sq, queue, match_any, pri));
	}

	/*
	* Resume threads on the sleep queue that match the given predicate.
	*/
	int
	sleepq_remove_matching(struct sleepqueue *sq, int queue,
	bool (matches)(struct thread ), int pri)
	{
	struct thread td, tdn;
	int wakeup_swapper;

	/*
	* The last thread will be given ownership of sq and may
	* re-enqueue itself before sleepq_resume_thread() returns,
	* so we must cache the "next" queue item at the beginning
	* of the final iteration.
	*/
	wakeup_swapper = 0;
	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
	thread_lock(td);
	if (matches(td))
	wakeup_swapper \|= sleepq_resume_thread(sq, td, pri);
	thread_unlock(td);
	}

	return (wakeup_swapper);
	}

	/*
	* Time sleeping threads out. When the timeout expires, the thread is
	* removed from the sleep queue and made runnable if it is still asleep.
	*/
	static void
	sleepq_timeout(void *arg)
	{
	struct sleepqueue_chain *sc;
	struct sleepqueue *sq;
	struct thread *td;
	void *wchan;
	int wakeup_swapper;

	td = arg;
	wakeup_swapper = 0;
	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
	(void )td, (long)td->td_proc->p_pid, (void )td->td_name);

	thread_lock(td);

	if (td->td_sleeptimo > sbinuptime() \|\| td->td_sleeptimo == 0) {
	/*
	* The thread does not want a timeout (yet).
	*/
	} else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
	/*
	* See if the thread is asleep and get the wait
	* channel if it is.
	*/
	wchan = td->td_wchan;
	sc = SC_LOOKUP(wchan);
	THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
	sq = sleepq_lookup(wchan);
	MPASS(sq != NULL);
	td->td_flags \|= TDF_TIMEOUT;
	wakeup_swapper = sleepq_resume_thread(sq, td, 0);
	} else if (TD_ON_SLEEPQ(td)) {
	/*
	* If the thread is on the SLEEPQ but isn't sleeping
	* yet, it can either be on another CPU in between
	* sleepq_add() and one of the sleepq_wait()
	* routines or it can be in sleepq_catch_signals().
	*/
	td->td_flags \|= TDF_TIMEOUT;
	}

	thread_unlock(td);
	if (wakeup_swapper)
	kick_proc0();
	}

	/*
	* Resumes a specific thread from the sleep queue associated with a specific
	* wait channel if it is on that queue.
	*/
	void
	sleepq_remove(struct thread td, void wchan)
	{
	struct sleepqueue *sq;
	int wakeup_swapper;

	/*
	* Look up the sleep queue for this wait channel, then re-check
	* that the thread is asleep on that channel, if it is not, then
	* bail.
	*/
	MPASS(wchan != NULL);
	sleepq_lock(wchan);
	sq = sleepq_lookup(wchan);
	/*
	* We can not lock the thread here as it may be sleeping on a
	* different sleepq. However, holding the sleepq lock for this
	* wchan can guarantee that we do not miss a wakeup for this
	* channel. The asserts below will catch any false positives.
	*/
	if (!TD_ON_SLEEPQ(td) \|\| td->td_wchan != wchan) {
	sleepq_release(wchan);
	return;
	}
	/* Thread is asleep on sleep queue sq, so wake it up. */
	thread_lock(td);
	MPASS(sq != NULL);
	MPASS(td->td_wchan == wchan);
	wakeup_swapper = sleepq_resume_thread(sq, td, 0);
	thread_unlock(td);
	sleepq_release(wchan);
	if (wakeup_swapper)
	kick_proc0();
	}

	/*
	* Abort a thread as if an interrupt had occurred. Only abort
	* interruptible waits (unfortunately it isn't safe to abort others).
	*/
	int
	sleepq_abort(struct thread *td, int intrval)
	{
	struct sleepqueue *sq;
	void *wchan;

	THREAD_LOCK_ASSERT(td, MA_OWNED);
	MPASS(TD_ON_SLEEPQ(td));
	MPASS(td->td_flags & TDF_SINTR);
	MPASS(intrval == EINTR \|\| intrval == ERESTART);

	/*
	* If the TDF_TIMEOUT flag is set, just leave. A
	* timeout is scheduled anyhow.
	*/
	if (td->td_flags & TDF_TIMEOUT)
	return (0);

	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
	(void )td, (long)td->td_proc->p_pid, (void )td->td_name);
	td->td_intrval = intrval;
	td->td_flags \|= TDF_SLEEPABORT;
	/*
	* If the thread has not slept yet it will find the signal in
	* sleepq_catch_signals() and call sleepq_resume_thread. Otherwise
	* we have to do it here.
	*/
	if (!TD_IS_SLEEPING(td))
	return (0);
	wchan = td->td_wchan;
	MPASS(wchan != NULL);
	sq = sleepq_lookup(wchan);
	MPASS(sq != NULL);

	/* Thread is asleep on sleep queue sq, so wake it up. */
	return (sleepq_resume_thread(sq, td, 0));
	}

	void
	sleepq_chains_remove_matching(bool (matches)(struct thread ))
	{
	struct sleepqueue_chain *sc;
	struct sleepqueue *sq;
	int i, wakeup_swapper;

	wakeup_swapper = 0;
	for (sc = &sleepq_chains[0]; sc < sleepq_chains + SC_TABLESIZE; ++sc) {
	if (LIST_EMPTY(&sc->sc_queues)) {
	continue;
	}
	mtx_lock_spin(&sc->sc_lock);
	LIST_FOREACH(sq, &sc->sc_queues, sq_hash) {
	for (i = 0; i < NR_SLEEPQS; ++i) {
	wakeup_swapper \|= sleepq_remove_matching(sq, i,
	matches, 0);
	}
	}
	mtx_unlock_spin(&sc->sc_lock);
	}
	if (wakeup_swapper) {
	kick_proc0();
	}
	}

	/*
	* Prints the stacks of all threads presently sleeping on wchan/queue to
	* the sbuf sb. Sets count_stacks_printed to the number of stacks actually
	* printed. Typically, this will equal the number of threads sleeping on the
	* queue, but may be less if sb overflowed before all stacks were printed.
	*/
	#ifdef STACK
	int
	sleepq_sbuf_print_stacks(struct sbuf sb, void wchan, int queue,
	int *count_stacks_printed)
	{
	struct thread td, td_next;
	struct sleepqueue *sq;
	struct stack **st;
	struct sbuf **td_infos;
	int i, stack_idx, error, stacks_to_allocate;
	- bool finished, partial_print;
	+ bool finished;

	error = 0;
	finished = false;
	- partial_print = false;

	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
	MPASS((queue >= 0) && (queue < NR_SLEEPQS));

	stacks_to_allocate = 10;
	for (i = 0; i < 3 && !finished ; i++) {
	/* We cannot malloc while holding the queue's spinlock, so
	* we do our mallocs now, and hope it is enough. If it
	* isn't, we will free these, drop the lock, malloc more,
	* and try again, up to a point. After that point we will
	* give up and report ENOMEM. We also cannot write to sb
	* during this time since the client may have set the
	* SBUF_AUTOEXTEND flag on their sbuf, which could cause a
	* malloc as we print to it. So we defer actually printing
	* to sb until after we drop the spinlock.
	*/

	/* Where we will store the stacks. */
	st = malloc(sizeof(struct stack ) stacks_to_allocate,
	M_TEMP, M_WAITOK);
	for (stack_idx = 0; stack_idx < stacks_to_allocate;
	stack_idx++)
	st[stack_idx] = stack_create(M_WAITOK);

	/* Where we will store the td name, tid, etc. */
	td_infos = malloc(sizeof(struct sbuf ) stacks_to_allocate,
	M_TEMP, M_WAITOK);
	for (stack_idx = 0; stack_idx < stacks_to_allocate;
	stack_idx++)
	td_infos[stack_idx] = sbuf_new(NULL, NULL,
	MAXCOMLEN + sizeof(struct thread ) 2 + 40,
	SBUF_FIXEDLEN);

	sleepq_lock(wchan);
	sq = sleepq_lookup(wchan);
	if (sq == NULL) {
	/* This sleepq does not exist; exit and return ENOENT. */
	error = ENOENT;
	finished = true;
	sleepq_release(wchan);
	goto loop_end;
	}

	stack_idx = 0;
	/* Save thread info */
	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq,
	td_next) {
	if (stack_idx >= stacks_to_allocate)
	goto loop_end;

	/* Note the td_lock is equal to the sleepq_lock here. */
	stack_save_td(st[stack_idx], td);

	sbuf_printf(td_infos[stack_idx], "%d: %s %p",
	td->td_tid, td->td_name, td);

	++stack_idx;
	}

	finished = true;
	sleepq_release(wchan);

	/* Print the stacks */
	for (i = 0; i < stack_idx; i++) {
	sbuf_finish(td_infos[i]);
	sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i]));
	stack_sbuf_print(sb, st[i]);
	sbuf_printf(sb, "\n");

	error = sbuf_error(sb);
	if (error == 0)
	*count_stacks_printed = stack_idx;
	}

	loop_end:
	if (!finished)
	sleepq_release(wchan);
	for (stack_idx = 0; stack_idx < stacks_to_allocate;
	stack_idx++)
	stack_destroy(st[stack_idx]);
	for (stack_idx = 0; stack_idx < stacks_to_allocate;
	stack_idx++)
	sbuf_delete(td_infos[stack_idx]);
	free(st, M_TEMP);
	free(td_infos, M_TEMP);
	stacks_to_allocate *= 10;
	}

	if (!finished && error == 0)
	error = ENOMEM;

	return (error);
	}
	#endif

	#ifdef SLEEPQUEUE_PROFILING
	#define SLEEPQ_PROF_LOCATIONS 1024
	#define SLEEPQ_SBUFSIZE 512
	struct sleepq_prof {
	LIST_ENTRY(sleepq_prof) sp_link;
	const char *sp_wmesg;
	long sp_count;
	};

	LIST_HEAD(sqphead, sleepq_prof);

	struct sqphead sleepq_prof_free;
	struct sqphead sleepq_hash[SC_TABLESIZE];
	static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
	static struct mtx sleepq_prof_lock;
	MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);

	static void
	sleepq_profile(const char *wmesg)
	{
	struct sleepq_prof *sp;

	mtx_lock_spin(&sleepq_prof_lock);
	if (prof_enabled == 0)
	goto unlock;
	LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
	if (sp->sp_wmesg == wmesg)
	goto done;
	sp = LIST_FIRST(&sleepq_prof_free);
	if (sp == NULL)
	goto unlock;
	sp->sp_wmesg = wmesg;
	LIST_REMOVE(sp, sp_link);
	LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
	done:
	sp->sp_count++;
	unlock:
	mtx_unlock_spin(&sleepq_prof_lock);
	return;
	}

	static void
	sleepq_prof_reset(void)
	{
	struct sleepq_prof *sp;
	int enabled;
	int i;

	mtx_lock_spin(&sleepq_prof_lock);
	enabled = prof_enabled;
	prof_enabled = 0;
	for (i = 0; i < SC_TABLESIZE; i++)
	LIST_INIT(&sleepq_hash[i]);
	LIST_INIT(&sleepq_prof_free);
	for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
	sp = &sleepq_profent[i];
	sp->sp_wmesg = NULL;
	sp->sp_count = 0;
	LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
	}
	prof_enabled = enabled;
	mtx_unlock_spin(&sleepq_prof_lock);
	}

	static int
	enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
	{
	int error, v;

	v = prof_enabled;
	error = sysctl_handle_int(oidp, &v, v, req);
	if (error)
	return (error);
	if (req->newptr == NULL)
	return (error);
	if (v == prof_enabled)
	return (0);
	if (v == 1)
	sleepq_prof_reset();
	mtx_lock_spin(&sleepq_prof_lock);
	prof_enabled = !!v;
	mtx_unlock_spin(&sleepq_prof_lock);

	return (0);
	}

	static int
	reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
	{
	int error, v;

	v = 0;
	error = sysctl_handle_int(oidp, &v, 0, req);
	if (error)
	return (error);
	if (req->newptr == NULL)
	return (error);
	if (v == 0)
	return (0);
	sleepq_prof_reset();

	return (0);
	}

	static int
	dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
	{
	struct sleepq_prof *sp;
	struct sbuf *sb;
	int enabled;
	int error;
	int i;

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);
	sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
	sbuf_printf(sb, "\nwmesg\tcount\n");
	enabled = prof_enabled;
	mtx_lock_spin(&sleepq_prof_lock);
	prof_enabled = 0;
	mtx_unlock_spin(&sleepq_prof_lock);
	for (i = 0; i < SC_TABLESIZE; i++) {
	LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
	sbuf_printf(sb, "%s\t%ld\n",
	sp->sp_wmesg, sp->sp_count);
	}
	}
	mtx_lock_spin(&sleepq_prof_lock);
	prof_enabled = enabled;
	mtx_unlock_spin(&sleepq_prof_lock);

	error = sbuf_finish(sb);
	sbuf_delete(sb);
	return (error);
	}

	SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING \| CTLFLAG_RD,
	NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
	SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT \| CTLFLAG_RW,
	NULL, 0, reset_sleepq_prof_stats, "I",
	"Reset sleepqueue profiling statistics");
	SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT \| CTLFLAG_RW,
	NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
	#endif

	#ifdef DDB
	DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
	{
	struct sleepqueue_chain *sc;
	struct sleepqueue *sq;
	#ifdef INVARIANTS
	struct lock_object *lock;
	#endif
	struct thread *td;
	void *wchan;
	int i;

	if (!have_addr)
	return;

	/*
	* First, see if there is an active sleep queue for the wait channel
	* indicated by the address.
	*/
	wchan = (void *)addr;
	sc = SC_LOOKUP(wchan);
	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
	if (sq->sq_wchan == wchan)
	goto found;

	/*
	* Second, see if there is an active sleep queue at the address
	* indicated.
	*/
	for (i = 0; i < SC_TABLESIZE; i++)
	LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
	if (sq == (struct sleepqueue *)addr)
	goto found;
	}

	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
	return;
	found:
	db_printf("Wait channel: %p\n", sq->sq_wchan);
	db_printf("Queue type: %d\n", sq->sq_type);
	#ifdef INVARIANTS
	if (sq->sq_lock) {
	lock = sq->sq_lock;
	db_printf("Associated Interlock: %p - (%s) %s\n", lock,
	LOCK_CLASS(lock)->lc_name, lock->lo_name);
	}
	#endif
	db_printf("Blocked threads:\n");
	for (i = 0; i < NR_SLEEPQS; i++) {
	db_printf("\nQueue[%d]:\n", i);
	if (TAILQ_EMPTY(&sq->sq_blocked[i]))
	db_printf("\tempty\n");
	else
	TAILQ_FOREACH(td, &sq->sq_blocked[i],
	td_slpq) {
	db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
	td->td_tid, td->td_proc->p_pid,
	td->td_name);
	}
	db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
	}
	}

	/* Alias 'show sleepqueue' to 'show sleepq'. */
	DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
	#endif
	Index: head/sys/kern/subr_witness.c
	===================================================================
	--- head/sys/kern/subr_witness.c (revision 327172)
	+++ head/sys/kern/subr_witness.c (revision 327173)
	@@ -1,3058 +1,3047 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2008 Isilon Systems, Inc.
	* Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
	* Copyright (c) 1998 Berkeley Software Design, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Berkeley Software Design Inc's name may not be used to endorse or
	* promote products derived from this software without specific prior
	* written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
	* and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
	*/

	/*
	* Implementation of the `witness' lock verifier. Originally implemented for
	* mutexes in BSD/OS. Extended to handle generic lock objects and lock
	* classes in FreeBSD.
	*/

	/*
	* Main Entry: witness
	* Pronunciation: 'wit-n&s
	* Function: noun
	* Etymology: Middle English witnesse, from Old English witnes knowledge,
	* testimony, witness, from 2wit
	* Date: before 12th century
	* 1 : attestation of a fact or event : TESTIMONY
	* 2 : one that gives evidence; specifically : one who testifies in
	* a cause or before a judicial tribunal
	* 3 : one asked to be present at a transaction so as to be able to
	* testify to its having taken place
	* 4 : one who has personal knowledge of something
	* 5 a : something serving as evidence or proof : SIGN
	* b : public affirmation by word or example of usually
	* religious faith or conviction <the heroic witness to divine
	* life -- Pilot>
	* 6 capitalized : a member of the Jehovah's Witnesses
	*/

	/*
	* Special rules concerning Giant and lock orders:
	*
	* 1) Giant must be acquired before any other mutexes. Stated another way,
	* no other mutex may be held when Giant is acquired.
	*
	* 2) Giant must be released when blocking on a sleepable lock.
	*
	* This rule is less obvious, but is a result of Giant providing the same
	* semantics as spl(). Basically, when a thread sleeps, it must release
	* Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule
	* 2).
	*
	* 3) Giant may be acquired before or after sleepable locks.
	*
	* This rule is also not quite as obvious. Giant may be acquired after
	* a sleepable lock because it is a non-sleepable lock and non-sleepable
	* locks may always be acquired while holding a sleepable lock. The second
	* case, Giant before a sleepable lock, follows from rule 2) above. Suppose
	* you have two threads T1 and T2 and a sleepable lock X. Suppose that T1
	* acquires X and blocks on Giant. Then suppose that T2 acquires Giant and
	* blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to
	* execute. Thus, acquiring Giant both before and after a sleepable lock
	* will not result in a lock order reversal.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_hwpmc_hooks.h"
	#include "opt_stack.h"
	#include "opt_witness.h"

	#include <sys/param.h>
	#include <sys/bus.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/sched.h>
	#include <sys/stack.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif

	#include <machine/stdarg.h>

	#if !defined(DDB) && !defined(STACK)
	#error "DDB or STACK options are required for WITNESS"
	#endif

	/* Note that these traces do not work with KTR_ALQ. */
	#if 0
	#define KTR_WITNESS KTR_SUBSYS
	#else
	#define KTR_WITNESS 0
	#endif

	#define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */
	#define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */
	#define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */

	/* Define this to check for blessed mutexes */
	#undef BLESSING

	#ifndef WITNESS_COUNT
	#define WITNESS_COUNT 1536
	#endif
	#define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */
	#define WITNESS_PENDLIST (2048 + MAXCPU)

	/* Allocate 256 KB of stack data space */
	#define WITNESS_LO_DATA_COUNT 2048

	/* Prime, gives load factor of ~2 at full load */
	#define WITNESS_LO_HASH_SIZE 1021

	/*
	* XXX: This is somewhat bogus, as we assume here that at most 2048 threads
	* will hold LOCK_NCHILDREN locks. We handle failure ok, and we should
	* probably be safe for the most part, but it's still a SWAG.
	*/
	#define LOCK_NCHILDREN 5
	#define LOCK_CHILDCOUNT 2048

	#define MAX_W_NAME 64

	#define FULLGRAPH_SBUF_SIZE 512

	/*
	* These flags go in the witness relationship matrix and describe the
	* relationship between any two struct witness objects.
	*/
	#define WITNESS_UNRELATED 0x00 /* No lock order relation. */
	#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */
	#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */
	#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */
	#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */
	#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT \| WITNESS_ANCESTOR)
	#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD \| WITNESS_DESCENDANT)
	#define WITNESS_RELATED_MASK \
	(WITNESS_ANCESTOR_MASK \| WITNESS_DESCENDANT_MASK)
	#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been
	* observed. */
	#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */
	#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */
	#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */

	/* Descendant to ancestor flags */
	#define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2)

	/* Ancestor to descendant flags */
	#define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2)

	#define WITNESS_INDEX_ASSERT(i) \
	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)

	static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");

	/*
	* Lock instances. A lock instance is the data associated with a lock while
	* it is held by witness. For example, a lock instance will hold the
	* recursion count of a lock. Lock instances are held in lists. Spin locks
	* are held in a per-cpu list while sleep locks are held in per-thread list.
	*/
	struct lock_instance {
	struct lock_object *li_lock;
	const char *li_file;
	int li_line;
	u_int li_flags;
	};

	/*
	* A simple list type used to build the list of locks held by a thread
	* or CPU. We can't simply embed the list in struct lock_object since a
	* lock may be held by more than one thread if it is a shared lock. Locks
	* are added to the head of the list, so we fill up each list entry from
	* "the back" logically. To ease some of the arithmetic, we actually fill
	* in each list entry the normal way (children[0] then children[1], etc.) but
	* when we traverse the list we read children[count-1] as the first entry
	* down to children[0] as the final entry.
	*/
	struct lock_list_entry {
	struct lock_list_entry *ll_next;
	struct lock_instance ll_children[LOCK_NCHILDREN];
	u_int ll_count;
	};

	/*
	* The main witness structure. One of these per named lock type in the system
	* (for example, "vnode interlock").
	*/
	struct witness {
	char w_name[MAX_W_NAME];
	uint32_t w_index; /* Index in the relationship matrix */
	struct lock_class *w_class;
	STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */
	STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */
	struct witness w_hash_next; / Linked list in hash buckets. */
	const char w_file; / File where last acquired */
	uint32_t w_line; /* Line where last acquired */
	uint32_t w_refcount;
	uint16_t w_num_ancestors; /* direct/indirect
	* ancestor count */
	uint16_t w_num_descendants; /* direct/indirect
	* descendant count */
	int16_t w_ddb_level;
	unsigned w_displayed:1;
	unsigned w_reversed:1;
	};

	STAILQ_HEAD(witness_list, witness);

	/*
	* The witness hash table. Keys are witness names (const char *), elements are
	* witness objects (struct witness *).
	*/
	struct witness_hash {
	struct witness *wh_array[WITNESS_HASH_SIZE];
	uint32_t wh_size;
	uint32_t wh_count;
	};

	/*
	* Key type for the lock order data hash table.
	*/
	struct witness_lock_order_key {
	uint16_t from;
	uint16_t to;
	};

	struct witness_lock_order_data {
	struct stack wlod_stack;
	struct witness_lock_order_key wlod_key;
	struct witness_lock_order_data *wlod_next;
	};

	/*
	* The witness lock order data hash table. Keys are witness index tuples
	* (struct witness_lock_order_key), elements are lock order data objects
	* (struct witness_lock_order_data).
	*/
	struct witness_lock_order_hash {
	struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE];
	u_int wloh_size;
	u_int wloh_count;
	};

	#ifdef BLESSING
	struct witness_blessed {
	const char *b_lock1;
	const char *b_lock2;
	};
	#endif

	struct witness_pendhelp {
	const char *wh_type;
	struct lock_object *wh_lock;
	};

	struct witness_order_list_entry {
	const char *w_name;
	struct lock_class *w_class;
	};

	/*
	* Returns 0 if one of the locks is a spin lock and the other is not.
	* Returns 1 otherwise.
	*/
	static __inline int
	witness_lock_type_equal(struct witness w1, struct witness w2)
	{

	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK \| LC_SPINLOCK)) ==
	(w2->w_class->lc_flags & (LC_SLEEPLOCK \| LC_SPINLOCK)));
	}

	static __inline int
	witness_lock_order_key_equal(const struct witness_lock_order_key *a,
	const struct witness_lock_order_key *b)
	{

	return (a->from == b->from && a->to == b->to);
	}

	static int _isitmyx(struct witness w1, struct witness w2, int rmask,
	const char *fname);
	static void adopt(struct witness parent, struct witness child);
	#ifdef BLESSING
	static int blessed(struct witness , struct witness );
	#endif
	static void depart(struct witness *w);
	static struct witness enroll(const char description,
	struct lock_class *lock_class);
	static struct lock_instance find_instance(struct lock_list_entry list,
	const struct lock_object *lock);
	static int isitmychild(struct witness parent, struct witness child);
	static int isitmydescendant(struct witness parent, struct witness child);
	static void itismychild(struct witness parent, struct witness child);
	static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
	static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
	static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
	static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
	static void witness_add_fullgraph(struct sbuf sb, struct witness parent);
	#ifdef DDB
	static void witness_ddb_compute_levels(void);
	static void witness_ddb_display(int()(const char fmt, ...));
	static void witness_ddb_display_descendants(int()(const char fmt, ...),
	struct witness *, int indent);
	static void witness_ddb_display_list(int(prnt)(const char fmt, ...),
	struct witness_list *list);
	static void witness_ddb_level_descendants(struct witness *parent, int l);
	static void witness_ddb_list(struct thread *td);
	#endif
	static void witness_debugger(int cond, const char *msg);
	static void witness_free(struct witness *m);
	static struct witness *witness_get(void);
	static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size);
	static struct witness witness_hash_get(const char key);
	static void witness_hash_put(struct witness *w);
	static void witness_init_hash_tables(void);
	static void witness_increment_graph_generation(void);
	static void witness_lock_list_free(struct lock_list_entry *lle);
	static struct lock_list_entry *witness_lock_list_get(void);
	static int witness_lock_order_add(struct witness *parent,
	struct witness *child);
	static int witness_lock_order_check(struct witness *parent,
	struct witness *child);
	static struct witness_lock_order_data *witness_lock_order_get(
	struct witness *parent,
	struct witness *child);
	static void witness_list_lock(struct lock_instance *instance,
	int (prnt)(const char fmt, ...));
	static int witness_output(const char *fmt, ...) __printflike(1, 2);
	static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
	static void witness_setflag(struct lock_object *lock, int flag, int set);

	static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
	"Witness Locking");

	/*
	* If set to 0, lock order checking is disabled. If set to -1,
	* witness is completely disabled. Otherwise witness performs full
	* lock order checking for all locks. At runtime, lock order checking
	* may be toggled. However, witness cannot be reenabled once it is
	* completely disabled.
	*/
	static int witness_watch = 1;
	SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN \| CTLTYPE_INT, NULL, 0,
	sysctl_debug_witness_watch, "I", "witness is watching lock operations");

	#ifdef KDB
	/*
	* When KDB is enabled and witness_kdb is 1, it will cause the system
	* to drop into kdebug() when:
	* - a lock hierarchy violation occurs
	* - locks are held when going to sleep.
	*/
	#ifdef WITNESS_KDB
	int witness_kdb = 1;
	#else
	int witness_kdb = 0;
	#endif
	SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
	#endif /* KDB */

	#if defined(DDB) \|\| defined(KDB)
	/*
	* When DDB or KDB is enabled and witness_trace is 1, it will cause the system
	* to print a stack trace:
	* - a lock hierarchy violation occurs
	* - locks are held when going to sleep.
	*/
	int witness_trace = 1;
	SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
	#endif /* DDB \|\| KDB */

	#ifdef WITNESS_SKIPSPIN
	int witness_skipspin = 1;
	#else
	int witness_skipspin = 0;
	#endif
	SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");

	int badstack_sbuf_size;

	int witness_count = WITNESS_COUNT;
	SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN,
	&witness_count, 0, "");

	/*
	* Output channel for witness messages. By default we print to the console.
	*/
	enum witness_channel {
	WITNESS_CONSOLE,
	WITNESS_LOG,
	WITNESS_NONE,
	};

	static enum witness_channel witness_channel = WITNESS_CONSOLE;
	SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING \|
	CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
	"Output channel for warnings");

	/*
	* Call this to print out the relations between locks.
	*/
	SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING \| CTLFLAG_RD,
	NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");

	/*
	* Call this to print out the witness faulty stacks.
	*/
	SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING \| CTLFLAG_RD,
	NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");

	static struct mtx w_mtx;

	/* w_list */
	static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
	static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);

	/* w_typelist */
	static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
	static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);

	/* lock list */
	static struct lock_list_entry *w_lock_list_free = NULL;
	static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
	static u_int pending_cnt;

	static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
	SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
	SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
	SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
	"");

	static struct witness *w_data;
	static uint8_t **w_rmatrix;
	static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
	static struct witness_hash w_hash; /* The witness hash table. */

	/* The lock order data hash */
	static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
	static struct witness_lock_order_data *w_lofree = NULL;
	static struct witness_lock_order_hash w_lohash;
	static int w_max_used_index = 0;
	static unsigned int w_generation = 0;
	static const char w_notrunning[] = "Witness not running\n";
	static const char w_stillcold[] = "Witness is still cold\n";


	static struct witness_order_list_entry order_lists[] = {
	/*
	* sx locks
	*/
	{ "proctree", &lock_class_sx },
	{ "allproc", &lock_class_sx },
	{ "allprison", &lock_class_sx },
	{ NULL, NULL },
	/*
	* Various mutexes
	*/
	{ "Giant", &lock_class_mtx_sleep },
	{ "pipe mutex", &lock_class_mtx_sleep },
	{ "sigio lock", &lock_class_mtx_sleep },
	{ "process group", &lock_class_mtx_sleep },
	{ "process lock", &lock_class_mtx_sleep },
	{ "session", &lock_class_mtx_sleep },
	{ "uidinfo hash", &lock_class_rw },
	#ifdef HWPMC_HOOKS
	{ "pmc-sleep", &lock_class_mtx_sleep },
	#endif
	{ "time lock", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* umtx
	*/
	{ "umtx lock", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* Sockets
	*/
	{ "accept", &lock_class_mtx_sleep },
	{ "so_snd", &lock_class_mtx_sleep },
	{ "so_rcv", &lock_class_mtx_sleep },
	{ "sellck", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* Routing
	*/
	{ "so_rcv", &lock_class_mtx_sleep },
	{ "radix node head", &lock_class_rw },
	{ "rtentry", &lock_class_mtx_sleep },
	{ "ifaddr", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* IPv4 multicast:
	* protocol locks before interface locks, after UDP locks.
	*/
	{ "udpinp", &lock_class_rw },
	{ "in_multi_mtx", &lock_class_mtx_sleep },
	{ "igmp_mtx", &lock_class_mtx_sleep },
	{ "if_addr_lock", &lock_class_rw },
	{ NULL, NULL },
	/*
	* IPv6 multicast:
	* protocol locks before interface locks, after UDP locks.
	*/
	{ "udpinp", &lock_class_rw },
	{ "in6_multi_mtx", &lock_class_mtx_sleep },
	{ "mld_mtx", &lock_class_mtx_sleep },
	{ "if_addr_lock", &lock_class_rw },
	{ NULL, NULL },
	/*
	* UNIX Domain Sockets
	*/
	{ "unp_link_rwlock", &lock_class_rw },
	{ "unp_list_lock", &lock_class_mtx_sleep },
	{ "unp", &lock_class_mtx_sleep },
	{ "so_snd", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* UDP/IP
	*/
	{ "udp", &lock_class_rw },
	{ "udpinp", &lock_class_rw },
	{ "so_snd", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* TCP/IP
	*/
	{ "tcp", &lock_class_rw },
	{ "tcpinp", &lock_class_rw },
	{ "so_snd", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* BPF
	*/
	{ "bpf global lock", &lock_class_mtx_sleep },
	{ "bpf interface lock", &lock_class_rw },
	{ "bpf cdev lock", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* NFS server
	*/
	{ "nfsd_mtx", &lock_class_mtx_sleep },
	{ "so_snd", &lock_class_mtx_sleep },
	{ NULL, NULL },

	/*
	* IEEE 802.11
	*/
	{ "802.11 com lock", &lock_class_mtx_sleep},
	{ NULL, NULL },
	/*
	* Network drivers
	*/
	{ "network driver", &lock_class_mtx_sleep},
	{ NULL, NULL },

	/*
	* Netgraph
	*/
	{ "ng_node", &lock_class_mtx_sleep },
	{ "ng_worklist", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* CDEV
	*/
	{ "vm map (system)", &lock_class_mtx_sleep },
	{ "vm pagequeue", &lock_class_mtx_sleep },
	{ "vnode interlock", &lock_class_mtx_sleep },
	{ "cdev", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* VM
	*/
	{ "vm map (user)", &lock_class_sx },
	{ "vm object", &lock_class_rw },
	{ "vm page", &lock_class_mtx_sleep },
	{ "vm pagequeue", &lock_class_mtx_sleep },
	{ "pmap pv global", &lock_class_rw },
	{ "pmap", &lock_class_mtx_sleep },
	{ "pmap pv list", &lock_class_rw },
	{ "vm page free queue", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* kqueue/VFS interaction
	*/
	{ "kqueue", &lock_class_mtx_sleep },
	{ "struct mount mtx", &lock_class_mtx_sleep },
	{ "vnode interlock", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* VFS namecache
	*/
	{ "ncvn", &lock_class_mtx_sleep },
	{ "ncbuc", &lock_class_rw },
	{ "vnode interlock", &lock_class_mtx_sleep },
	{ "ncneg", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* ZFS locking
	*/
	{ "dn->dn_mtx", &lock_class_sx },
	{ "dr->dt.di.dr_mtx", &lock_class_sx },
	{ "db->db_mtx", &lock_class_sx },
	{ NULL, NULL },
	/*
	* spin locks
	*/
	#ifdef SMP
	{ "ap boot", &lock_class_mtx_spin },
	#endif
	{ "rm.mutex_mtx", &lock_class_mtx_spin },
	{ "sio", &lock_class_mtx_spin },
	#ifdef __i386__
	{ "cy", &lock_class_mtx_spin },
	#endif
	#ifdef __sparc64__
	{ "pcib_mtx", &lock_class_mtx_spin },
	{ "rtc_mtx", &lock_class_mtx_spin },
	#endif
	{ "scc_hwmtx", &lock_class_mtx_spin },
	{ "uart_hwmtx", &lock_class_mtx_spin },
	{ "fast_taskqueue", &lock_class_mtx_spin },
	{ "intr table", &lock_class_mtx_spin },
	#ifdef HWPMC_HOOKS
	{ "pmc-per-proc", &lock_class_mtx_spin },
	#endif
	{ "process slock", &lock_class_mtx_spin },
	{ "syscons video lock", &lock_class_mtx_spin },
	{ "sleepq chain", &lock_class_mtx_spin },
	{ "rm_spinlock", &lock_class_mtx_spin },
	{ "turnstile chain", &lock_class_mtx_spin },
	{ "turnstile lock", &lock_class_mtx_spin },
	{ "sched lock", &lock_class_mtx_spin },
	{ "td_contested", &lock_class_mtx_spin },
	{ "callout", &lock_class_mtx_spin },
	{ "entropy harvest mutex", &lock_class_mtx_spin },
	#ifdef SMP
	{ "smp rendezvous", &lock_class_mtx_spin },
	#endif
	#ifdef __powerpc__
	{ "tlb0", &lock_class_mtx_spin },
	#endif
	/*
	* leaf locks
	*/
	{ "intrcnt", &lock_class_mtx_spin },
	{ "icu", &lock_class_mtx_spin },
	#if defined(SMP) && defined(__sparc64__)
	{ "ipi", &lock_class_mtx_spin },
	#endif
	#ifdef __i386__
	{ "allpmaps", &lock_class_mtx_spin },
	{ "descriptor tables", &lock_class_mtx_spin },
	#endif
	{ "clk", &lock_class_mtx_spin },
	{ "cpuset", &lock_class_mtx_spin },
	{ "mprof lock", &lock_class_mtx_spin },
	{ "zombie lock", &lock_class_mtx_spin },
	{ "ALD Queue", &lock_class_mtx_spin },
	#if defined(__i386__) \|\| defined(__amd64__)
	{ "pcicfg", &lock_class_mtx_spin },
	{ "NDIS thread lock", &lock_class_mtx_spin },
	#endif
	{ "tw_osl_io_lock", &lock_class_mtx_spin },
	{ "tw_osl_q_lock", &lock_class_mtx_spin },
	{ "tw_cl_io_lock", &lock_class_mtx_spin },
	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
	#ifdef HWPMC_HOOKS
	{ "pmc-leaf", &lock_class_mtx_spin },
	#endif
	{ "blocked lock", &lock_class_mtx_spin },
	{ NULL, NULL },
	{ NULL, NULL }
	};

	#ifdef BLESSING
	/*
	* Pairs of locks which have been blessed
	* Don't complain about order problems with blessed locks
	*/
	static struct witness_blessed blessed_list[] = {
	};
	#endif

	/*
	* This global is set to 0 once it becomes safe to use the witness code.
	*/
	static int witness_cold = 1;

	/*
	* This global is set to 1 once the static lock orders have been enrolled
	* so that a warning can be issued for any spin locks enrolled later.
	*/
	static int witness_spin_warn = 0;

	/* Trim useless garbage from filenames. */
	static const char *
	fixup_filename(const char *file)
	{

	if (file == NULL)
	return (NULL);
	while (strncmp(file, "../", 3) == 0)
	file += 3;
	return (file);
	}

	/*
	* The WITNESS-enabled diagnostic code. Note that the witness code does
	* assume that the early boot is single-threaded at least until after this
	* routine is completed.
	*/
	static void
	witness_initialize(void *dummy __unused)
	{
	struct lock_object *lock;
	struct witness_order_list_entry *order;
	struct witness w, w1;
	int i;

	w_data = malloc(sizeof (struct witness) * witness_count, M_WITNESS,
	M_WAITOK \| M_ZERO);

	w_rmatrix = malloc(sizeof(w_rmatrix) (witness_count + 1),
	M_WITNESS, M_WAITOK \| M_ZERO);

	for (i = 0; i < witness_count + 1; i++) {
	w_rmatrix[i] = malloc(sizeof(w_rmatrix[i])
	(witness_count + 1), M_WITNESS, M_WAITOK \| M_ZERO);
	}
	badstack_sbuf_size = witness_count * 256;

	/*
	* We have to release Giant before initializing its witness
	* structure so that WITNESS doesn't get confused.
	*/
	mtx_unlock(&Giant);
	mtx_assert(&Giant, MA_NOTOWNED);

	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN \| MTX_QUIET \|
	MTX_NOWITNESS \| MTX_NOPROFILE);
	for (i = witness_count - 1; i >= 0; i--) {
	w = &w_data[i];
	memset(w, 0, sizeof(*w));
	w_data[i].w_index = i; /* Witness index never changes. */
	witness_free(w);
	}
	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
	("%s: Invalid list of free witness objects", __func__));

	/* Witness with index 0 is not used to aid in debugging. */
	STAILQ_REMOVE_HEAD(&w_free, w_list);
	w_free_cnt--;

	for (i = 0; i < witness_count; i++) {
	memset(w_rmatrix[i], 0, sizeof(w_rmatrix[i])
	(witness_count + 1));
	}

	for (i = 0; i < LOCK_CHILDCOUNT; i++)
	witness_lock_list_free(&w_locklistdata[i]);
	witness_init_hash_tables();

	/* First add in all the specified order lists. */
	for (order = order_lists; order->w_name != NULL; order++) {
	w = enroll(order->w_name, order->w_class);
	if (w == NULL)
	continue;
	w->w_file = "order list";
	for (order++; order->w_name != NULL; order++) {
	w1 = enroll(order->w_name, order->w_class);
	if (w1 == NULL)
	continue;
	w1->w_file = "order list";
	itismychild(w, w1);
	w = w1;
	}
	}
	witness_spin_warn = 1;

	/* Iterate through all locks and add them to witness. */
	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
	lock = pending_locks[i].wh_lock;
	KASSERT(lock->lo_flags & LO_WITNESS,
	("%s: lock %s is on pending list but not LO_WITNESS",
	__func__, lock->lo_name));
	lock->lo_witness = enroll(pending_locks[i].wh_type,
	LOCK_CLASS(lock));
	}

	/* Mark the witness code as being ready for use. */
	witness_cold = 0;

	mtx_lock(&Giant);
	}
	SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
	NULL);

	void
	witness_init(struct lock_object lock, const char type)
	{
	struct lock_class *class;

	/* Various sanity checks. */
	class = LOCK_CLASS(lock);
	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
	(class->lc_flags & LC_RECURSABLE) == 0)
	kassert_panic("%s: lock (%s) %s can not be recursable",
	__func__, class->lc_name, lock->lo_name);
	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
	(class->lc_flags & LC_SLEEPABLE) == 0)
	kassert_panic("%s: lock (%s) %s can not be sleepable",
	__func__, class->lc_name, lock->lo_name);
	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
	(class->lc_flags & LC_UPGRADABLE) == 0)
	kassert_panic("%s: lock (%s) %s can not be upgradable",
	__func__, class->lc_name, lock->lo_name);

	/*
	* If we shouldn't watch this lock, then just clear lo_witness.
	* Otherwise, if witness_cold is set, then it is too early to
	* enroll this lock, so defer it to witness_initialize() by adding
	* it to the pending_locks list. If it is not too early, then enroll
	* the lock now.
	*/
	if (witness_watch < 1 \|\| panicstr != NULL \|\|
	(lock->lo_flags & LO_WITNESS) == 0)
	lock->lo_witness = NULL;
	else if (witness_cold) {
	pending_locks[pending_cnt].wh_lock = lock;
	pending_locks[pending_cnt++].wh_type = type;
	if (pending_cnt > WITNESS_PENDLIST)
	panic("%s: pending locks list is too small, "
	"increase WITNESS_PENDLIST\n",
	__func__);
	} else
	lock->lo_witness = enroll(type, class);
	}

	void
	witness_destroy(struct lock_object *lock)
	{
	struct lock_class *class;
	struct witness *w;

	class = LOCK_CLASS(lock);

	if (witness_cold)
	panic("lock (%s) %s destroyed while witness_cold",
	class->lc_name, lock->lo_name);

	/* XXX: need to verify that no one holds the lock */
	if ((lock->lo_flags & LO_WITNESS) == 0 \|\| lock->lo_witness == NULL)
	return;
	w = lock->lo_witness;

	mtx_lock_spin(&w_mtx);
	MPASS(w->w_refcount > 0);
	w->w_refcount--;

	if (w->w_refcount == 0)
	depart(w);
	mtx_unlock_spin(&w_mtx);
	}

	#ifdef DDB
	static void
	witness_ddb_compute_levels(void)
	{
	struct witness *w;

	/*
	* First clear all levels.
	*/
	STAILQ_FOREACH(w, &w_all, w_list)
	w->w_ddb_level = -1;

	/*
	* Look for locks with no parents and level all their descendants.
	*/
	STAILQ_FOREACH(w, &w_all, w_list) {

	/* If the witness has ancestors (is not a root), skip it. */
	if (w->w_num_ancestors > 0)
	continue;
	witness_ddb_level_descendants(w, 0);
	}
	}

	static void
	witness_ddb_level_descendants(struct witness *w, int l)
	{
	int i;

	if (w->w_ddb_level >= l)
	return;

	w->w_ddb_level = l;
	l++;

	for (i = 1; i <= w_max_used_index; i++) {
	if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
	witness_ddb_level_descendants(&w_data[i], l);
	}
	}

	static void
	witness_ddb_display_descendants(int(prnt)(const char fmt, ...),
	struct witness *w, int indent)
	{
	int i;

	for (i = 0; i < indent; i++)
	prnt(" ");
	prnt("%s (type: %s, depth: %d, active refs: %d)",
	w->w_name, w->w_class->lc_name,
	w->w_ddb_level, w->w_refcount);
	if (w->w_displayed) {
	prnt(" -- (already displayed)\n");
	return;
	}
	w->w_displayed = 1;
	if (w->w_file != NULL && w->w_line != 0)
	prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
	w->w_line);
	else
	prnt(" -- never acquired\n");
	indent++;
	WITNESS_INDEX_ASSERT(w->w_index);
	for (i = 1; i <= w_max_used_index; i++) {
	if (db_pager_quit)
	return;
	if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
	witness_ddb_display_descendants(prnt, &w_data[i],
	indent);
	}
	}

	static void
	witness_ddb_display_list(int(prnt)(const char fmt, ...),
	struct witness_list *list)
	{
	struct witness *w;

	STAILQ_FOREACH(w, list, w_typelist) {
	if (w->w_file == NULL \|\| w->w_ddb_level > 0)
	continue;

	/* This lock has no anscestors - display its descendants. */
	witness_ddb_display_descendants(prnt, w, 0);
	if (db_pager_quit)
	return;
	}
	}

	static void
	witness_ddb_display(int(prnt)(const char fmt, ...))
	{
	struct witness *w;

	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
	witness_ddb_compute_levels();

	/* Clear all the displayed flags. */
	STAILQ_FOREACH(w, &w_all, w_list)
	w->w_displayed = 0;

	/*
	* First, handle sleep locks which have been acquired at least
	* once.
	*/
	prnt("Sleep locks:\n");
	witness_ddb_display_list(prnt, &w_sleep);
	if (db_pager_quit)
	return;

	/*
	* Now do spin locks which have been acquired at least once.
	*/
	prnt("\nSpin locks:\n");
	witness_ddb_display_list(prnt, &w_spin);
	if (db_pager_quit)
	return;

	/*
	* Finally, any locks which have not been acquired yet.
	*/
	prnt("\nLocks which were never acquired:\n");
	STAILQ_FOREACH(w, &w_all, w_list) {
	if (w->w_file != NULL \|\| w->w_refcount == 0)
	continue;
	prnt("%s (type: %s, depth: %d)\n", w->w_name,
	w->w_class->lc_name, w->w_ddb_level);
	if (db_pager_quit)
	return;
	}
	}
	#endif /* DDB */

	int
	witness_defineorder(struct lock_object lock1, struct lock_object lock2)
	{

	if (witness_watch == -1 \|\| panicstr != NULL)
	return (0);

	/* Require locks that witness knows about. */
	if (lock1 == NULL \|\| lock1->lo_witness == NULL \|\| lock2 == NULL \|\|
	lock2->lo_witness == NULL)
	return (EINVAL);

	mtx_assert(&w_mtx, MA_NOTOWNED);
	mtx_lock_spin(&w_mtx);

	/*
	* If we already have either an explicit or implied lock order that
	* is the other way around, then return an error.
	*/
	if (witness_watch &&
	isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
	mtx_unlock_spin(&w_mtx);
	return (EDOOFUS);
	}

	/* Try to add the new order. */
	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
	lock2->lo_witness->w_name, lock1->lo_witness->w_name);
	itismychild(lock1->lo_witness, lock2->lo_witness);
	mtx_unlock_spin(&w_mtx);
	return (0);
	}

	void
	witness_checkorder(struct lock_object lock, int flags, const char file,
	int line, struct lock_object *interlock)
	{
	struct lock_list_entry lock_list, lle;
	struct lock_instance lock1, lock2, *plock;
	struct lock_class class, iclass;
	struct witness w, w1;
	struct thread *td;
	int i, j;

	if (witness_cold \|\| witness_watch < 1 \|\| lock->lo_witness == NULL \|\|
	panicstr != NULL)
	return;

	w = lock->lo_witness;
	class = LOCK_CLASS(lock);
	td = curthread;

	if (class->lc_flags & LC_SLEEPLOCK) {

	/*
	* Since spin locks include a critical section, this check
	* implicitly enforces a lock order of all sleep locks before
	* all spin locks.
	*/
	if (td->td_critnest != 0 && !kdb_active)
	kassert_panic("acquiring blockable sleep lock with "
	"spinlock or critical section held (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);

	/*
	* If this is the first lock acquired then just return as
	* no order checking is needed.
	*/
	lock_list = td->td_sleeplocks;
	if (lock_list == NULL \|\| lock_list->ll_count == 0)
	return;
	} else {

	/*
	* If this is the first lock, just return as no order
	* checking is needed. Avoid problems with thread
	* migration pinning the thread while checking if
	* spinlocks are held. If at least one spinlock is held
	* the thread is in a safe path and it is allowed to
	* unpin it.
	*/
	sched_pin();
	lock_list = PCPU_GET(spinlocks);
	if (lock_list == NULL \|\| lock_list->ll_count == 0) {
	sched_unpin();
	return;
	}
	sched_unpin();
	}

	/*
	* Check to see if we are recursing on a lock we already own. If
	* so, make sure that we don't mismatch exclusive and shared lock
	* acquires.
	*/
	lock1 = find_instance(lock_list, lock);
	if (lock1 != NULL) {
	if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
	(flags & LOP_EXCLUSIVE) == 0) {
	witness_output("shared lock of (%s) %s @ %s:%d\n",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	witness_output("while exclusively locked from %s:%d\n",
	fixup_filename(lock1->li_file), lock1->li_line);
	kassert_panic("excl->share");
	}
	if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
	(flags & LOP_EXCLUSIVE) != 0) {
	witness_output("exclusive lock of (%s) %s @ %s:%d\n",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	witness_output("while share locked from %s:%d\n",
	fixup_filename(lock1->li_file), lock1->li_line);
	kassert_panic("share->excl");
	}
	return;
	}

	/* Warn if the interlock is not locked exactly once. */
	if (interlock != NULL) {
	iclass = LOCK_CLASS(interlock);
	lock1 = find_instance(lock_list, interlock);
	if (lock1 == NULL)
	kassert_panic("interlock (%s) %s not locked @ %s:%d",
	iclass->lc_name, interlock->lo_name,
	fixup_filename(file), line);
	else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
	kassert_panic("interlock (%s) %s recursed @ %s:%d",
	iclass->lc_name, interlock->lo_name,
	fixup_filename(file), line);
	}

	/*
	* Find the previously acquired lock, but ignore interlocks.
	*/
	plock = &lock_list->ll_children[lock_list->ll_count - 1];
	if (interlock != NULL && plock->li_lock == interlock) {
	if (lock_list->ll_count > 1)
	plock =
	&lock_list->ll_children[lock_list->ll_count - 2];
	else {
	lle = lock_list->ll_next;

	/*
	* The interlock is the only lock we hold, so
	* simply return.
	*/
	if (lle == NULL)
	return;
	plock = &lle->ll_children[lle->ll_count - 1];
	}
	}

	/*
	* Try to perform most checks without a lock. If this succeeds we
	* can skip acquiring the lock and return success. Otherwise we redo
	* the check with the lock held to handle races with concurrent updates.
	*/
	w1 = plock->li_lock->lo_witness;
	if (witness_lock_order_check(w1, w))
	return;

	mtx_lock_spin(&w_mtx);
	if (witness_lock_order_check(w1, w)) {
	mtx_unlock_spin(&w_mtx);
	return;
	}
	witness_lock_order_add(w1, w);

	/*
	* Check for duplicate locks of the same type. Note that we only
	* have to check for this on the last lock we just acquired. Any
	* other cases will be caught as lock order violations.
	*/
	if (w1 == w) {
	i = w->w_index;
	if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
	!(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
	w_rmatrix[i][i] \|= WITNESS_REVERSAL;
	w->w_reversed = 1;
	mtx_unlock_spin(&w_mtx);
	witness_output(
	"acquiring duplicate lock of same type: \"%s\"\n",
	w->w_name);
	witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
	fixup_filename(plock->li_file), plock->li_line);
	witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
	fixup_filename(file), line);
	witness_debugger(1, __func__);
	} else
	mtx_unlock_spin(&w_mtx);
	return;
	}
	mtx_assert(&w_mtx, MA_OWNED);

	/*
	* If we know that the lock we are acquiring comes after
	* the lock we most recently acquired in the lock order tree,
	* then there is no need for any further checks.
	*/
	if (isitmychild(w1, w))
	goto out;

	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
	for (i = lle->ll_count - 1; i >= 0; i--, j++) {

	MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
	lock1 = &lle->ll_children[i];

	/*
	* Ignore the interlock.
	*/
	if (interlock == lock1->li_lock)
	continue;

	/*
	* If this lock doesn't undergo witness checking,
	* then skip it.
	*/
	w1 = lock1->li_lock->lo_witness;
	if (w1 == NULL) {
	KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
	("lock missing witness structure"));
	continue;
	}

	/*
	* If we are locking Giant and this is a sleepable
	* lock, then skip it.
	*/
	if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
	lock == &Giant.lock_object)
	continue;

	/*
	* If we are locking a sleepable lock and this lock
	* is Giant, then skip it.
	*/
	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
	lock1->li_lock == &Giant.lock_object)
	continue;

	/*
	* If we are locking a sleepable lock and this lock
	* isn't sleepable, we want to treat it as a lock
	* order violation to enfore a general lock order of
	* sleepable locks before non-sleepable locks.
	*/
	if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
	(lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
	goto reversal;

	/*
	* If we are locking Giant and this is a non-sleepable
	* lock, then treat it as a reversal.
	*/
	if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
	lock == &Giant.lock_object)
	goto reversal;

	/*
	* Check the lock order hierarchy for a reveresal.
	*/
	if (!isitmydescendant(w, w1))
	continue;
	reversal:

	/*
	* We have a lock order violation, check to see if it
	* is allowed or has already been yelled about.
	*/
	#ifdef BLESSING

	/*
	* If the lock order is blessed, just bail. We don't
	* look for other lock order violations though, which
	* may be a bug.
	*/
	if (blessed(w, w1))
	goto out;
	#endif

	/* Bail if this violation is known */
	if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
	goto out;

	/* Record this as a violation */
	w_rmatrix[w1->w_index][w->w_index] \|= WITNESS_REVERSAL;
	w_rmatrix[w->w_index][w1->w_index] \|= WITNESS_REVERSAL;
	w->w_reversed = w1->w_reversed = 1;
	witness_increment_graph_generation();
	mtx_unlock_spin(&w_mtx);

	#ifdef WITNESS_NO_VNODE
	/*
	* There are known LORs between VNODE locks. They are
	* not an indication of a bug. VNODE locks are flagged
	* as such (LO_IS_VNODE) and we don't yell if the LOR
	* is between 2 VNODE locks.
	*/
	if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
	(lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
	return;
	#endif

	/*
	* Ok, yell about it.
	*/
	if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
	(lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
	witness_output(
	"lock order reversal: (sleepable after non-sleepable)\n");
	else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
	&& lock == &Giant.lock_object)
	witness_output(
	"lock order reversal: (Giant after non-sleepable)\n");
	else
	witness_output("lock order reversal:\n");

	/*
	* Try to locate an earlier lock with
	* witness w in our list.
	*/
	do {
	lock2 = &lle->ll_children[i];
	MPASS(lock2->li_lock != NULL);
	if (lock2->li_lock->lo_witness == w)
	break;
	if (i == 0 && lle->ll_next != NULL) {
	lle = lle->ll_next;
	i = lle->ll_count - 1;
	MPASS(i >= 0 && i < LOCK_NCHILDREN);
	} else
	i--;
	} while (i >= 0);
	if (i < 0) {
	witness_output(" 1st %p %s (%s) @ %s:%d\n",
	lock1->li_lock, lock1->li_lock->lo_name,
	w1->w_name, fixup_filename(lock1->li_file),
	lock1->li_line);
	witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
	lock->lo_name, w->w_name,
	fixup_filename(file), line);
	} else {
	witness_output(" 1st %p %s (%s) @ %s:%d\n",
	lock2->li_lock, lock2->li_lock->lo_name,
	lock2->li_lock->lo_witness->w_name,
	fixup_filename(lock2->li_file),
	lock2->li_line);
	witness_output(" 2nd %p %s (%s) @ %s:%d\n",
	lock1->li_lock, lock1->li_lock->lo_name,
	w1->w_name, fixup_filename(lock1->li_file),
	lock1->li_line);
	witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
	lock->lo_name, w->w_name,
	fixup_filename(file), line);
	}
	witness_debugger(1, __func__);
	return;
	}
	}

	/*
	* If requested, build a new lock order. However, don't build a new
	* relationship between a sleepable lock and Giant if it is in the
	* wrong direction. The correct lock order is that sleepable locks
	* always come before Giant.
	*/
	if (flags & LOP_NEWORDER &&
	!(plock->li_lock == &Giant.lock_object &&
	(lock->lo_flags & LO_SLEEPABLE) != 0)) {
	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
	w->w_name, plock->li_lock->lo_witness->w_name);
	itismychild(plock->li_lock->lo_witness, w);
	}
	out:
	mtx_unlock_spin(&w_mtx);
	}

	void
	witness_lock(struct lock_object lock, int flags, const char file, int line)
	{
	struct lock_list_entry *lock_list, lle;
	struct lock_instance *instance;
	struct witness *w;
	struct thread *td;

	if (witness_cold \|\| witness_watch == -1 \|\| lock->lo_witness == NULL \|\|
	panicstr != NULL)
	return;
	w = lock->lo_witness;
	td = curthread;

	/* Determine lock list for this lock. */
	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
	lock_list = &td->td_sleeplocks;
	else
	lock_list = PCPU_PTR(spinlocks);

	/* Check to see if we are recursing on a lock we already own. */
	instance = find_instance(*lock_list, lock);
	if (instance != NULL) {
	instance->li_flags++;
	CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
	td->td_proc->p_pid, lock->lo_name,
	instance->li_flags & LI_RECURSEMASK);
	instance->li_file = file;
	instance->li_line = line;
	return;
	}

	/* Update per-witness last file and line acquire. */
	w->w_file = file;
	w->w_line = line;

	/* Find the next open lock instance in the list and fill it. */
	lle = *lock_list;
	if (lle == NULL \|\| lle->ll_count == LOCK_NCHILDREN) {
	lle = witness_lock_list_get();
	if (lle == NULL)
	return;
	lle->ll_next = *lock_list;
	CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
	td->td_proc->p_pid, lle);
	*lock_list = lle;
	}
	instance = &lle->ll_children[lle->ll_count++];
	instance->li_lock = lock;
	instance->li_line = line;
	instance->li_file = file;
	if ((flags & LOP_EXCLUSIVE) != 0)
	instance->li_flags = LI_EXCLUSIVE;
	else
	instance->li_flags = 0;
	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
	td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
	}

	void
	witness_upgrade(struct lock_object lock, int flags, const char file, int line)
	{
	struct lock_instance *instance;
	struct lock_class *class;

	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
	if (lock->lo_witness == NULL \|\| witness_watch == -1 \|\| panicstr != NULL)
	return;
	class = LOCK_CLASS(lock);
	if (witness_watch) {
	if ((lock->lo_flags & LO_UPGRADABLE) == 0)
	kassert_panic(
	"upgrade of non-upgradable lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	if ((class->lc_flags & LC_SLEEPLOCK) == 0)
	kassert_panic(
	"upgrade of non-sleep lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	}
	instance = find_instance(curthread->td_sleeplocks, lock);
	if (instance == NULL) {
	kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	return;
	}
	if (witness_watch) {
	if ((instance->li_flags & LI_EXCLUSIVE) != 0)
	kassert_panic(
	"upgrade of exclusive lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	if ((instance->li_flags & LI_RECURSEMASK) != 0)
	kassert_panic(
	"upgrade of recursed lock (%s) %s r=%d @ %s:%d",
	class->lc_name, lock->lo_name,
	instance->li_flags & LI_RECURSEMASK,
	fixup_filename(file), line);
	}
	instance->li_flags \|= LI_EXCLUSIVE;
	}

	void
	witness_downgrade(struct lock_object lock, int flags, const char file,
	int line)
	{
	struct lock_instance *instance;
	struct lock_class *class;

	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
	if (lock->lo_witness == NULL \|\| witness_watch == -1 \|\| panicstr != NULL)
	return;
	class = LOCK_CLASS(lock);
	if (witness_watch) {
	if ((lock->lo_flags & LO_UPGRADABLE) == 0)
	kassert_panic(
	"downgrade of non-upgradable lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	if ((class->lc_flags & LC_SLEEPLOCK) == 0)
	kassert_panic(
	"downgrade of non-sleep lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	}
	instance = find_instance(curthread->td_sleeplocks, lock);
	if (instance == NULL) {
	kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	return;
	}
	if (witness_watch) {
	if ((instance->li_flags & LI_EXCLUSIVE) == 0)
	kassert_panic(
	"downgrade of shared lock (%s) %s @ %s:%d",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	if ((instance->li_flags & LI_RECURSEMASK) != 0)
	kassert_panic(
	"downgrade of recursed lock (%s) %s r=%d @ %s:%d",
	class->lc_name, lock->lo_name,
	instance->li_flags & LI_RECURSEMASK,
	fixup_filename(file), line);
	}
	instance->li_flags &= ~LI_EXCLUSIVE;
	}

	void
	witness_unlock(struct lock_object lock, int flags, const char file, int line)
	{
	struct lock_list_entry *lock_list, lle;
	struct lock_instance *instance;
	struct lock_class *class;
	struct thread *td;
	register_t s;
	int i, j;

	if (witness_cold \|\| lock->lo_witness == NULL \|\| panicstr != NULL)
	return;
	td = curthread;
	class = LOCK_CLASS(lock);

	/* Find lock instance associated with this lock. */
	if (class->lc_flags & LC_SLEEPLOCK)
	lock_list = &td->td_sleeplocks;
	else
	lock_list = PCPU_PTR(spinlocks);
	lle = *lock_list;
	for (; lock_list != NULL; lock_list = &(lock_list)->ll_next)
	for (i = 0; i < (*lock_list)->ll_count; i++) {
	instance = &(*lock_list)->ll_children[i];
	if (instance->li_lock == lock)
	goto found;
	}

	/*
	* When disabling WITNESS through witness_watch we could end up in
	* having registered locks in the td_sleeplocks queue.
	* We have to make sure we flush these queues, so just search for
	* eventual register locks and remove them.
	*/
	if (witness_watch > 0) {
	kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
	lock->lo_name, fixup_filename(file), line);
	return;
	} else {
	return;
	}
	found:

	/* First, check for shared/exclusive mismatches. */
	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
	(flags & LOP_EXCLUSIVE) == 0) {
	witness_output("shared unlock of (%s) %s @ %s:%d\n",
	class->lc_name, lock->lo_name, fixup_filename(file), line);
	witness_output("while exclusively locked from %s:%d\n",
	fixup_filename(instance->li_file), instance->li_line);
	kassert_panic("excl->ushare");
	}
	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
	(flags & LOP_EXCLUSIVE) != 0) {
	witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
	class->lc_name, lock->lo_name, fixup_filename(file), line);
	witness_output("while share locked from %s:%d\n",
	fixup_filename(instance->li_file),
	instance->li_line);
	kassert_panic("share->uexcl");
	}
	/* If we are recursed, unrecurse. */
	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
	CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
	td->td_proc->p_pid, instance->li_lock->lo_name,
	instance->li_flags);
	instance->li_flags--;
	return;
	}
	/* The lock is now being dropped, check for NORELEASE flag */
	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
	witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
	class->lc_name, lock->lo_name, fixup_filename(file), line);
	kassert_panic("lock marked norelease");
	}

	/* Otherwise, remove this item from the list. */
	s = intr_disable();
	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
	td->td_proc->p_pid, instance->li_lock->lo_name,
	(*lock_list)->ll_count - 1);
	for (j = i; j < (*lock_list)->ll_count - 1; j++)
	(*lock_list)->ll_children[j] =
	(*lock_list)->ll_children[j + 1];
	(*lock_list)->ll_count--;
	intr_restore(s);

	/*
	* In order to reduce contention on w_mtx, we want to keep always an
	* head object into lists so that frequent allocation from the
	* free witness pool (and subsequent locking) is avoided.
	* In order to maintain the current code simple, when the head
	* object is totally unloaded it means also that we do not have
	* further objects in the list, so the list ownership needs to be
	* hand over to another object if the current head needs to be freed.
	*/
	if ((*lock_list)->ll_count == 0) {
	if (*lock_list == lle) {
	if (lle->ll_next == NULL)
	return;
	} else
	lle = *lock_list;
	*lock_list = lle->ll_next;
	CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
	td->td_proc->p_pid, lle);
	witness_lock_list_free(lle);
	}
	}

	void
	witness_thread_exit(struct thread *td)
	{
	struct lock_list_entry *lle;
	int i, n;

	lle = td->td_sleeplocks;
	if (lle == NULL \|\| panicstr != NULL)
	return;
	if (lle->ll_count != 0) {
	for (n = 0; lle != NULL; lle = lle->ll_next)
	for (i = lle->ll_count - 1; i >= 0; i--) {
	if (n == 0)
	witness_output(
	"Thread %p exiting with the following locks held:\n", td);
	n++;
	witness_list_lock(&lle->ll_children[i],
	witness_output);

	}
	kassert_panic(
	"Thread %p cannot exit while holding sleeplocks\n", td);
	}
	witness_lock_list_free(lle);
	}

	/*
	* Warn if any locks other than 'lock' are held. Flags can be passed in to
	* exempt Giant and sleepable locks from the checks as well. If any
	* non-exempt locks are held, then a supplied message is printed to the
	* output channel along with a list of the offending locks. If indicated in the
	* flags then a failure results in a panic as well.
	*/
	int
	witness_warn(int flags, struct lock_object lock, const char fmt, ...)
	{
	struct lock_list_entry lock_list, lle;
	struct lock_instance *lock1;
	struct thread *td;
	va_list ap;
	int i, n;

	if (witness_cold \|\| witness_watch < 1 \|\| panicstr != NULL)
	return (0);
	n = 0;
	td = curthread;
	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
	for (i = lle->ll_count - 1; i >= 0; i--) {
	lock1 = &lle->ll_children[i];
	if (lock1->li_lock == lock)
	continue;
	if (flags & WARN_GIANTOK &&
	lock1->li_lock == &Giant.lock_object)
	continue;
	if (flags & WARN_SLEEPOK &&
	(lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
	continue;
	if (n == 0) {
	va_start(ap, fmt);
	vprintf(fmt, ap);
	va_end(ap);
	printf(" with the following %slocks held:\n",
	(flags & WARN_SLEEPOK) != 0 ?
	"non-sleepable " : "");
	}
	n++;
	witness_list_lock(lock1, printf);
	}

	/*
	* Pin the thread in order to avoid problems with thread migration.
	* Once that all verifies are passed about spinlocks ownership,
	* the thread is in a safe path and it can be unpinned.
	*/
	sched_pin();
	lock_list = PCPU_GET(spinlocks);
	if (lock_list != NULL && lock_list->ll_count != 0) {
	sched_unpin();

	/*
	* We should only have one spinlock and as long as
	* the flags cannot match for this locks class,
	* check if the first spinlock is the one curthread
	* should hold.
	*/
	lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
	if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
	lock1->li_lock == lock && n == 0)
	return (0);

	va_start(ap, fmt);
	vprintf(fmt, ap);
	va_end(ap);
	printf(" with the following %slocks held:\n",
	(flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : "");
	n += witness_list_locks(&lock_list, printf);
	} else
	sched_unpin();
	if (flags & WARN_PANIC && n)
	kassert_panic("%s", __func__);
	else
	witness_debugger(n, __func__);
	return (n);
	}

	const char *
	witness_file(struct lock_object *lock)
	{
	struct witness *w;

	if (witness_cold \|\| witness_watch < 1 \|\| lock->lo_witness == NULL)
	return ("?");
	w = lock->lo_witness;
	return (w->w_file);
	}

	int
	witness_line(struct lock_object *lock)
	{
	struct witness *w;

	if (witness_cold \|\| witness_watch < 1 \|\| lock->lo_witness == NULL)
	return (0);
	w = lock->lo_witness;
	return (w->w_line);
	}

	static struct witness *
	enroll(const char description, struct lock_class lock_class)
	{
	struct witness *w;
	- struct witness_list *typelist;

	MPASS(description != NULL);

	if (witness_watch == -1 \|\| panicstr != NULL)
	return (NULL);
	if ((lock_class->lc_flags & LC_SPINLOCK)) {
	if (witness_skipspin)
	return (NULL);
	- else
	- typelist = &w_spin;
	- } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
	- typelist = &w_sleep;
	- } else {
	+ } else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) {
	kassert_panic("lock class %s is not sleep or spin",
	lock_class->lc_name);
	return (NULL);
	}

	mtx_lock_spin(&w_mtx);
	w = witness_hash_get(description);
	if (w)
	goto found;
	if ((w = witness_get()) == NULL)
	return (NULL);
	MPASS(strlen(description) < MAX_W_NAME);
	strcpy(w->w_name, description);
	w->w_class = lock_class;
	w->w_refcount = 1;
	STAILQ_INSERT_HEAD(&w_all, w, w_list);
	if (lock_class->lc_flags & LC_SPINLOCK) {
	STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
	w_spin_cnt++;
	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
	STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
	w_sleep_cnt++;
	}

	/* Insert new witness into the hash */
	witness_hash_put(w);
	witness_increment_graph_generation();
	mtx_unlock_spin(&w_mtx);
	return (w);
	found:
	w->w_refcount++;
	if (w->w_refcount == 1)
	w->w_class = lock_class;
	mtx_unlock_spin(&w_mtx);
	if (lock_class != w->w_class)
	kassert_panic(
	"lock (%s) %s does not match earlier (%s) lock",
	description, lock_class->lc_name,
	w->w_class->lc_name);
	return (w);
	}

	static void
	depart(struct witness *w)
	{
	- struct witness_list *list;

	MPASS(w->w_refcount == 0);
	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
	- list = &w_sleep;
	w_sleep_cnt--;
	} else {
	- list = &w_spin;
	w_spin_cnt--;
	}
	/*
	* Set file to NULL as it may point into a loadable module.
	*/
	w->w_file = NULL;
	w->w_line = 0;
	witness_increment_graph_generation();
	}


	static void
	adopt(struct witness parent, struct witness child)
	{
	int pi, ci, i, j;

	if (witness_cold == 0)
	mtx_assert(&w_mtx, MA_OWNED);

	/* If the relationship is already known, there's no work to be done. */
	if (isitmychild(parent, child))
	return;

	/* When the structure of the graph changes, bump up the generation. */
	witness_increment_graph_generation();

	/*
	* The hard part ... create the direct relationship, then propagate all
	* indirect relationships.
	*/
	pi = parent->w_index;
	ci = child->w_index;
	WITNESS_INDEX_ASSERT(pi);
	WITNESS_INDEX_ASSERT(ci);
	MPASS(pi != ci);
	w_rmatrix[pi][ci] \|= WITNESS_PARENT;
	w_rmatrix[ci][pi] \|= WITNESS_CHILD;

	/*
	* If parent was not already an ancestor of child,
	* then we increment the descendant and ancestor counters.
	*/
	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
	parent->w_num_descendants++;
	child->w_num_ancestors++;
	}

	/*
	* Find each ancestor of 'pi'. Note that 'pi' itself is counted as
	* an ancestor of 'pi' during this loop.
	*/
	for (i = 1; i <= w_max_used_index; i++) {
	if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
	(i != pi))
	continue;

	/* Find each descendant of 'i' and mark it as a descendant. */
	for (j = 1; j <= w_max_used_index; j++) {

	/*
	* Skip children that are already marked as
	* descendants of 'i'.
	*/
	if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
	continue;

	/*
	* We are only interested in descendants of 'ci'. Note
	* that 'ci' itself is counted as a descendant of 'ci'.
	*/
	if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
	(j != ci))
	continue;
	w_rmatrix[i][j] \|= WITNESS_ANCESTOR;
	w_rmatrix[j][i] \|= WITNESS_DESCENDANT;
	w_data[i].w_num_descendants++;
	w_data[j].w_num_ancestors++;

	/*
	* Make sure we aren't marking a node as both an
	* ancestor and descendant. We should have caught
	* this as a lock order reversal earlier.
	*/
	if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
	(w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
	printf("witness rmatrix paradox! [%d][%d]=%d "
	"both ancestor and descendant\n",
	i, j, w_rmatrix[i][j]);
	kdb_backtrace();
	printf("Witness disabled.\n");
	witness_watch = -1;
	}
	if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
	(w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
	printf("witness rmatrix paradox! [%d][%d]=%d "
	"both ancestor and descendant\n",
	j, i, w_rmatrix[j][i]);
	kdb_backtrace();
	printf("Witness disabled.\n");
	witness_watch = -1;
	}
	}
	}
	}

	static void
	itismychild(struct witness parent, struct witness child)
	{
	int unlocked;

	MPASS(child != NULL && parent != NULL);
	if (witness_cold == 0)
	mtx_assert(&w_mtx, MA_OWNED);

	if (!witness_lock_type_equal(parent, child)) {
	if (witness_cold == 0) {
	unlocked = 1;
	mtx_unlock_spin(&w_mtx);
	} else {
	unlocked = 0;
	}
	kassert_panic(
	"%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
	"the same lock type", __func__, parent->w_name,
	parent->w_class->lc_name, child->w_name,
	child->w_class->lc_name);
	if (unlocked)
	mtx_lock_spin(&w_mtx);
	}
	adopt(parent, child);
	}

	/*
	* Generic code for the isitmy*() functions. The rmask parameter is the
	* expected relationship of w1 to w2.
	*/
	static int
	_isitmyx(struct witness w1, struct witness w2, int rmask, const char *fname)
	{
	unsigned char r1, r2;
	int i1, i2;

	i1 = w1->w_index;
	i2 = w2->w_index;
	WITNESS_INDEX_ASSERT(i1);
	WITNESS_INDEX_ASSERT(i2);
	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;

	/* The flags on one better be the inverse of the flags on the other */
	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) \|\|
	(WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
	/* Don't squawk if we're potentially racing with an update. */
	if (!mtx_owned(&w_mtx))
	return (0);
	printf("%s: rmatrix mismatch between %s (index %d) and %s "
	"(index %d): w_rmatrix[%d][%d] == %hhx but "
	"w_rmatrix[%d][%d] == %hhx\n",
	fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
	i2, i1, r2);
	kdb_backtrace();
	printf("Witness disabled.\n");
	witness_watch = -1;
	}
	return (r1 & rmask);
	}

	/*
	* Checks if @child is a direct child of @parent.
	*/
	static int
	isitmychild(struct witness parent, struct witness child)
	{

	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
	}

	/*
	* Checks if @descendant is a direct or inderect descendant of @ancestor.
	*/
	static int
	isitmydescendant(struct witness ancestor, struct witness descendant)
	{

	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
	__func__));
	}

	#ifdef BLESSING
	static int
	blessed(struct witness w1, struct witness w2)
	{
	int i;
	struct witness_blessed *b;

	for (i = 0; i < nitems(blessed_list); i++) {
	b = &blessed_list[i];
	if (strcmp(w1->w_name, b->b_lock1) == 0) {
	if (strcmp(w2->w_name, b->b_lock2) == 0)
	return (1);
	continue;
	}
	if (strcmp(w1->w_name, b->b_lock2) == 0)
	if (strcmp(w2->w_name, b->b_lock1) == 0)
	return (1);
	}
	return (0);
	}
	#endif

	static struct witness *
	witness_get(void)
	{
	struct witness *w;
	int index;

	if (witness_cold == 0)
	mtx_assert(&w_mtx, MA_OWNED);

	if (witness_watch == -1) {
	mtx_unlock_spin(&w_mtx);
	return (NULL);
	}
	if (STAILQ_EMPTY(&w_free)) {
	witness_watch = -1;
	mtx_unlock_spin(&w_mtx);
	printf("WITNESS: unable to allocate a new witness object\n");
	return (NULL);
	}
	w = STAILQ_FIRST(&w_free);
	STAILQ_REMOVE_HEAD(&w_free, w_list);
	w_free_cnt--;
	index = w->w_index;
	MPASS(index > 0 && index == w_max_used_index+1 &&
	index < witness_count);
	bzero(w, sizeof(*w));
	w->w_index = index;
	if (index > w_max_used_index)
	w_max_used_index = index;
	return (w);
	}

	static void
	witness_free(struct witness *w)
	{

	STAILQ_INSERT_HEAD(&w_free, w, w_list);
	w_free_cnt++;
	}

	static struct lock_list_entry *
	witness_lock_list_get(void)
	{
	struct lock_list_entry *lle;

	if (witness_watch == -1)
	return (NULL);
	mtx_lock_spin(&w_mtx);
	lle = w_lock_list_free;
	if (lle == NULL) {
	witness_watch = -1;
	mtx_unlock_spin(&w_mtx);
	printf("%s: witness exhausted\n", __func__);
	return (NULL);
	}
	w_lock_list_free = lle->ll_next;
	mtx_unlock_spin(&w_mtx);
	bzero(lle, sizeof(*lle));
	return (lle);
	}

	static void
	witness_lock_list_free(struct lock_list_entry *lle)
	{

	mtx_lock_spin(&w_mtx);
	lle->ll_next = w_lock_list_free;
	w_lock_list_free = lle;
	mtx_unlock_spin(&w_mtx);
	}

	static struct lock_instance *
	find_instance(struct lock_list_entry list, const struct lock_object lock)
	{
	struct lock_list_entry *lle;
	struct lock_instance *instance;
	int i;

	for (lle = list; lle != NULL; lle = lle->ll_next)
	for (i = lle->ll_count - 1; i >= 0; i--) {
	instance = &lle->ll_children[i];
	if (instance->li_lock == lock)
	return (instance);
	}
	return (NULL);
	}

	static void
	witness_list_lock(struct lock_instance *instance,
	int (prnt)(const char fmt, ...))
	{
	struct lock_object *lock;

	lock = instance->li_lock;
	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
	"exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
	if (lock->lo_witness->w_name != lock->lo_name)
	prnt(" (%s)", lock->lo_witness->w_name);
	prnt(" r = %d (%p) locked @ %s:%d\n",
	instance->li_flags & LI_RECURSEMASK, lock,
	fixup_filename(instance->li_file), instance->li_line);
	}

	static int
	witness_output(const char *fmt, ...)
	{
	va_list ap;
	int ret;

	va_start(ap, fmt);
	ret = witness_voutput(fmt, ap);
	va_end(ap);
	return (ret);
	}

	static int
	witness_voutput(const char *fmt, va_list ap)
	{
	int ret;

	ret = 0;
	switch (witness_channel) {
	case WITNESS_CONSOLE:
	ret = vprintf(fmt, ap);
	break;
	case WITNESS_LOG:
	vlog(LOG_NOTICE, fmt, ap);
	break;
	case WITNESS_NONE:
	break;
	}
	return (ret);
	}

	#ifdef DDB
	static int
	witness_thread_has_locks(struct thread *td)
	{

	if (td->td_sleeplocks == NULL)
	return (0);
	return (td->td_sleeplocks->ll_count != 0);
	}

	static int
	witness_proc_has_locks(struct proc *p)
	{
	struct thread *td;

	FOREACH_THREAD_IN_PROC(p, td) {
	if (witness_thread_has_locks(td))
	return (1);
	}
	return (0);
	}
	#endif

	int
	witness_list_locks(struct lock_list_entry **lock_list,
	int (prnt)(const char fmt, ...))
	{
	struct lock_list_entry *lle;
	int i, nheld;

	nheld = 0;
	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
	for (i = lle->ll_count - 1; i >= 0; i--) {
	witness_list_lock(&lle->ll_children[i], prnt);
	nheld++;
	}
	return (nheld);
	}

	/*
	* This is a bit risky at best. We call this function when we have timed
	* out acquiring a spin lock, and we assume that the other CPU is stuck
	* with this lock held. So, we go groveling around in the other CPU's
	* per-cpu data to try to find the lock instance for this spin lock to
	* see when it was last acquired.
	*/
	void
	witness_display_spinlock(struct lock_object lock, struct thread owner,
	int (prnt)(const char fmt, ...))
	{
	struct lock_instance *instance;
	struct pcpu *pc;

	if (owner->td_critnest == 0 \|\| owner->td_oncpu == NOCPU)
	return;
	pc = pcpu_find(owner->td_oncpu);
	instance = find_instance(pc->pc_spinlocks, lock);
	if (instance != NULL)
	witness_list_lock(instance, prnt);
	}

	void
	witness_save(struct lock_object lock, const char filep, int linep)
	{
	struct lock_list_entry *lock_list;
	struct lock_instance *instance;
	struct lock_class *class;

	/*
	* This function is used independently in locking code to deal with
	* Giant, SCHEDULER_STOPPED() check can be removed here after Giant
	* is gone.
	*/
	if (SCHEDULER_STOPPED())
	return;
	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
	if (lock->lo_witness == NULL \|\| witness_watch == -1 \|\| panicstr != NULL)
	return;
	class = LOCK_CLASS(lock);
	if (class->lc_flags & LC_SLEEPLOCK)
	lock_list = curthread->td_sleeplocks;
	else {
	if (witness_skipspin)
	return;
	lock_list = PCPU_GET(spinlocks);
	}
	instance = find_instance(lock_list, lock);
	if (instance == NULL) {
	kassert_panic("%s: lock (%s) %s not locked", __func__,
	class->lc_name, lock->lo_name);
	return;
	}
	*filep = instance->li_file;
	*linep = instance->li_line;
	}

	void
	witness_restore(struct lock_object lock, const char file, int line)
	{
	struct lock_list_entry *lock_list;
	struct lock_instance *instance;
	struct lock_class *class;

	/*
	* This function is used independently in locking code to deal with
	* Giant, SCHEDULER_STOPPED() check can be removed here after Giant
	* is gone.
	*/
	if (SCHEDULER_STOPPED())
	return;
	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
	if (lock->lo_witness == NULL \|\| witness_watch == -1 \|\| panicstr != NULL)
	return;
	class = LOCK_CLASS(lock);
	if (class->lc_flags & LC_SLEEPLOCK)
	lock_list = curthread->td_sleeplocks;
	else {
	if (witness_skipspin)
	return;
	lock_list = PCPU_GET(spinlocks);
	}
	instance = find_instance(lock_list, lock);
	if (instance == NULL)
	kassert_panic("%s: lock (%s) %s not locked", __func__,
	class->lc_name, lock->lo_name);
	lock->lo_witness->w_file = file;
	lock->lo_witness->w_line = line;
	if (instance == NULL)
	return;
	instance->li_file = file;
	instance->li_line = line;
	}

	void
	witness_assert(const struct lock_object lock, int flags, const char file,
	int line)
	{
	#ifdef INVARIANT_SUPPORT
	struct lock_instance *instance;
	struct lock_class *class;

	if (lock->lo_witness == NULL \|\| witness_watch < 1 \|\| panicstr != NULL)
	return;
	class = LOCK_CLASS(lock);
	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
	instance = find_instance(curthread->td_sleeplocks, lock);
	else if ((class->lc_flags & LC_SPINLOCK) != 0)
	instance = find_instance(PCPU_GET(spinlocks), lock);
	else {
	kassert_panic("Lock (%s) %s is not sleep or spin!",
	class->lc_name, lock->lo_name);
	return;
	}
	switch (flags) {
	case LA_UNLOCKED:
	if (instance != NULL)
	kassert_panic("Lock (%s) %s locked @ %s:%d.",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	break;
	case LA_LOCKED:
	case LA_LOCKED \| LA_RECURSED:
	case LA_LOCKED \| LA_NOTRECURSED:
	case LA_SLOCKED:
	case LA_SLOCKED \| LA_RECURSED:
	case LA_SLOCKED \| LA_NOTRECURSED:
	case LA_XLOCKED:
	case LA_XLOCKED \| LA_RECURSED:
	case LA_XLOCKED \| LA_NOTRECURSED:
	if (instance == NULL) {
	kassert_panic("Lock (%s) %s not locked @ %s:%d.",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	break;
	}
	if ((flags & LA_XLOCKED) != 0 &&
	(instance->li_flags & LI_EXCLUSIVE) == 0)
	kassert_panic(
	"Lock (%s) %s not exclusively locked @ %s:%d.",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	if ((flags & LA_SLOCKED) != 0 &&
	(instance->li_flags & LI_EXCLUSIVE) != 0)
	kassert_panic(
	"Lock (%s) %s exclusively locked @ %s:%d.",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	if ((flags & LA_RECURSED) != 0 &&
	(instance->li_flags & LI_RECURSEMASK) == 0)
	kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	if ((flags & LA_NOTRECURSED) != 0 &&
	(instance->li_flags & LI_RECURSEMASK) != 0)
	kassert_panic("Lock (%s) %s recursed @ %s:%d.",
	class->lc_name, lock->lo_name,
	fixup_filename(file), line);
	break;
	default:
	kassert_panic("Invalid lock assertion at %s:%d.",
	fixup_filename(file), line);

	}
	#endif /* INVARIANT_SUPPORT */
	}

	static void
	witness_setflag(struct lock_object *lock, int flag, int set)
	{
	struct lock_list_entry *lock_list;
	struct lock_instance *instance;
	struct lock_class *class;

	if (lock->lo_witness == NULL \|\| witness_watch == -1 \|\| panicstr != NULL)
	return;
	class = LOCK_CLASS(lock);
	if (class->lc_flags & LC_SLEEPLOCK)
	lock_list = curthread->td_sleeplocks;
	else {
	if (witness_skipspin)
	return;
	lock_list = PCPU_GET(spinlocks);
	}
	instance = find_instance(lock_list, lock);
	if (instance == NULL) {
	kassert_panic("%s: lock (%s) %s not locked", __func__,
	class->lc_name, lock->lo_name);
	return;
	}

	if (set)
	instance->li_flags \|= flag;
	else
	instance->li_flags &= ~flag;
	}

	void
	witness_norelease(struct lock_object *lock)
	{

	witness_setflag(lock, LI_NORELEASE, 1);
	}

	void
	witness_releaseok(struct lock_object *lock)
	{

	witness_setflag(lock, LI_NORELEASE, 0);
	}

	#ifdef DDB
	static void
	witness_ddb_list(struct thread *td)
	{

	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
	KASSERT(kdb_active, ("%s: not in the debugger", __func__));

	if (witness_watch < 1)
	return;

	witness_list_locks(&td->td_sleeplocks, db_printf);

	/*
	* We only handle spinlocks if td == curthread. This is somewhat broken
	* if td is currently executing on some other CPU and holds spin locks
	* as we won't display those locks. If we had a MI way of getting
	* the per-cpu data for a given cpu then we could use
	* td->td_oncpu to get the list of spinlocks for this thread
	* and "fix" this.
	*
	* That still wouldn't really fix this unless we locked the scheduler
	* lock or stopped the other CPU to make sure it wasn't changing the
	* list out from under us. It is probably best to just not try to
	* handle threads on other CPU's for now.
	*/
	if (td == curthread && PCPU_GET(spinlocks) != NULL)
	witness_list_locks(PCPU_PTR(spinlocks), db_printf);
	}

	DB_SHOW_COMMAND(locks, db_witness_list)
	{
	struct thread *td;

	if (have_addr)
	td = db_lookup_thread(addr, true);
	else
	td = kdb_thread;
	witness_ddb_list(td);
	}

	DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
	{
	struct thread *td;
	struct proc *p;

	/*
	* It would be nice to list only threads and processes that actually
	* held sleep locks, but that information is currently not exported
	* by WITNESS.
	*/
	FOREACH_PROC_IN_SYSTEM(p) {
	if (!witness_proc_has_locks(p))
	continue;
	FOREACH_THREAD_IN_PROC(p, td) {
	if (!witness_thread_has_locks(td))
	continue;
	db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
	p->p_comm, td, td->td_tid);
	witness_ddb_list(td);
	if (db_pager_quit)
	return;
	}
	}
	}
	DB_SHOW_ALIAS(alllocks, db_witness_list_all)

	DB_SHOW_COMMAND(witness, db_witness_display)
	{

	witness_ddb_display(db_printf);
	}
	#endif

	static void
	sbuf_print_witness_badstacks(struct sbuf sb, size_t oldidx)
	{
	struct witness_lock_order_data data1, data2, tmp_data1, tmp_data2;
	struct witness tmp_w1, tmp_w2, w1, w2;
	- u_int w_rmatrix1, w_rmatrix2;
	int generation, i, j;

	tmp_data1 = NULL;
	tmp_data2 = NULL;
	tmp_w1 = NULL;
	tmp_w2 = NULL;

	/* Allocate and init temporary storage space. */
	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK \| M_ZERO);
	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK \| M_ZERO);
	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
	M_WAITOK \| M_ZERO);
	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
	M_WAITOK \| M_ZERO);
	stack_zero(&tmp_data1->wlod_stack);
	stack_zero(&tmp_data2->wlod_stack);

	restart:
	mtx_lock_spin(&w_mtx);
	generation = w_generation;
	mtx_unlock_spin(&w_mtx);
	sbuf_printf(sb, "Number of known direct relationships is %d\n",
	w_lohash.wloh_count);
	for (i = 1; i < w_max_used_index; i++) {
	mtx_lock_spin(&w_mtx);
	if (generation != w_generation) {
	mtx_unlock_spin(&w_mtx);

	/* The graph has changed, try again. */
	*oldidx = 0;
	sbuf_clear(sb);
	goto restart;
	}

	w1 = &w_data[i];
	if (w1->w_reversed == 0) {
	mtx_unlock_spin(&w_mtx);
	continue;
	}

	/* Copy w1 locally so we can release the spin lock. */
	tmp_w1 = w1;
	mtx_unlock_spin(&w_mtx);

	if (tmp_w1->w_reversed == 0)
	continue;
	for (j = 1; j < w_max_used_index; j++) {
	if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 \|\| i > j)
	continue;

	mtx_lock_spin(&w_mtx);
	if (generation != w_generation) {
	mtx_unlock_spin(&w_mtx);

	/* The graph has changed, try again. */
	*oldidx = 0;
	sbuf_clear(sb);
	goto restart;
	}

	w2 = &w_data[j];
	data1 = witness_lock_order_get(w1, w2);
	data2 = witness_lock_order_get(w2, w1);

	/*
	* Copy information locally so we can release the
	* spin lock.
	*/
	tmp_w2 = w2;
	- w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
	- w_rmatrix2 = (unsigned int)w_rmatrix[j][i];

	if (data1) {
	stack_zero(&tmp_data1->wlod_stack);
	stack_copy(&data1->wlod_stack,
	&tmp_data1->wlod_stack);
	}
	if (data2 && data2 != data1) {
	stack_zero(&tmp_data2->wlod_stack);
	stack_copy(&data2->wlod_stack,
	&tmp_data2->wlod_stack);
	}
	mtx_unlock_spin(&w_mtx);

	sbuf_printf(sb,
	"\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
	tmp_w1->w_name, tmp_w1->w_class->lc_name,
	tmp_w2->w_name, tmp_w2->w_class->lc_name);
	if (data1) {
	sbuf_printf(sb,
	"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
	tmp_w1->w_name, tmp_w1->w_class->lc_name,
	tmp_w2->w_name, tmp_w2->w_class->lc_name);
	stack_sbuf_print(sb, &tmp_data1->wlod_stack);
	sbuf_printf(sb, "\n");
	}
	if (data2 && data2 != data1) {
	sbuf_printf(sb,
	"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
	tmp_w2->w_name, tmp_w2->w_class->lc_name,
	tmp_w1->w_name, tmp_w1->w_class->lc_name);
	stack_sbuf_print(sb, &tmp_data2->wlod_stack);
	sbuf_printf(sb, "\n");
	}
	}
	}
	mtx_lock_spin(&w_mtx);
	if (generation != w_generation) {
	mtx_unlock_spin(&w_mtx);

	/*
	* The graph changed while we were printing stack data,
	* try again.
	*/
	*oldidx = 0;
	sbuf_clear(sb);
	goto restart;
	}
	mtx_unlock_spin(&w_mtx);

	/* Free temporary storage space. */
	free(tmp_data1, M_TEMP);
	free(tmp_data2, M_TEMP);
	free(tmp_w1, M_TEMP);
	free(tmp_w2, M_TEMP);
	}

	static int
	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
	{
	struct sbuf *sb;
	int error;

	if (witness_watch < 1) {
	error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
	return (error);
	}
	if (witness_cold) {
	error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
	return (error);
	}
	error = 0;
	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
	if (sb == NULL)
	return (ENOMEM);

	sbuf_print_witness_badstacks(sb, &req->oldidx);

	sbuf_finish(sb);
	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
	sbuf_delete(sb);

	return (error);
	}

	#ifdef DDB
	static int
	sbuf_db_printf_drain(void arg __unused, const char data, int len)
	{

	return (db_printf("%.*s", len, data));
	}

	DB_SHOW_COMMAND(badstacks, db_witness_badstacks)
	{
	struct sbuf sb;
	char buffer[128];
	size_t dummy;

	sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
	sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL);
	sbuf_print_witness_badstacks(&sb, &dummy);
	sbuf_finish(&sb);
	}
	#endif

	static int
	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
	{
	static const struct {
	enum witness_channel channel;
	const char *name;
	} channels[] = {
	{ WITNESS_CONSOLE, "console" },
	{ WITNESS_LOG, "log" },
	{ WITNESS_NONE, "none" },
	};
	char buf[16];
	u_int i;
	int error;

	buf[0] = '\0';
	for (i = 0; i < nitems(channels); i++)
	if (witness_channel == channels[i].channel) {
	snprintf(buf, sizeof(buf), "%s", channels[i].name);
	break;
	}

	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);

	error = EINVAL;
	for (i = 0; i < nitems(channels); i++)
	if (strcmp(channels[i].name, buf) == 0) {
	witness_channel = channels[i].channel;
	error = 0;
	break;
	}
	return (error);
	}

	static int
	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
	{
	struct witness *w;
	struct sbuf *sb;
	int error;

	if (witness_watch < 1) {
	error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
	return (error);
	}
	if (witness_cold) {
	error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
	return (error);
	}
	error = 0;

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);
	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
	if (sb == NULL)
	return (ENOMEM);
	sbuf_printf(sb, "\n");

	mtx_lock_spin(&w_mtx);
	STAILQ_FOREACH(w, &w_all, w_list)
	w->w_displayed = 0;
	STAILQ_FOREACH(w, &w_all, w_list)
	witness_add_fullgraph(sb, w);
	mtx_unlock_spin(&w_mtx);

	/*
	* Close the sbuf and return to userland.
	*/
	error = sbuf_finish(sb);
	sbuf_delete(sb);

	return (error);
	}

	static int
	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
	{
	int error, value;

	value = witness_watch;
	error = sysctl_handle_int(oidp, &value, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	if (value > 1 \|\| value < -1 \|\|
	(witness_watch == -1 && value != witness_watch))
	return (EINVAL);
	witness_watch = value;
	return (0);
	}

	static void
	witness_add_fullgraph(struct sbuf sb, struct witness w)
	{
	int i;

	if (w->w_displayed != 0 \|\| (w->w_file == NULL && w->w_line == 0))
	return;
	w->w_displayed = 1;

	WITNESS_INDEX_ASSERT(w->w_index);
	for (i = 1; i <= w_max_used_index; i++) {
	if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
	sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
	w_data[i].w_name);
	witness_add_fullgraph(sb, &w_data[i]);
	}
	}
	}

	/*
	* A simple hash function. Takes a key pointer and a key size. If size == 0,
	* interprets the key as a string and reads until the null
	* terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
	* hash value computed from the key.
	*/
	static uint32_t
	witness_hash_djb2(const uint8_t *key, uint32_t size)
	{
	unsigned int hash = 5381;
	int i;

	/* hash = hash * 33 + key[i] */
	if (size)
	for (i = 0; i < size; i++)
	hash = ((hash << 5) + hash) + (unsigned int)key[i];
	else
	for (i = 0; key[i] != 0; i++)
	hash = ((hash << 5) + hash) + (unsigned int)key[i];

	return (hash);
	}


	/*
	* Initializes the two witness hash tables. Called exactly once from
	* witness_initialize().
	*/
	static void
	witness_init_hash_tables(void)
	{
	int i;

	MPASS(witness_cold);

	/* Initialize the hash tables. */
	for (i = 0; i < WITNESS_HASH_SIZE; i++)
	w_hash.wh_array[i] = NULL;

	w_hash.wh_size = WITNESS_HASH_SIZE;
	w_hash.wh_count = 0;

	/* Initialize the lock order data hash. */
	w_lofree = NULL;
	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
	memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
	w_lodata[i].wlod_next = w_lofree;
	w_lofree = &w_lodata[i];
	}
	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
	w_lohash.wloh_count = 0;
	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
	w_lohash.wloh_array[i] = NULL;
	}

	static struct witness *
	witness_hash_get(const char *key)
	{
	struct witness *w;
	uint32_t hash;

	MPASS(key != NULL);
	if (witness_cold == 0)
	mtx_assert(&w_mtx, MA_OWNED);
	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
	w = w_hash.wh_array[hash];
	while (w != NULL) {
	if (strcmp(w->w_name, key) == 0)
	goto out;
	w = w->w_hash_next;
	}

	out:
	return (w);
	}

	static void
	witness_hash_put(struct witness *w)
	{
	uint32_t hash;

	MPASS(w != NULL);
	MPASS(w->w_name != NULL);
	if (witness_cold == 0)
	mtx_assert(&w_mtx, MA_OWNED);
	KASSERT(witness_hash_get(w->w_name) == NULL,
	("%s: trying to add a hash entry that already exists!", __func__));
	KASSERT(w->w_hash_next == NULL,
	("%s: w->w_hash_next != NULL", __func__));

	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
	w->w_hash_next = w_hash.wh_array[hash];
	w_hash.wh_array[hash] = w;
	w_hash.wh_count++;
	}


	static struct witness_lock_order_data *
	witness_lock_order_get(struct witness parent, struct witness child)
	{
	struct witness_lock_order_data *data = NULL;
	struct witness_lock_order_key key;
	unsigned int hash;

	MPASS(parent != NULL && child != NULL);
	key.from = parent->w_index;
	key.to = child->w_index;
	WITNESS_INDEX_ASSERT(key.from);
	WITNESS_INDEX_ASSERT(key.to);
	if ((w_rmatrix[parent->w_index][child->w_index]
	& WITNESS_LOCK_ORDER_KNOWN) == 0)
	goto out;

	hash = witness_hash_djb2((const char*)&key,
	sizeof(key)) % w_lohash.wloh_size;
	data = w_lohash.wloh_array[hash];
	while (data != NULL) {
	if (witness_lock_order_key_equal(&data->wlod_key, &key))
	break;
	data = data->wlod_next;
	}

	out:
	return (data);
	}

	/*
	* Verify that parent and child have a known relationship, are not the same,
	* and child is actually a child of parent. This is done without w_mtx
	* to avoid contention in the common case.
	*/
	static int
	witness_lock_order_check(struct witness parent, struct witness child)
	{

	if (parent != child &&
	w_rmatrix[parent->w_index][child->w_index]
	& WITNESS_LOCK_ORDER_KNOWN &&
	isitmychild(parent, child))
	return (1);

	return (0);
	}

	static int
	witness_lock_order_add(struct witness parent, struct witness child)
	{
	struct witness_lock_order_data *data = NULL;
	struct witness_lock_order_key key;
	unsigned int hash;

	MPASS(parent != NULL && child != NULL);
	key.from = parent->w_index;
	key.to = child->w_index;
	WITNESS_INDEX_ASSERT(key.from);
	WITNESS_INDEX_ASSERT(key.to);
	if (w_rmatrix[parent->w_index][child->w_index]
	& WITNESS_LOCK_ORDER_KNOWN)
	return (1);

	hash = witness_hash_djb2((const char*)&key,
	sizeof(key)) % w_lohash.wloh_size;
	w_rmatrix[parent->w_index][child->w_index] \|= WITNESS_LOCK_ORDER_KNOWN;
	data = w_lofree;
	if (data == NULL)
	return (0);
	w_lofree = data->wlod_next;
	data->wlod_next = w_lohash.wloh_array[hash];
	data->wlod_key = key;
	w_lohash.wloh_array[hash] = data;
	w_lohash.wloh_count++;
	stack_zero(&data->wlod_stack);
	stack_save(&data->wlod_stack);
	return (1);
	}

	/* Call this whenever the structure of the witness graph changes. */
	static void
	witness_increment_graph_generation(void)
	{

	if (witness_cold == 0)
	mtx_assert(&w_mtx, MA_OWNED);
	w_generation++;
	}

	static int
	witness_output_drain(void arg __unused, const char data, int len)
	{

	witness_output("%.*s", len, data);
	return (len);
	}

	static void
	witness_debugger(int cond, const char *msg)
	{
	char buf[32];
	struct sbuf sb;
	struct stack st;

	if (!cond)
	return;

	if (witness_trace) {
	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
	sbuf_set_drain(&sb, witness_output_drain, NULL);

	stack_zero(&st);
	stack_save(&st);
	witness_output("stack backtrace:\n");
	stack_sbuf_print_ddb(&sb, &st);

	sbuf_finish(&sb);
	}

	#ifdef KDB
	if (witness_kdb)
	kdb_enter(KDB_WHY_WITNESS, msg);
	#endif
	}
	Index: head/sys/kern/vfs_aio.c
	===================================================================
	--- head/sys/kern/vfs_aio.c (revision 327172)
	+++ head/sys/kern/vfs_aio.c (revision 327173)
	@@ -1,3005 +1,3001 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 1997 John S. Dyson. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. John S. Dyson's name may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* DISCLAIMER: This code isn't warranted to do anything useful. Anything
	* bad that happens because of using this software isn't the responsibility
	* of the author. This software is distributed AS-IS.
	*/

	/*
	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/capsicum.h>
	#include <sys/eventhandler.h>
	#include <sys/sysproto.h>
	#include <sys/filedesc.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/kthread.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/unistd.h>
	#include <sys/posix4.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/protosw.h>
	#include <sys/rwlock.h>
	#include <sys/sema.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscall.h>
	#include <sys/sysent.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/sx.h>
	#include <sys/taskqueue.h>
	#include <sys/vnode.h>
	#include <sys/conf.h>
	#include <sys/event.h>
	#include <sys/mount.h>
	#include <geom/geom.h>

	#include <machine/atomic.h>

	#include <vm/vm.h>
	#include <vm/vm_page.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/uma.h>
	#include <sys/aio.h>

	/*
	* Counter for allocating reference ids to new jobs. Wrapped to 1 on
	* overflow. (XXX will be removed soon.)
	*/
	static u_long jobrefid;

	/*
	* Counter for aio_fsync.
	*/
	static uint64_t jobseqno;

	#ifndef MAX_AIO_PER_PROC
	#define MAX_AIO_PER_PROC 32
	#endif

	#ifndef MAX_AIO_QUEUE_PER_PROC
	#define MAX_AIO_QUEUE_PER_PROC 256
	#endif

	#ifndef MAX_AIO_QUEUE
	#define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
	#endif

	#ifndef MAX_BUF_AIO
	#define MAX_BUF_AIO 16
	#endif

	FEATURE(aio, "Asynchronous I/O");
	SYSCTL_DECL(_p1003_1b);

	static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
	static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");

	static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
	"Async IO management");

	static int enable_aio_unsafe = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
	"Permit asynchronous IO on all file types, not just known-safe types");

	static unsigned int unsafe_warningcnt = 1;
	SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
	&unsafe_warningcnt, 0,
	"Warnings that will be triggered upon failed IO requests on unsafe files");

	static int max_aio_procs = MAX_AIO_PROCS;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
	"Maximum number of kernel processes to use for handling async IO ");

	static int num_aio_procs = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
	"Number of presently active kernel processes for async IO");

	/*
	* The code will adjust the actual number of AIO processes towards this
	* number when it gets a chance.
	*/
	static int target_aio_procs = TARGET_AIO_PROCS;
	SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
	0,
	"Preferred number of ready kernel processes for async IO");

	static int max_queue_count = MAX_AIO_QUEUE;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
	"Maximum number of aio requests to queue, globally");

	static int num_queue_count = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
	"Number of queued aio requests");

	static int num_buf_aio = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
	"Number of aio requests presently handled by the buf subsystem");

	/* Number of async I/O processes in the process of being started */
	/* XXX This should be local to aio_aqueue() */
	static int num_aio_resv_start = 0;

	static int aiod_lifetime;
	SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
	"Maximum lifetime for idle aiod");

	static int max_aio_per_proc = MAX_AIO_PER_PROC;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
	0,
	"Maximum active aio requests per process (stored in the process)");

	static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
	&max_aio_queue_per_proc, 0,
	"Maximum queued aio requests per process (stored in the process)");

	static int max_buf_aio = MAX_BUF_AIO;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
	"Maximum buf aio requests per process (stored in the process)");

	/*
	* Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
	* sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
	* vfs.aio.aio_listio_max.
	*/
	SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
	CTLFLAG_RD \| CTLFLAG_CAPRD, &max_aio_queue_per_proc,
	0, "Maximum aio requests for a single lio_listio call");

	#ifdef COMPAT_FREEBSD6
	typedef struct oaiocb {
	int aio_fildes; /* File descriptor */
	off_t aio_offset; /* File offset for I/O */
	volatile void aio_buf; / I/O buffer in process space */
	size_t aio_nbytes; /* Number of bytes for I/O */
	struct osigevent aio_sigevent; /* Signal to deliver */
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private _aiocb_private;
	} oaiocb_t;
	#endif

	/*
	* Below is a key of locks used to protect each member of struct kaiocb
	* aioliojob and kaioinfo and any backends.
	*
	* * - need not protected
	* a - locked by kaioinfo lock
	* b - locked by backend lock, the backend lock can be null in some cases,
	* for example, BIO belongs to this type, in this case, proc lock is
	* reused.
	* c - locked by aio_job_mtx, the lock for the generic file I/O backend.
	*/

	/*
	* If the routine that services an AIO request blocks while running in an
	* AIO kernel process it can starve other I/O requests. BIO requests
	* queued via aio_qphysio() complete in GEOM and do not use AIO kernel
	* processes at all. Socket I/O requests use a separate pool of
	* kprocs and also force non-blocking I/O. Other file I/O requests
	* use the generic fo_read/fo_write operations which can block. The
	* fsync and mlock operations can also block while executing. Ideally
	* none of these requests would block while executing.
	*
	* Note that the service routines cannot toggle O_NONBLOCK in the file
	* structure directly while handling a request due to races with
	* userland threads.
	*/

	/* jobflags */
	#define KAIOCB_QUEUEING 0x01
	#define KAIOCB_CANCELLED 0x02
	#define KAIOCB_CANCELLING 0x04
	#define KAIOCB_CHECKSYNC 0x08
	#define KAIOCB_CLEARED 0x10
	#define KAIOCB_FINISHED 0x20

	/*
	* AIO process info
	*/
	#define AIOP_FREE 0x1 /* proc on free queue */

	struct aioproc {
	int aioprocflags; /* (c) AIO proc flags */
	TAILQ_ENTRY(aioproc) list; /* (c) list of processes */
	struct proc aioproc; / () the AIO proc /
	};

	/*
	* data-structure for lio signal management
	*/
	struct aioliojob {
	int lioj_flags; /* (a) listio flags */
	int lioj_count; /* (a) listio flags */
	int lioj_finished_count; /* (a) listio flags */
	struct sigevent lioj_signal; /* (a) signal on all I/O done */
	TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
	struct knlist klist; /* (a) list of knotes */
	ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
	};

	#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
	#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
	#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */

	/*
	* per process aio data structure
	*/
	struct kaioinfo {
	struct mtx kaio_mtx; /* the lock to protect this struct */
	int kaio_flags; /* (a) per process kaio flags */
	int kaio_maxactive_count; /* () maximum number of AIOs /
	int kaio_active_count; /* (c) number of currently used AIOs */
	int kaio_qallowed_count; /* () maxiumu size of AIO queue /
	int kaio_count; /* (a) size of AIO queue */
	int kaio_ballowed_count; /* () maximum number of buffers /
	int kaio_buffer_count; /* (a) number of physio buffers */
	TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */
	TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */
	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
	TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */
	TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */
	TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */
	struct task kaio_task; /* () task to kick aio processes /
	struct task kaio_sync_task; /* () task to schedule fsync jobs /
	};

	#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
	#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
	#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
	#define AIO_MTX(ki) (&(ki)->kaio_mtx)

	#define KAIO_RUNDOWN 0x1 /* process is being run down */
	#define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */

	/*
	* Operations used to interact with userland aio control blocks.
	* Different ABIs provide their own operations.
	*/
	struct aiocb_ops {
	int (copyin)(struct aiocb ujob, struct aiocb *kjob);
	long (fetch_status)(struct aiocb ujob);
	long (fetch_error)(struct aiocb ujob);
	int (store_status)(struct aiocb ujob, long status);
	int (store_error)(struct aiocb ujob, long error);
	int (store_kernelinfo)(struct aiocb ujob, long jobref);
	int (store_aiocb)(struct aiocb ujobp, struct aiocb ujob);
	};

	static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */
	static struct sema aio_newproc_sem;
	static struct mtx aio_job_mtx;
	static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */
	static struct unrhdr *aiod_unr;

	void aio_init_aioinfo(struct proc *p);
	static int aio_onceonly(void);
	static int aio_free_entry(struct kaiocb *job);
	static void aio_process_rw(struct kaiocb *job);
	static void aio_process_sync(struct kaiocb *job);
	static void aio_process_mlock(struct kaiocb *job);
	static void aio_schedule_fsync(void *context, int pending);
	static int aio_newproc(int *);
	int aio_aqueue(struct thread td, struct aiocb ujob,
	struct aioliojob lio, int type, struct aiocb_ops ops);
	static int aio_queue_file(struct file fp, struct kaiocb job);
	static void aio_physwakeup(struct bio *bp);
	static void aio_proc_rundown(void arg, struct proc p);
	static void aio_proc_rundown_exec(void arg, struct proc p,
	struct image_params *imgp);
	static int aio_qphysio(struct proc p, struct kaiocb job);
	static void aio_daemon(void *param);
	static void aio_bio_done_notify(struct proc userp, struct kaiocb job);
	static bool aio_clear_cancel_function_locked(struct kaiocb *job);
	static int aio_kick(struct proc *userp);
	static void aio_kick_nowait(struct proc *userp);
	static void aio_kick_helper(void *context, int pending);
	static int filt_aioattach(struct knote *kn);
	static void filt_aiodetach(struct knote *kn);
	static int filt_aio(struct knote *kn, long hint);
	static int filt_lioattach(struct knote *kn);
	static void filt_liodetach(struct knote *kn);
	static int filt_lio(struct knote *kn, long hint);

	/*
	* Zones for:
	* kaio Per process async io info
	* aiop async io process data
	* aiocb async io jobs
	* aiolio list io jobs
	*/
	static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;

	/* kqueue filters for aio */
	static struct filterops aio_filtops = {
	.f_isfd = 0,
	.f_attach = filt_aioattach,
	.f_detach = filt_aiodetach,
	.f_event = filt_aio,
	};
	static struct filterops lio_filtops = {
	.f_isfd = 0,
	.f_attach = filt_lioattach,
	.f_detach = filt_liodetach,
	.f_event = filt_lio
	};

	static eventhandler_tag exit_tag, exec_tag;

	TASKQUEUE_DEFINE_THREAD(aiod_kick);

	/*
	* Main operations function for use as a kernel module.
	*/
	static int
	aio_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	switch (cmd) {
	case MOD_LOAD:
	aio_onceonly();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}
	return (error);
	}

	static moduledata_t aio_mod = {
	"aio",
	&aio_modload,
	NULL
	};

	DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
	MODULE_VERSION(aio, 1);

	/*
	* Startup initialization
	*/
	static int
	aio_onceonly(void)
	{

	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
	EVENTHANDLER_PRI_ANY);
	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
	NULL, EVENTHANDLER_PRI_ANY);
	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
	TAILQ_INIT(&aio_freeproc);
	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
	TAILQ_INIT(&aio_jobs);
	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
	jobrefid = 1;
	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);

	return (0);
	}

	/*
	* Init the per-process aioinfo structure. The aioinfo limits are set
	* per-process for user limit (resource) management.
	*/
	void
	aio_init_aioinfo(struct proc *p)
	{
	struct kaioinfo *ki;

	ki = uma_zalloc(kaio_zone, M_WAITOK);
	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF \| MTX_NEW);
	ki->kaio_flags = 0;
	ki->kaio_maxactive_count = max_aio_per_proc;
	ki->kaio_active_count = 0;
	ki->kaio_qallowed_count = max_aio_queue_per_proc;
	ki->kaio_count = 0;
	ki->kaio_ballowed_count = max_buf_aio;
	ki->kaio_buffer_count = 0;
	TAILQ_INIT(&ki->kaio_all);
	TAILQ_INIT(&ki->kaio_done);
	TAILQ_INIT(&ki->kaio_jobqueue);
	TAILQ_INIT(&ki->kaio_liojoblist);
	TAILQ_INIT(&ki->kaio_syncqueue);
	TAILQ_INIT(&ki->kaio_syncready);
	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
	PROC_LOCK(p);
	if (p->p_aioinfo == NULL) {
	p->p_aioinfo = ki;
	PROC_UNLOCK(p);
	} else {
	PROC_UNLOCK(p);
	mtx_destroy(&ki->kaio_mtx);
	uma_zfree(kaio_zone, ki);
	}

	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
	aio_newproc(NULL);
	}

	static int
	aio_sendsig(struct proc p, struct sigevent sigev, ksiginfo_t *ksi)
	{
	struct thread *td;
	int error;

	error = sigev_findtd(p, sigev, &td);
	if (error)
	return (error);
	if (!KSI_ONQ(ksi)) {
	ksiginfo_set_sigev(ksi, sigev);
	ksi->ksi_code = SI_ASYNCIO;
	ksi->ksi_flags \|= KSI_EXT \| KSI_INS;
	tdsendsignal(p, td, ksi->ksi_signo, ksi);
	}
	PROC_UNLOCK(p);
	return (error);
	}

	/*
	* Free a job entry. Wait for completion if it is currently active, but don't
	* delay forever. If we delay, we return a flag that says that we have to
	* restart the queue scan.
	*/
	static int
	aio_free_entry(struct kaiocb *job)
	{
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct proc *p;

	p = job->userproc;
	MPASS(curproc == p);
	ki = p->p_aioinfo;
	MPASS(ki != NULL);

	AIO_LOCK_ASSERT(ki, MA_OWNED);
	MPASS(job->jobflags & KAIOCB_FINISHED);

	atomic_subtract_int(&num_queue_count, 1);

	ki->kaio_count--;
	MPASS(ki->kaio_count >= 0);

	TAILQ_REMOVE(&ki->kaio_done, job, plist);
	TAILQ_REMOVE(&ki->kaio_all, job, allist);

	lj = job->lio;
	if (lj) {
	lj->lioj_count--;
	lj->lioj_finished_count--;

	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	/* lio is going away, we need to destroy any knotes */
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	uma_zfree(aiolio_zone, lj);
	}
	}

	/* job is going away, we need to destroy any knotes */
	knlist_delete(&job->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&job->ksi);
	PROC_UNLOCK(p);

	AIO_UNLOCK(ki);

	/*
	* The thread argument here is used to find the owning process
	* and is also passed to fo_close() which may pass it to various
	* places such as devsw close() routines. Because of that, we
	* need a thread pointer from the process owning the job that is
	* persistent and won't disappear out from under us or move to
	* another process.
	*
	* Currently, all the callers of this function call it to remove
	* a kaiocb from the current process' job list either via a
	* syscall or due to the current process calling exit() or
	* execve(). Thus, we know that p == curproc. We also know that
	* curthread can't exit since we are curthread.
	*
	* Therefore, we use curthread as the thread to pass to
	* knlist_delete(). This does mean that it is possible for the
	* thread pointer at close time to differ from the thread pointer
	* at open time, but this is already true of file descriptors in
	* a multithreaded process.
	*/
	if (job->fd_file)
	fdrop(job->fd_file, curthread);
	crfree(job->cred);
	uma_zfree(aiocb_zone, job);
	AIO_LOCK(ki);

	return (0);
	}

	static void
	aio_proc_rundown_exec(void arg, struct proc p,
	struct image_params *imgp __unused)
	{
	aio_proc_rundown(arg, p);
	}

	static int
	aio_cancel_job(struct proc p, struct kaioinfo ki, struct kaiocb *job)
	{
	aio_cancel_fn_t *func;
	int cancelled;

	AIO_LOCK_ASSERT(ki, MA_OWNED);
	if (job->jobflags & (KAIOCB_CANCELLED \| KAIOCB_FINISHED))
	return (0);
	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
	job->jobflags \|= KAIOCB_CANCELLED;

	func = job->cancel_fn;

	/*
	* If there is no cancel routine, just leave the job marked as
	* cancelled. The job should be in active use by a caller who
	* should complete it normally or when it fails to install a
	* cancel routine.
	*/
	if (func == NULL)
	return (0);

	/*
	* Set the CANCELLING flag so that aio_complete() will defer
	* completions of this job. This prevents the job from being
	* freed out from under the cancel callback. After the
	* callback any deferred completion (whether from the callback
	* or any other source) will be completed.
	*/
	job->jobflags \|= KAIOCB_CANCELLING;
	AIO_UNLOCK(ki);
	func(job);
	AIO_LOCK(ki);
	job->jobflags &= ~KAIOCB_CANCELLING;
	if (job->jobflags & KAIOCB_FINISHED) {
	cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
	TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
	aio_bio_done_notify(p, job);
	} else {
	/*
	* The cancel callback might have scheduled an
	* operation to cancel this request, but it is
	* only counted as cancelled if the request is
	* cancelled when the callback returns.
	*/
	cancelled = 0;
	}
	return (cancelled);
	}

	/*
	* Rundown the jobs for a given process.
	*/
	static void
	aio_proc_rundown(void arg, struct proc p)
	{
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct kaiocb job, jobn;

	KASSERT(curthread->td_proc == p,
	("%s: called on non-curproc", __func__));
	ki = p->p_aioinfo;
	if (ki == NULL)
	return;

	AIO_LOCK(ki);
	ki->kaio_flags \|= KAIO_RUNDOWN;

	restart:

	/*
	* Try to cancel all pending requests. This code simulates
	* aio_cancel on all pending I/O requests.
	*/
	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
	aio_cancel_job(p, ki, job);
	}

	/* Wait for all running I/O to be finished */
	if (TAILQ_FIRST(&ki->kaio_jobqueue) \|\| ki->kaio_active_count != 0) {
	ki->kaio_flags \|= KAIO_WAKEUP;
	msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
	goto restart;
	}

	/* Free all completed I/O requests. */
	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
	aio_free_entry(job);

	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	uma_zfree(aiolio_zone, lj);
	} else {
	panic("LIO job not cleaned up: C:%d, FC:%d\n",
	lj->lioj_count, lj->lioj_finished_count);
	}
	}
	AIO_UNLOCK(ki);
	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
	mtx_destroy(&ki->kaio_mtx);
	uma_zfree(kaio_zone, ki);
	p->p_aioinfo = NULL;
	}

	/*
	* Select a job to run (called by an AIO daemon).
	*/
	static struct kaiocb *
	aio_selectjob(struct aioproc *aiop)
	{
	struct kaiocb *job;
	struct kaioinfo *ki;
	struct proc *userp;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	restart:
	TAILQ_FOREACH(job, &aio_jobs, list) {
	userp = job->userproc;
	ki = userp->p_aioinfo;

	if (ki->kaio_active_count < ki->kaio_maxactive_count) {
	TAILQ_REMOVE(&aio_jobs, job, list);
	if (!aio_clear_cancel_function(job))
	goto restart;

	/* Account for currently active jobs. */
	ki->kaio_active_count++;
	break;
	}
	}
	return (job);
	}

	/*
	* Move all data to a permanent storage device. This code
	* simulates the fsync syscall.
	*/
	static int
	aio_fsync_vnode(struct thread td, struct vnode vp)
	{
	struct mount *mp;
	int error;

	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	goto drop;
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (vp->v_object != NULL) {
	VM_OBJECT_WLOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, 0);
	VM_OBJECT_WUNLOCK(vp->v_object);
	}
	error = VOP_FSYNC(vp, MNT_WAIT, td);

	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	drop:
	return (error);
	}

	/*
	* The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that
	* does the I/O request for the non-physio version of the operations. The
	* normal vn operations are used, and this code should work in all instances
	* for every type of file, including pipes, sockets, fifos, and regular files.
	*
	* XXX I don't think it works well for socket, pipe, and fifo.
	*/
	static void
	aio_process_rw(struct kaiocb *job)
	{
	struct ucred *td_savedcred;
	struct thread *td;
	struct aiocb *cb;
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	ssize_t cnt;
	long msgsnd_st, msgsnd_end;
	long msgrcv_st, msgrcv_end;
	long oublock_st, oublock_end;
	long inblock_st, inblock_end;
	int error;

	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ \|\|
	job->uaiocb.aio_lio_opcode == LIO_WRITE,
	("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));

	aio_switch_vmspace(job);
	td = curthread;
	td_savedcred = td->td_ucred;
	td->td_ucred = job->cred;
	cb = &job->uaiocb;
	fp = job->fd_file;

	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
	aiov.iov_len = cb->aio_nbytes;

	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = cb->aio_offset;
	auio.uio_resid = cb->aio_nbytes;
	cnt = cb->aio_nbytes;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;

	msgrcv_st = td->td_ru.ru_msgrcv;
	msgsnd_st = td->td_ru.ru_msgsnd;
	inblock_st = td->td_ru.ru_inblock;
	oublock_st = td->td_ru.ru_oublock;

	/*
	* aio_aqueue() acquires a reference to the file that is
	* released in aio_free_entry().
	*/
	if (cb->aio_lio_opcode == LIO_READ) {
	auio.uio_rw = UIO_READ;
	if (auio.uio_resid == 0)
	error = 0;
	else
	error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
	} else {
	if (fp->f_type == DTYPE_VNODE)
	bwillwrite();
	auio.uio_rw = UIO_WRITE;
	error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
	}
	msgrcv_end = td->td_ru.ru_msgrcv;
	msgsnd_end = td->td_ru.ru_msgsnd;
	inblock_end = td->td_ru.ru_inblock;
	oublock_end = td->td_ru.ru_oublock;

	job->msgrcv = msgrcv_end - msgrcv_st;
	job->msgsnd = msgsnd_end - msgsnd_st;
	job->inblock = inblock_end - inblock_st;
	job->outblock = oublock_end - oublock_st;

	if ((error) && (auio.uio_resid != cnt)) {
	if (error == ERESTART \|\| error == EINTR \|\| error == EWOULDBLOCK)
	error = 0;
	if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
	PROC_LOCK(job->userproc);
	kern_psignal(job->userproc, SIGPIPE);
	PROC_UNLOCK(job->userproc);
	}
	}

	cnt -= auio.uio_resid;
	td->td_ucred = td_savedcred;
	if (error)
	aio_complete(job, -1, error);
	else
	aio_complete(job, cnt, 0);
	}

	static void
	aio_process_sync(struct kaiocb *job)
	{
	struct thread *td = curthread;
	struct ucred *td_savedcred = td->td_ucred;
	struct file *fp = job->fd_file;
	int error = 0;

	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
	("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));

	td->td_ucred = job->cred;
	if (fp->f_vnode != NULL)
	error = aio_fsync_vnode(td, fp->f_vnode);
	td->td_ucred = td_savedcred;
	if (error)
	aio_complete(job, -1, error);
	else
	aio_complete(job, 0, 0);
	}

	static void
	aio_process_mlock(struct kaiocb *job)
	{
	struct aiocb *cb = &job->uaiocb;
	int error;

	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
	("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));

	aio_switch_vmspace(job);
	error = kern_mlock(job->userproc, job->cred,
	__DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
	aio_complete(job, error != 0 ? -1 : 0, error);
	}

	static void
	aio_bio_done_notify(struct proc userp, struct kaiocb job)
	{
	struct aioliojob *lj;
	struct kaioinfo *ki;
	struct kaiocb sjob, sjobn;
	int lj_done;
	bool schedule_fsync;

	ki = userp->p_aioinfo;
	AIO_LOCK_ASSERT(ki, MA_OWNED);
	lj = job->lio;
	lj_done = 0;
	if (lj) {
	lj->lioj_finished_count++;
	if (lj->lioj_count == lj->lioj_finished_count)
	lj_done = 1;
	}
	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
	MPASS(job->jobflags & KAIOCB_FINISHED);

	if (ki->kaio_flags & KAIO_RUNDOWN)
	goto notification_done;

	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL \|\|
	job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
	aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);

	KNOTE_LOCKED(&job->klist, 1);

	if (lj_done) {
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	lj->lioj_flags \|= LIOJ_KEVENT_POSTED;
	KNOTE_LOCKED(&lj->klist, 1);
	}
	if ((lj->lioj_flags & (LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED))
	== LIOJ_SIGNAL
	&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
	aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	}
	}

	notification_done:
	if (job->jobflags & KAIOCB_CHECKSYNC) {
	schedule_fsync = false;
	TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
	if (job->fd_file != sjob->fd_file \|\|
	job->seqno >= sjob->seqno)
	continue;
	if (--sjob->pending > 0)
	continue;
	TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
	if (!aio_clear_cancel_function_locked(sjob))
	continue;
	TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
	schedule_fsync = true;
	}
	if (schedule_fsync)
	taskqueue_enqueue(taskqueue_aiod_kick,
	&ki->kaio_sync_task);
	}
	if (ki->kaio_flags & KAIO_WAKEUP) {
	ki->kaio_flags &= ~KAIO_WAKEUP;
	wakeup(&userp->p_aioinfo);
	}
	}

	static void
	aio_schedule_fsync(void *context, int pending)
	{
	struct kaioinfo *ki;
	struct kaiocb *job;

	ki = context;
	AIO_LOCK(ki);
	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
	job = TAILQ_FIRST(&ki->kaio_syncready);
	TAILQ_REMOVE(&ki->kaio_syncready, job, list);
	AIO_UNLOCK(ki);
	aio_schedule(job, aio_process_sync);
	AIO_LOCK(ki);
	}
	AIO_UNLOCK(ki);
	}

	bool
	aio_cancel_cleared(struct kaiocb *job)
	{
	- struct kaioinfo *ki;

	/*
	* The caller should hold the same queue lock held when
	* aio_clear_cancel_function() was called and set this flag
	* ensuring this check sees an up-to-date value. However,
	* there is no way to assert that.
	*/
	- ki = job->userproc->p_aioinfo;
	return ((job->jobflags & KAIOCB_CLEARED) != 0);
	}

	static bool
	aio_clear_cancel_function_locked(struct kaiocb *job)
	{

	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
	MPASS(job->cancel_fn != NULL);
	if (job->jobflags & KAIOCB_CANCELLING) {
	job->jobflags \|= KAIOCB_CLEARED;
	return (false);
	}
	job->cancel_fn = NULL;
	return (true);
	}

	bool
	aio_clear_cancel_function(struct kaiocb *job)
	{
	struct kaioinfo *ki;
	bool ret;

	ki = job->userproc->p_aioinfo;
	AIO_LOCK(ki);
	ret = aio_clear_cancel_function_locked(job);
	AIO_UNLOCK(ki);
	return (ret);
	}

	static bool
	aio_set_cancel_function_locked(struct kaiocb job, aio_cancel_fn_t func)
	{

	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
	if (job->jobflags & KAIOCB_CANCELLED)
	return (false);
	job->cancel_fn = func;
	return (true);
	}

	bool
	aio_set_cancel_function(struct kaiocb job, aio_cancel_fn_t func)
	{
	struct kaioinfo *ki;
	bool ret;

	ki = job->userproc->p_aioinfo;
	AIO_LOCK(ki);
	ret = aio_set_cancel_function_locked(job, func);
	AIO_UNLOCK(ki);
	return (ret);
	}

	void
	aio_complete(struct kaiocb *job, long status, int error)
	{
	struct kaioinfo *ki;
	struct proc *userp;

	job->uaiocb._aiocb_private.error = error;
	job->uaiocb._aiocb_private.status = status;

	userp = job->userproc;
	ki = userp->p_aioinfo;

	AIO_LOCK(ki);
	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
	("duplicate aio_complete"));
	job->jobflags \|= KAIOCB_FINISHED;
	if ((job->jobflags & (KAIOCB_QUEUEING \| KAIOCB_CANCELLING)) == 0) {
	TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
	aio_bio_done_notify(userp, job);
	}
	AIO_UNLOCK(ki);
	}

	void
	aio_cancel(struct kaiocb *job)
	{

	aio_complete(job, -1, ECANCELED);
	}

	void
	aio_switch_vmspace(struct kaiocb *job)
	{

	vmspace_switch_aio(job->userproc->p_vmspace);
	}

	/*
	* The AIO daemon, most of the actual work is done in aio_process_*,
	* but the setup (and address space mgmt) is done in this routine.
	*/
	static void
	aio_daemon(void *_id)
	{
	struct kaiocb *job;
	struct aioproc *aiop;
	struct kaioinfo *ki;
	struct proc *p;
	struct vmspace *myvm;
	struct thread *td = curthread;
	int id = (intptr_t)_id;

	/*
	* Grab an extra reference on the daemon's vmspace so that it
	* doesn't get freed by jobs that switch to a different
	* vmspace.
	*/
	p = td->td_proc;
	myvm = vmspace_acquire_ref(p);

	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));

	/*
	* Allocate and ready the aio control info. There is one aiop structure
	* per daemon.
	*/
	aiop = uma_zalloc(aiop_zone, M_WAITOK);
	aiop->aioproc = p;
	aiop->aioprocflags = 0;

	/*
	* Wakeup parent process. (Parent sleeps to keep from blasting away
	* and creating too many daemons.)
	*/
	sema_post(&aio_newproc_sem);

	mtx_lock(&aio_job_mtx);
	for (;;) {
	/*
	* Take daemon off of free queue
	*/
	if (aiop->aioprocflags & AIOP_FREE) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aioprocflags &= ~AIOP_FREE;
	}

	/*
	* Check for jobs.
	*/
	while ((job = aio_selectjob(aiop)) != NULL) {
	mtx_unlock(&aio_job_mtx);

	ki = job->userproc->p_aioinfo;
	job->handle_fn(job);

	mtx_lock(&aio_job_mtx);
	/* Decrement the active job count. */
	ki->kaio_active_count--;
	}

	/*
	* Disconnect from user address space.
	*/
	if (p->p_vmspace != myvm) {
	mtx_unlock(&aio_job_mtx);
	vmspace_switch_aio(myvm);
	mtx_lock(&aio_job_mtx);
	/*
	* We have to restart to avoid race, we only sleep if
	* no job can be selected.
	*/
	continue;
	}

	mtx_assert(&aio_job_mtx, MA_OWNED);

	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
	aiop->aioprocflags \|= AIOP_FREE;

	/*
	* If daemon is inactive for a long time, allow it to exit,
	* thereby freeing resources.
	*/
	if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
	aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
	(aiop->aioprocflags & AIOP_FREE) &&
	num_aio_procs > target_aio_procs)
	break;
	}
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	num_aio_procs--;
	mtx_unlock(&aio_job_mtx);
	uma_zfree(aiop_zone, aiop);
	free_unr(aiod_unr, id);
	vmspace_free(myvm);

	KASSERT(p->p_vmspace == myvm,
	("AIOD: bad vmspace for exiting daemon"));
	KASSERT(myvm->vm_refcnt > 1,
	("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
	kproc_exit(0);
	}

	/*
	* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
	* AIO daemon modifies its environment itself.
	*/
	static int
	aio_newproc(int *start)
	{
	int error;
	struct proc *p;
	int id;

	id = alloc_unr(aiod_unr);
	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
	RFNOWAIT, 0, "aiod%d", id);
	if (error == 0) {
	/*
	* Wait until daemon is started.
	*/
	sema_wait(&aio_newproc_sem);
	mtx_lock(&aio_job_mtx);
	num_aio_procs++;
	if (start != NULL)
	(*start)--;
	mtx_unlock(&aio_job_mtx);
	} else {
	free_unr(aiod_unr, id);
	}
	return (error);
	}

	/*
	* Try the high-performance, low-overhead physio method for eligible
	* VCHR devices. This method doesn't use an aio helper thread, and
	* thus has very low overhead.
	*
	* Assumes that the caller, aio_aqueue(), has incremented the file
	* structure's reference count, preventing its deallocation for the
	* duration of this call.
	*/
	static int
	aio_qphysio(struct proc p, struct kaiocb job)
	{
	struct aiocb *cb;
	struct file *fp;
	struct bio *bp;
	struct buf *pbuf;
	struct vnode *vp;
	struct cdevsw *csw;
	struct cdev *dev;
	struct kaioinfo *ki;
	int error, ref, poff;
	vm_prot_t prot;

	cb = &job->uaiocb;
	fp = job->fd_file;

	if (fp == NULL \|\| fp->f_type != DTYPE_VNODE)
	return (-1);

	vp = fp->f_vnode;
	if (vp->v_type != VCHR)
	return (-1);
	if (vp->v_bufobj.bo_bsize == 0)
	return (-1);
	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
	return (-1);

	ref = 0;
	csw = devvn_refthread(vp, &dev, &ref);
	if (csw == NULL)
	return (ENXIO);

	if ((csw->d_flags & D_DISK) == 0) {
	error = -1;
	goto unref;
	}
	if (cb->aio_nbytes > dev->si_iosize_max) {
	error = -1;
	goto unref;
	}

	ki = p->p_aioinfo;
	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
	if (cb->aio_nbytes > MAXPHYS) {
	error = -1;
	goto unref;
	}

	pbuf = NULL;
	} else {
	if (cb->aio_nbytes > MAXPHYS - poff) {
	error = -1;
	goto unref;
	}
	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
	error = -1;
	goto unref;
	}

	job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
	BUF_KERNPROC(pbuf);
	AIO_LOCK(ki);
	ki->kaio_buffer_count++;
	AIO_UNLOCK(ki);
	}
	job->bp = bp = g_alloc_bio();

	bp->bio_length = cb->aio_nbytes;
	bp->bio_bcount = cb->aio_nbytes;
	bp->bio_done = aio_physwakeup;
	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
	bp->bio_offset = cb->aio_offset;
	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
	bp->bio_dev = dev;
	bp->bio_caller1 = (void *)job;

	prot = VM_PROT_READ;
	if (cb->aio_lio_opcode == LIO_READ)
	prot \|= VM_PROT_WRITE; /* Less backwards than it looks */
	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
	(vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
	nitems(job->pages));
	if (job->npages < 0) {
	error = EFAULT;
	goto doerror;
	}
	if (pbuf != NULL) {
	pmap_qenter((vm_offset_t)pbuf->b_data,
	job->pages, job->npages);
	bp->bio_data = pbuf->b_data + poff;
	atomic_add_int(&num_buf_aio, 1);
	} else {
	bp->bio_ma = job->pages;
	bp->bio_ma_n = job->npages;
	bp->bio_ma_offset = poff;
	bp->bio_data = unmapped_buf;
	bp->bio_flags \|= BIO_UNMAPPED;
	}

	/* Perform transfer. */
	csw->d_strategy(bp);
	dev_relthread(dev, ref);
	return (0);

	doerror:
	if (pbuf != NULL) {
	AIO_LOCK(ki);
	ki->kaio_buffer_count--;
	AIO_UNLOCK(ki);
	relpbuf(pbuf, NULL);
	job->pbuf = NULL;
	}
	g_destroy_bio(bp);
	job->bp = NULL;
	unref:
	dev_relthread(dev, ref);
	return (error);
	}

	#ifdef COMPAT_FREEBSD6
	static int
	convert_old_sigevent(struct osigevent osig, struct sigevent nsig)
	{

	/*
	* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
	* supported by AIO with the old sigevent structure.
	*/
	nsig->sigev_notify = osig->sigev_notify;
	switch (nsig->sigev_notify) {
	case SIGEV_NONE:
	break;
	case SIGEV_SIGNAL:
	nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
	break;
	case SIGEV_KEVENT:
	nsig->sigev_notify_kqueue =
	osig->__sigev_u.__sigev_notify_kqueue;
	nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	aiocb_copyin_old_sigevent(struct aiocb ujob, struct aiocb kjob)
	{
	struct oaiocb *ojob;
	int error;

	bzero(kjob, sizeof(struct aiocb));
	error = copyin(ujob, kjob, sizeof(struct oaiocb));
	if (error)
	return (error);
	ojob = (struct oaiocb *)kjob;
	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
	}
	#endif

	static int
	aiocb_copyin(struct aiocb ujob, struct aiocb kjob)
	{

	return (copyin(ujob, kjob, sizeof(struct aiocb)));
	}

	static long
	aiocb_fetch_status(struct aiocb *ujob)
	{

	return (fuword(&ujob->_aiocb_private.status));
	}

	static long
	aiocb_fetch_error(struct aiocb *ujob)
	{

	return (fuword(&ujob->_aiocb_private.error));
	}

	static int
	aiocb_store_status(struct aiocb *ujob, long status)
	{

	return (suword(&ujob->_aiocb_private.status, status));
	}

	static int
	aiocb_store_error(struct aiocb *ujob, long error)
	{

	return (suword(&ujob->_aiocb_private.error, error));
	}

	static int
	aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
	{

	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
	}

	static int
	aiocb_store_aiocb(struct aiocb *ujobp, struct aiocb ujob)
	{

	return (suword(ujobp, (long)ujob));
	}

	static struct aiocb_ops aiocb_ops = {
	.copyin = aiocb_copyin,
	.fetch_status = aiocb_fetch_status,
	.fetch_error = aiocb_fetch_error,
	.store_status = aiocb_store_status,
	.store_error = aiocb_store_error,
	.store_kernelinfo = aiocb_store_kernelinfo,
	.store_aiocb = aiocb_store_aiocb,
	};

	#ifdef COMPAT_FREEBSD6
	static struct aiocb_ops aiocb_ops_osigevent = {
	.copyin = aiocb_copyin_old_sigevent,
	.fetch_status = aiocb_fetch_status,
	.fetch_error = aiocb_fetch_error,
	.store_status = aiocb_store_status,
	.store_error = aiocb_store_error,
	.store_kernelinfo = aiocb_store_kernelinfo,
	.store_aiocb = aiocb_store_aiocb,
	};
	#endif

	/*
	* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
	* technique is done in this code.
	*/
	int
	aio_aqueue(struct thread td, struct aiocb ujob, struct aioliojob *lj,
	int type, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	cap_rights_t rights;
	struct file *fp;
	struct kaiocb *job;
	struct kaioinfo *ki;
	struct kevent kev;
	int opcode;
	int error;
	int fd, kqfd;
	int jid;
	u_short evflags;

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);

	ki = p->p_aioinfo;

	ops->store_status(ujob, -1);
	ops->store_error(ujob, 0);
	ops->store_kernelinfo(ujob, -1);

	if (num_queue_count >= max_queue_count \|\|
	ki->kaio_count >= ki->kaio_qallowed_count) {
	ops->store_error(ujob, EAGAIN);
	return (EAGAIN);
	}

	job = uma_zalloc(aiocb_zone, M_WAITOK \| M_ZERO);
	knlist_init_mtx(&job->klist, AIO_MTX(ki));

	error = ops->copyin(ujob, &job->uaiocb);
	if (error) {
	ops->store_error(ujob, error);
	uma_zfree(aiocb_zone, job);
	return (error);
	}

	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
	uma_zfree(aiocb_zone, job);
	return (EINVAL);
	}

	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
	job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
	job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
	job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
	ops->store_error(ujob, EINVAL);
	uma_zfree(aiocb_zone, job);
	return (EINVAL);
	}

	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL \|\|
	job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
	!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
	uma_zfree(aiocb_zone, job);
	return (EINVAL);
	}

	ksiginfo_init(&job->ksi);

	/* Save userspace address of the job info. */
	job->ujob = ujob;

	/* Get the opcode. */
	if (type != LIO_NOP)
	job->uaiocb.aio_lio_opcode = type;
	opcode = job->uaiocb.aio_lio_opcode;

	/*
	* Validate the opcode and fetch the file object for the specified
	* file descriptor.
	*
	* XXXRW: Moved the opcode validation up here so that we don't
	* retrieve a file descriptor without knowing what the capabiltity
	* should be.
	*/
	fd = job->uaiocb.aio_fildes;
	switch (opcode) {
	case LIO_WRITE:
	error = fget_write(td, fd,
	cap_rights_init(&rights, CAP_PWRITE), &fp);
	break;
	case LIO_READ:
	error = fget_read(td, fd,
	cap_rights_init(&rights, CAP_PREAD), &fp);
	break;
	case LIO_SYNC:
	error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
	break;
	case LIO_MLOCK:
	fp = NULL;
	break;
	case LIO_NOP:
	error = fget(td, fd, cap_rights_init(&rights), &fp);
	break;
	default:
	error = EINVAL;
	}
	if (error) {
	uma_zfree(aiocb_zone, job);
	ops->store_error(ujob, error);
	return (error);
	}

	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
	error = EINVAL;
	goto aqueue_fail;
	}

	if ((opcode == LIO_READ \|\| opcode == LIO_WRITE) &&
	job->uaiocb.aio_offset < 0 &&
	(fp->f_vnode == NULL \|\| fp->f_vnode->v_type != VCHR)) {
	error = EINVAL;
	goto aqueue_fail;
	}

	job->fd_file = fp;

	mtx_lock(&aio_job_mtx);
	jid = jobrefid++;
	job->seqno = jobseqno++;
	mtx_unlock(&aio_job_mtx);
	error = ops->store_kernelinfo(ujob, jid);
	if (error) {
	error = EINVAL;
	goto aqueue_fail;
	}
	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;

	if (opcode == LIO_NOP) {
	fdrop(fp, td);
	uma_zfree(aiocb_zone, job);
	return (0);
	}

	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
	goto no_kqueue;
	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
	if ((evflags & ~(EV_CLEAR \| EV_DISPATCH \| EV_ONESHOT)) != 0) {
	error = EINVAL;
	goto aqueue_fail;
	}
	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
	kev.ident = (uintptr_t)job->ujob;
	kev.filter = EVFILT_AIO;
	kev.flags = EV_ADD \| EV_ENABLE \| EV_FLAG1 \| evflags;
	kev.data = (intptr_t)job;
	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
	error = kqfd_register(kqfd, &kev, td, 1);
	if (error)
	goto aqueue_fail;

	no_kqueue:

	ops->store_error(ujob, EINPROGRESS);
	job->uaiocb._aiocb_private.error = EINPROGRESS;
	job->userproc = p;
	job->cred = crhold(td->td_ucred);
	job->jobflags = KAIOCB_QUEUEING;
	job->lio = lj;

	if (opcode == LIO_MLOCK) {
	aio_schedule(job, aio_process_mlock);
	error = 0;
	} else if (fp->f_ops->fo_aio_queue == NULL)
	error = aio_queue_file(fp, job);
	else
	error = fo_aio_queue(fp, job);
	if (error)
	goto aqueue_fail;

	AIO_LOCK(ki);
	job->jobflags &= ~KAIOCB_QUEUEING;
	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
	ki->kaio_count++;
	if (lj)
	lj->lioj_count++;
	atomic_add_int(&num_queue_count, 1);
	if (job->jobflags & KAIOCB_FINISHED) {
	/*
	* The queue callback completed the request synchronously.
	* The bulk of the completion is deferred in that case
	* until this point.
	*/
	aio_bio_done_notify(p, job);
	} else
	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
	AIO_UNLOCK(ki);
	return (0);

	aqueue_fail:
	knlist_delete(&job->klist, curthread, 0);
	if (fp)
	fdrop(fp, td);
	uma_zfree(aiocb_zone, job);
	ops->store_error(ujob, error);
	return (error);
	}

	static void
	aio_cancel_daemon_job(struct kaiocb *job)
	{

	mtx_lock(&aio_job_mtx);
	if (!aio_cancel_cleared(job))
	TAILQ_REMOVE(&aio_jobs, job, list);
	mtx_unlock(&aio_job_mtx);
	aio_cancel(job);
	}

	void
	aio_schedule(struct kaiocb job, aio_handle_fn_t func)
	{

	mtx_lock(&aio_job_mtx);
	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
	mtx_unlock(&aio_job_mtx);
	aio_cancel(job);
	return;
	}
	job->handle_fn = func;
	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
	aio_kick_nowait(job->userproc);
	mtx_unlock(&aio_job_mtx);
	}

	static void
	aio_cancel_sync(struct kaiocb *job)
	{
	struct kaioinfo *ki;

	ki = job->userproc->p_aioinfo;
	AIO_LOCK(ki);
	if (!aio_cancel_cleared(job))
	TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
	AIO_UNLOCK(ki);
	aio_cancel(job);
	}

	int
	aio_queue_file(struct file fp, struct kaiocb job)
	{
	- struct aioliojob *lj;
	struct kaioinfo *ki;
	struct kaiocb *job2;
	struct vnode *vp;
	struct mount *mp;
	int error, opcode;
	bool safe;

	- lj = job->lio;
	ki = job->userproc->p_aioinfo;
	opcode = job->uaiocb.aio_lio_opcode;
	if (opcode == LIO_SYNC)
	goto queueit;

	if ((error = aio_qphysio(job->userproc, job)) == 0)
	goto done;
	#if 0
	/*
	* XXX: This means qphysio() failed with EFAULT. The current
	* behavior is to retry the operation via fo_read/fo_write.
	* Wouldn't it be better to just complete the request with an
	* error here?
	*/
	if (error > 0)
	goto done;
	#endif
	queueit:
	safe = false;
	if (fp->f_type == DTYPE_VNODE) {
	vp = fp->f_vnode;
	if (vp->v_type == VREG \|\| vp->v_type == VDIR) {
	mp = fp->f_vnode->v_mount;
	if (mp == NULL \|\| (mp->mnt_flag & MNT_LOCAL) != 0)
	safe = true;
	}
	}
	if (!(safe \|\| enable_aio_unsafe)) {
	counted_warning(&unsafe_warningcnt,
	"is attempting to use unsafe AIO requests");
	return (EOPNOTSUPP);
	}

	if (opcode == LIO_SYNC) {
	AIO_LOCK(ki);
	TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
	if (job2->fd_file == job->fd_file &&
	job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
	job2->seqno < job->seqno) {
	job2->jobflags \|= KAIOCB_CHECKSYNC;
	job->pending++;
	}
	}
	if (job->pending != 0) {
	if (!aio_set_cancel_function_locked(job,
	aio_cancel_sync)) {
	AIO_UNLOCK(ki);
	aio_cancel(job);
	return (0);
	}
	TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
	AIO_UNLOCK(ki);
	return (0);
	}
	AIO_UNLOCK(ki);
	}

	switch (opcode) {
	case LIO_READ:
	case LIO_WRITE:
	aio_schedule(job, aio_process_rw);
	error = 0;
	break;
	case LIO_SYNC:
	aio_schedule(job, aio_process_sync);
	error = 0;
	break;
	default:
	error = EINVAL;
	}
	done:
	return (error);
	}

	static void
	aio_kick_nowait(struct proc *userp)
	{
	struct kaioinfo *ki = userp->p_aioinfo;
	struct aioproc *aiop;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aioprocflags &= ~AIOP_FREE;
	wakeup(aiop->aioproc);
	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
	ki->kaio_active_count + num_aio_resv_start <
	ki->kaio_maxactive_count) {
	taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
	}
	}

	static int
	aio_kick(struct proc *userp)
	{
	struct kaioinfo *ki = userp->p_aioinfo;
	struct aioproc *aiop;
	int error, ret = 0;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	retryproc:
	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aioprocflags &= ~AIOP_FREE;
	wakeup(aiop->aioproc);
	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
	ki->kaio_active_count + num_aio_resv_start <
	ki->kaio_maxactive_count) {
	num_aio_resv_start++;
	mtx_unlock(&aio_job_mtx);
	error = aio_newproc(&num_aio_resv_start);
	mtx_lock(&aio_job_mtx);
	if (error) {
	num_aio_resv_start--;
	goto retryproc;
	}
	} else {
	ret = -1;
	}
	return (ret);
	}

	static void
	aio_kick_helper(void *context, int pending)
	{
	struct proc *userp = context;

	mtx_lock(&aio_job_mtx);
	while (--pending >= 0) {
	if (aio_kick(userp))
	break;
	}
	mtx_unlock(&aio_job_mtx);
	}

	/*
	* Support the aio_return system call, as a side-effect, kernel resources are
	* released.
	*/
	static int
	kern_aio_return(struct thread td, struct aiocb ujob, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct kaiocb *job;
	struct kaioinfo *ki;
	long status, error;

	ki = p->p_aioinfo;
	if (ki == NULL)
	return (EINVAL);
	AIO_LOCK(ki);
	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
	if (job->ujob == ujob)
	break;
	}
	if (job != NULL) {
	MPASS(job->jobflags & KAIOCB_FINISHED);
	status = job->uaiocb._aiocb_private.status;
	error = job->uaiocb._aiocb_private.error;
	td->td_retval[0] = status;
	td->td_ru.ru_oublock += job->outblock;
	td->td_ru.ru_inblock += job->inblock;
	td->td_ru.ru_msgsnd += job->msgsnd;
	td->td_ru.ru_msgrcv += job->msgrcv;
	aio_free_entry(job);
	AIO_UNLOCK(ki);
	ops->store_error(ujob, error);
	ops->store_status(ujob, status);
	} else {
	error = EINVAL;
	AIO_UNLOCK(ki);
	}
	return (error);
	}

	int
	sys_aio_return(struct thread td, struct aio_return_args uap)
	{

	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
	}

	/*
	* Allow a process to wakeup when any of the I/O requests are completed.
	*/
	static int
	kern_aio_suspend(struct thread td, int njoblist, struct aiocb *ujoblist,
	struct timespec *ts)
	{
	struct proc *p = td->td_proc;
	struct timeval atv;
	struct kaioinfo *ki;
	struct kaiocb firstjob, job;
	int error, i, timo;

	timo = 0;
	if (ts) {
	if (ts->tv_nsec < 0 \|\| ts->tv_nsec >= 1000000000)
	return (EINVAL);

	TIMESPEC_TO_TIMEVAL(&atv, ts);
	if (itimerfix(&atv))
	return (EINVAL);
	timo = tvtohz(&atv);
	}

	ki = p->p_aioinfo;
	if (ki == NULL)
	return (EAGAIN);

	if (njoblist == 0)
	return (0);

	AIO_LOCK(ki);
	for (;;) {
	firstjob = NULL;
	error = 0;
	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
	for (i = 0; i < njoblist; i++) {
	if (job->ujob == ujoblist[i]) {
	if (firstjob == NULL)
	firstjob = job;
	if (job->jobflags & KAIOCB_FINISHED)
	goto RETURN;
	}
	}
	}
	/* All tasks were finished. */
	if (firstjob == NULL)
	break;

	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO \| PCATCH,
	"aiospn", timo);
	if (error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}
	RETURN:
	AIO_UNLOCK(ki);
	return (error);
	}

	int
	sys_aio_suspend(struct thread td, struct aio_suspend_args uap)
	{
	struct timespec ts, *tsp;
	struct aiocb **ujoblist;
	int error;

	if (uap->nent < 0 \|\| uap->nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->timeout) {
	/* Get timespec struct. */
	if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
	return (error);
	tsp = &ts;
	} else
	tsp = NULL;

	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
	if (error == 0)
	error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
	free(ujoblist, M_AIOS);
	return (error);
	}

	/*
	* aio_cancel cancels any non-physio aio operations not currently in
	* progress.
	*/
	int
	sys_aio_cancel(struct thread td, struct aio_cancel_args uap)
	{
	struct proc *p = td->td_proc;
	struct kaioinfo *ki;
	struct kaiocb job, jobn;
	struct file *fp;
	cap_rights_t rights;
	int error;
	int cancelled = 0;
	int notcancelled = 0;
	struct vnode *vp;

	/* Lookup file object. */
	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
	if (error)
	return (error);

	ki = p->p_aioinfo;
	if (ki == NULL)
	goto done;

	if (fp->f_type == DTYPE_VNODE) {
	vp = fp->f_vnode;
	if (vn_isdisk(vp, &error)) {
	fdrop(fp, td);
	td->td_retval[0] = AIO_NOTCANCELED;
	return (0);
	}
	}

	AIO_LOCK(ki);
	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
	if ((uap->fd == job->uaiocb.aio_fildes) &&
	((uap->aiocbp == NULL) \|\|
	(uap->aiocbp == job->ujob))) {
	if (aio_cancel_job(p, ki, job)) {
	cancelled++;
	} else {
	notcancelled++;
	}
	if (uap->aiocbp != NULL)
	break;
	}
	}
	AIO_UNLOCK(ki);

	done:
	fdrop(fp, td);

	if (uap->aiocbp != NULL) {
	if (cancelled) {
	td->td_retval[0] = AIO_CANCELED;
	return (0);
	}
	}

	if (notcancelled) {
	td->td_retval[0] = AIO_NOTCANCELED;
	return (0);
	}

	if (cancelled) {
	td->td_retval[0] = AIO_CANCELED;
	return (0);
	}

	td->td_retval[0] = AIO_ALLDONE;

	return (0);
	}

	/*
	* aio_error is implemented in the kernel level for compatibility purposes
	* only. For a user mode async implementation, it would be best to do it in
	* a userland subroutine.
	*/
	static int
	kern_aio_error(struct thread td, struct aiocb ujob, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct kaiocb *job;
	struct kaioinfo *ki;
	int status;

	ki = p->p_aioinfo;
	if (ki == NULL) {
	td->td_retval[0] = EINVAL;
	return (0);
	}

	AIO_LOCK(ki);
	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
	if (job->ujob == ujob) {
	if (job->jobflags & KAIOCB_FINISHED)
	td->td_retval[0] =
	job->uaiocb._aiocb_private.error;
	else
	td->td_retval[0] = EINPROGRESS;
	AIO_UNLOCK(ki);
	return (0);
	}
	}
	AIO_UNLOCK(ki);

	/*
	* Hack for failure of aio_aqueue.
	*/
	status = ops->fetch_status(ujob);
	if (status == -1) {
	td->td_retval[0] = ops->fetch_error(ujob);
	return (0);
	}

	td->td_retval[0] = EINVAL;
	return (0);
	}

	int
	sys_aio_error(struct thread td, struct aio_error_args uap)
	{

	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
	}

	/* syscall - asynchronous read from a file (REALTIME) */
	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_aio_read(struct thread td, struct freebsd6_aio_read_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb_ops_osigevent));
	}
	#endif

	int
	sys_aio_read(struct thread td, struct aio_read_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
	}

	/* syscall - asynchronous write to a file (REALTIME) */
	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_aio_write(struct thread td, struct freebsd6_aio_write_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb_ops_osigevent));
	}
	#endif

	int
	sys_aio_write(struct thread td, struct aio_write_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
	}

	int
	sys_aio_mlock(struct thread td, struct aio_mlock_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
	}

	static int
	kern_lio_listio(struct thread td, int mode, struct aiocb const *uacb_list,
	struct aiocb *acb_list, int nent, struct sigevent sig,
	struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct aiocb *job;
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct kevent kev;
	int error;
	int nerror;
	int i;

	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
	return (EINVAL);

	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);

	ki = p->p_aioinfo;

	lj = uma_zalloc(aiolio_zone, M_WAITOK);
	lj->lioj_flags = 0;
	lj->lioj_count = 0;
	lj->lioj_finished_count = 0;
	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
	ksiginfo_init(&lj->lioj_ksi);

	/*
	* Setup signal.
	*/
	if (sig && (mode == LIO_NOWAIT)) {
	bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	/* Assume only new style KEVENT */
	kev.filter = EVFILT_LIO;
	kev.flags = EV_ADD \| EV_ENABLE \| EV_FLAG1;
	kev.ident = (uintptr_t)uacb_list; /* something unique */
	kev.data = (intptr_t)lj;
	/* pass user defined sigval data */
	kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
	error = kqfd_register(
	lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
	if (error) {
	uma_zfree(aiolio_zone, lj);
	return (error);
	}
	} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
	;
	} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
	if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
	uma_zfree(aiolio_zone, lj);
	return EINVAL;
	}
	lj->lioj_flags \|= LIOJ_SIGNAL;
	} else {
	uma_zfree(aiolio_zone, lj);
	return EINVAL;
	}
	}

	AIO_LOCK(ki);
	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
	/*
	* Add extra aiocb count to avoid the lio to be freed
	* by other threads doing aio_waitcomplete or aio_return,
	* and prevent event from being sent until we have queued
	* all tasks.
	*/
	lj->lioj_count = 1;
	AIO_UNLOCK(ki);

	/*
	* Get pointers to the list of I/O requests.
	*/
	nerror = 0;
	for (i = 0; i < nent; i++) {
	job = acb_list[i];
	if (job != NULL) {
	error = aio_aqueue(td, job, lj, LIO_NOP, ops);
	if (error != 0)
	nerror++;
	}
	}

	error = 0;
	AIO_LOCK(ki);
	if (mode == LIO_WAIT) {
	while (lj->lioj_count - 1 != lj->lioj_finished_count) {
	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki),
	PRIBIO \| PCATCH, "aiospn", 0);
	if (error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}
	} else {
	if (lj->lioj_count - 1 == lj->lioj_finished_count) {
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	lj->lioj_flags \|= LIOJ_KEVENT_POSTED;
	KNOTE_LOCKED(&lj->klist, 1);
	}
	if ((lj->lioj_flags & (LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED))
	== LIOJ_SIGNAL
	&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
	aio_sendsig(p, &lj->lioj_signal,
	&lj->lioj_ksi);
	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	}
	}
	}
	lj->lioj_count--;
	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	AIO_UNLOCK(ki);
	uma_zfree(aiolio_zone, lj);
	} else
	AIO_UNLOCK(ki);

	if (nerror)
	return (EIO);
	return (error);
	}

	/* syscall - list directed I/O (REALTIME) */
	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_lio_listio(struct thread td, struct freebsd6_lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct osigevent osig;
	int error, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &osig, sizeof(osig));
	if (error)
	return (error);
	error = convert_old_sigevent(&osig, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
	if (error == 0)
	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb_ops_osigevent);
	free(acb_list, M_LIO);
	return (error);
	}
	#endif

	/* syscall - list directed I/O (REALTIME) */
	int
	sys_lio_listio(struct thread td, struct lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	int error, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &sig, sizeof(sig));
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
	if (error == 0)
	error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
	nent, sigp, &aiocb_ops);
	free(acb_list, M_LIO);
	return (error);
	}

	static void
	aio_physwakeup(struct bio *bp)
	{
	struct kaiocb job = (struct kaiocb )bp->bio_caller1;
	struct proc *userp;
	struct kaioinfo *ki;
	size_t nbytes;
	int error, nblks;

	/* Release mapping into kernel space. */
	userp = job->userproc;
	ki = userp->p_aioinfo;
	if (job->pbuf) {
	pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
	relpbuf(job->pbuf, NULL);
	job->pbuf = NULL;
	atomic_subtract_int(&num_buf_aio, 1);
	AIO_LOCK(ki);
	ki->kaio_buffer_count--;
	AIO_UNLOCK(ki);
	}
	vm_page_unhold_pages(job->pages, job->npages);

	bp = job->bp;
	job->bp = NULL;
	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
	error = 0;
	if (bp->bio_flags & BIO_ERROR)
	error = bp->bio_error;
	nblks = btodb(nbytes);
	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
	job->outblock += nblks;
	else
	job->inblock += nblks;

	if (error)
	aio_complete(job, -1, error);
	else
	aio_complete(job, nbytes, 0);

	g_destroy_bio(bp);
	}

	/* syscall - wait for the next completion of an aio request */
	static int
	kern_aio_waitcomplete(struct thread td, struct aiocb *ujobp,
	struct timespec ts, struct aiocb_ops ops)
	{
	struct proc *p = td->td_proc;
	struct timeval atv;
	struct kaioinfo *ki;
	struct kaiocb *job;
	struct aiocb *ujob;
	long error, status;
	int timo;

	ops->store_aiocb(ujobp, NULL);

	if (ts == NULL) {
	timo = 0;
	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
	timo = -1;
	} else {
	if ((ts->tv_nsec < 0) \|\| (ts->tv_nsec >= 1000000000))
	return (EINVAL);

	TIMESPEC_TO_TIMEVAL(&atv, ts);
	if (itimerfix(&atv))
	return (EINVAL);
	timo = tvtohz(&atv);
	}

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);
	ki = p->p_aioinfo;

	error = 0;
	job = NULL;
	AIO_LOCK(ki);
	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
	if (timo == -1) {
	error = EWOULDBLOCK;
	break;
	}
	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO \| PCATCH,
	"aiowc", timo);
	if (timo && error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}

	if (job != NULL) {
	MPASS(job->jobflags & KAIOCB_FINISHED);
	ujob = job->ujob;
	status = job->uaiocb._aiocb_private.status;
	error = job->uaiocb._aiocb_private.error;
	td->td_retval[0] = status;
	td->td_ru.ru_oublock += job->outblock;
	td->td_ru.ru_inblock += job->inblock;
	td->td_ru.ru_msgsnd += job->msgsnd;
	td->td_ru.ru_msgrcv += job->msgrcv;
	aio_free_entry(job);
	AIO_UNLOCK(ki);
	ops->store_aiocb(ujobp, ujob);
	ops->store_error(ujob, error);
	ops->store_status(ujob, status);
	} else
	AIO_UNLOCK(ki);

	return (error);
	}

	int
	sys_aio_waitcomplete(struct thread td, struct aio_waitcomplete_args uap)
	{
	struct timespec ts, *tsp;
	int error;

	if (uap->timeout) {
	/* Get timespec struct. */
	error = copyin(uap->timeout, &ts, sizeof(ts));
	if (error)
	return (error);
	tsp = &ts;
	} else
	tsp = NULL;

	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
	}

	static int
	kern_aio_fsync(struct thread td, int op, struct aiocb ujob,
	struct aiocb_ops *ops)
	{

	if (op != O_SYNC) /* XXX lack of O_DSYNC */
	return (EINVAL);
	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
	}

	int
	sys_aio_fsync(struct thread td, struct aio_fsync_args uap)
	{

	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
	}

	/* kqueue attach function */
	static int
	filt_aioattach(struct knote *kn)
	{
	struct kaiocb *job;

	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;

	/*
	* The job pointer must be validated before using it, so
	* registration is restricted to the kernel; the user cannot
	* set EV_FLAG1.
	*/
	if ((kn->kn_flags & EV_FLAG1) == 0)
	return (EPERM);
	kn->kn_ptr.p_aio = job;
	kn->kn_flags &= ~EV_FLAG1;

	knlist_add(&job->klist, kn, 0);

	return (0);
	}

	/* kqueue detach function */
	static void
	filt_aiodetach(struct knote *kn)
	{
	struct knlist *knl;

	knl = &kn->kn_ptr.p_aio->klist;
	knl->kl_lock(knl->kl_lockarg);
	if (!knlist_empty(knl))
	knlist_remove(knl, kn, 1);
	knl->kl_unlock(knl->kl_lockarg);
	}

	/* kqueue filter function */
	/ARGSUSED/
	static int
	filt_aio(struct knote *kn, long hint)
	{
	struct kaiocb *job = kn->kn_ptr.p_aio;

	kn->kn_data = job->uaiocb._aiocb_private.error;
	if (!(job->jobflags & KAIOCB_FINISHED))
	return (0);
	kn->kn_flags \|= EV_EOF;
	return (1);
	}

	/* kqueue attach function */
	static int
	filt_lioattach(struct knote *kn)
	{
	struct aioliojob *lj;

	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;

	/*
	* The aioliojob pointer must be validated before using it, so
	* registration is restricted to the kernel; the user cannot
	* set EV_FLAG1.
	*/
	if ((kn->kn_flags & EV_FLAG1) == 0)
	return (EPERM);
	kn->kn_ptr.p_lio = lj;
	kn->kn_flags &= ~EV_FLAG1;

	knlist_add(&lj->klist, kn, 0);

	return (0);
	}

	/* kqueue detach function */
	static void
	filt_liodetach(struct knote *kn)
	{
	struct knlist *knl;

	knl = &kn->kn_ptr.p_lio->klist;
	knl->kl_lock(knl->kl_lockarg);
	if (!knlist_empty(knl))
	knlist_remove(knl, kn, 1);
	knl->kl_unlock(knl->kl_lockarg);
	}

	/* kqueue filter function */
	/ARGSUSED/
	static int
	filt_lio(struct knote *kn, long hint)
	{
	struct aioliojob * lj = kn->kn_ptr.p_lio;

	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
	}

	#ifdef COMPAT_FREEBSD32
	#include <sys/mount.h>
	#include <sys/socket.h>
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	struct __aiocb_private32 {
	int32_t status;
	int32_t error;
	uint32_t kernelinfo;
	};

	#ifdef COMPAT_FREEBSD6
	typedef struct oaiocb32 {
	int aio_fildes; /* File descriptor */
	uint64_t aio_offset __packed; /* File offset for I/O */
	uint32_t aio_buf; /* I/O buffer in process space */
	uint32_t aio_nbytes; /* Number of bytes for I/O */
	struct osigevent32 aio_sigevent; /* Signal to deliver */
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private32 _aiocb_private;
	} oaiocb32_t;
	#endif

	typedef struct aiocb32 {
	int32_t aio_fildes; /* File descriptor */
	uint64_t aio_offset __packed; /* File offset for I/O */
	uint32_t aio_buf; /* I/O buffer in process space */
	uint32_t aio_nbytes; /* Number of bytes for I/O */
	int __spare__[2];
	uint32_t __spare2__;
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private32 _aiocb_private;
	struct sigevent32 aio_sigevent; /* Signal to deliver */
	} aiocb32_t;

	#ifdef COMPAT_FREEBSD6
	static int
	convert_old_sigevent32(struct osigevent32 osig, struct sigevent nsig)
	{

	/*
	* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
	* supported by AIO with the old sigevent structure.
	*/
	CP(osig, nsig, sigev_notify);
	switch (nsig->sigev_notify) {
	case SIGEV_NONE:
	break;
	case SIGEV_SIGNAL:
	nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
	break;
	case SIGEV_KEVENT:
	nsig->sigev_notify_kqueue =
	osig->__sigev_u.__sigev_notify_kqueue;
	PTRIN_CP(osig, nsig, sigev_value.sival_ptr);
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	aiocb32_copyin_old_sigevent(struct aiocb ujob, struct aiocb kjob)
	{
	struct oaiocb32 job32;
	int error;

	bzero(kjob, sizeof(struct aiocb));
	error = copyin(ujob, &job32, sizeof(job32));
	if (error)
	return (error);

	CP(job32, *kjob, aio_fildes);
	CP(job32, *kjob, aio_offset);
	PTRIN_CP(job32, *kjob, aio_buf);
	CP(job32, *kjob, aio_nbytes);
	CP(job32, *kjob, aio_lio_opcode);
	CP(job32, *kjob, aio_reqprio);
	CP(job32, *kjob, _aiocb_private.status);
	CP(job32, *kjob, _aiocb_private.error);
	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
	return (convert_old_sigevent32(&job32.aio_sigevent,
	&kjob->aio_sigevent));
	}
	#endif

	static int
	aiocb32_copyin(struct aiocb ujob, struct aiocb kjob)
	{
	struct aiocb32 job32;
	int error;

	error = copyin(ujob, &job32, sizeof(job32));
	if (error)
	return (error);
	CP(job32, *kjob, aio_fildes);
	CP(job32, *kjob, aio_offset);
	PTRIN_CP(job32, *kjob, aio_buf);
	CP(job32, *kjob, aio_nbytes);
	CP(job32, *kjob, aio_lio_opcode);
	CP(job32, *kjob, aio_reqprio);
	CP(job32, *kjob, _aiocb_private.status);
	CP(job32, *kjob, _aiocb_private.error);
	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
	}

	static long
	aiocb32_fetch_status(struct aiocb *ujob)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (fuword32(&ujob32->_aiocb_private.status));
	}

	static long
	aiocb32_fetch_error(struct aiocb *ujob)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (fuword32(&ujob32->_aiocb_private.error));
	}

	static int
	aiocb32_store_status(struct aiocb *ujob, long status)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.status, status));
	}

	static int
	aiocb32_store_error(struct aiocb *ujob, long error)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.error, error));
	}

	static int
	aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
	}

	static int
	aiocb32_store_aiocb(struct aiocb *ujobp, struct aiocb ujob)
	{

	return (suword32(ujobp, (long)ujob));
	}

	static struct aiocb_ops aiocb32_ops = {
	.copyin = aiocb32_copyin,
	.fetch_status = aiocb32_fetch_status,
	.fetch_error = aiocb32_fetch_error,
	.store_status = aiocb32_store_status,
	.store_error = aiocb32_store_error,
	.store_kernelinfo = aiocb32_store_kernelinfo,
	.store_aiocb = aiocb32_store_aiocb,
	};

	#ifdef COMPAT_FREEBSD6
	static struct aiocb_ops aiocb32_ops_osigevent = {
	.copyin = aiocb32_copyin_old_sigevent,
	.fetch_status = aiocb32_fetch_status,
	.fetch_error = aiocb32_fetch_error,
	.store_status = aiocb32_store_status,
	.store_error = aiocb32_store_error,
	.store_kernelinfo = aiocb32_store_kernelinfo,
	.store_aiocb = aiocb32_store_aiocb,
	};
	#endif

	int
	freebsd32_aio_return(struct thread td, struct freebsd32_aio_return_args uap)
	{

	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
	}

	int
	freebsd32_aio_suspend(struct thread td, struct freebsd32_aio_suspend_args uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	struct aiocb **ujoblist;
	uint32_t *ujoblist32;
	int error, i;

	if (uap->nent < 0 \|\| uap->nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->timeout) {
	/* Get timespec struct. */
	if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	tsp = &ts;
	} else
	tsp = NULL;

	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
	ujoblist32 = (uint32_t *)ujoblist;
	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
	sizeof(ujoblist32[0]));
	if (error == 0) {
	for (i = uap->nent - 1; i >= 0; i--)
	ujoblist[i] = PTRIN(ujoblist32[i]);

	error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
	}
	free(ujoblist, M_AIOS);
	return (error);
	}

	int
	freebsd32_aio_error(struct thread td, struct freebsd32_aio_error_args uap)
	{

	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
	}

	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_freebsd32_aio_read(struct thread *td,
	struct freebsd6_freebsd32_aio_read_args *uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb32_ops_osigevent));
	}
	#endif

	int
	freebsd32_aio_read(struct thread td, struct freebsd32_aio_read_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb32_ops));
	}

	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_freebsd32_aio_write(struct thread *td,
	struct freebsd6_freebsd32_aio_write_args *uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb32_ops_osigevent));
	}
	#endif

	int
	freebsd32_aio_write(struct thread td, struct freebsd32_aio_write_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_mlock(struct thread td, struct freebsd32_aio_mlock_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_waitcomplete(struct thread *td,
	struct freebsd32_aio_waitcomplete_args *uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	int error;

	if (uap->timeout) {
	/* Get timespec struct. */
	error = copyin(uap->timeout, &ts32, sizeof(ts32));
	if (error)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	tsp = &ts;
	} else
	tsp = NULL;

	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_fsync(struct thread td, struct freebsd32_aio_fsync_args uap)
	{

	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
	&aiocb32_ops));
	}

	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_freebsd32_lio_listio(struct thread *td,
	struct freebsd6_freebsd32_lio_listio_args *uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct osigevent32 osig;
	uint32_t *acb_list32;
	int error, i, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &osig, sizeof(osig));
	if (error)
	return (error);
	error = convert_old_sigevent32(&osig, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
	if (error) {
	free(acb_list32, M_LIO);
	return (error);
	}
	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	for (i = 0; i < nent; i++)
	acb_list[i] = PTRIN(acb_list32[i]);
	free(acb_list32, M_LIO);

	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb32_ops_osigevent);
	free(acb_list, M_LIO);
	return (error);
	}
	#endif

	int
	freebsd32_lio_listio(struct thread td, struct freebsd32_lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct sigevent32 sig32;
	uint32_t *acb_list32;
	int error, i, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > max_aio_queue_per_proc)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &sig32, sizeof(sig32));
	if (error)
	return (error);
	error = convert_sigevent32(&sig32, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
	if (error) {
	free(acb_list32, M_LIO);
	return (error);
	}
	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	for (i = 0; i < nent; i++)
	acb_list[i] = PTRIN(acb_list32[i]);
	free(acb_list32, M_LIO);

	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb32_ops);
	free(acb_list, M_LIO);
	return (error);
	}

	#endif
	Index: head/sys/kern/vfs_subr.c
	===================================================================
	--- head/sys/kern/vfs_subr.c (revision 327172)
	+++ head/sys/kern/vfs_subr.c (revision 327173)
	@@ -1,5566 +1,5565 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	*/

	/*
	* External virtual filesystem routines
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_ddb.h"
	#include "opt_watchdog.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/condvar.h>
	#include <sys/conf.h>
	#include <sys/counter.h>
	#include <sys/dirent.h>
	#include <sys/event.h>
	#include <sys/eventhandler.h>
	#include <sys/extattr.h>
	#include <sys/file.h>
	#include <sys/fcntl.h>
	#include <sys/jail.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lockf.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/pctrie.h>
	#include <sys/priv.h>
	#include <sys/reboot.h>
	#include <sys/refcount.h>
	#include <sys/rwlock.h>
	#include <sys/sched.h>
	#include <sys/sleepqueue.h>
	#include <sys/smp.h>
	#include <sys/stat.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/watchdog.h>

	#include <machine/stdarg.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/vm_kern.h>
	#include <vm/uma.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif

	static void delmntque(struct vnode *vp);
	static int flushbuflist(struct bufv bufv, int flags, struct bufobj bo,
	int slpflag, int slptimeo);
	static void syncer_shutdown(void *arg, int howto);
	static int vtryrecycle(struct vnode *vp);
	static void v_init_counters(struct vnode *);
	static void v_incr_usecount(struct vnode *);
	static void v_incr_usecount_locked(struct vnode *);
	static void v_incr_devcount(struct vnode *);
	static void v_decr_devcount(struct vnode *);
	static void vgonel(struct vnode *);
	static void vfs_knllock(void *arg);
	static void vfs_knlunlock(void *arg);
	static void vfs_knl_assert_locked(void *arg);
	static void vfs_knl_assert_unlocked(void *arg);
	static void vnlru_return_batches(struct vfsops *mnt_op);
	static void destroy_vpollinfo(struct vpollinfo *vi);

	/*
	* Number of vnodes in existence. Increased whenever getnewvnode()
	* allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
	*/
	static unsigned long numvnodes;

	SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
	"Number of vnodes in existence");

	static counter_u64_t vnodes_created;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
	"Number of vnodes created by getnewvnode");

	static u_long mnt_free_list_batch = 128;
	SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
	&mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");

	/*
	* Conversion tables for conversion from vnode types to inode formats
	* and back.
	*/
	enum vtype iftovt_tab[16] = {
	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
	};
	int vttoif_tab[10] = {
	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
	};

	/*
	* List of vnodes that are ready for recycling.
	*/
	static TAILQ_HEAD(freelst, vnode) vnode_free_list;

	/*
	* "Free" vnode target. Free vnodes are rarely completely free, but are
	* just ones that are cheap to recycle. Usually they are for files which
	* have been stat'd but not read; these usually have inode and namecache
	* data attached to them. This target is the preferred minimum size of a
	* sub-cache consisting mostly of such files. The system balances the size
	* of this sub-cache with its complement to try to prevent either from
	* thrashing while the other is relatively inactive. The targets express
	* a preference for the best balance.
	*
	* "Above" this target there are 2 further targets (watermarks) related
	* to recyling of free vnodes. In the best-operating case, the cache is
	* exactly full, the free list has size between vlowat and vhiwat above the
	* free target, and recycling from it and normal use maintains this state.
	* Sometimes the free list is below vlowat or even empty, but this state
	* is even better for immediate use provided the cache is not full.
	* Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
	* ones) to reach one of these states. The watermarks are currently hard-
	* coded as 4% and 9% of the available space higher. These and the default
	* of 25% for wantfreevnodes are too large if the memory size is large.
	* E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
	* whenever vnlru_proc() becomes active.
	*/
	static u_long wantfreevnodes;
	SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
	&wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
	static u_long freevnodes;
	SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
	&freevnodes, 0, "Number of \"free\" vnodes");

	static counter_u64_t recycles_count;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
	"Number of vnodes recycled to meet vnode cache targets");

	/*
	* Various variables used for debugging the new implementation of
	* reassignbuf().
	* XXX these are probably of (very) limited utility now.
	*/
	static int reassignbufcalls;
	SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
	"Number of calls to reassignbuf");

	static counter_u64_t free_owe_inact;
	SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
	"Number of times free vnodes kept on active list due to VFS "
	"owing inactivation");

	/* To keep more than one thread at a time from running vfs_getnewfsid */
	static struct mtx mntid_mtx;

	/*
	* Lock for any access to the following:
	* vnode_free_list
	* numvnodes
	* freevnodes
	*/
	static struct mtx vnode_free_list_mtx;

	/* Publicly exported FS */
	struct nfs_public nfs_pub;

	static uma_zone_t buf_trie_zone;

	/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
	static uma_zone_t vnode_zone;
	static uma_zone_t vnodepoll_zone;

	/*
	* The workitem queue.
	*
	* It is useful to delay writes of file data and filesystem metadata
	* for tens of seconds so that quickly created and deleted files need
	* not waste disk bandwidth being created and removed. To realize this,
	* we append vnodes to a "workitem" queue. When running with a soft
	* updates implementation, most pending metadata dependencies should
	* not wait for more than a few seconds. Thus, mounted on block devices
	* are delayed only about a half the time that file data is delayed.
	* Similarly, directory updates are more critical, so are only delayed
	* about a third the time that file data is delayed. Thus, there are
	* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
	* one each second (driven off the filesystem syncer process). The
	* syncer_delayno variable indicates the next queue that is to be processed.
	* Items that need to be processed soon are placed in this queue:
	*
	* syncer_workitem_pending[syncer_delayno]
	*
	* A delay of fifteen seconds is done by placing the request fifteen
	* entries later in the queue:
	*
	* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
	*
	*/
	static int syncer_delayno;
	static long syncer_mask;
	LIST_HEAD(synclist, bufobj);
	static struct synclist *syncer_workitem_pending;
	/*
	* The sync_mtx protects:
	* bo->bo_synclist
	* sync_vnode_count
	* syncer_delayno
	* syncer_state
	* syncer_workitem_pending
	* syncer_worklist_len
	* rushjob
	*/
	static struct mtx sync_mtx;
	static struct cv sync_wakeup;

	#define SYNCER_MAXDELAY 32
	static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
	static int syncdelay = 30; /* max time to delay syncing data */
	static int filedelay = 30; /* time to delay syncing files */
	SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
	"Time to delay syncing files (in seconds)");
	static int dirdelay = 29; /* time to delay syncing directories */
	SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
	"Time to delay syncing directories (in seconds)");
	static int metadelay = 28; /* time to delay syncing metadata */
	SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
	"Time to delay syncing metadata (in seconds)");
	static int rushjob; /* number of slots to run ASAP */
	static int stat_rush_requests; /* number of times I/O speeded up */
	SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
	"Number of times I/O speeded up (rush requests)");

	/*
	* When shutting down the syncer, run it at four times normal speed.
	*/
	#define SYNCER_SHUTDOWN_SPEEDUP 4
	static int sync_vnode_count;
	static int syncer_worklist_len;
	static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
	syncer_state;

	/* Target for maximum number of vnodes. */
	int desiredvnodes;
	static int gapvnodes; /* gap between wanted and desired */
	static int vhiwat; /* enough extras after expansion */
	static int vlowat; /* minimal extras before expansion */
	static int vstir; /* nonzero to stir non-free vnodes */
	static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */

	static int
	sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
	{
	int error, old_desiredvnodes;

	old_desiredvnodes = desiredvnodes;
	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
	return (error);
	if (old_desiredvnodes != desiredvnodes) {
	wantfreevnodes = desiredvnodes / 4;
	/* XXX locking seems to be incomplete. */
	vfs_hash_changesize(desiredvnodes);
	cache_changesize(desiredvnodes);
	}
	return (0);
	}

	SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
	CTLTYPE_INT \| CTLFLAG_MPSAFE \| CTLFLAG_RW, &desiredvnodes, 0,
	sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
	SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
	&wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
	static int vnlru_nowhere;
	SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
	&vnlru_nowhere, 0, "Number of times the vnlru process ran without success");

	/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
	static int vnsz2log;

	/*
	* Support for the bufobj clean & dirty pctrie.
	*/
	static void *
	buf_trie_alloc(struct pctrie *ptree)
	{

	return uma_zalloc(buf_trie_zone, M_NOWAIT);
	}

	static void
	buf_trie_free(struct pctrie ptree, void node)
	{

	uma_zfree(buf_trie_zone, node);
	}
	PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);

	/*
	* Initialize the vnode management data structures.
	*
	* Reevaluate the following cap on the number of vnodes after the physical
	* memory size exceeds 512GB. In the limit, as the physical memory size
	* grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
	*/
	#ifndef MAXVNODES_MAX
	#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */
	#endif

	/*
	* Initialize a vnode as it first enters the zone.
	*/
	static int
	vnode_init(void *mem, int size, int flags)
	{
	struct vnode *vp;
	struct bufobj *bo;

	vp = mem;
	bzero(vp, size);
	/*
	* Setup locks.
	*/
	vp->v_vnlock = &vp->v_lock;
	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
	/*
	* By default, don't allow shared locks unless filesystems opt-in.
	*/
	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
	LK_NOSHARE \| LK_IS_VNODE);
	/*
	* Initialize bufobj.
	*/
	bo = &vp->v_bufobj;
	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
	bo->bo_private = vp;
	TAILQ_INIT(&bo->bo_clean.bv_hd);
	TAILQ_INIT(&bo->bo_dirty.bv_hd);
	/*
	* Initialize namecache.
	*/
	LIST_INIT(&vp->v_cache_src);
	TAILQ_INIT(&vp->v_cache_dst);
	/*
	* Initialize rangelocks.
	*/
	rangelock_init(&vp->v_rl);
	return (0);
	}

	/*
	* Free a vnode when it is cleared from the zone.
	*/
	static void
	vnode_fini(void *mem, int size)
	{
	struct vnode *vp;
	struct bufobj *bo;

	vp = mem;
	rangelock_destroy(&vp->v_rl);
	lockdestroy(vp->v_vnlock);
	mtx_destroy(&vp->v_interlock);
	bo = &vp->v_bufobj;
	rw_destroy(BO_LOCKPTR(bo));
	}

	/*
	* Provide the size of NFS nclnode and NFS fh for calculation of the
	* vnode memory consumption. The size is specified directly to
	* eliminate dependency on NFS-private header.
	*
	* Other filesystems may use bigger or smaller (like UFS and ZFS)
	* private inode data, but the NFS-based estimation is ample enough.
	* Still, we care about differences in the size between 64- and 32-bit
	* platforms.
	*
	* Namecache structure size is heuristically
	* sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
	*/
	#ifdef _LP64
	#define NFS_NCLNODE_SZ (528 + 64)
	#define NC_SZ 148
	#else
	#define NFS_NCLNODE_SZ (360 + 32)
	#define NC_SZ 92
	#endif

	static void
	vntblinit(void *dummy __unused)
	{
	u_int i;
	int physvnodes, virtvnodes;

	/*
	* Desiredvnodes is a function of the physical memory size and the
	* kernel's heap size. Generally speaking, it scales with the
	* physical memory size. The ratio of desiredvnodes to the physical
	* memory size is 1:16 until desiredvnodes exceeds 98,304.
	* Thereafter, the
	* marginal ratio of desiredvnodes to the physical memory size is
	* 1:64. However, desiredvnodes is limited by the kernel's heap
	* size. The memory required by desiredvnodes vnodes and vm objects
	* must not exceed 1/10th of the kernel's heap size.
	*/
	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
	3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
	sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
	desiredvnodes = min(physvnodes, virtvnodes);
	if (desiredvnodes > MAXVNODES_MAX) {
	if (bootverbose)
	printf("Reducing kern.maxvnodes %d -> %d\n",
	desiredvnodes, MAXVNODES_MAX);
	desiredvnodes = MAXVNODES_MAX;
	}
	wantfreevnodes = desiredvnodes / 4;
	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
	TAILQ_INIT(&vnode_free_list);
	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
	vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	/*
	* Preallocate enough nodes to support one-per buf so that
	* we can not fail an insert. reassignbuf() callers can not
	* tolerate the insertion failure.
	*/
	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
	NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
	UMA_ZONE_NOFREE \| UMA_ZONE_VM);
	uma_prealloc(buf_trie_zone, nbuf);

	vnodes_created = counter_u64_alloc(M_WAITOK);
	recycles_count = counter_u64_alloc(M_WAITOK);
	free_owe_inact = counter_u64_alloc(M_WAITOK);

	/*
	* Initialize the filesystem syncer.
	*/
	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
	&syncer_mask);
	syncer_maxdelay = syncer_mask + 1;
	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
	cv_init(&sync_wakeup, "syncer");
	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
	vnsz2log++;
	vnsz2log--;
	}
	SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);


	/*
	* Mark a mount point as busy. Used to synchronize access and to delay
	* unmounting. Eventually, mountlist_mtx is not released on failure.
	*
	* vfs_busy() is a custom lock, it can block the caller.
	* vfs_busy() only sleeps if the unmount is active on the mount point.
	* For a mountpoint mp, vfs_busy-enforced lock is before lock of any
	* vnode belonging to mp.
	*
	* Lookup uses vfs_busy() to traverse mount points.
	* root fs var fs
	* / vnode lock A / vnode lock (/var) D
	* /var vnode lock B /log vnode lock(/var/log) E
	* vfs_busy lock C vfs_busy lock F
	*
	* Within each file system, the lock order is C->A->B and F->D->E.
	*
	* When traversing across mounts, the system follows that lock order:
	*
	* C->A->B
	* \|
	* +->F->D->E
	*
	* The lookup() process for namei("/var") illustrates the process:
	* VOP_LOOKUP() obtains B while A is held
	* vfs_busy() obtains a shared lock on F while A and B are held
	* vput() releases lock on B
	* vput() releases lock on A
	* VFS_ROOT() obtains lock on D while shared lock on F is held
	* vfs_unbusy() releases shared lock on F
	* vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
	* Attempt to lock A (instead of vp_crossmp) while D is held would
	* violate the global order, causing deadlocks.
	*
	* dounmount() locks B while F is drained.
	*/
	int
	vfs_busy(struct mount *mp, int flags)
	{

	MPASS((flags & ~MBF_MASK) == 0);
	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);

	MNT_ILOCK(mp);
	MNT_REF(mp);
	/*
	* If mount point is currently being unmounted, sleep until the
	* mount point fate is decided. If thread doing the unmounting fails,
	* it will clear MNTK_UNMOUNT flag before waking us up, indicating
	* that this mount point has survived the unmount attempt and vfs_busy
	* should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
	* flag in addition to MNTK_UNMOUNT, indicating that mount point is
	* about to be really destroyed. vfs_busy needs to release its
	* reference on the mount point in this case and return with ENOENT,
	* telling the caller that mount mount it tried to busy is no longer
	* valid.
	*/
	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
	if (flags & MBF_NOWAIT \|\| mp->mnt_kern_flag & MNTK_REFEXPIRE) {
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	CTR1(KTR_VFS, "%s: failed busying before sleeping",
	__func__);
	return (ENOENT);
	}
	if (flags & MBF_MNTLSTLOCK)
	mtx_unlock(&mountlist_mtx);
	mp->mnt_kern_flag \|= MNTK_MWAIT;
	msleep(mp, MNT_MTX(mp), PVFS \| PDROP, "vfs_busy", 0);
	if (flags & MBF_MNTLSTLOCK)
	mtx_lock(&mountlist_mtx);
	MNT_ILOCK(mp);
	}
	if (flags & MBF_MNTLSTLOCK)
	mtx_unlock(&mountlist_mtx);
	mp->mnt_lockref++;
	MNT_IUNLOCK(mp);
	return (0);
	}

	/*
	* Free a busy filesystem.
	*/
	void
	vfs_unbusy(struct mount *mp)
	{

	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
	MNT_ILOCK(mp);
	MNT_REL(mp);
	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
	mp->mnt_lockref--;
	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
	MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
	CTR1(KTR_VFS, "%s: waking up waiters", __func__);
	mp->mnt_kern_flag &= ~MNTK_DRAINING;
	wakeup(&mp->mnt_lockref);
	}
	MNT_IUNLOCK(mp);
	}

	/*
	* Lookup a mount point by filesystem identifier.
	*/
	struct mount *
	vfs_getvfs(fsid_t *fsid)
	{
	struct mount *mp;

	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
	mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
	vfs_ref(mp);
	mtx_unlock(&mountlist_mtx);
	return (mp);
	}
	}
	mtx_unlock(&mountlist_mtx);
	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
	return ((struct mount *) 0);
	}

	/*
	* Lookup a mount point by filesystem identifier, busying it before
	* returning.
	*
	* To avoid congestion on mountlist_mtx, implement simple direct-mapped
	* cache for popular filesystem identifiers. The cache is lockess, using
	* the fact that struct mount's are never freed. In worst case we may
	* get pointer to unmounted or even different filesystem, so we have to
	* check what we got, and go slow way if so.
	*/
	struct mount *
	vfs_busyfs(fsid_t *fsid)
	{
	#define FSID_CACHE_SIZE 256
	typedef struct mount * volatile vmp_t;
	static vmp_t cache[FSID_CACHE_SIZE];
	struct mount *mp;
	int error;
	uint32_t hash;

	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
	hash = fsid->val[0] ^ fsid->val[1];
	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
	mp = cache[hash];
	if (mp == NULL \|\|
	mp->mnt_stat.f_fsid.val[0] != fsid->val[0] \|\|
	mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
	goto slow;
	if (vfs_busy(mp, 0) != 0) {
	cache[hash] = NULL;
	goto slow;
	}
	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
	mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
	return (mp);
	else
	vfs_unbusy(mp);

	slow:
	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
	mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
	error = vfs_busy(mp, MBF_MNTLSTLOCK);
	if (error) {
	cache[hash] = NULL;
	mtx_unlock(&mountlist_mtx);
	return (NULL);
	}
	cache[hash] = mp;
	return (mp);
	}
	}
	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
	mtx_unlock(&mountlist_mtx);
	return ((struct mount *) 0);
	}

	/*
	* Check if a user can access privileged mount options.
	*/
	int
	vfs_suser(struct mount mp, struct thread td)
	{
	int error;

	/*
	* If the thread is jailed, but this is not a jail-friendly file
	* system, deny immediately.
	*/
	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
	return (EPERM);

	/*
	* If the file system was mounted outside the jail of the calling
	* thread, deny immediately.
	*/
	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
	return (EPERM);

	/*
	* If file system supports delegated administration, we don't check
	* for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
	* by the file system itself.
	* If this is not the user that did original mount, we check for
	* the PRIV_VFS_MOUNT_OWNER privilege.
	*/
	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
	mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
	if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
	return (error);
	}
	return (0);
	}

	/*
	* Get a new unique fsid. Try to make its val[0] unique, since this value
	* will be used to create fake device numbers for stat(). Also try (but
	* not so hard) make its val[0] unique mod 2^16, since some emulators only
	* support 16-bit device numbers. We end up with unique val[0]'s for the
	* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
	*
	* Keep in mind that several mounts may be running in parallel. Starting
	* the search one past where the previous search terminated is both a
	* micro-optimization and a defense against returning the same fsid to
	* different mounts.
	*/
	void
	vfs_getnewfsid(struct mount *mp)
	{
	static uint16_t mntid_base;
	struct mount *nmp;
	fsid_t tfsid;
	int mtype;

	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
	mtx_lock(&mntid_mtx);
	mtype = mp->mnt_vfc->vfc_typenum;
	tfsid.val[1] = mtype;
	mtype = (mtype & 0xFF) << 24;
	for (;;) {
	tfsid.val[0] = makedev(255,
	mtype \| ((mntid_base & 0xFF00) << 8) \| (mntid_base & 0xFF));
	mntid_base++;
	if ((nmp = vfs_getvfs(&tfsid)) == NULL)
	break;
	vfs_rel(nmp);
	}
	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
	mtx_unlock(&mntid_mtx);
	}

	/*
	* Knob to control the precision of file timestamps:
	*
	* 0 = seconds only; nanoseconds zeroed.
	* 1 = seconds and nanoseconds, accurate within 1/HZ.
	* 2 = seconds and nanoseconds, truncated to microseconds.
	* >=3 = seconds and nanoseconds, maximum precision.
	*/
	enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };

	static int timestamp_precision = TSP_USEC;
	SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
	&timestamp_precision, 0, "File timestamp precision (0: seconds, "
	"1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
	"3+: sec + ns (max. precision))");

	/*
	* Get a current timestamp.
	*/
	void
	vfs_timestamp(struct timespec *tsp)
	{
	struct timeval tv;

	switch (timestamp_precision) {
	case TSP_SEC:
	tsp->tv_sec = time_second;
	tsp->tv_nsec = 0;
	break;
	case TSP_HZ:
	getnanotime(tsp);
	break;
	case TSP_USEC:
	microtime(&tv);
	TIMEVAL_TO_TIMESPEC(&tv, tsp);
	break;
	case TSP_NSEC:
	default:
	nanotime(tsp);
	break;
	}
	}

	/*
	* Set vnode attributes to VNOVAL
	*/
	void
	vattr_null(struct vattr *vap)
	{

	vap->va_type = VNON;
	vap->va_size = VNOVAL;
	vap->va_bytes = VNOVAL;
	vap->va_mode = VNOVAL;
	vap->va_nlink = VNOVAL;
	vap->va_uid = VNOVAL;
	vap->va_gid = VNOVAL;
	vap->va_fsid = VNOVAL;
	vap->va_fileid = VNOVAL;
	vap->va_blocksize = VNOVAL;
	vap->va_rdev = VNOVAL;
	vap->va_atime.tv_sec = VNOVAL;
	vap->va_atime.tv_nsec = VNOVAL;
	vap->va_mtime.tv_sec = VNOVAL;
	vap->va_mtime.tv_nsec = VNOVAL;
	vap->va_ctime.tv_sec = VNOVAL;
	vap->va_ctime.tv_nsec = VNOVAL;
	vap->va_birthtime.tv_sec = VNOVAL;
	vap->va_birthtime.tv_nsec = VNOVAL;
	vap->va_flags = VNOVAL;
	vap->va_gen = VNOVAL;
	vap->va_vaflags = 0;
	}

	/*
	* This routine is called when we have too many vnodes. It attempts
	* to free <count> vnodes and will potentially free vnodes that still
	* have VM backing store (VM backing store is typically the cause
	* of a vnode blowout so we want to do this). Therefore, this operation
	* is not considered cheap.
	*
	* A number of conditions may prevent a vnode from being reclaimed.
	* the buffer cache may have references on the vnode, a directory
	* vnode may still have references due to the namei cache representing
	* underlying files, or the vnode may be in active use. It is not
	* desirable to reuse such vnodes. These conditions may cause the
	* number of vnodes to reach some minimum value regardless of what
	* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
	*/
	static int
	vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
	{
	struct vnode *vp;
	int count, done, target;

	done = 0;
	vn_start_write(NULL, &mp, V_WAIT);
	MNT_ILOCK(mp);
	count = mp->mnt_nvnodelistsize;
	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
	target = target / 10 + 1;
	while (count != 0 && done < target) {
	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	while (vp != NULL && vp->v_type == VMARKER)
	vp = TAILQ_NEXT(vp, v_nmntvnodes);
	if (vp == NULL)
	break;
	/*
	* XXX LRU is completely broken for non-free vnodes. First
	* by calling here in mountpoint order, then by moving
	* unselected vnodes to the end here, and most grossly by
	* removing the vlruvp() function that was supposed to
	* maintain the order. (This function was born broken
	* since syncer problems prevented it doing anything.) The
	* order is closer to LRC (C = Created).
	*
	* LRU reclaiming of vnodes seems to have last worked in
	* FreeBSD-3 where LRU wasn't mentioned under any spelling.
	* Then there was no hold count, and inactive vnodes were
	* simply put on the free list in LRU order. The separate
	* lists also break LRU. We prefer to reclaim from the
	* free list for technical reasons. This tends to thrash
	* the free list to keep very unrecently used held vnodes.
	* The problem is mitigated by keeping the free list large.
	*/
	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	--count;
	if (!VI_TRYLOCK(vp))
	goto next_iter;
	/*
	* If it's been deconstructed already, it's still
	* referenced, or it exceeds the trigger, skip it.
	* Also skip free vnodes. We are trying to make space
	* to expand the free list, not reduce it.
	*/
	if (vp->v_usecount \|\|
	(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) \|\|
	((vp->v_iflag & VI_FREE) != 0) \|\|
	(vp->v_iflag & VI_DOOMED) != 0 \|\| (vp->v_object != NULL &&
	vp->v_object->resident_page_count > trigger)) {
	VI_UNLOCK(vp);
	goto next_iter;
	}
	MNT_IUNLOCK(mp);
	vholdl(vp);
	if (VOP_LOCK(vp, LK_INTERLOCK\|LK_EXCLUSIVE\|LK_NOWAIT)) {
	vdrop(vp);
	goto next_iter_mntunlocked;
	}
	VI_LOCK(vp);
	/*
	* v_usecount may have been bumped after VOP_LOCK() dropped
	* the vnode interlock and before it was locked again.
	*
	* It is not necessary to recheck VI_DOOMED because it can
	* only be set by another thread that holds both the vnode
	* lock and vnode interlock. If another thread has the
	* vnode lock before we get to VOP_LOCK() and obtains the
	* vnode interlock after VOP_LOCK() drops the vnode
	* interlock, the other thread will be unable to drop the
	* vnode lock before our VOP_LOCK() call fails.
	*/
	if (vp->v_usecount \|\|
	(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) \|\|
	(vp->v_iflag & VI_FREE) != 0 \|\|
	(vp->v_object != NULL &&
	vp->v_object->resident_page_count > trigger)) {
	VOP_UNLOCK(vp, LK_INTERLOCK);
	vdrop(vp);
	goto next_iter_mntunlocked;
	}
	KASSERT((vp->v_iflag & VI_DOOMED) == 0,
	("VI_DOOMED unexpectedly detected in vlrureclaim()"));
	counter_u64_add(recycles_count, 1);
	vgonel(vp);
	VOP_UNLOCK(vp, 0);
	vdropl(vp);
	done++;
	next_iter_mntunlocked:
	if (!should_yield())
	goto relock_mnt;
	goto yield;
	next_iter:
	if (!should_yield())
	continue;
	MNT_IUNLOCK(mp);
	yield:
	kern_yield(PRI_USER);
	relock_mnt:
	MNT_ILOCK(mp);
	}
	MNT_IUNLOCK(mp);
	vn_finished_write(mp);
	return done;
	}

	static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
	SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
	0,
	"limit on vnode free requests per call to the vnlru_free routine");

	/*
	* Attempt to reduce the free list by the requested amount.
	*/
	static void
	vnlru_free_locked(int count, struct vfsops *mnt_op)
	{
	struct vnode *vp;
	struct mount *mp;
	bool tried_batches;

	tried_batches = false;
	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
	if (count > max_vnlru_free)
	count = max_vnlru_free;
	for (; count > 0; count--) {
	vp = TAILQ_FIRST(&vnode_free_list);
	/*
	* The list can be modified while the free_list_mtx
	* has been dropped and vp could be NULL here.
	*/
	if (vp == NULL) {
	if (tried_batches)
	break;
	mtx_unlock(&vnode_free_list_mtx);
	vnlru_return_batches(mnt_op);
	tried_batches = true;
	mtx_lock(&vnode_free_list_mtx);
	continue;
	}

	VNASSERT(vp->v_op != NULL, vp,
	("vnlru_free: vnode already reclaimed."));
	KASSERT((vp->v_iflag & VI_FREE) != 0,
	("Removing vnode not on freelist"));
	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
	("Mangling active vnode"));
	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);

	/*
	* Don't recycle if our vnode is from different type
	* of mount point. Note that mp is type-safe, the
	* check does not reach unmapped address even if
	* vnode is reclaimed.
	* Don't recycle if we can't get the interlock without
	* blocking.
	*/
	if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
	mp->mnt_op != mnt_op) \|\| !VI_TRYLOCK(vp)) {
	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
	continue;
	}
	VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
	vp, ("vp inconsistent on freelist"));

	/*
	* The clear of VI_FREE prevents activation of the
	* vnode. There is no sense in putting the vnode on
	* the mount point active list, only to remove it
	* later during recycling. Inline the relevant part
	* of vholdl(), to avoid triggering assertions or
	* activating.
	*/
	freevnodes--;
	vp->v_iflag &= ~VI_FREE;
	refcount_acquire(&vp->v_holdcnt);

	mtx_unlock(&vnode_free_list_mtx);
	VI_UNLOCK(vp);
	vtryrecycle(vp);
	/*
	* If the recycled succeeded this vdrop will actually free
	* the vnode. If not it will simply place it back on
	* the free list.
	*/
	vdrop(vp);
	mtx_lock(&vnode_free_list_mtx);
	}
	}

	void
	vnlru_free(int count, struct vfsops *mnt_op)
	{

	mtx_lock(&vnode_free_list_mtx);
	vnlru_free_locked(count, mnt_op);
	mtx_unlock(&vnode_free_list_mtx);
	}


	/* XXX some names and initialization are bad for limits and watermarks. */
	static int
	vspace(void)
	{
	int space;

	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
	vlowat = vhiwat / 2;
	if (numvnodes > desiredvnodes)
	return (0);
	space = desiredvnodes - numvnodes;
	if (freevnodes > wantfreevnodes)
	space += freevnodes - wantfreevnodes;
	return (space);
	}

	static void
	vnlru_return_batch_locked(struct mount *mp)
	{
	struct vnode *vp;

	mtx_assert(&mp->mnt_listmtx, MA_OWNED);

	if (mp->mnt_tmpfreevnodelistsize == 0)
	return;

	TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
	VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
	("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
	vp->v_mflag &= ~VMP_TMPMNTFREELIST;
	}
	mtx_lock(&vnode_free_list_mtx);
	TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
	freevnodes += mp->mnt_tmpfreevnodelistsize;
	mtx_unlock(&vnode_free_list_mtx);
	mp->mnt_tmpfreevnodelistsize = 0;
	}

	static void
	vnlru_return_batch(struct mount *mp)
	{

	mtx_lock(&mp->mnt_listmtx);
	vnlru_return_batch_locked(mp);
	mtx_unlock(&mp->mnt_listmtx);
	}

	static void
	vnlru_return_batches(struct vfsops *mnt_op)
	{
	struct mount mp, nmp;
	bool need_unbusy;

	mtx_lock(&mountlist_mtx);
	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	need_unbusy = false;
	if (mnt_op != NULL && mp->mnt_op != mnt_op)
	goto next;
	if (mp->mnt_tmpfreevnodelistsize == 0)
	goto next;
	if (vfs_busy(mp, MBF_NOWAIT \| MBF_MNTLSTLOCK) == 0) {
	vnlru_return_batch(mp);
	need_unbusy = true;
	mtx_lock(&mountlist_mtx);
	}
	next:
	nmp = TAILQ_NEXT(mp, mnt_list);
	if (need_unbusy)
	vfs_unbusy(mp);
	}
	mtx_unlock(&mountlist_mtx);
	}

	/*
	* Attempt to recycle vnodes in a context that is always safe to block.
	* Calling vlrurecycle() from the bowels of filesystem code has some
	* interesting deadlock problems.
	*/
	static struct proc *vnlruproc;
	static int vnlruproc_sig;

	static void
	vnlru_proc(void)
	{
	struct mount mp, nmp;
	- unsigned long ofreevnodes, onumvnodes;
	+ unsigned long onumvnodes;
	int done, force, reclaim_nc_src, trigger, usevnodes;

	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
	SHUTDOWN_PRI_FIRST);

	force = 0;
	for (;;) {
	kproc_suspend_check(vnlruproc);
	mtx_lock(&vnode_free_list_mtx);
	/*
	* If numvnodes is too large (due to desiredvnodes being
	* adjusted using its sysctl, or emergency growth), first
	* try to reduce it by discarding from the free list.
	*/
	if (numvnodes > desiredvnodes)
	vnlru_free_locked(numvnodes - desiredvnodes, NULL);
	/*
	* Sleep if the vnode cache is in a good state. This is
	* when it is not over-full and has space for about a 4%
	* or 9% expansion (by growing its size or inexcessively
	* reducing its free list). Otherwise, try to reclaim
	* space for a 10% expansion.
	*/
	if (vstir && force == 0) {
	force = 1;
	vstir = 0;
	}
	if (vspace() >= vlowat && force == 0) {
	vnlruproc_sig = 0;
	wakeup(&vnlruproc_sig);
	msleep(vnlruproc, &vnode_free_list_mtx,
	PVFS\|PDROP, "vlruwt", hz);
	continue;
	}
	mtx_unlock(&vnode_free_list_mtx);
	done = 0;
	- ofreevnodes = freevnodes;
	onumvnodes = numvnodes;
	/*
	* Calculate parameters for recycling. These are the same
	* throughout the loop to give some semblance of fairness.
	* The trigger point is to avoid recycling vnodes with lots
	* of resident pages. We aren't trying to free memory; we
	* are trying to recycle or at least free vnodes.
	*/
	if (numvnodes <= desiredvnodes)
	usevnodes = numvnodes - freevnodes;
	else
	usevnodes = numvnodes;
	if (usevnodes <= 0)
	usevnodes = 1;
	/*
	* The trigger value is is chosen to give a conservatively
	* large value to ensure that it alone doesn't prevent
	* making progress. The value can easily be so large that
	* it is effectively infinite in some congested and
	* misconfigured cases, and this is necessary. Normally
	* it is about 8 to 100 (pages), which is quite large.
	*/
	trigger = vm_cnt.v_page_count * 2 / usevnodes;
	if (force < 2)
	trigger = vsmalltrigger;
	reclaim_nc_src = force >= 3;
	mtx_lock(&mountlist_mtx);
	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	if (vfs_busy(mp, MBF_NOWAIT \| MBF_MNTLSTLOCK)) {
	nmp = TAILQ_NEXT(mp, mnt_list);
	continue;
	}
	done += vlrureclaim(mp, reclaim_nc_src, trigger);
	mtx_lock(&mountlist_mtx);
	nmp = TAILQ_NEXT(mp, mnt_list);
	vfs_unbusy(mp);
	}
	mtx_unlock(&mountlist_mtx);
	if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
	uma_reclaim();
	if (done == 0) {
	if (force == 0 \|\| force == 1) {
	force = 2;
	continue;
	}
	if (force == 2) {
	force = 3;
	continue;
	}
	force = 0;
	vnlru_nowhere++;
	tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
	} else
	kern_yield(PRI_USER);
	/*
	* After becoming active to expand above low water, keep
	* active until above high water.
	*/
	force = vspace() < vhiwat;
	}
	}

	static struct kproc_desc vnlru_kp = {
	"vnlru",
	vnlru_proc,
	&vnlruproc
	};
	SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
	&vnlru_kp);

	/*
	* Routines having to do with the management of the vnode table.
	*/

	/*
	* Try to recycle a freed vnode. We abort if anyone picks up a reference
	* before we actually vgone(). This function must be called with the vnode
	* held to prevent the vnode from being returned to the free list midway
	* through vgone().
	*/
	static int
	vtryrecycle(struct vnode *vp)
	{
	struct mount *vnmp;

	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	VNASSERT(vp->v_holdcnt, vp,
	("vtryrecycle: Recycling vp %p without a reference.", vp));
	/*
	* This vnode may found and locked via some other list, if so we
	* can't recycle it yet.
	*/
	if (VOP_LOCK(vp, LK_EXCLUSIVE \| LK_NOWAIT) != 0) {
	CTR2(KTR_VFS,
	"%s: impossible to recycle, vp %p lock is already held",
	__func__, vp);
	return (EWOULDBLOCK);
	}
	/*
	* Don't recycle if its filesystem is being suspended.
	*/
	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
	VOP_UNLOCK(vp, 0);
	CTR2(KTR_VFS,
	"%s: impossible to recycle, cannot start the write for %p",
	__func__, vp);
	return (EBUSY);
	}
	/*
	* If we got this far, we need to acquire the interlock and see if
	* anyone picked up this vnode from another list. If not, we will
	* mark it with DOOMED via vgonel() so that anyone who does find it
	* will skip over it.
	*/
	VI_LOCK(vp);
	if (vp->v_usecount) {
	VOP_UNLOCK(vp, LK_INTERLOCK);
	vn_finished_write(vnmp);
	CTR2(KTR_VFS,
	"%s: impossible to recycle, %p is already referenced",
	__func__, vp);
	return (EBUSY);
	}
	if ((vp->v_iflag & VI_DOOMED) == 0) {
	counter_u64_add(recycles_count, 1);
	vgonel(vp);
	}
	VOP_UNLOCK(vp, LK_INTERLOCK);
	vn_finished_write(vnmp);
	return (0);
	}

	static void
	vcheckspace(void)
	{

	if (vspace() < vlowat && vnlruproc_sig == 0) {
	vnlruproc_sig = 1;
	wakeup(vnlruproc);
	}
	}

	/*
	* Wait if necessary for space for a new vnode.
	*/
	static int
	getnewvnode_wait(int suspended)
	{

	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
	if (numvnodes >= desiredvnodes) {
	if (suspended) {
	/*
	* The file system is being suspended. We cannot
	* risk a deadlock here, so allow allocation of
	* another vnode even if this would give too many.
	*/
	return (0);
	}
	if (vnlruproc_sig == 0) {
	vnlruproc_sig = 1; /* avoid unnecessary wakeups */
	wakeup(vnlruproc);
	}
	msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
	"vlruwk", hz);
	}
	/* Post-adjust like the pre-adjust in getnewvnode(). */
	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
	vnlru_free_locked(1, NULL);
	return (numvnodes >= desiredvnodes ? ENFILE : 0);
	}

	/*
	* This hack is fragile, and probably not needed any more now that the
	* watermark handling works.
	*/
	void
	getnewvnode_reserve(u_int count)
	{
	struct thread *td;

	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
	/* XXX no longer so quick, but this part is not racy. */
	mtx_lock(&vnode_free_list_mtx);
	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
	vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes,
	freevnodes - wantfreevnodes), NULL);
	mtx_unlock(&vnode_free_list_mtx);

	td = curthread;
	/* First try to be quick and racy. */
	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
	td->td_vp_reserv += count;
	vcheckspace(); /* XXX no longer so quick, but more racy */
	return;
	} else
	atomic_subtract_long(&numvnodes, count);

	mtx_lock(&vnode_free_list_mtx);
	while (count > 0) {
	if (getnewvnode_wait(0) == 0) {
	count--;
	td->td_vp_reserv++;
	atomic_add_long(&numvnodes, 1);
	}
	}
	vcheckspace();
	mtx_unlock(&vnode_free_list_mtx);
	}

	/*
	* This hack is fragile, especially if desiredvnodes or wantvnodes are
	* misconfgured or changed significantly. Reducing desiredvnodes below
	* the reserved amount should cause bizarre behaviour like reducing it
	* below the number of active vnodes -- the system will try to reduce
	* numvnodes to match, but should fail, so the subtraction below should
	* not overflow.
	*/
	void
	getnewvnode_drop_reserve(void)
	{
	struct thread *td;

	td = curthread;
	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
	td->td_vp_reserv = 0;
	}

	/*
	* Return the next vnode from the free list.
	*/
	int
	getnewvnode(const char tag, struct mount mp, struct vop_vector *vops,
	struct vnode **vpp)
	{
	struct vnode *vp;
	struct thread *td;
	struct lock_object *lo;
	static int cyclecount;
	int error;

	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
	vp = NULL;
	td = curthread;
	if (td->td_vp_reserv > 0) {
	td->td_vp_reserv -= 1;
	goto alloc;
	}
	mtx_lock(&vnode_free_list_mtx);
	if (numvnodes < desiredvnodes)
	cyclecount = 0;
	else if (cyclecount++ >= freevnodes) {
	cyclecount = 0;
	vstir = 1;
	}
	/*
	* Grow the vnode cache if it will not be above its target max
	* after growing. Otherwise, if the free list is nonempty, try
	* to reclaim 1 item from it before growing the cache (possibly
	* above its target max if the reclamation failed or is delayed).
	* Otherwise, wait for some space. In all cases, schedule
	* vnlru_proc() if we are getting short of space. The watermarks
	* should be chosen so that we never wait or even reclaim from
	* the free list to below its target minimum.
	*/
	if (numvnodes + 1 <= desiredvnodes)
	;
	else if (freevnodes > 0)
	vnlru_free_locked(1, NULL);
	else {
	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
	MNTK_SUSPEND));
	#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
	if (error != 0) {
	mtx_unlock(&vnode_free_list_mtx);
	return (error);
	}
	#endif
	}
	vcheckspace();
	atomic_add_long(&numvnodes, 1);
	mtx_unlock(&vnode_free_list_mtx);
	alloc:
	counter_u64_add(vnodes_created, 1);
	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
	/*
	* Locks are given the generic name "vnode" when created.
	* Follow the historic practice of using the filesystem
	* name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
	*
	* Locks live in a witness group keyed on their name. Thus,
	* when a lock is renamed, it must also move from the witness
	* group of its old name to the witness group of its new name.
	*
	* The change only needs to be made when the vnode moves
	* from one filesystem type to another. We ensure that each
	* filesystem use a single static name pointer for its tag so
	* that we can compare pointers rather than doing a strcmp().
	*/
	lo = &vp->v_vnlock->lock_object;
	if (lo->lo_name != tag) {
	lo->lo_name = tag;
	WITNESS_DESTROY(lo);
	WITNESS_INIT(lo, tag);
	}
	/*
	* By default, don't allow shared locks unless filesystems opt-in.
	*/
	vp->v_vnlock->lock_object.lo_flags \|= LK_NOSHARE;
	/*
	* Finalize various vnode identity bits.
	*/
	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
	vp->v_type = VNON;
	vp->v_tag = tag;
	vp->v_op = vops;
	v_init_counters(vp);
	vp->v_bufobj.bo_ops = &buf_ops_bio;
	#ifdef DIAGNOSTIC
	if (mp == NULL && vops != &dead_vnodeops)
	printf("NULL mp in getnewvnode(9), tag %s\n", tag);
	#endif
	#ifdef MAC
	mac_vnode_init(vp);
	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
	mac_vnode_associate_singlelabel(mp, vp);
	#endif
	if (mp != NULL) {
	vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
	if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
	vp->v_vflag \|= VV_NOKNOTE;
	}

	/*
	* For the filesystems which do not use vfs_hash_insert(),
	* still initialize v_hash to have vfs_hash_index() useful.
	* E.g., nullfs uses vfs_hash_index() on the lower vnode for
	* its own hashing.
	*/
	vp->v_hash = (uintptr_t)vp >> vnsz2log;

	*vpp = vp;
	return (0);
	}

	/*
	* Delete from old mount point vnode list, if on one.
	*/
	static void
	delmntque(struct vnode *vp)
	{
	struct mount *mp;
	int active;

	mp = vp->v_mount;
	if (mp == NULL)
	return;
	MNT_ILOCK(mp);
	VI_LOCK(vp);
	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
	("Active vnode list size %d > Vnode list size %d",
	mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
	active = vp->v_iflag & VI_ACTIVE;
	vp->v_iflag &= ~VI_ACTIVE;
	if (active) {
	mtx_lock(&mp->mnt_listmtx);
	TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
	mp->mnt_activevnodelistsize--;
	mtx_unlock(&mp->mnt_listmtx);
	}
	vp->v_mount = NULL;
	VI_UNLOCK(vp);
	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
	("bad mount point vnode list size"));
	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	mp->mnt_nvnodelistsize--;
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	}

	static void
	insmntque_stddtr(struct vnode vp, void dtr_arg)
	{

	vp->v_data = NULL;
	vp->v_op = &dead_vnodeops;
	vgone(vp);
	vput(vp);
	}

	/*
	* Insert into list of vnodes for the new mount point, if available.
	*/
	int
	insmntque1(struct vnode vp, struct mount mp,
	void (dtr)(struct vnode , void ), void dtr_arg)
	{

	KASSERT(vp->v_mount == NULL,
	("insmntque: vnode already on per mount vnode list"));
	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");

	/*
	* We acquire the vnode interlock early to ensure that the
	* vnode cannot be recycled by another process releasing a
	* holdcnt on it before we get it on both the vnode list
	* and the active vnode list. The mount mutex protects only
	* manipulation of the vnode list and the vnode freelist
	* mutex protects only manipulation of the active vnode list.
	* Hence the need to hold the vnode interlock throughout.
	*/
	MNT_ILOCK(mp);
	VI_LOCK(vp);
	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
	((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 \|\|
	mp->mnt_nvnodelistsize == 0)) &&
	(vp->v_vflag & VV_FORCEINSMQ) == 0) {
	VI_UNLOCK(vp);
	MNT_IUNLOCK(mp);
	if (dtr != NULL)
	dtr(vp, dtr_arg);
	return (EBUSY);
	}
	vp->v_mount = mp;
	MNT_REF(mp);
	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
	("neg mount point vnode list size"));
	mp->mnt_nvnodelistsize++;
	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
	("Activating already active vnode"));
	vp->v_iflag \|= VI_ACTIVE;
	mtx_lock(&mp->mnt_listmtx);
	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
	mp->mnt_activevnodelistsize++;
	mtx_unlock(&mp->mnt_listmtx);
	VI_UNLOCK(vp);
	MNT_IUNLOCK(mp);
	return (0);
	}

	int
	insmntque(struct vnode vp, struct mount mp)
	{

	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
	}

	/*
	* Flush out and invalidate all buffers associated with a bufobj
	* Called with the underlying object locked.
	*/
	int
	bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
	{
	int error;

	BO_LOCK(bo);
	if (flags & V_SAVE) {
	error = bufobj_wwait(bo, slpflag, slptimeo);
	if (error) {
	BO_UNLOCK(bo);
	return (error);
	}
	if (bo->bo_dirty.bv_cnt > 0) {
	BO_UNLOCK(bo);
	if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
	return (error);
	/*
	* XXX We could save a lock/unlock if this was only
	* enabled under INVARIANTS
	*/
	BO_LOCK(bo);
	if (bo->bo_numoutput > 0 \|\| bo->bo_dirty.bv_cnt > 0)
	panic("vinvalbuf: dirty bufs");
	}
	}
	/*
	* If you alter this loop please notice that interlock is dropped and
	* reacquired in flushbuflist. Special care is needed to ensure that
	* no race conditions occur from this.
	*/
	do {
	error = flushbuflist(&bo->bo_clean,
	flags, bo, slpflag, slptimeo);
	if (error == 0 && !(flags & V_CLEANONLY))
	error = flushbuflist(&bo->bo_dirty,
	flags, bo, slpflag, slptimeo);
	if (error != 0 && error != EAGAIN) {
	BO_UNLOCK(bo);
	return (error);
	}
	} while (error != 0);

	/*
	* Wait for I/O to complete. XXX needs cleaning up. The vnode can
	* have write I/O in-progress but if there is a VM object then the
	* VM object can also have read-I/O in-progress.
	*/
	do {
	bufobj_wwait(bo, 0, 0);
	if ((flags & V_VMIO) == 0) {
	BO_UNLOCK(bo);
	if (bo->bo_object != NULL) {
	VM_OBJECT_WLOCK(bo->bo_object);
	vm_object_pip_wait(bo->bo_object, "bovlbx");
	VM_OBJECT_WUNLOCK(bo->bo_object);
	}
	BO_LOCK(bo);
	}
	} while (bo->bo_numoutput > 0);
	BO_UNLOCK(bo);

	/*
	* Destroy the copy in the VM cache, too.
	*/
	if (bo->bo_object != NULL &&
	(flags & (V_ALT \| V_NORMAL \| V_CLEANONLY \| V_VMIO)) == 0) {
	VM_OBJECT_WLOCK(bo->bo_object);
	vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
	OBJPR_CLEANONLY : 0);
	VM_OBJECT_WUNLOCK(bo->bo_object);
	}

	#ifdef INVARIANTS
	BO_LOCK(bo);
	if ((flags & (V_ALT \| V_NORMAL \| V_CLEANONLY \| V_VMIO \|
	V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 \|\|
	bo->bo_clean.bv_cnt > 0))
	panic("vinvalbuf: flush failed");
	if ((flags & (V_ALT \| V_NORMAL \| V_CLEANONLY \| V_VMIO)) == 0 &&
	bo->bo_dirty.bv_cnt > 0)
	panic("vinvalbuf: flush dirty failed");
	BO_UNLOCK(bo);
	#endif
	return (0);
	}

	/*
	* Flush out and invalidate all buffers associated with a vnode.
	* Called with the underlying object locked.
	*/
	int
	vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
	{

	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
	if (vp->v_object != NULL && vp->v_object->handle != vp)
	return (0);
	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
	}

	/*
	* Flush out buffers on the specified list.
	*
	*/
	static int
	flushbuflist(struct bufv bufv, int flags, struct bufobj bo, int slpflag,
	int slptimeo)
	{
	struct buf bp, nbp;
	int retval, error;
	daddr_t lblkno;
	b_xflags_t xflags;

	ASSERT_BO_WLOCKED(bo);

	retval = 0;
	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
	if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) \|\|
	((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
	continue;
	}
	if (nbp != NULL) {
	lblkno = nbp->b_lblkno;
	xflags = nbp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN);
	}
	retval = EAGAIN;
	error = BUF_TIMELOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK, BO_LOCKPTR(bo),
	"flushbuf", slpflag, slptimeo);
	if (error) {
	BO_LOCK(bo);
	return (error != ENOLCK ? error : EAGAIN);
	}
	KASSERT(bp->b_bufobj == bo,
	("bp %p wrong b_bufobj %p should be %p",
	bp, bp->b_bufobj, bo));
	/*
	* XXX Since there are no node locks for NFS, I
	* believe there is a slight chance that a delayed
	* write will occur while sleeping just above, so
	* check for it.
	*/
	if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) &&
	(flags & V_SAVE)) {
	bremfree(bp);
	bp->b_flags \|= B_ASYNC;
	bwrite(bp);
	BO_LOCK(bo);
	return (EAGAIN); /* XXX: why not loop ? */
	}
	bremfree(bp);
	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	bp->b_flags &= ~B_ASYNC;
	brelse(bp);
	BO_LOCK(bo);
	if (nbp == NULL)
	break;
	nbp = gbincore(bo, lblkno);
	if (nbp == NULL \|\| (nbp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN))
	!= xflags)
	break; /* nbp invalid */
	}
	return (retval);
	}

	int
	bnoreuselist(struct bufv bufv, struct bufobj bo, daddr_t startn, daddr_t endn)
	{
	struct buf *bp;
	int error;
	daddr_t lblkno;

	ASSERT_BO_LOCKED(bo);

	for (lblkno = startn;;) {
	again:
	bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
	if (bp == NULL \|\| bp->b_lblkno >= endn \|\|
	bp->b_lblkno < startn)
	break;
	error = BUF_TIMELOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL \|
	LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
	if (error != 0) {
	BO_RLOCK(bo);
	if (error == ENOLCK)
	goto again;
	return (error);
	}
	KASSERT(bp->b_bufobj == bo,
	("bp %p wrong b_bufobj %p should be %p",
	bp, bp->b_bufobj, bo));
	lblkno = bp->b_lblkno + 1;
	if ((bp->b_flags & B_MANAGED) == 0)
	bremfree(bp);
	bp->b_flags \|= B_RELBUF;
	/*
	* In the VMIO case, use the B_NOREUSE flag to hint that the
	* pages backing each buffer in the range are unlikely to be
	* reused. Dirty buffers will have the hint applied once
	* they've been written.
	*/
	if (bp->b_vp->v_object != NULL)
	bp->b_flags \|= B_NOREUSE;
	brelse(bp);
	BO_RLOCK(bo);
	}
	return (0);
	}

	/*
	* Truncate a file's buffer and pages to a specified length. This
	* is in lieu of the old vinvalbuf mechanism, which performed unneeded
	* sync activity.
	*/
	int
	vtruncbuf(struct vnode vp, struct ucred cred, off_t length, int blksize)
	{
	struct buf bp, nbp;
	int anyfreed;
	int trunclbn;
	struct bufobj *bo;

	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
	vp, cred, blksize, (uintmax_t)length);

	/*
	* Round up to the next lbn.
	*/
	trunclbn = howmany(length, blksize);

	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
	restart:
	bo = &vp->v_bufobj;
	BO_LOCK(bo);
	anyfreed = 1;
	for (;anyfreed;) {
	anyfreed = 0;
	TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
	if (bp->b_lblkno < trunclbn)
	continue;
	if (BUF_LOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	BO_LOCKPTR(bo)) == ENOLCK)
	goto restart;

	bremfree(bp);
	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	bp->b_flags &= ~B_ASYNC;
	brelse(bp);
	anyfreed = 1;

	BO_LOCK(bo);
	if (nbp != NULL &&
	(((nbp->b_xflags & BX_VNCLEAN) == 0) \|\|
	(nbp->b_vp != vp) \|\|
	(nbp->b_flags & B_DELWRI))) {
	BO_UNLOCK(bo);
	goto restart;
	}
	}

	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	if (bp->b_lblkno < trunclbn)
	continue;
	if (BUF_LOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	BO_LOCKPTR(bo)) == ENOLCK)
	goto restart;
	bremfree(bp);
	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	bp->b_flags &= ~B_ASYNC;
	brelse(bp);
	anyfreed = 1;

	BO_LOCK(bo);
	if (nbp != NULL &&
	(((nbp->b_xflags & BX_VNDIRTY) == 0) \|\|
	(nbp->b_vp != vp) \|\|
	(nbp->b_flags & B_DELWRI) == 0)) {
	BO_UNLOCK(bo);
	goto restart;
	}
	}
	}

	if (length > 0) {
	restartsync:
	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	if (bp->b_lblkno > 0)
	continue;
	/*
	* Since we hold the vnode lock this should only
	* fail if we're racing with the buf daemon.
	*/
	if (BUF_LOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	BO_LOCKPTR(bo)) == ENOLCK) {
	goto restart;
	}
	VNASSERT((bp->b_flags & B_DELWRI), vp,
	("buf(%p) on dirty queue without DELWRI", bp));

	bremfree(bp);
	bawrite(bp);
	BO_LOCK(bo);
	goto restartsync;
	}
	}

	bufobj_wwait(bo, 0, 0);
	BO_UNLOCK(bo);
	vnode_pager_setsize(vp, length);

	return (0);
	}

	static void
	buf_vlist_remove(struct buf *bp)
	{
	struct bufv *bv;

	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
	ASSERT_BO_WLOCKED(bp->b_bufobj);
	KASSERT((bp->b_xflags & (BX_VNDIRTY\|BX_VNCLEAN)) !=
	(BX_VNDIRTY\|BX_VNCLEAN),
	("buf_vlist_remove: Buf %p is on two lists", bp));
	if (bp->b_xflags & BX_VNDIRTY)
	bv = &bp->b_bufobj->bo_dirty;
	else
	bv = &bp->b_bufobj->bo_clean;
	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
	bv->bv_cnt--;
	bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN);
	}

	/*
	* Add the buffer to the sorted clean or dirty block list.
	*
	* NOTE: xflags is passed as a constant, optimizing this inline function!
	*/
	static void
	buf_vlist_add(struct buf bp, struct bufobj bo, b_xflags_t xflags)
	{
	struct bufv *bv;
	struct buf *n;
	int error;

	ASSERT_BO_WLOCKED(bo);
	KASSERT((xflags & BX_VNDIRTY) == 0 \|\| (bo->bo_flag & BO_DEAD) == 0,
	("dead bo %p", bo));
	KASSERT((bp->b_xflags & (BX_VNDIRTY\|BX_VNCLEAN)) == 0,
	("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
	bp->b_xflags \|= xflags;
	if (xflags & BX_VNDIRTY)
	bv = &bo->bo_dirty;
	else
	bv = &bo->bo_clean;

	/*
	* Keep the list ordered. Optimize empty list insertion. Assume
	* we tend to grow at the tail so lookup_le should usually be cheaper
	* than _ge.
	*/
	if (bv->bv_cnt == 0 \|\|
	bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
	TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
	TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
	else
	TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
	if (error)
	panic("buf_vlist_add: Preallocated nodes insufficient.");
	bv->bv_cnt++;
	}

	/*
	* Look up a buffer using the buffer tries.
	*/
	struct buf *
	gbincore(struct bufobj *bo, daddr_t lblkno)
	{
	struct buf *bp;

	ASSERT_BO_LOCKED(bo);
	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
	if (bp != NULL)
	return (bp);
	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
	}

	/*
	* Associate a buffer with a vnode.
	*/
	void
	bgetvp(struct vnode vp, struct buf bp)
	{
	struct bufobj *bo;

	bo = &vp->v_bufobj;
	ASSERT_BO_WLOCKED(bo);
	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));

	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
	VNASSERT((bp->b_xflags & (BX_VNDIRTY\|BX_VNCLEAN)) == 0, vp,
	("bgetvp: bp already attached! %p", bp));

	vhold(vp);
	bp->b_vp = vp;
	bp->b_bufobj = bo;
	/*
	* Insert onto list for new vnode.
	*/
	buf_vlist_add(bp, bo, BX_VNCLEAN);
	}

	/*
	* Disassociate a buffer from a vnode.
	*/
	void
	brelvp(struct buf *bp)
	{
	struct bufobj *bo;
	struct vnode *vp;

	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));

	/*
	* Delete from old vnode list, if on one.
	*/
	vp = bp->b_vp; /* XXX */
	bo = bp->b_bufobj;
	BO_LOCK(bo);
	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN))
	buf_vlist_remove(bp);
	else
	panic("brelvp: Buffer %p not on queue.", bp);
	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
	bo->bo_flag &= ~BO_ONWORKLST;
	mtx_lock(&sync_mtx);
	LIST_REMOVE(bo, bo_synclist);
	syncer_worklist_len--;
	mtx_unlock(&sync_mtx);
	}
	bp->b_vp = NULL;
	bp->b_bufobj = NULL;
	BO_UNLOCK(bo);
	vdrop(vp);
	}

	/*
	* Add an item to the syncer work queue.
	*/
	static void
	vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
	{
	int slot;

	ASSERT_BO_WLOCKED(bo);

	mtx_lock(&sync_mtx);
	if (bo->bo_flag & BO_ONWORKLST)
	LIST_REMOVE(bo, bo_synclist);
	else {
	bo->bo_flag \|= BO_ONWORKLST;
	syncer_worklist_len++;
	}

	if (delay > syncer_maxdelay - 2)
	delay = syncer_maxdelay - 2;
	slot = (syncer_delayno + delay) & syncer_mask;

	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
	mtx_unlock(&sync_mtx);
	}

	static int
	sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
	{
	int error, len;

	mtx_lock(&sync_mtx);
	len = syncer_worklist_len - sync_vnode_count;
	mtx_unlock(&sync_mtx);
	error = SYSCTL_OUT(req, &len, sizeof(len));
	return (error);
	}

	SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT \| CTLFLAG_RD, NULL, 0,
	sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");

	static struct proc *updateproc;
	static void sched_sync(void);
	static struct kproc_desc up_kp = {
	"syncer",
	sched_sync,
	&updateproc
	};
	SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);

	static int
	sync_vnode(struct synclist slp, struct bufobj bo, struct thread td)
	{
	struct vnode *vp;
	struct mount *mp;

	*bo = LIST_FIRST(slp);
	if (*bo == NULL)
	return (0);
	vp = bo2vnode(*bo);
	if (VOP_ISLOCKED(vp) != 0 \|\| VI_TRYLOCK(vp) == 0)
	return (1);
	/*
	* We use vhold in case the vnode does not
	* successfully sync. vhold prevents the vnode from
	* going away when we unlock the sync_mtx so that
	* we can acquire the vnode interlock.
	*/
	vholdl(vp);
	mtx_unlock(&sync_mtx);
	VI_UNLOCK(vp);
	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
	vdrop(vp);
	mtx_lock(&sync_mtx);
	return (*bo == LIST_FIRST(slp));
	}
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	(void) VOP_FSYNC(vp, MNT_LAZY, td);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	BO_LOCK(*bo);
	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
	/*
	* Put us back on the worklist. The worklist
	* routine will remove us from our current
	* position and then add us back in at a later
	* position.
	*/
	vn_syncer_add_to_worklist(*bo, syncdelay);
	}
	BO_UNLOCK(*bo);
	vdrop(vp);
	mtx_lock(&sync_mtx);
	return (0);
	}

	static int first_printf = 1;

	/*
	* System filesystem synchronizer daemon.
	*/
	static void
	sched_sync(void)
	{
	struct synclist next, slp;
	struct bufobj *bo;
	long starttime;
	struct thread *td = curthread;
	int last_work_seen;
	int net_worklist_len;
	int syncer_final_iter;
	int error;

	last_work_seen = 0;
	syncer_final_iter = 0;
	syncer_state = SYNCER_RUNNING;
	starttime = time_uptime;
	td->td_pflags \|= TDP_NORUNNINGBUF;

	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
	SHUTDOWN_PRI_LAST);

	mtx_lock(&sync_mtx);
	for (;;) {
	if (syncer_state == SYNCER_FINAL_DELAY &&
	syncer_final_iter == 0) {
	mtx_unlock(&sync_mtx);
	kproc_suspend_check(td->td_proc);
	mtx_lock(&sync_mtx);
	}
	net_worklist_len = syncer_worklist_len - sync_vnode_count;
	if (syncer_state != SYNCER_RUNNING &&
	starttime != time_uptime) {
	if (first_printf) {
	printf("\nSyncing disks, vnodes remaining... ");
	first_printf = 0;
	}
	printf("%d ", net_worklist_len);
	}
	starttime = time_uptime;

	/*
	* Push files whose dirty time has expired. Be careful
	* of interrupt race on slp queue.
	*
	* Skip over empty worklist slots when shutting down.
	*/
	do {
	slp = &syncer_workitem_pending[syncer_delayno];
	syncer_delayno += 1;
	if (syncer_delayno == syncer_maxdelay)
	syncer_delayno = 0;
	next = &syncer_workitem_pending[syncer_delayno];
	/*
	* If the worklist has wrapped since the
	* it was emptied of all but syncer vnodes,
	* switch to the FINAL_DELAY state and run
	* for one more second.
	*/
	if (syncer_state == SYNCER_SHUTTING_DOWN &&
	net_worklist_len == 0 &&
	last_work_seen == syncer_delayno) {
	syncer_state = SYNCER_FINAL_DELAY;
	syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
	}
	} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
	syncer_worklist_len > 0);

	/*
	* Keep track of the last time there was anything
	* on the worklist other than syncer vnodes.
	* Return to the SHUTTING_DOWN state if any
	* new work appears.
	*/
	if (net_worklist_len > 0 \|\| syncer_state == SYNCER_RUNNING)
	last_work_seen = syncer_delayno;
	if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
	syncer_state = SYNCER_SHUTTING_DOWN;
	while (!LIST_EMPTY(slp)) {
	error = sync_vnode(slp, &bo, td);
	if (error == 1) {
	LIST_REMOVE(bo, bo_synclist);
	LIST_INSERT_HEAD(next, bo, bo_synclist);
	continue;
	}

	if (first_printf == 0) {
	/*
	* Drop the sync mutex, because some watchdog
	* drivers need to sleep while patting
	*/
	mtx_unlock(&sync_mtx);
	wdog_kern_pat(WD_LASTVAL);
	mtx_lock(&sync_mtx);
	}

	}
	if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
	syncer_final_iter--;
	/*
	* The variable rushjob allows the kernel to speed up the
	* processing of the filesystem syncer process. A rushjob
	* value of N tells the filesystem syncer to process the next
	* N seconds worth of work on its queue ASAP. Currently rushjob
	* is used by the soft update code to speed up the filesystem
	* syncer process when the incore state is getting so far
	* ahead of the disk that the kernel memory pool is being
	* threatened with exhaustion.
	*/
	if (rushjob > 0) {
	rushjob -= 1;
	continue;
	}
	/*
	* Just sleep for a short period of time between
	* iterations when shutting down to allow some I/O
	* to happen.
	*
	* If it has taken us less than a second to process the
	* current work, then wait. Otherwise start right over
	* again. We can still lose time if any single round
	* takes more than two seconds, but it does not really
	* matter as we are just trying to generally pace the
	* filesystem activity.
	*/
	if (syncer_state != SYNCER_RUNNING \|\|
	time_uptime == starttime) {
	thread_lock(td);
	sched_prio(td, PPAUSE);
	thread_unlock(td);
	}
	if (syncer_state != SYNCER_RUNNING)
	cv_timedwait(&sync_wakeup, &sync_mtx,
	hz / SYNCER_SHUTDOWN_SPEEDUP);
	else if (time_uptime == starttime)
	cv_timedwait(&sync_wakeup, &sync_mtx, hz);
	}
	}

	/*
	* Request the syncer daemon to speed up its work.
	* We never push it to speed up more than half of its
	* normal turn time, otherwise it could take over the cpu.
	*/
	int
	speedup_syncer(void)
	{
	int ret = 0;

	mtx_lock(&sync_mtx);
	if (rushjob < syncdelay / 2) {
	rushjob += 1;
	stat_rush_requests += 1;
	ret = 1;
	}
	mtx_unlock(&sync_mtx);
	cv_broadcast(&sync_wakeup);
	return (ret);
	}

	/*
	* Tell the syncer to speed up its work and run though its work
	* list several times, then tell it to shut down.
	*/
	static void
	syncer_shutdown(void *arg, int howto)
	{

	if (howto & RB_NOSYNC)
	return;
	mtx_lock(&sync_mtx);
	syncer_state = SYNCER_SHUTTING_DOWN;
	rushjob = 0;
	mtx_unlock(&sync_mtx);
	cv_broadcast(&sync_wakeup);
	kproc_shutdown(arg, howto);
	}

	void
	syncer_suspend(void)
	{

	syncer_shutdown(updateproc, 0);
	}

	void
	syncer_resume(void)
	{

	mtx_lock(&sync_mtx);
	first_printf = 1;
	syncer_state = SYNCER_RUNNING;
	mtx_unlock(&sync_mtx);
	cv_broadcast(&sync_wakeup);
	kproc_resume(updateproc);
	}

	/*
	* Reassign a buffer from one vnode to another.
	* Used to assign file specific control information
	* (indirect blocks) to the vnode to which they belong.
	*/
	void
	reassignbuf(struct buf *bp)
	{
	struct vnode *vp;
	struct bufobj *bo;
	int delay;
	#ifdef INVARIANTS
	struct bufv *bv;
	#endif

	vp = bp->b_vp;
	bo = bp->b_bufobj;
	++reassignbufcalls;

	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
	bp, bp->b_vp, bp->b_flags);
	/*
	* B_PAGING flagged buffers cannot be reassigned because their vp
	* is not fully linked in.
	*/
	if (bp->b_flags & B_PAGING)
	panic("cannot reassign paging buffer");

	/*
	* Delete from old vnode list, if on one.
	*/
	BO_LOCK(bo);
	if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN))
	buf_vlist_remove(bp);
	else
	panic("reassignbuf: Buffer %p not on queue.", bp);
	/*
	* If dirty, put on list of dirty buffers; otherwise insert onto list
	* of clean buffers.
	*/
	if (bp->b_flags & B_DELWRI) {
	if ((bo->bo_flag & BO_ONWORKLST) == 0) {
	switch (vp->v_type) {
	case VDIR:
	delay = dirdelay;
	break;
	case VCHR:
	delay = metadelay;
	break;
	default:
	delay = filedelay;
	}
	vn_syncer_add_to_worklist(bo, delay);
	}
	buf_vlist_add(bp, bo, BX_VNDIRTY);
	} else {
	buf_vlist_add(bp, bo, BX_VNCLEAN);

	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
	mtx_lock(&sync_mtx);
	LIST_REMOVE(bo, bo_synclist);
	syncer_worklist_len--;
	mtx_unlock(&sync_mtx);
	bo->bo_flag &= ~BO_ONWORKLST;
	}
	}
	#ifdef INVARIANTS
	bv = &bo->bo_clean;
	bp = TAILQ_FIRST(&bv->bv_hd);
	KASSERT(bp == NULL \|\| bp->b_bufobj == bo,
	("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
	bp = TAILQ_LAST(&bv->bv_hd, buflists);
	KASSERT(bp == NULL \|\| bp->b_bufobj == bo,
	("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
	bv = &bo->bo_dirty;
	bp = TAILQ_FIRST(&bv->bv_hd);
	KASSERT(bp == NULL \|\| bp->b_bufobj == bo,
	("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
	bp = TAILQ_LAST(&bv->bv_hd, buflists);
	KASSERT(bp == NULL \|\| bp->b_bufobj == bo,
	("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
	#endif
	BO_UNLOCK(bo);
	}

	/*
	* A temporary hack until refcount_* APIs are sorted out.
	*/
	static __inline int
	vfs_refcount_acquire_if_not_zero(volatile u_int *count)
	{
	u_int old;

	old = *count;
	for (;;) {
	if (old == 0)
	return (0);
	if (atomic_fcmpset_int(count, &old, old + 1))
	return (1);
	}
	}

	static __inline int
	vfs_refcount_release_if_not_last(volatile u_int *count)
	{
	u_int old;

	old = *count;
	for (;;) {
	if (old == 1)
	return (0);
	if (atomic_fcmpset_int(count, &old, old - 1))
	return (1);
	}
	}

	static void
	v_init_counters(struct vnode *vp)
	{

	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
	vp, ("%s called for an initialized vnode", __FUNCTION__));
	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);

	refcount_init(&vp->v_holdcnt, 1);
	refcount_init(&vp->v_usecount, 1);
	}

	static void
	v_incr_usecount_locked(struct vnode *vp)
	{

	ASSERT_VI_LOCKED(vp, __func__);
	if ((vp->v_iflag & VI_OWEINACT) != 0) {
	VNASSERT(vp->v_usecount == 0, vp,
	("vnode with usecount and VI_OWEINACT set"));
	vp->v_iflag &= ~VI_OWEINACT;
	}
	refcount_acquire(&vp->v_usecount);
	v_incr_devcount(vp);
	}

	/*
	* Increment the use count on the vnode, taking care to reference
	* the driver's usecount if this is a chardev.
	*/
	static void
	v_incr_usecount(struct vnode *vp)
	{

	ASSERT_VI_UNLOCKED(vp, __func__);
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);

	if (vp->v_type != VCHR &&
	vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
	VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
	("vnode with usecount and VI_OWEINACT set"));
	} else {
	VI_LOCK(vp);
	v_incr_usecount_locked(vp);
	VI_UNLOCK(vp);
	}
	}

	/*
	* Increment si_usecount of the associated device, if any.
	*/
	static void
	v_incr_devcount(struct vnode *vp)
	{

	ASSERT_VI_LOCKED(vp, __FUNCTION__);
	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
	dev_lock();
	vp->v_rdev->si_usecount++;
	dev_unlock();
	}
	}

	/*
	* Decrement si_usecount of the associated device, if any.
	*/
	static void
	v_decr_devcount(struct vnode *vp)
	{

	ASSERT_VI_LOCKED(vp, __FUNCTION__);
	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
	dev_lock();
	vp->v_rdev->si_usecount--;
	dev_unlock();
	}
	}

	/*
	* Grab a particular vnode from the free list, increment its
	* reference count and lock it. VI_DOOMED is set if the vnode
	* is being destroyed. Only callers who specify LK_RETRY will
	* see doomed vnodes. If inactive processing was delayed in
	* vput try to do it here.
	*
	* Notes on lockless counter manipulation:
	* _vhold, vputx and other routines make various decisions based
	* on either holdcnt or usecount being 0. As long as either counter
	* is not transitioning 0->1 nor 1->0, the manipulation can be done
	* with atomic operations. Otherwise the interlock is taken covering
	* both the atomic and additional actions.
	*/
	int
	vget(struct vnode vp, int flags, struct thread td)
	{
	int error, oweinact;

	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
	("vget: invalid lock operation"));

	if ((flags & LK_INTERLOCK) != 0)
	ASSERT_VI_LOCKED(vp, __func__);
	else
	ASSERT_VI_UNLOCKED(vp, __func__);
	if ((flags & LK_VNHELD) != 0)
	VNASSERT((vp->v_holdcnt > 0), vp,
	("vget: LK_VNHELD passed but vnode not held"));

	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);

	if ((flags & LK_VNHELD) == 0)
	_vhold(vp, (flags & LK_INTERLOCK) != 0);

	if ((error = vn_lock(vp, flags)) != 0) {
	vdrop(vp);
	CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
	vp);
	return (error);
	}
	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
	panic("vget: vn_lock failed to return ENOENT\n");
	/*
	* We don't guarantee that any particular close will
	* trigger inactive processing so just make a best effort
	* here at preventing a reference to a removed file. If
	* we don't succeed no harm is done.
	*
	* Upgrade our holdcnt to a usecount.
	*/
	if (vp->v_type == VCHR \|\|
	!vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
	VI_LOCK(vp);
	if ((vp->v_iflag & VI_OWEINACT) == 0) {
	oweinact = 0;
	} else {
	oweinact = 1;
	vp->v_iflag &= ~VI_OWEINACT;
	}
	refcount_acquire(&vp->v_usecount);
	v_incr_devcount(vp);
	if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
	(flags & LK_NOWAIT) == 0)
	vinactive(vp, td);
	VI_UNLOCK(vp);
	}
	return (0);
	}

	/*
	* Increase the reference (use) and hold count of a vnode.
	* This will also remove the vnode from the free list if it is presently free.
	*/
	void
	vref(struct vnode *vp)
	{

	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	_vhold(vp, false);
	v_incr_usecount(vp);
	}

	void
	vrefl(struct vnode *vp)
	{

	ASSERT_VI_LOCKED(vp, __func__);
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	_vhold(vp, true);
	v_incr_usecount_locked(vp);
	}

	void
	vrefact(struct vnode *vp)
	{

	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	if (__predict_false(vp->v_type == VCHR)) {
	VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
	("%s: wrong ref counts", __func__));
	vref(vp);
	return;
	}
	#ifdef INVARIANTS
	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
	VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
	old = atomic_fetchadd_int(&vp->v_usecount, 1);
	VNASSERT(old > 0, vp, ("%s: wrong use count", __func__));
	#else
	refcount_acquire(&vp->v_holdcnt);
	refcount_acquire(&vp->v_usecount);
	#endif
	}

	/*
	* Return reference count of a vnode.
	*
	* The results of this call are only guaranteed when some mechanism is used to
	* stop other processes from gaining references to the vnode. This may be the
	* case if the caller holds the only reference. This is also useful when stale
	* data is acceptable as race conditions may be accounted for by some other
	* means.
	*/
	int
	vrefcnt(struct vnode *vp)
	{

	return (vp->v_usecount);
	}

	#define VPUTX_VRELE 1
	#define VPUTX_VPUT 2
	#define VPUTX_VUNREF 3

	/*
	* Decrement the use and hold counts for a vnode.
	*
	* See an explanation near vget() as to why atomic operation is safe.
	*/
	static void
	vputx(struct vnode *vp, int func)
	{
	int error;

	KASSERT(vp != NULL, ("vputx: null vp"));
	if (func == VPUTX_VUNREF)
	ASSERT_VOP_LOCKED(vp, "vunref");
	else if (func == VPUTX_VPUT)
	ASSERT_VOP_LOCKED(vp, "vput");
	else
	KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
	ASSERT_VI_UNLOCKED(vp, __func__);
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);

	if (vp->v_type != VCHR &&
	vfs_refcount_release_if_not_last(&vp->v_usecount)) {
	if (func == VPUTX_VPUT)
	VOP_UNLOCK(vp, 0);
	vdrop(vp);
	return;
	}

	VI_LOCK(vp);

	/*
	* We want to hold the vnode until the inactive finishes to
	* prevent vgone() races. We drop the use count here and the
	* hold count below when we're done.
	*/
	if (!refcount_release(&vp->v_usecount) \|\|
	(vp->v_iflag & VI_DOINGINACT)) {
	if (func == VPUTX_VPUT)
	VOP_UNLOCK(vp, 0);
	v_decr_devcount(vp);
	vdropl(vp);
	return;
	}

	v_decr_devcount(vp);

	error = 0;

	if (vp->v_usecount != 0) {
	vn_printf(vp, "vputx: usecount not zero for vnode ");
	panic("vputx: usecount not zero");
	}

	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);

	/*
	* We must call VOP_INACTIVE with the node locked. Mark
	* as VI_DOINGINACT to avoid recursion.
	*/
	vp->v_iflag \|= VI_OWEINACT;
	switch (func) {
	case VPUTX_VRELE:
	error = vn_lock(vp, LK_EXCLUSIVE \| LK_INTERLOCK);
	VI_LOCK(vp);
	break;
	case VPUTX_VPUT:
	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
	error = VOP_LOCK(vp, LK_UPGRADE \| LK_INTERLOCK \|
	LK_NOWAIT);
	VI_LOCK(vp);
	}
	break;
	case VPUTX_VUNREF:
	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
	error = VOP_LOCK(vp, LK_TRYUPGRADE \| LK_INTERLOCK);
	VI_LOCK(vp);
	}
	break;
	}
	VNASSERT(vp->v_usecount == 0 \|\| (vp->v_iflag & VI_OWEINACT) == 0, vp,
	("vnode with usecount and VI_OWEINACT set"));
	if (error == 0) {
	if (vp->v_iflag & VI_OWEINACT)
	vinactive(vp, curthread);
	if (func != VPUTX_VUNREF)
	VOP_UNLOCK(vp, 0);
	}
	vdropl(vp);
	}

	/*
	* Vnode put/release.
	* If count drops to zero, call inactive routine and return to freelist.
	*/
	void
	vrele(struct vnode *vp)
	{

	vputx(vp, VPUTX_VRELE);
	}

	/*
	* Release an already locked vnode. This give the same effects as
	* unlock+vrele(), but takes less time and avoids releasing and
	* re-aquiring the lock (as vrele() acquires the lock internally.)
	*/
	void
	vput(struct vnode *vp)
	{

	vputx(vp, VPUTX_VPUT);
	}

	/*
	* Release an exclusively locked vnode. Do not unlock the vnode lock.
	*/
	void
	vunref(struct vnode *vp)
	{

	vputx(vp, VPUTX_VUNREF);
	}

	/*
	* Increase the hold count and activate if this is the first reference.
	*/
	void
	_vhold(struct vnode *vp, bool locked)
	{
	struct mount *mp;

	if (locked)
	ASSERT_VI_LOCKED(vp, __func__);
	else
	ASSERT_VI_UNLOCKED(vp, __func__);
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
	("_vhold: vnode with holdcnt is free"));
	return;
	}

	if (!locked)
	VI_LOCK(vp);
	if ((vp->v_iflag & VI_FREE) == 0) {
	refcount_acquire(&vp->v_holdcnt);
	if (!locked)
	VI_UNLOCK(vp);
	return;
	}
	VNASSERT(vp->v_holdcnt == 0, vp,
	("%s: wrong hold count", __func__));
	VNASSERT(vp->v_op != NULL, vp,
	("%s: vnode already reclaimed.", __func__));
	/*
	* Remove a vnode from the free list, mark it as in use,
	* and put it on the active list.
	*/
	VNASSERT(vp->v_mount != NULL, vp,
	("_vhold: vnode not on per mount vnode list"));
	mp = vp->v_mount;
	mtx_lock(&mp->mnt_listmtx);
	if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
	TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
	mp->mnt_tmpfreevnodelistsize--;
	vp->v_mflag &= ~VMP_TMPMNTFREELIST;
	} else {
	mtx_lock(&vnode_free_list_mtx);
	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
	freevnodes--;
	mtx_unlock(&vnode_free_list_mtx);
	}
	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
	("Activating already active vnode"));
	vp->v_iflag &= ~VI_FREE;
	vp->v_iflag \|= VI_ACTIVE;
	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
	mp->mnt_activevnodelistsize++;
	mtx_unlock(&mp->mnt_listmtx);
	refcount_acquire(&vp->v_holdcnt);
	if (!locked)
	VI_UNLOCK(vp);
	}

	/*
	* Drop the hold count of the vnode. If this is the last reference to
	* the vnode we place it on the free list unless it has been vgone'd
	* (marked VI_DOOMED) in which case we will free it.
	*
	* Because the vnode vm object keeps a hold reference on the vnode if
	* there is at least one resident non-cached page, the vnode cannot
	* leave the active list without the page cleanup done.
	*/
	void
	_vdrop(struct vnode *vp, bool locked)
	{
	struct bufobj *bo;
	struct mount *mp;
	int active;

	if (locked)
	ASSERT_VI_LOCKED(vp, __func__);
	else
	ASSERT_VI_UNLOCKED(vp, __func__);
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	if ((int)vp->v_holdcnt <= 0)
	panic("vdrop: holdcnt %d", vp->v_holdcnt);
	if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
	if (locked)
	VI_UNLOCK(vp);
	return;
	}

	if (!locked)
	VI_LOCK(vp);
	if (refcount_release(&vp->v_holdcnt) == 0) {
	VI_UNLOCK(vp);
	return;
	}
	if ((vp->v_iflag & VI_DOOMED) == 0) {
	/*
	* Mark a vnode as free: remove it from its active list
	* and put it up for recycling on the freelist.
	*/
	VNASSERT(vp->v_op != NULL, vp,
	("vdropl: vnode already reclaimed."));
	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
	("vnode already free"));
	VNASSERT(vp->v_holdcnt == 0, vp,
	("vdropl: freeing when we shouldn't"));
	active = vp->v_iflag & VI_ACTIVE;
	if ((vp->v_iflag & VI_OWEINACT) == 0) {
	vp->v_iflag &= ~VI_ACTIVE;
	mp = vp->v_mount;
	if (mp != NULL) {
	mtx_lock(&mp->mnt_listmtx);
	if (active) {
	TAILQ_REMOVE(&mp->mnt_activevnodelist,
	vp, v_actfreelist);
	mp->mnt_activevnodelistsize--;
	}
	TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist,
	vp, v_actfreelist);
	mp->mnt_tmpfreevnodelistsize++;
	vp->v_iflag \|= VI_FREE;
	vp->v_mflag \|= VMP_TMPMNTFREELIST;
	VI_UNLOCK(vp);
	if (mp->mnt_tmpfreevnodelistsize >=
	mnt_free_list_batch)
	vnlru_return_batch_locked(mp);
	mtx_unlock(&mp->mnt_listmtx);
	} else {
	VNASSERT(active == 0, vp,
	("vdropl: active vnode not on per mount "
	"vnode list"));
	mtx_lock(&vnode_free_list_mtx);
	TAILQ_INSERT_TAIL(&vnode_free_list, vp,
	v_actfreelist);
	freevnodes++;
	vp->v_iflag \|= VI_FREE;
	VI_UNLOCK(vp);
	mtx_unlock(&vnode_free_list_mtx);
	}
	} else {
	VI_UNLOCK(vp);
	counter_u64_add(free_owe_inact, 1);
	}
	return;
	}
	/*
	* The vnode has been marked for destruction, so free it.
	*
	* The vnode will be returned to the zone where it will
	* normally remain until it is needed for another vnode. We
	* need to cleanup (or verify that the cleanup has already
	* been done) any residual data left from its current use
	* so as not to contaminate the freshly allocated vnode.
	*/
	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
	atomic_subtract_long(&numvnodes, 1);
	bo = &vp->v_bufobj;
	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
	("cleaned vnode still on the free list."));
	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
	("clean blk trie not empty"));
	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
	("dirty blk trie not empty"));
	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
	("Dangling rangelock waiters"));
	VI_UNLOCK(vp);
	#ifdef MAC
	mac_vnode_destroy(vp);
	#endif
	if (vp->v_pollinfo != NULL) {
	destroy_vpollinfo(vp->v_pollinfo);
	vp->v_pollinfo = NULL;
	}
	#ifdef INVARIANTS
	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
	vp->v_op = NULL;
	#endif
	vp->v_mountedhere = NULL;
	vp->v_unpcb = NULL;
	vp->v_rdev = NULL;
	vp->v_fifoinfo = NULL;
	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
	vp->v_iflag = 0;
	vp->v_vflag = 0;
	bo->bo_flag = 0;
	uma_zfree(vnode_zone, vp);
	}

	/*
	* Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
	* flags. DOINGINACT prevents us from recursing in calls to vinactive.
	* OWEINACT tracks whether a vnode missed a call to inactive due to a
	* failed lock upgrade.
	*/
	void
	vinactive(struct vnode vp, struct thread td)
	{
	struct vm_object *obj;

	ASSERT_VOP_ELOCKED(vp, "vinactive");
	ASSERT_VI_LOCKED(vp, "vinactive");
	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
	("vinactive: recursed on VI_DOINGINACT"));
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	vp->v_iflag \|= VI_DOINGINACT;
	vp->v_iflag &= ~VI_OWEINACT;
	VI_UNLOCK(vp);
	/*
	* Before moving off the active list, we must be sure that any
	* modified pages are converted into the vnode's dirty
	* buffers, since these will no longer be checked once the
	* vnode is on the inactive list.
	*
	* The write-out of the dirty pages is asynchronous. At the
	* point that VOP_INACTIVE() is called, there could still be
	* pending I/O and dirty pages in the object.
	*/
	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
	(obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
	VM_OBJECT_WLOCK(obj);
	vm_object_page_clean(obj, 0, 0, 0);
	VM_OBJECT_WUNLOCK(obj);
	}
	VOP_INACTIVE(vp, td);
	VI_LOCK(vp);
	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
	("vinactive: lost VI_DOINGINACT"));
	vp->v_iflag &= ~VI_DOINGINACT;
	}

	/*
	* Remove any vnodes in the vnode table belonging to mount point mp.
	*
	* If FORCECLOSE is not specified, there should not be any active ones,
	* return error if any are found (nb: this is a user error, not a
	* system error). If FORCECLOSE is specified, detach any active vnodes
	* that are found.
	*
	* If WRITECLOSE is set, only flush out regular file vnodes open for
	* writing.
	*
	* SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
	*
	* `rootrefs' specifies the base reference count for the root vnode
	* of this filesystem. The root vnode is considered busy if its
	* v_usecount exceeds this value. On a successful return, vflush(, td)
	* will call vrele() on the root vnode exactly rootrefs times.
	* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
	* be zero.
	*/
	#ifdef DIAGNOSTIC
	static int busyprt = 0; /* print out busy vnodes */
	SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
	#endif

	int
	vflush(struct mount mp, int rootrefs, int flags, struct thread td)
	{
	struct vnode vp, mvp, *rootvp = NULL;
	struct vattr vattr;
	int busy = 0, error;

	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
	rootrefs, flags);
	if (rootrefs > 0) {
	KASSERT((flags & (SKIPSYSTEM \| WRITECLOSE)) == 0,
	("vflush: bad args"));
	/*
	* Get the filesystem root vnode. We can vput() it
	* immediately, since with rootrefs > 0, it won't go away.
	*/
	if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
	CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
	__func__, error);
	return (error);
	}
	vput(rootvp);
	}
	loop:
	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
	vholdl(vp);
	error = vn_lock(vp, LK_INTERLOCK \| LK_EXCLUSIVE);
	if (error) {
	vdrop(vp);
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	goto loop;
	}
	/*
	* Skip over a vnodes marked VV_SYSTEM.
	*/
	if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
	VOP_UNLOCK(vp, 0);
	vdrop(vp);
	continue;
	}
	/*
	* If WRITECLOSE is set, flush out unlinked but still open
	* files (even if open only for reading) and regular file
	* vnodes open for writing.
	*/
	if (flags & WRITECLOSE) {
	if (vp->v_object != NULL) {
	VM_OBJECT_WLOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, 0);
	VM_OBJECT_WUNLOCK(vp->v_object);
	}
	error = VOP_FSYNC(vp, MNT_WAIT, td);
	if (error != 0) {
	VOP_UNLOCK(vp, 0);
	vdrop(vp);
	MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
	return (error);
	}
	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
	VI_LOCK(vp);

	if ((vp->v_type == VNON \|\|
	(error == 0 && vattr.va_nlink > 0)) &&
	(vp->v_writecount == 0 \|\| vp->v_type != VREG)) {
	VOP_UNLOCK(vp, 0);
	vdropl(vp);
	continue;
	}
	} else
	VI_LOCK(vp);
	/*
	* With v_usecount == 0, all we need to do is clear out the
	* vnode data structures and we are done.
	*
	* If FORCECLOSE is set, forcibly close the vnode.
	*/
	if (vp->v_usecount == 0 \|\| (flags & FORCECLOSE)) {
	vgonel(vp);
	} else {
	busy++;
	#ifdef DIAGNOSTIC
	if (busyprt)
	vn_printf(vp, "vflush: busy vnode ");
	#endif
	}
	VOP_UNLOCK(vp, 0);
	vdropl(vp);
	}
	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
	/*
	* If just the root vnode is busy, and if its refcount
	* is equal to `rootrefs', then go ahead and kill it.
	*/
	VI_LOCK(rootvp);
	KASSERT(busy > 0, ("vflush: not busy"));
	VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
	("vflush: usecount %d < rootrefs %d",
	rootvp->v_usecount, rootrefs));
	if (busy == 1 && rootvp->v_usecount == rootrefs) {
	VOP_LOCK(rootvp, LK_EXCLUSIVE\|LK_INTERLOCK);
	vgone(rootvp);
	VOP_UNLOCK(rootvp, 0);
	busy = 0;
	} else
	VI_UNLOCK(rootvp);
	}
	if (busy) {
	CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
	busy);
	return (EBUSY);
	}
	for (; rootrefs > 0; rootrefs--)
	vrele(rootvp);
	return (0);
	}

	/*
	* Recycle an unused vnode to the front of the free list.
	*/
	int
	vrecycle(struct vnode *vp)
	{
	int recycled;

	VI_LOCK(vp);
	recycled = vrecyclel(vp);
	VI_UNLOCK(vp);
	return (recycled);
	}

	/*
	* vrecycle, with the vp interlock held.
	*/
	int
	vrecyclel(struct vnode *vp)
	{
	int recycled;

	ASSERT_VOP_ELOCKED(vp, __func__);
	ASSERT_VI_LOCKED(vp, __func__);
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	recycled = 0;
	if (vp->v_usecount == 0) {
	recycled = 1;
	vgonel(vp);
	}
	return (recycled);
	}

	/*
	* Eliminate all activity associated with a vnode
	* in preparation for reuse.
	*/
	void
	vgone(struct vnode *vp)
	{
	VI_LOCK(vp);
	vgonel(vp);
	VI_UNLOCK(vp);
	}

	static void
	notify_lowervp_vfs_dummy(struct mount *mp __unused,
	struct vnode *lowervp __unused)
	{
	}

	/*
	* Notify upper mounts about reclaimed or unlinked vnode.
	*/
	void
	vfs_notify_upper(struct vnode *vp, int event)
	{
	static struct vfsops vgonel_vfsops = {
	.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
	.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
	};
	struct mount mp, ump, *mmp;

	mp = vp->v_mount;
	if (mp == NULL)
	return;

	MNT_ILOCK(mp);
	if (TAILQ_EMPTY(&mp->mnt_uppers))
	goto unlock;
	MNT_IUNLOCK(mp);
	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK \| M_ZERO);
	mmp->mnt_op = &vgonel_vfsops;
	mmp->mnt_kern_flag \|= MNTK_MARKER;
	MNT_ILOCK(mp);
	mp->mnt_kern_flag \|= MNTK_VGONE_UPPER;
	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
	if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
	ump = TAILQ_NEXT(ump, mnt_upper_link);
	continue;
	}
	TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
	MNT_IUNLOCK(mp);
	switch (event) {
	case VFS_NOTIFY_UPPER_RECLAIM:
	VFS_RECLAIM_LOWERVP(ump, vp);
	break;
	case VFS_NOTIFY_UPPER_UNLINK:
	VFS_UNLINK_LOWERVP(ump, vp);
	break;
	default:
	KASSERT(0, ("invalid event %d", event));
	break;
	}
	MNT_ILOCK(mp);
	ump = TAILQ_NEXT(mmp, mnt_upper_link);
	TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
	}
	free(mmp, M_TEMP);
	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
	mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
	wakeup(&mp->mnt_uppers);
	}
	unlock:
	MNT_IUNLOCK(mp);
	}

	/*
	* vgone, with the vp interlock held.
	*/
	static void
	vgonel(struct vnode *vp)
	{
	struct thread *td;
	int oweinact;
	int active;
	struct mount *mp;

	ASSERT_VOP_ELOCKED(vp, "vgonel");
	ASSERT_VI_LOCKED(vp, "vgonel");
	VNASSERT(vp->v_holdcnt, vp,
	("vgonel: vp %p has no reference.", vp));
	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	td = curthread;

	/*
	* Don't vgonel if we're already doomed.
	*/
	if (vp->v_iflag & VI_DOOMED)
	return;
	vp->v_iflag \|= VI_DOOMED;

	/*
	* Check to see if the vnode is in use. If so, we have to call
	* VOP_CLOSE() and VOP_INACTIVE().
	*/
	active = vp->v_usecount;
	oweinact = (vp->v_iflag & VI_OWEINACT);
	VI_UNLOCK(vp);
	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);

	/*
	* If purging an active vnode, it must be closed and
	* deactivated before being reclaimed.
	*/
	if (active)
	VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
	if (oweinact \|\| active) {
	VI_LOCK(vp);
	if ((vp->v_iflag & VI_DOINGINACT) == 0)
	vinactive(vp, td);
	VI_UNLOCK(vp);
	}
	if (vp->v_type == VSOCK)
	vfs_unp_reclaim(vp);

	/*
	* Clean out any buffers associated with the vnode.
	* If the flush fails, just toss the buffers.
	*/
	mp = NULL;
	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
	(void) vn_start_secondary_write(vp, &mp, V_WAIT);
	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
	while (vinvalbuf(vp, 0, 0, 0) != 0)
	;
	}

	BO_LOCK(&vp->v_bufobj);
	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
	vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
	TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
	vp->v_bufobj.bo_clean.bv_cnt == 0,
	("vp %p bufobj not invalidated", vp));

	/*
	* For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
	* after the object's page queue is flushed.
	*/
	if (vp->v_bufobj.bo_object == NULL)
	vp->v_bufobj.bo_flag \|= BO_DEAD;
	BO_UNLOCK(&vp->v_bufobj);

	/*
	* Reclaim the vnode.
	*/
	if (VOP_RECLAIM(vp, td))
	panic("vgone: cannot reclaim");
	if (mp != NULL)
	vn_finished_secondary_write(mp);
	VNASSERT(vp->v_object == NULL, vp,
	("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
	/*
	* Clear the advisory locks and wake up waiting threads.
	*/
	(void)VOP_ADVLOCKPURGE(vp);
	vp->v_lockf = NULL;
	/*
	* Delete from old mount point vnode list.
	*/
	delmntque(vp);
	cache_purge(vp);
	/*
	* Done with purge, reset to the standard lock and invalidate
	* the vnode.
	*/
	VI_LOCK(vp);
	vp->v_vnlock = &vp->v_lock;
	vp->v_op = &dead_vnodeops;
	vp->v_tag = "none";
	vp->v_type = VBAD;
	}

	/*
	* Calculate the total number of references to a special device.
	*/
	int
	vcount(struct vnode *vp)
	{
	int count;

	dev_lock();
	count = vp->v_rdev->si_usecount;
	dev_unlock();
	return (count);
	}

	/*
	* Same as above, but using the struct cdev *as argument
	*/
	int
	count_dev(struct cdev *dev)
	{
	int count;

	dev_lock();
	count = dev->si_usecount;
	dev_unlock();
	return(count);
	}

	/*
	* Print out a description of a vnode.
	*/
	static char *typename[] =
	{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
	"VMARKER"};

	void
	vn_printf(struct vnode vp, const char fmt, ...)
	{
	va_list ap;
	char buf[256], buf2[16];
	u_long flags;

	va_start(ap, fmt);
	vprintf(fmt, ap);
	va_end(ap);
	printf("%p: ", (void *)vp);
	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
	printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n",
	vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
	buf[0] = '\0';
	buf[1] = '\0';
	if (vp->v_vflag & VV_ROOT)
	strlcat(buf, "\|VV_ROOT", sizeof(buf));
	if (vp->v_vflag & VV_ISTTY)
	strlcat(buf, "\|VV_ISTTY", sizeof(buf));
	if (vp->v_vflag & VV_NOSYNC)
	strlcat(buf, "\|VV_NOSYNC", sizeof(buf));
	if (vp->v_vflag & VV_ETERNALDEV)
	strlcat(buf, "\|VV_ETERNALDEV", sizeof(buf));
	if (vp->v_vflag & VV_CACHEDLABEL)
	strlcat(buf, "\|VV_CACHEDLABEL", sizeof(buf));
	if (vp->v_vflag & VV_TEXT)
	strlcat(buf, "\|VV_TEXT", sizeof(buf));
	if (vp->v_vflag & VV_COPYONWRITE)
	strlcat(buf, "\|VV_COPYONWRITE", sizeof(buf));
	if (vp->v_vflag & VV_SYSTEM)
	strlcat(buf, "\|VV_SYSTEM", sizeof(buf));
	if (vp->v_vflag & VV_PROCDEP)
	strlcat(buf, "\|VV_PROCDEP", sizeof(buf));
	if (vp->v_vflag & VV_NOKNOTE)
	strlcat(buf, "\|VV_NOKNOTE", sizeof(buf));
	if (vp->v_vflag & VV_DELETED)
	strlcat(buf, "\|VV_DELETED", sizeof(buf));
	if (vp->v_vflag & VV_MD)
	strlcat(buf, "\|VV_MD", sizeof(buf));
	if (vp->v_vflag & VV_FORCEINSMQ)
	strlcat(buf, "\|VV_FORCEINSMQ", sizeof(buf));
	flags = vp->v_vflag & ~(VV_ROOT \| VV_ISTTY \| VV_NOSYNC \| VV_ETERNALDEV \|
	VV_CACHEDLABEL \| VV_TEXT \| VV_COPYONWRITE \| VV_SYSTEM \| VV_PROCDEP \|
	VV_NOKNOTE \| VV_DELETED \| VV_MD \| VV_FORCEINSMQ);
	if (flags != 0) {
	snprintf(buf2, sizeof(buf2), "\|VV(0x%lx)", flags);
	strlcat(buf, buf2, sizeof(buf));
	}
	if (vp->v_iflag & VI_MOUNT)
	strlcat(buf, "\|VI_MOUNT", sizeof(buf));
	if (vp->v_iflag & VI_DOOMED)
	strlcat(buf, "\|VI_DOOMED", sizeof(buf));
	if (vp->v_iflag & VI_FREE)
	strlcat(buf, "\|VI_FREE", sizeof(buf));
	if (vp->v_iflag & VI_ACTIVE)
	strlcat(buf, "\|VI_ACTIVE", sizeof(buf));
	if (vp->v_iflag & VI_DOINGINACT)
	strlcat(buf, "\|VI_DOINGINACT", sizeof(buf));
	if (vp->v_iflag & VI_OWEINACT)
	strlcat(buf, "\|VI_OWEINACT", sizeof(buf));
	flags = vp->v_iflag & ~(VI_MOUNT \| VI_DOOMED \| VI_FREE \|
	VI_ACTIVE \| VI_DOINGINACT \| VI_OWEINACT);
	if (flags != 0) {
	snprintf(buf2, sizeof(buf2), "\|VI(0x%lx)", flags);
	strlcat(buf, buf2, sizeof(buf));
	}
	printf(" flags (%s)\n", buf + 1);
	if (mtx_owned(VI_MTX(vp)))
	printf(" VI_LOCKed");
	if (vp->v_object != NULL)
	printf(" v_object %p ref %d pages %d "
	"cleanbuf %d dirtybuf %d\n",
	vp->v_object, vp->v_object->ref_count,
	vp->v_object->resident_page_count,
	vp->v_bufobj.bo_clean.bv_cnt,
	vp->v_bufobj.bo_dirty.bv_cnt);
	printf(" ");
	lockmgr_printinfo(vp->v_vnlock);
	if (vp->v_data != NULL)
	VOP_PRINT(vp);
	}

	#ifdef DDB
	/*
	* List all of the locked vnodes in the system.
	* Called when debugging the kernel.
	*/
	DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
	{
	struct mount *mp;
	struct vnode *vp;

	/*
	* Note: because this is DDB, we can't obey the locking semantics
	* for these structures, which means we could catch an inconsistent
	* state and dereference a nasty pointer. Not much to be done
	* about that.
	*/
	db_printf("Locked vnodes\n");
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
	vn_printf(vp, "vnode ");
	}
	}
	}

	/*
	* Show details about the given vnode.
	*/
	DB_SHOW_COMMAND(vnode, db_show_vnode)
	{
	struct vnode *vp;

	if (!have_addr)
	return;
	vp = (struct vnode *)addr;
	vn_printf(vp, "vnode ");
	}

	/*
	* Show details about the given mount point.
	*/
	DB_SHOW_COMMAND(mount, db_show_mount)
	{
	struct mount *mp;
	struct vfsopt *opt;
	struct statfs *sp;
	struct vnode *vp;
	char buf[512];
	uint64_t mflags;
	u_int flags;

	if (!have_addr) {
	/* No address given, print short info about all mount points. */
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	db_printf("%p %s on %s (%s)\n", mp,
	mp->mnt_stat.f_mntfromname,
	mp->mnt_stat.f_mntonname,
	mp->mnt_stat.f_fstypename);
	if (db_pager_quit)
	break;
	}
	db_printf("\nMore info: show mount <addr>\n");
	return;
	}

	mp = (struct mount *)addr;
	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
	mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);

	buf[0] = '\0';
	mflags = mp->mnt_flag;
	#define MNT_FLAG(flag) do { \
	if (mflags & (flag)) { \
	if (buf[0] != '\0') \
	strlcat(buf, ", ", sizeof(buf)); \
	strlcat(buf, (#flag) + 4, sizeof(buf)); \
	mflags &= ~(flag); \
	} \
	} while (0)
	MNT_FLAG(MNT_RDONLY);
	MNT_FLAG(MNT_SYNCHRONOUS);
	MNT_FLAG(MNT_NOEXEC);
	MNT_FLAG(MNT_NOSUID);
	MNT_FLAG(MNT_NFS4ACLS);
	MNT_FLAG(MNT_UNION);
	MNT_FLAG(MNT_ASYNC);
	MNT_FLAG(MNT_SUIDDIR);
	MNT_FLAG(MNT_SOFTDEP);
	MNT_FLAG(MNT_NOSYMFOLLOW);
	MNT_FLAG(MNT_GJOURNAL);
	MNT_FLAG(MNT_MULTILABEL);
	MNT_FLAG(MNT_ACLS);
	MNT_FLAG(MNT_NOATIME);
	MNT_FLAG(MNT_NOCLUSTERR);
	MNT_FLAG(MNT_NOCLUSTERW);
	MNT_FLAG(MNT_SUJ);
	MNT_FLAG(MNT_EXRDONLY);
	MNT_FLAG(MNT_EXPORTED);
	MNT_FLAG(MNT_DEFEXPORTED);
	MNT_FLAG(MNT_EXPORTANON);
	MNT_FLAG(MNT_EXKERB);
	MNT_FLAG(MNT_EXPUBLIC);
	MNT_FLAG(MNT_LOCAL);
	MNT_FLAG(MNT_QUOTA);
	MNT_FLAG(MNT_ROOTFS);
	MNT_FLAG(MNT_USER);
	MNT_FLAG(MNT_IGNORE);
	MNT_FLAG(MNT_UPDATE);
	MNT_FLAG(MNT_DELEXPORT);
	MNT_FLAG(MNT_RELOAD);
	MNT_FLAG(MNT_FORCE);
	MNT_FLAG(MNT_SNAPSHOT);
	MNT_FLAG(MNT_BYFSID);
	#undef MNT_FLAG
	if (mflags != 0) {
	if (buf[0] != '\0')
	strlcat(buf, ", ", sizeof(buf));
	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
	"0x%016jx", mflags);
	}
	db_printf(" mnt_flag = %s\n", buf);

	buf[0] = '\0';
	flags = mp->mnt_kern_flag;
	#define MNT_KERN_FLAG(flag) do { \
	if (flags & (flag)) { \
	if (buf[0] != '\0') \
	strlcat(buf, ", ", sizeof(buf)); \
	strlcat(buf, (#flag) + 5, sizeof(buf)); \
	flags &= ~(flag); \
	} \
	} while (0)
	MNT_KERN_FLAG(MNTK_UNMOUNTF);
	MNT_KERN_FLAG(MNTK_ASYNC);
	MNT_KERN_FLAG(MNTK_SOFTDEP);
	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
	MNT_KERN_FLAG(MNTK_DRAINING);
	MNT_KERN_FLAG(MNTK_REFEXPIRE);
	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
	MNT_KERN_FLAG(MNTK_NO_IOPF);
	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
	MNT_KERN_FLAG(MNTK_MARKER);
	MNT_KERN_FLAG(MNTK_USES_BCACHE);
	MNT_KERN_FLAG(MNTK_NOASYNC);
	MNT_KERN_FLAG(MNTK_UNMOUNT);
	MNT_KERN_FLAG(MNTK_MWAIT);
	MNT_KERN_FLAG(MNTK_SUSPEND);
	MNT_KERN_FLAG(MNTK_SUSPEND2);
	MNT_KERN_FLAG(MNTK_SUSPENDED);
	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
	MNT_KERN_FLAG(MNTK_NOKNOTE);
	#undef MNT_KERN_FLAG
	if (flags != 0) {
	if (buf[0] != '\0')
	strlcat(buf, ", ", sizeof(buf));
	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
	"0x%08x", flags);
	}
	db_printf(" mnt_kern_flag = %s\n", buf);

	db_printf(" mnt_opt = ");
	opt = TAILQ_FIRST(mp->mnt_opt);
	if (opt != NULL) {
	db_printf("%s", opt->name);
	opt = TAILQ_NEXT(opt, link);
	while (opt != NULL) {
	db_printf(", %s", opt->name);
	opt = TAILQ_NEXT(opt, link);
	}
	}
	db_printf("\n");

	sp = &mp->mnt_stat;
	db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
	"bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
	"ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
	"asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
	(u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
	(uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
	(uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
	(intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
	(intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
	(uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
	(uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
	(u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);

	db_printf(" mnt_cred = { uid=%u ruid=%u",
	(u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
	if (jailed(mp->mnt_cred))
	db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
	db_printf(" }\n");
	db_printf(" mnt_ref = %d\n", mp->mnt_ref);
	db_printf(" mnt_gen = %d\n", mp->mnt_gen);
	db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
	db_printf(" mnt_activevnodelistsize = %d\n",
	mp->mnt_activevnodelistsize);
	db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount);
	db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
	db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
	db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
	db_printf(" mnt_lockref = %d\n", mp->mnt_lockref);
	db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
	db_printf(" mnt_secondary_accwrites = %d\n",
	mp->mnt_secondary_accwrites);
	db_printf(" mnt_gjprovider = %s\n",
	mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");

	db_printf("\n\nList of active vnodes\n");
	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
	if (vp->v_type != VMARKER) {
	vn_printf(vp, "vnode ");
	if (db_pager_quit)
	break;
	}
	}
	db_printf("\n\nList of inactive vnodes\n");
	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
	vn_printf(vp, "vnode ");
	if (db_pager_quit)
	break;
	}
	}
	}
	#endif /* DDB */

	/*
	* Fill in a struct xvfsconf based on a struct vfsconf.
	*/
	static int
	vfsconf2x(struct sysctl_req req, struct vfsconf vfsp)
	{
	struct xvfsconf xvfsp;

	bzero(&xvfsp, sizeof(xvfsp));
	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
	xvfsp.vfc_typenum = vfsp->vfc_typenum;
	xvfsp.vfc_refcount = vfsp->vfc_refcount;
	xvfsp.vfc_flags = vfsp->vfc_flags;
	/*
	* These are unused in userland, we keep them
	* to not break binary compatibility.
	*/
	xvfsp.vfc_vfsops = NULL;
	xvfsp.vfc_next = NULL;
	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
	}

	#ifdef COMPAT_FREEBSD32
	struct xvfsconf32 {
	uint32_t vfc_vfsops;
	char vfc_name[MFSNAMELEN];
	int32_t vfc_typenum;
	int32_t vfc_refcount;
	int32_t vfc_flags;
	uint32_t vfc_next;
	};

	static int
	vfsconf2x32(struct sysctl_req req, struct vfsconf vfsp)
	{
	struct xvfsconf32 xvfsp;

	bzero(&xvfsp, sizeof(xvfsp));
	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
	xvfsp.vfc_typenum = vfsp->vfc_typenum;
	xvfsp.vfc_refcount = vfsp->vfc_refcount;
	xvfsp.vfc_flags = vfsp->vfc_flags;
	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
	}
	#endif

	/*
	* Top level filesystem related information gathering.
	*/
	static int
	sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
	{
	struct vfsconf *vfsp;
	int error;

	error = 0;
	vfsconf_slock();
	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
	#ifdef COMPAT_FREEBSD32
	if (req->flags & SCTL_MASK32)
	error = vfsconf2x32(req, vfsp);
	else
	#endif
	error = vfsconf2x(req, vfsp);
	if (error)
	break;
	}
	vfsconf_sunlock();
	return (error);
	}

	SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE \| CTLFLAG_RD \|
	CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
	"S,xvfsconf", "List of all configured filesystems");

	#ifndef BURN_BRIDGES
	static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);

	static int
	vfs_sysctl(SYSCTL_HANDLER_ARGS)
	{
	int name = (int )arg1 - 1; /* XXX */
	u_int namelen = arg2 + 1; /* XXX */
	struct vfsconf *vfsp;

	log(LOG_WARNING, "userland calling deprecated sysctl, "
	"please rebuild world\n");

	#if 1 \|\| defined(COMPAT_PRELITE2)
	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
	if (namelen == 1)
	return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
	#endif

	switch (name[1]) {
	case VFS_MAXTYPENUM:
	if (namelen != 2)
	return (ENOTDIR);
	return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
	case VFS_CONF:
	if (namelen != 3)
	return (ENOTDIR); /* overloaded */
	vfsconf_slock();
	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
	if (vfsp->vfc_typenum == name[2])
	break;
	}
	vfsconf_sunlock();
	if (vfsp == NULL)
	return (EOPNOTSUPP);
	#ifdef COMPAT_FREEBSD32
	if (req->flags & SCTL_MASK32)
	return (vfsconf2x32(req, vfsp));
	else
	#endif
	return (vfsconf2x(req, vfsp));
	}
	return (EOPNOTSUPP);
	}

	static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD \| CTLFLAG_SKIP \|
	CTLFLAG_MPSAFE, vfs_sysctl,
	"Generic filesystem");

	#if 1 \|\| defined(COMPAT_PRELITE2)

	static int
	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
	{
	int error;
	struct vfsconf *vfsp;
	struct ovfsconf ovfs;

	vfsconf_slock();
	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
	bzero(&ovfs, sizeof(ovfs));
	ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
	strcpy(ovfs.vfc_name, vfsp->vfc_name);
	ovfs.vfc_index = vfsp->vfc_typenum;
	ovfs.vfc_refcount = vfsp->vfc_refcount;
	ovfs.vfc_flags = vfsp->vfc_flags;
	error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
	if (error != 0) {
	vfsconf_sunlock();
	return (error);
	}
	}
	vfsconf_sunlock();
	return (0);
	}

	#endif /* 1 \|\| COMPAT_PRELITE2 */
	#endif /* !BURN_BRIDGES */

	#define KINFO_VNODESLOP 10
	#ifdef notyet
	/*
	* Dump vnode list (via sysctl).
	*/
	/* ARGSUSED */
	static int
	sysctl_vnode(SYSCTL_HANDLER_ARGS)
	{
	struct xvnode *xvn;
	struct mount *mp;
	struct vnode *vp;
	int error, len, n;

	/*
	* Stale numvnodes access is not fatal here.
	*/
	req->lock = 0;
	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
	if (!req->oldptr)
	/* Make an estimate */
	return (SYSCTL_OUT(req, 0, len));

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);
	xvn = malloc(len, M_TEMP, M_ZERO \| M_WAITOK);
	n = 0;
	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	if (vfs_busy(mp, MBF_NOWAIT \| MBF_MNTLSTLOCK))
	continue;
	MNT_ILOCK(mp);
	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	if (n == len)
	break;
	vref(vp);
	xvn[n].xv_size = sizeof *xvn;
	xvn[n].xv_vnode = vp;
	xvn[n].xv_id = 0; /* XXX compat */
	#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
	XV_COPY(usecount);
	XV_COPY(writecount);
	XV_COPY(holdcnt);
	XV_COPY(mount);
	XV_COPY(numoutput);
	XV_COPY(type);
	#undef XV_COPY
	xvn[n].xv_flag = vp->v_vflag;

	switch (vp->v_type) {
	case VREG:
	case VDIR:
	case VLNK:
	break;
	case VBLK:
	case VCHR:
	if (vp->v_rdev == NULL) {
	vrele(vp);
	continue;
	}
	xvn[n].xv_dev = dev2udev(vp->v_rdev);
	break;
	case VSOCK:
	xvn[n].xv_socket = vp->v_socket;
	break;
	case VFIFO:
	xvn[n].xv_fifo = vp->v_fifoinfo;
	break;
	case VNON:
	case VBAD:
	default:
	/* shouldn't happen? */
	vrele(vp);
	continue;
	}
	vrele(vp);
	++n;
	}
	MNT_IUNLOCK(mp);
	mtx_lock(&mountlist_mtx);
	vfs_unbusy(mp);
	if (n == len)
	break;
	}
	mtx_unlock(&mountlist_mtx);

	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
	free(xvn, M_TEMP);
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE \| CTLFLAG_RD \|
	CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
	"");
	#endif

	static void
	unmount_or_warn(struct mount *mp)
	{
	int error;

	error = dounmount(mp, MNT_FORCE, curthread);
	if (error != 0) {
	printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
	if (error == EBUSY)
	printf("BUSY)\n");
	else
	printf("%d)\n", error);
	}
	}

	/*
	* Unmount all filesystems. The list is traversed in reverse order
	* of mounting to avoid dependencies.
	*/
	void
	vfs_unmountall(void)
	{
	struct mount mp, tmp;

	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);

	/*
	* Since this only runs when rebooting, it is not interlocked.
	*/
	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
	vfs_ref(mp);

	/*
	* Forcibly unmounting "/dev" before "/" would prevent clean
	* unmount of the latter.
	*/
	if (mp == rootdevmp)
	continue;

	unmount_or_warn(mp);
	}

	if (rootdevmp != NULL)
	unmount_or_warn(rootdevmp);
	}

	/*
	* perform msync on all vnodes under a mount point
	* the mount point must be locked.
	*/
	void
	vfs_msync(struct mount *mp, int flags)
	{
	struct vnode vp, mvp;
	struct vm_object *obj;

	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);

	vnlru_return_batch(mp);

	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
	obj = vp->v_object;
	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
	(flags == MNT_WAIT \|\| VOP_ISLOCKED(vp) == 0)) {
	if (!vget(vp,
	LK_EXCLUSIVE \| LK_RETRY \| LK_INTERLOCK,
	curthread)) {
	if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
	vput(vp);
	continue;
	}

	obj = vp->v_object;
	if (obj != NULL) {
	VM_OBJECT_WLOCK(obj);
	vm_object_page_clean(obj, 0, 0,
	flags == MNT_WAIT ?
	OBJPC_SYNC : OBJPC_NOSYNC);
	VM_OBJECT_WUNLOCK(obj);
	}
	vput(vp);
	}
	} else
	VI_UNLOCK(vp);
	}
	}

	static void
	destroy_vpollinfo_free(struct vpollinfo *vi)
	{

	knlist_destroy(&vi->vpi_selinfo.si_note);
	mtx_destroy(&vi->vpi_lock);
	uma_zfree(vnodepoll_zone, vi);
	}

	static void
	destroy_vpollinfo(struct vpollinfo *vi)
	{

	knlist_clear(&vi->vpi_selinfo.si_note, 1);
	seldrain(&vi->vpi_selinfo);
	destroy_vpollinfo_free(vi);
	}

	/*
	* Initialize per-vnode helper structure to hold poll-related state.
	*/
	void
	v_addpollinfo(struct vnode *vp)
	{
	struct vpollinfo *vi;

	if (vp->v_pollinfo != NULL)
	return;
	vi = uma_zalloc(vnodepoll_zone, M_WAITOK \| M_ZERO);
	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
	vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
	VI_LOCK(vp);
	if (vp->v_pollinfo != NULL) {
	VI_UNLOCK(vp);
	destroy_vpollinfo_free(vi);
	return;
	}
	vp->v_pollinfo = vi;
	VI_UNLOCK(vp);
	}

	/*
	* Record a process's interest in events which might happen to
	* a vnode. Because poll uses the historic select-style interface
	* internally, this routine serves as both the ``check for any
	* pending events'' and the ``record my interest in future events''
	* functions. (These are done together, while the lock is held,
	* to avoid race conditions.)
	*/
	int
	vn_pollrecord(struct vnode vp, struct thread td, int events)
	{

	v_addpollinfo(vp);
	mtx_lock(&vp->v_pollinfo->vpi_lock);
	if (vp->v_pollinfo->vpi_revents & events) {
	/*
	* This leaves events we are not interested
	* in available for the other process which
	* which presumably had requested them
	* (otherwise they would never have been
	* recorded).
	*/
	events &= vp->v_pollinfo->vpi_revents;
	vp->v_pollinfo->vpi_revents &= ~events;

	mtx_unlock(&vp->v_pollinfo->vpi_lock);
	return (events);
	}
	vp->v_pollinfo->vpi_events \|= events;
	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
	mtx_unlock(&vp->v_pollinfo->vpi_lock);
	return (0);
	}

	/*
	* Routine to create and manage a filesystem syncer vnode.
	*/
	#define sync_close ((int ()(struct vop_close_args ))nullop)
	static int sync_fsync(struct vop_fsync_args *);
	static int sync_inactive(struct vop_inactive_args *);
	static int sync_reclaim(struct vop_reclaim_args *);

	static struct vop_vector sync_vnodeops = {
	.vop_bypass = VOP_EOPNOTSUPP,
	.vop_close = sync_close, /* close */
	.vop_fsync = sync_fsync, /* fsync */
	.vop_inactive = sync_inactive, /* inactive */
	.vop_reclaim = sync_reclaim, /* reclaim */
	.vop_lock1 = vop_stdlock, /* lock */
	.vop_unlock = vop_stdunlock, /* unlock */
	.vop_islocked = vop_stdislocked, /* islocked */
	};

	/*
	* Create a new filesystem syncer vnode for the specified mount point.
	*/
	void
	vfs_allocate_syncvnode(struct mount *mp)
	{
	struct vnode *vp;
	struct bufobj *bo;
	static long start, incr, next;
	int error;

	/* Allocate a new vnode */
	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
	if (error != 0)
	panic("vfs_allocate_syncvnode: getnewvnode() failed");
	vp->v_type = VNON;
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	vp->v_vflag \|= VV_FORCEINSMQ;
	error = insmntque(vp, mp);
	if (error != 0)
	panic("vfs_allocate_syncvnode: insmntque() failed");
	vp->v_vflag &= ~VV_FORCEINSMQ;
	VOP_UNLOCK(vp, 0);
	/*
	* Place the vnode onto the syncer worklist. We attempt to
	* scatter them about on the list so that they will go off
	* at evenly distributed times even if all the filesystems
	* are mounted at once.
	*/
	next += incr;
	if (next == 0 \|\| next > syncer_maxdelay) {
	start /= 2;
	incr /= 2;
	if (start == 0) {
	start = syncer_maxdelay / 2;
	incr = syncer_maxdelay;
	}
	next = start;
	}
	bo = &vp->v_bufobj;
	BO_LOCK(bo);
	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
	mtx_lock(&sync_mtx);
	sync_vnode_count++;
	if (mp->mnt_syncer == NULL) {
	mp->mnt_syncer = vp;
	vp = NULL;
	}
	mtx_unlock(&sync_mtx);
	BO_UNLOCK(bo);
	if (vp != NULL) {
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	vgone(vp);
	vput(vp);
	}
	}

	void
	vfs_deallocate_syncvnode(struct mount *mp)
	{
	struct vnode *vp;

	mtx_lock(&sync_mtx);
	vp = mp->mnt_syncer;
	if (vp != NULL)
	mp->mnt_syncer = NULL;
	mtx_unlock(&sync_mtx);
	if (vp != NULL)
	vrele(vp);
	}

	/*
	* Do a lazy sync of the filesystem.
	*/
	static int
	sync_fsync(struct vop_fsync_args *ap)
	{
	struct vnode *syncvp = ap->a_vp;
	struct mount *mp = syncvp->v_mount;
	int error, save;
	struct bufobj *bo;

	/*
	* We only need to do something if this is a lazy evaluation.
	*/
	if (ap->a_waitfor != MNT_LAZY)
	return (0);

	/*
	* Move ourselves to the back of the sync list.
	*/
	bo = &syncvp->v_bufobj;
	BO_LOCK(bo);
	vn_syncer_add_to_worklist(bo, syncdelay);
	BO_UNLOCK(bo);

	/*
	* Walk the list of vnodes pushing all that are dirty and
	* not already on the sync list.
	*/
	if (vfs_busy(mp, MBF_NOWAIT) != 0)
	return (0);
	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
	vfs_unbusy(mp);
	return (0);
	}
	save = curthread_pflags_set(TDP_SYNCIO);
	vfs_msync(mp, MNT_NOWAIT);
	error = VFS_SYNC(mp, MNT_LAZY);
	curthread_pflags_restore(save);
	vn_finished_write(mp);
	vfs_unbusy(mp);
	return (error);
	}

	/*
	* The syncer vnode is no referenced.
	*/
	static int
	sync_inactive(struct vop_inactive_args *ap)
	{

	vgone(ap->a_vp);
	return (0);
	}

	/*
	* The syncer vnode is no longer needed and is being decommissioned.
	*
	* Modifications to the worklist must be protected by sync_mtx.
	*/
	static int
	sync_reclaim(struct vop_reclaim_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct bufobj *bo;

	bo = &vp->v_bufobj;
	BO_LOCK(bo);
	mtx_lock(&sync_mtx);
	if (vp->v_mount->mnt_syncer == vp)
	vp->v_mount->mnt_syncer = NULL;
	if (bo->bo_flag & BO_ONWORKLST) {
	LIST_REMOVE(bo, bo_synclist);
	syncer_worklist_len--;
	sync_vnode_count--;
	bo->bo_flag &= ~BO_ONWORKLST;
	}
	mtx_unlock(&sync_mtx);
	BO_UNLOCK(bo);

	return (0);
	}

	/*
	* Check if vnode represents a disk device
	*/
	int
	vn_isdisk(struct vnode vp, int errp)
	{
	int error;

	if (vp->v_type != VCHR) {
	error = ENOTBLK;
	goto out;
	}
	error = 0;
	dev_lock();
	if (vp->v_rdev == NULL)
	error = ENXIO;
	else if (vp->v_rdev->si_devsw == NULL)
	error = ENXIO;
	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
	error = ENOTBLK;
	dev_unlock();
	out:
	if (errp != NULL)
	*errp = error;
	return (error == 0);
	}

	/*
	* Common filesystem object access control check routine. Accepts a
	* vnode's type, "mode", uid and gid, requested access mode, credentials,
	* and optional call-by-reference privused argument allowing vaccess()
	* to indicate to the caller whether privilege was used to satisfy the
	* request (obsoleted). Returns 0 on success, or an errno on failure.
	*/
	int
	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
	accmode_t accmode, struct ucred cred, int privused)
	{
	accmode_t dac_granted;
	accmode_t priv_granted;

	KASSERT((accmode & ~(VEXEC \| VWRITE \| VREAD \| VADMIN \| VAPPEND)) == 0,
	("invalid bit in accmode"));
	KASSERT((accmode & VAPPEND) == 0 \|\| (accmode & VWRITE),
	("VAPPEND without VWRITE"));

	/*
	* Look for a normal, non-privileged way to access the file/directory
	* as requested. If it exists, go with that.
	*/

	if (privused != NULL)
	*privused = 0;

	dac_granted = 0;

	/* Check the owner. */
	if (cred->cr_uid == file_uid) {
	dac_granted \|= VADMIN;
	if (file_mode & S_IXUSR)
	dac_granted \|= VEXEC;
	if (file_mode & S_IRUSR)
	dac_granted \|= VREAD;
	if (file_mode & S_IWUSR)
	dac_granted \|= (VWRITE \| VAPPEND);

	if ((accmode & dac_granted) == accmode)
	return (0);

	goto privcheck;
	}

	/* Otherwise, check the groups (first match) */
	if (groupmember(file_gid, cred)) {
	if (file_mode & S_IXGRP)
	dac_granted \|= VEXEC;
	if (file_mode & S_IRGRP)
	dac_granted \|= VREAD;
	if (file_mode & S_IWGRP)
	dac_granted \|= (VWRITE \| VAPPEND);

	if ((accmode & dac_granted) == accmode)
	return (0);

	goto privcheck;
	}

	/* Otherwise, check everyone else. */
	if (file_mode & S_IXOTH)
	dac_granted \|= VEXEC;
	if (file_mode & S_IROTH)
	dac_granted \|= VREAD;
	if (file_mode & S_IWOTH)
	dac_granted \|= (VWRITE \| VAPPEND);
	if ((accmode & dac_granted) == accmode)
	return (0);

	privcheck:
	/*
	* Build a privilege mask to determine if the set of privileges
	* satisfies the requirements when combined with the granted mask
	* from above. For each privilege, if the privilege is required,
	* bitwise or the request type onto the priv_granted mask.
	*/
	priv_granted = 0;

	if (type == VDIR) {
	/*
	* For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
	* requests, instead of PRIV_VFS_EXEC.
	*/
	if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
	!priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
	priv_granted \|= VEXEC;
	} else {
	/*
	* Ensure that at least one execute bit is on. Otherwise,
	* a privileged user will always succeed, and we don't want
	* this to happen unless the file really is executable.
	*/
	if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
	(file_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) != 0 &&
	!priv_check_cred(cred, PRIV_VFS_EXEC, 0))
	priv_granted \|= VEXEC;
	}

	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
	!priv_check_cred(cred, PRIV_VFS_READ, 0))
	priv_granted \|= VREAD;

	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
	!priv_check_cred(cred, PRIV_VFS_WRITE, 0))
	priv_granted \|= (VWRITE \| VAPPEND);

	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
	!priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
	priv_granted \|= VADMIN;

	if ((accmode & (priv_granted \| dac_granted)) == accmode) {
	/* XXX audit: privilege used */
	if (privused != NULL)
	*privused = 1;
	return (0);
	}

	return ((accmode & VADMIN) ? EPERM : EACCES);
	}

	/*
	* Credential check based on process requesting service, and per-attribute
	* permissions.
	*/
	int
	extattr_check_cred(struct vnode vp, int attrnamespace, struct ucred cred,
	struct thread *td, accmode_t accmode)
	{

	/*
	* Kernel-invoked always succeeds.
	*/
	if (cred == NOCRED)
	return (0);

	/*
	* Do not allow privileged processes in jail to directly manipulate
	* system attributes.
	*/
	switch (attrnamespace) {
	case EXTATTR_NAMESPACE_SYSTEM:
	/* Potentially should be: return (EPERM); */
	return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
	case EXTATTR_NAMESPACE_USER:
	return (VOP_ACCESS(vp, accmode, cred, td));
	default:
	return (EPERM);
	}
	}

	#ifdef DEBUG_VFS_LOCKS
	/*
	* This only exists to suppress warnings from unlocked specfs accesses. It is
	* no longer ok to have an unlocked VFS.
	*/
	#define IGNORE_LOCK(vp) (panicstr != NULL \|\| (vp) == NULL \|\| \
	(vp)->v_type == VCHR \|\| (vp)->v_type == VBAD)

	int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
	SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
	"Drop into debugger on lock violation");

	int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
	SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
	0, "Check for interlock across VOPs");

	int vfs_badlock_print = 1; /* Print lock violations. */
	SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
	0, "Print lock violations");

	int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */
	SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
	0, "Print vnode details on lock violations");

	#ifdef KDB
	int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
	SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
	&vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
	#endif

	static void
	vfs_badlock(const char msg, const char str, struct vnode *vp)
	{

	#ifdef KDB
	if (vfs_badlock_backtrace)
	kdb_backtrace();
	#endif
	if (vfs_badlock_vnode)
	vn_printf(vp, "vnode ");
	if (vfs_badlock_print)
	printf("%s: %p %s\n", str, (void *)vp, msg);
	if (vfs_badlock_ddb)
	kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
	}

	void
	assert_vi_locked(struct vnode vp, const char str)
	{

	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
	vfs_badlock("interlock is not locked but should be", str, vp);
	}

	void
	assert_vi_unlocked(struct vnode vp, const char str)
	{

	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
	vfs_badlock("interlock is locked but should not be", str, vp);
	}

	void
	assert_vop_locked(struct vnode vp, const char str)
	{
	int locked;

	if (!IGNORE_LOCK(vp)) {
	locked = VOP_ISLOCKED(vp);
	if (locked == 0 \|\| locked == LK_EXCLOTHER)
	vfs_badlock("is not locked but should be", str, vp);
	}
	}

	void
	assert_vop_unlocked(struct vnode vp, const char str)
	{

	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
	vfs_badlock("is locked but should not be", str, vp);
	}

	void
	assert_vop_elocked(struct vnode vp, const char str)
	{

	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
	vfs_badlock("is not exclusive locked but should be", str, vp);
	}
	#endif /* DEBUG_VFS_LOCKS */

	void
	vop_rename_fail(struct vop_rename_args *ap)
	{

	if (ap->a_tvp != NULL)
	vput(ap->a_tvp);
	if (ap->a_tdvp == ap->a_tvp)
	vrele(ap->a_tdvp);
	else
	vput(ap->a_tdvp);
	vrele(ap->a_fdvp);
	vrele(ap->a_fvp);
	}

	void
	vop_rename_pre(void *ap)
	{
	struct vop_rename_args *a = ap;

	#ifdef DEBUG_VFS_LOCKS
	if (a->a_tvp)
	ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");

	/* Check the source (from). */
	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
	(a->a_tvp == NULL \|\| a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
	ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
	if (a->a_tvp == NULL \|\| a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
	ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");

	/* Check the target. */
	if (a->a_tvp)
	ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
	#endif
	if (a->a_tdvp != a->a_fdvp)
	vhold(a->a_fdvp);
	if (a->a_tvp != a->a_fvp)
	vhold(a->a_fvp);
	vhold(a->a_tdvp);
	if (a->a_tvp)
	vhold(a->a_tvp);
	}

	#ifdef DEBUG_VFS_LOCKS
	void
	vop_strategy_pre(void *ap)
	{
	struct vop_strategy_args *a;
	struct buf *bp;

	a = ap;
	bp = a->a_bp;

	/*
	* Cluster ops lock their component buffers but not the IO container.
	*/
	if ((bp->b_flags & B_CLUSTER) != 0)
	return;

	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
	if (vfs_badlock_print)
	printf(
	"VOP_STRATEGY: bp is not locked but should be\n");
	if (vfs_badlock_ddb)
	kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
	}
	}

	void
	vop_lock_pre(void *ap)
	{
	struct vop_lock1_args *a = ap;

	if ((a->a_flags & LK_INTERLOCK) == 0)
	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
	else
	ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
	}

	void
	vop_lock_post(void *ap, int rc)
	{
	struct vop_lock1_args *a = ap;

	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
	ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
	}

	void
	vop_unlock_pre(void *ap)
	{
	struct vop_unlock_args *a = ap;

	if (a->a_flags & LK_INTERLOCK)
	ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
	}

	void
	vop_unlock_post(void *ap, int rc)
	{
	struct vop_unlock_args *a = ap;

	if (a->a_flags & LK_INTERLOCK)
	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
	}
	#endif

	void
	vop_create_post(void *ap, int rc)
	{
	struct vop_create_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
	}

	void
	vop_deleteextattr_post(void *ap, int rc)
	{
	struct vop_deleteextattr_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
	}

	void
	vop_link_post(void *ap, int rc)
	{
	struct vop_link_args *a = ap;

	if (!rc) {
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
	VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
	}
	}

	void
	vop_mkdir_post(void *ap, int rc)
	{
	struct vop_mkdir_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE \| NOTE_LINK);
	}

	void
	vop_mknod_post(void *ap, int rc)
	{
	struct vop_mknod_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
	}

	void
	vop_reclaim_post(void *ap, int rc)
	{
	struct vop_reclaim_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
	}

	void
	vop_remove_post(void *ap, int rc)
	{
	struct vop_remove_args *a = ap;

	if (!rc) {
	VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
	}
	}

	void
	vop_rename_post(void *ap, int rc)
	{
	struct vop_rename_args *a = ap;
	long hint;

	if (!rc) {
	hint = NOTE_WRITE;
	if (a->a_fdvp == a->a_tdvp) {
	if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
	hint \|= NOTE_LINK;
	VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
	VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
	} else {
	hint \|= NOTE_EXTEND;
	if (a->a_fvp->v_type == VDIR)
	hint \|= NOTE_LINK;
	VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);

	if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
	a->a_tvp->v_type == VDIR)
	hint &= ~NOTE_LINK;
	VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
	}

	VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
	if (a->a_tvp)
	VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
	}
	if (a->a_tdvp != a->a_fdvp)
	vdrop(a->a_fdvp);
	if (a->a_tvp != a->a_fvp)
	vdrop(a->a_fvp);
	vdrop(a->a_tdvp);
	if (a->a_tvp)
	vdrop(a->a_tvp);
	}

	void
	vop_rmdir_post(void *ap, int rc)
	{
	struct vop_rmdir_args *a = ap;

	if (!rc) {
	VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE \| NOTE_LINK);
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
	}
	}

	void
	vop_setattr_post(void *ap, int rc)
	{
	struct vop_setattr_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
	}

	void
	vop_setextattr_post(void *ap, int rc)
	{
	struct vop_setextattr_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
	}

	void
	vop_symlink_post(void *ap, int rc)
	{
	struct vop_symlink_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
	}

	void
	vop_open_post(void *ap, int rc)
	{
	struct vop_open_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
	}

	void
	vop_close_post(void *ap, int rc)
	{
	struct vop_close_args *a = ap;

	if (!rc && (a->a_cred != NOCRED \|\| /* filter out revokes */
	(a->a_vp->v_iflag & VI_DOOMED) == 0)) {
	VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
	NOTE_CLOSE_WRITE : NOTE_CLOSE);
	}
	}

	void
	vop_read_post(void *ap, int rc)
	{
	struct vop_read_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
	}

	void
	vop_readdir_post(void *ap, int rc)
	{
	struct vop_readdir_args *a = ap;

	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
	}

	static struct knlist fs_knlist;

	static void
	vfs_event_init(void *arg)
	{
	knlist_init_mtx(&fs_knlist, NULL);
	}
	/* XXX - correct order? */
	SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);

	void
	vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
	{

	KNOTE_UNLOCKED(&fs_knlist, event);
	}

	static int filt_fsattach(struct knote *kn);
	static void filt_fsdetach(struct knote *kn);
	static int filt_fsevent(struct knote *kn, long hint);

	struct filterops fs_filtops = {
	.f_isfd = 0,
	.f_attach = filt_fsattach,
	.f_detach = filt_fsdetach,
	.f_event = filt_fsevent
	};

	static int
	filt_fsattach(struct knote *kn)
	{

	kn->kn_flags \|= EV_CLEAR;
	knlist_add(&fs_knlist, kn, 0);
	return (0);
	}

	static void
	filt_fsdetach(struct knote *kn)
	{

	knlist_remove(&fs_knlist, kn, 0);
	}

	static int
	filt_fsevent(struct knote *kn, long hint)
	{

	kn->kn_fflags \|= hint;
	return (kn->kn_fflags != 0);
	}

	static int
	sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
	{
	struct vfsidctl vc;
	int error;
	struct mount *mp;

	error = SYSCTL_IN(req, &vc, sizeof(vc));
	if (error)
	return (error);
	if (vc.vc_vers != VFS_CTL_VERS1)
	return (EINVAL);
	mp = vfs_getvfs(&vc.vc_fsid);
	if (mp == NULL)
	return (ENOENT);
	/* ensure that a specific sysctl goes to the right filesystem. */
	if (strcmp(vc.vc_fstypename, "*") != 0 &&
	strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
	vfs_rel(mp);
	return (EINVAL);
	}
	VCTLTOREQ(&vc, req);
	error = VFS_SYSCTL(mp, vc.vc_op, req);
	vfs_rel(mp);
	return (error);
	}

	SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE \| CTLFLAG_WR,
	NULL, 0, sysctl_vfs_ctl, "",
	"Sysctl by fsid");

	/*
	* Function to initialize a va_filerev field sensibly.
	* XXX: Wouldn't a random number make a lot more sense ??
	*/
	u_quad_t
	init_va_filerev(void)
	{
	struct bintime bt;

	getbinuptime(&bt);
	return (((u_quad_t)bt.sec << 32LL) \| (bt.frac >> 32LL));
	}

	static int filt_vfsread(struct knote *kn, long hint);
	static int filt_vfswrite(struct knote *kn, long hint);
	static int filt_vfsvnode(struct knote *kn, long hint);
	static void filt_vfsdetach(struct knote *kn);
	static struct filterops vfsread_filtops = {
	.f_isfd = 1,
	.f_detach = filt_vfsdetach,
	.f_event = filt_vfsread
	};
	static struct filterops vfswrite_filtops = {
	.f_isfd = 1,
	.f_detach = filt_vfsdetach,
	.f_event = filt_vfswrite
	};
	static struct filterops vfsvnode_filtops = {
	.f_isfd = 1,
	.f_detach = filt_vfsdetach,
	.f_event = filt_vfsvnode
	};

	static void
	vfs_knllock(void *arg)
	{
	struct vnode *vp = arg;

	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	}

	static void
	vfs_knlunlock(void *arg)
	{
	struct vnode *vp = arg;

	VOP_UNLOCK(vp, 0);
	}

	static void
	vfs_knl_assert_locked(void *arg)
	{
	#ifdef DEBUG_VFS_LOCKS
	struct vnode *vp = arg;

	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
	#endif
	}

	static void
	vfs_knl_assert_unlocked(void *arg)
	{
	#ifdef DEBUG_VFS_LOCKS
	struct vnode *vp = arg;

	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
	#endif
	}

	int
	vfs_kqfilter(struct vop_kqfilter_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct knote *kn = ap->a_kn;
	struct knlist *knl;

	switch (kn->kn_filter) {
	case EVFILT_READ:
	kn->kn_fop = &vfsread_filtops;
	break;
	case EVFILT_WRITE:
	kn->kn_fop = &vfswrite_filtops;
	break;
	case EVFILT_VNODE:
	kn->kn_fop = &vfsvnode_filtops;
	break;
	default:
	return (EINVAL);
	}

	kn->kn_hook = (caddr_t)vp;

	v_addpollinfo(vp);
	if (vp->v_pollinfo == NULL)
	return (ENOMEM);
	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
	vhold(vp);
	knlist_add(knl, kn, 0);

	return (0);
	}

	/*
	* Detach knote from vnode
	*/
	static void
	filt_vfsdetach(struct knote *kn)
	{
	struct vnode vp = (struct vnode )kn->kn_hook;

	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
	vdrop(vp);
	}

	/ARGSUSED/
	static int
	filt_vfsread(struct knote *kn, long hint)
	{
	struct vnode vp = (struct vnode )kn->kn_hook;
	struct vattr va;
	int res;

	/*
	* filesystem is gone, so set the EOF flag and schedule
	* the knote for deletion.
	*/
	if (hint == NOTE_REVOKE \|\| (hint == 0 && vp->v_type == VBAD)) {
	VI_LOCK(vp);
	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	VI_UNLOCK(vp);
	return (1);
	}

	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
	return (0);

	VI_LOCK(vp);
	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 \|\| kn->kn_data != 0;
	VI_UNLOCK(vp);
	return (res);
	}

	/ARGSUSED/
	static int
	filt_vfswrite(struct knote *kn, long hint)
	{
	struct vnode vp = (struct vnode )kn->kn_hook;

	VI_LOCK(vp);

	/*
	* filesystem is gone, so set the EOF flag and schedule
	* the knote for deletion.
	*/
	if (hint == NOTE_REVOKE \|\| (hint == 0 && vp->v_type == VBAD))
	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);

	kn->kn_data = 0;
	VI_UNLOCK(vp);
	return (1);
	}

	static int
	filt_vfsvnode(struct knote *kn, long hint)
	{
	struct vnode vp = (struct vnode )kn->kn_hook;
	int res;

	VI_LOCK(vp);
	if (kn->kn_sfflags & hint)
	kn->kn_fflags \|= hint;
	if (hint == NOTE_REVOKE \|\| (hint == 0 && vp->v_type == VBAD)) {
	kn->kn_flags \|= EV_EOF;
	VI_UNLOCK(vp);
	return (1);
	}
	res = (kn->kn_fflags != 0);
	VI_UNLOCK(vp);
	return (res);
	}

	int
	vfs_read_dirent(struct vop_readdir_args ap, struct dirent dp, off_t off)
	{
	int error;

	if (dp->d_reclen > ap->a_uio->uio_resid)
	return (ENAMETOOLONG);
	error = uiomove(dp, dp->d_reclen, ap->a_uio);
	if (error) {
	if (ap->a_ncookies != NULL) {
	if (ap->a_cookies != NULL)
	free(ap->a_cookies, M_TEMP);
	ap->a_cookies = NULL;
	*ap->a_ncookies = 0;
	}
	return (error);
	}
	if (ap->a_ncookies == NULL)
	return (0);

	KASSERT(ap->a_cookies,
	("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));

	ap->a_cookies = realloc(ap->a_cookies,
	(ap->a_ncookies + 1) sizeof(u_long), M_TEMP, M_WAITOK \| M_ZERO);
	(ap->a_cookies)[ap->a_ncookies] = off;
	*ap->a_ncookies += 1;
	return (0);
	}

	/*
	* Mark for update the access time of the file if the filesystem
	* supports VOP_MARKATIME. This functionality is used by execve and
	* mmap, so we want to avoid the I/O implied by directly setting
	* va_atime for the sake of efficiency.
	*/
	void
	vfs_mark_atime(struct vnode vp, struct ucred cred)
	{
	struct mount *mp;

	mp = vp->v_mount;
	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME \| MNT_RDONLY)) == 0)
	(void)VOP_MARKATIME(vp);
	}

	/*
	* The purpose of this routine is to remove granularity from accmode_t,
	* reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
	* VADMIN and VAPPEND.
	*
	* If it returns 0, the caller is supposed to continue with the usual
	* access checks using 'accmode' as modified by this routine. If it
	* returns nonzero value, the caller is supposed to return that value
	* as errno.
	*
	* Note that after this routine runs, accmode may be zero.
	*/
	int
	vfs_unixify_accmode(accmode_t *accmode)
	{
	/*
	* There is no way to specify explicit "deny" rule using
	* file mode or POSIX.1e ACLs.
	*/
	if (*accmode & VEXPLICIT_DENY) {
	*accmode = 0;
	return (0);
	}

	/*
	* None of these can be translated into usual access bits.
	* Also, the common case for NFSv4 ACLs is to not contain
	* either of these bits. Caller should check for VWRITE
	* on the containing directory instead.
	*/
	if (*accmode & (VDELETE_CHILD \| VDELETE))
	return (EPERM);

	if (*accmode & VADMIN_PERMS) {
	*accmode &= ~VADMIN_PERMS;
	*accmode \|= VADMIN;
	}

	/*
	* There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
	* or VSYNCHRONIZE using file mode or POSIX.1e ACL.
	*/
	*accmode &= ~(VSTAT_PERMS \| VSYNCHRONIZE);

	return (0);
	}

	/*
	* These are helper functions for filesystems to traverse all
	* their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
	*
	* This interface replaces MNT_VNODE_FOREACH.
	*/

	MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");

	struct vnode *
	__mnt_vnode_next_all(struct vnode *mvp, struct mount mp)
	{
	struct vnode *vp;

	if (should_yield())
	kern_yield(PRI_USER);
	MNT_ILOCK(mp);
	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
	vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
	/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
	if (vp->v_type == VMARKER \|\| (vp->v_iflag & VI_DOOMED) != 0)
	continue;
	VI_LOCK(vp);
	if ((vp->v_iflag & VI_DOOMED) != 0) {
	VI_UNLOCK(vp);
	continue;
	}
	break;
	}
	if (vp == NULL) {
	__mnt_vnode_markerfree_all(mvp, mp);
	/* MNT_IUNLOCK(mp); -- done in above function */
	mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
	return (NULL);
	}
	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
	MNT_IUNLOCK(mp);
	return (vp);
	}

	struct vnode *
	__mnt_vnode_first_all(struct vnode *mvp, struct mount mp)
	{
	struct vnode *vp;

	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK \| M_ZERO);
	MNT_ILOCK(mp);
	MNT_REF(mp);
	(*mvp)->v_mount = mp;
	(*mvp)->v_type = VMARKER;

	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
	/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
	if (vp->v_type == VMARKER \|\| (vp->v_iflag & VI_DOOMED) != 0)
	continue;
	VI_LOCK(vp);
	if ((vp->v_iflag & VI_DOOMED) != 0) {
	VI_UNLOCK(vp);
	continue;
	}
	break;
	}
	if (vp == NULL) {
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	free(*mvp, M_VNODE_MARKER);
	*mvp = NULL;
	return (NULL);
	}
	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
	MNT_IUNLOCK(mp);
	return (vp);
	}

	void
	__mnt_vnode_markerfree_all(struct vnode *mvp, struct mount mp)
	{

	if (*mvp == NULL) {
	MNT_IUNLOCK(mp);
	return;
	}

	mtx_assert(MNT_MTX(mp), MA_OWNED);

	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	free(*mvp, M_VNODE_MARKER);
	*mvp = NULL;
	}

	/*
	* These are helper functions for filesystems to traverse their
	* active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
	*/
	static void
	mnt_vnode_markerfree_active(struct vnode *mvp, struct mount mp)
	{

	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));

	MNT_ILOCK(mp);
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	free(*mvp, M_VNODE_MARKER);
	*mvp = NULL;
	}

	/*
	* Relock the mp mount vnode list lock with the vp vnode interlock in the
	* conventional lock order during mnt_vnode_next_active iteration.
	*
	* On entry, the mount vnode list lock is held and the vnode interlock is not.
	* The list lock is dropped and reacquired. On success, both locks are held.
	* On failure, the mount vnode list lock is held but the vnode interlock is
	* not, and the procedure may have yielded.
	*/
	static bool
	mnt_vnode_next_active_relock(struct vnode mvp, struct mount mp,
	struct vnode *vp)
	{
	const struct vnode *tmp;
	bool held, ret;

	VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
	TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
	("%s: bad marker", __func__));
	VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
	("%s: inappropriate vnode", __func__));
	ASSERT_VI_UNLOCKED(vp, __func__);
	mtx_assert(&mp->mnt_listmtx, MA_OWNED);

	ret = false;

	TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
	TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);

	/*
	* Use a hold to prevent vp from disappearing while the mount vnode
	* list lock is dropped and reacquired. Normally a hold would be
	* acquired with vhold(), but that might try to acquire the vnode
	* interlock, which would be a LOR with the mount vnode list lock.
	*/
	held = vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt);
	mtx_unlock(&mp->mnt_listmtx);
	if (!held)
	goto abort;
	VI_LOCK(vp);
	if (!vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
	vdropl(vp);
	goto abort;
	}
	mtx_lock(&mp->mnt_listmtx);

	/*
	* Determine whether the vnode is still the next one after the marker,
	* excepting any other markers. If the vnode has not been doomed by
	* vgone() then the hold should have ensured that it remained on the
	* active list. If it has been doomed but is still on the active list,
	* don't abort, but rather skip over it (avoid spinning on doomed
	* vnodes).
	*/
	tmp = mvp;
	do {
	tmp = TAILQ_NEXT(tmp, v_actfreelist);
	} while (tmp != NULL && tmp->v_type == VMARKER);
	if (tmp != vp) {
	mtx_unlock(&mp->mnt_listmtx);
	VI_UNLOCK(vp);
	goto abort;
	}

	ret = true;
	goto out;
	abort:
	maybe_yield();
	mtx_lock(&mp->mnt_listmtx);
	out:
	if (ret)
	ASSERT_VI_LOCKED(vp, __func__);
	else
	ASSERT_VI_UNLOCKED(vp, __func__);
	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
	return (ret);
	}

	static struct vnode *
	mnt_vnode_next_active(struct vnode *mvp, struct mount mp)
	{
	struct vnode vp, nvp;

	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
	restart:
	vp = TAILQ_NEXT(*mvp, v_actfreelist);
	while (vp != NULL) {
	if (vp->v_type == VMARKER) {
	vp = TAILQ_NEXT(vp, v_actfreelist);
	continue;
	}
	/*
	* Try-lock because this is the wrong lock order. If that does
	* not succeed, drop the mount vnode list lock and try to
	* reacquire it and the vnode interlock in the right order.
	*/
	if (!VI_TRYLOCK(vp) &&
	!mnt_vnode_next_active_relock(*mvp, mp, vp))
	goto restart;
	KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
	KASSERT(vp->v_mount == mp \|\| vp->v_mount == NULL,
	("alien vnode on the active list %p %p", vp, mp));
	if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
	break;
	nvp = TAILQ_NEXT(vp, v_actfreelist);
	VI_UNLOCK(vp);
	vp = nvp;
	}
	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);

	/* Check if we are done */
	if (vp == NULL) {
	mtx_unlock(&mp->mnt_listmtx);
	mnt_vnode_markerfree_active(mvp, mp);
	return (NULL);
	}
	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
	mtx_unlock(&mp->mnt_listmtx);
	ASSERT_VI_LOCKED(vp, "active iter");
	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
	return (vp);
	}

	struct vnode *
	__mnt_vnode_next_active(struct vnode *mvp, struct mount mp)
	{

	if (should_yield())
	kern_yield(PRI_USER);
	mtx_lock(&mp->mnt_listmtx);
	return (mnt_vnode_next_active(mvp, mp));
	}

	struct vnode *
	__mnt_vnode_first_active(struct vnode *mvp, struct mount mp)
	{
	struct vnode *vp;

	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK \| M_ZERO);
	MNT_ILOCK(mp);
	MNT_REF(mp);
	MNT_IUNLOCK(mp);
	(*mvp)->v_type = VMARKER;
	(*mvp)->v_mount = mp;

	mtx_lock(&mp->mnt_listmtx);
	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
	if (vp == NULL) {
	mtx_unlock(&mp->mnt_listmtx);
	mnt_vnode_markerfree_active(mvp, mp);
	return (NULL);
	}
	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
	return (mnt_vnode_next_active(mvp, mp));
	}

	void
	__mnt_vnode_markerfree_active(struct vnode *mvp, struct mount mp)
	{

	if (*mvp == NULL)
	return;

	mtx_lock(&mp->mnt_listmtx);
	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
	mtx_unlock(&mp->mnt_listmtx);
	mnt_vnode_markerfree_active(mvp, mp);
	}
	Index: head/sys/net/if_ethersubr.c
	===================================================================
	--- head/sys/net/if_ethersubr.c (revision 327172)
	+++ head/sys/net/if_ethersubr.c (revision 327173)
	@@ -1,1255 +1,1253 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1982, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93
	* $FreeBSD$
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_netgraph.h"
	#include "opt_mbuf_profiling.h"
	#include "opt_rss.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mbuf.h>
	#include <sys/random.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/uuid.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_arp.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/if_llc.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/bpf.h>
	#include <net/ethernet.h>
	#include <net/if_bridgevar.h>
	#include <net/if_vlan_var.h>
	#include <net/if_llatbl.h>
	#include <net/pfil.h>
	#include <net/rss_config.h>
	#include <net/vnet.h>

	#include <netpfil/pf/pf_mtag.h>

	#if defined(INET) \|\| defined(INET6)
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/if_ether.h>
	#include <netinet/ip_carp.h>
	#include <netinet/ip_var.h>
	#endif
	#ifdef INET6
	#include <netinet6/nd6.h>
	#endif
	#include <security/mac/mac_framework.h>

	#ifdef CTASSERT
	CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
	CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
	#endif

	VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */

	/* netgraph node hooks for ng_ether(4) */
	void (ng_ether_input_p)(struct ifnet ifp, struct mbuf **mp);
	void (ng_ether_input_orphan_p)(struct ifnet ifp, struct mbuf *m);
	int (ng_ether_output_p)(struct ifnet ifp, struct mbuf **mp);
	void (ng_ether_attach_p)(struct ifnet ifp);
	void (ng_ether_detach_p)(struct ifnet ifp);

	void (vlan_input_p)(struct ifnet , struct mbuf *);

	/* if_bridge(4) support */
	struct mbuf (bridge_input_p)(struct ifnet , struct mbuf );
	int (bridge_output_p)(struct ifnet , struct mbuf *,
	struct sockaddr , struct rtentry );
	void (bridge_dn_p)(struct mbuf , struct ifnet *);

	/* if_lagg(4) support */
	struct mbuf (lagg_input_p)(struct ifnet , struct mbuf );

	static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };

	static int ether_resolvemulti(struct ifnet , struct sockaddr *,
	struct sockaddr *);
	#ifdef VIMAGE
	static void ether_reassign(struct ifnet , struct vnet , char *);
	#endif
	static int ether_requestencap(struct ifnet , struct if_encap_req );


	#define senderr(e) do { error = (e); goto bad;} while (0)

	static void
	update_mbuf_csumflags(struct mbuf src, struct mbuf dst)
	{
	int csum_flags = 0;

	if (src->m_pkthdr.csum_flags & CSUM_IP)
	csum_flags \|= (CSUM_IP_CHECKED\|CSUM_IP_VALID);
	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
	csum_flags \|= (CSUM_DATA_VALID\|CSUM_PSEUDO_HDR);
	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
	csum_flags \|= CSUM_SCTP_VALID;
	dst->m_pkthdr.csum_flags \|= csum_flags;
	if (csum_flags & CSUM_DATA_VALID)
	dst->m_pkthdr.csum_data = 0xffff;
	}

	/*
	* Handle link-layer encapsulation requests.
	*/
	static int
	ether_requestencap(struct ifnet ifp, struct if_encap_req req)
	{
	struct ether_header *eh;
	struct arphdr *ah;
	uint16_t etype;
	const u_char *lladdr;

	if (req->rtype != IFENCAP_LL)
	return (EOPNOTSUPP);

	if (req->bufsize < ETHER_HDR_LEN)
	return (ENOMEM);

	eh = (struct ether_header *)req->buf;
	lladdr = req->lladdr;
	req->lladdr_off = 0;

	switch (req->family) {
	case AF_INET:
	etype = htons(ETHERTYPE_IP);
	break;
	case AF_INET6:
	etype = htons(ETHERTYPE_IPV6);
	break;
	case AF_ARP:
	ah = (struct arphdr *)req->hdata;
	ah->ar_hrd = htons(ARPHRD_ETHER);

	switch(ntohs(ah->ar_op)) {
	case ARPOP_REVREQUEST:
	case ARPOP_REVREPLY:
	etype = htons(ETHERTYPE_REVARP);
	break;
	case ARPOP_REQUEST:
	case ARPOP_REPLY:
	default:
	etype = htons(ETHERTYPE_ARP);
	break;
	}

	if (req->flags & IFENCAP_FLAG_BROADCAST)
	lladdr = ifp->if_broadcastaddr;
	break;
	default:
	return (EAFNOSUPPORT);
	}

	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
	memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
	req->bufsize = sizeof(struct ether_header);

	return (0);
	}


	static int
	ether_resolve_addr(struct ifnet ifp, struct mbuf m,
	const struct sockaddr dst, struct route ro, u_char *phdr,
	uint32_t pflags, struct llentry *plle)
	{
	struct ether_header *eh;
	uint32_t lleflags = 0;
	int error = 0;
	#if defined(INET) \|\| defined(INET6)
	uint16_t etype;
	#endif

	if (plle)
	*plle = NULL;
	eh = (struct ether_header *)phdr;

	switch (dst->sa_family) {
	#ifdef INET
	case AF_INET:
	if ((m->m_flags & (M_BCAST \| M_MCAST)) == 0)
	error = arpresolve(ifp, 0, m, dst, phdr, &lleflags,
	plle);
	else {
	if (m->m_flags & M_BCAST)
	memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
	ETHER_ADDR_LEN);
	else {
	const struct in_addr *a;
	a = &(((const struct sockaddr_in *)dst)->sin_addr);
	ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
	}
	etype = htons(ETHERTYPE_IP);
	memcpy(&eh->ether_type, &etype, sizeof(etype));
	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
	}
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	if ((m->m_flags & M_MCAST) == 0)
	error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags,
	plle);
	else {
	const struct in6_addr *a6;
	a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
	ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
	etype = htons(ETHERTYPE_IPV6);
	memcpy(&eh->ether_type, &etype, sizeof(etype));
	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
	}
	break;
	#endif
	default:
	if_printf(ifp, "can't handle af%d\n", dst->sa_family);
	if (m != NULL)
	m_freem(m);
	return (EAFNOSUPPORT);
	}

	if (error == EHOSTDOWN) {
	if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
	error = EHOSTUNREACH;
	}

	if (error != 0)
	return (error);

	*pflags = RT_MAY_LOOP;
	if (lleflags & LLE_IFADDR)
	*pflags \|= RT_L2_ME;

	return (0);
	}

	/*
	* Ethernet output routine.
	* Encapsulate a packet of type family for the local net.
	* Use trailer local net encapsulation if enough data in first
	* packet leaves a multiple of 512 bytes of data in remainder.
	*/
	int
	ether_output(struct ifnet ifp, struct mbuf m,
	const struct sockaddr dst, struct route ro)
	{
	int error = 0;
	char linkhdr[ETHER_HDR_LEN], *phdr;
	struct ether_header *eh;
	struct pf_mtag *t;
	int loop_copy = 1;
	int hlen; /* link layer header length */
	uint32_t pflags;
	struct llentry *lle = NULL;
	- struct rtentry *rt0 = NULL;
	int addref = 0;

	phdr = NULL;
	pflags = 0;
	if (ro != NULL) {
	/* XXX BPF uses ro_prepend */
	if (ro->ro_prepend != NULL) {
	phdr = ro->ro_prepend;
	hlen = ro->ro_plen;
	} else if (!(m->m_flags & (M_BCAST \| M_MCAST))) {
	if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
	lle = ro->ro_lle;
	if (lle != NULL &&
	(lle->la_flags & LLE_VALID) == 0) {
	LLE_FREE(lle);
	lle = NULL; /* redundant */
	ro->ro_lle = NULL;
	}
	if (lle == NULL) {
	/* if we lookup, keep cache */
	addref = 1;
	}
	}
	if (lle != NULL) {
	phdr = lle->r_linkdata;
	hlen = lle->r_hdrlen;
	pflags = lle->r_flags;
	}
	}
	- rt0 = ro->ro_rt;
	}

	#ifdef MAC
	error = mac_ifnet_check_transmit(ifp, m);
	if (error)
	senderr(error);
	#endif

	M_PROFILE(m);
	if (ifp->if_flags & IFF_MONITOR)
	senderr(ENETDOWN);
	if (!((ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)))
	senderr(ENETDOWN);

	if (phdr == NULL) {
	/* No prepend data supplied. Try to calculate ourselves. */
	phdr = linkhdr;
	hlen = ETHER_HDR_LEN;
	error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
	addref ? &lle : NULL);
	if (addref && lle != NULL)
	ro->ro_lle = lle;
	if (error != 0)
	return (error == EWOULDBLOCK ? 0 : error);
	}

	if ((pflags & RT_L2_ME) != 0) {
	update_mbuf_csumflags(m, m);
	return (if_simloop(ifp, m, dst->sa_family, 0));
	}
	loop_copy = pflags & RT_MAY_LOOP;

	/*
	* Add local net header. If no space in first mbuf,
	* allocate another.
	*
	* Note that we do prepend regardless of RT_HAS_HEADER flag.
	* This is done because BPF code shifts m_data pointer
	* to the end of ethernet header prior to calling if_output().
	*/
	M_PREPEND(m, hlen, M_NOWAIT);
	if (m == NULL)
	senderr(ENOBUFS);
	if ((pflags & RT_HAS_HEADER) == 0) {
	eh = mtod(m, struct ether_header *);
	memcpy(eh, phdr, hlen);
	}

	/*
	* If a simplex interface, and the packet is being sent to our
	* Ethernet address or a broadcast address, loopback a copy.
	* XXX To make a simplex device behave exactly like a duplex
	* device, we should copy in the case of sending to our own
	* ethernet address (thus letting the original actually appear
	* on the wire). However, we don't do that here for security
	* reasons and compatibility with the original behavior.
	*/
	if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
	((t = pf_find_mtag(m)) == NULL \|\| !t->routed)) {
	struct mbuf *n;

	/*
	* Because if_simloop() modifies the packet, we need a
	* writable copy through m_dup() instead of a readonly
	* one as m_copy[m] would give us. The alternative would
	* be to modify if_simloop() to handle the readonly mbuf,
	* but performancewise it is mostly equivalent (trading
	* extra data copying vs. extra locking).
	*
	* XXX This is a local workaround. A number of less
	* often used kernel parts suffer from the same bug.
	* See PR kern/105943 for a proposed general solution.
	*/
	if ((n = m_dup(m, M_NOWAIT)) != NULL) {
	update_mbuf_csumflags(m, n);
	(void)if_simloop(ifp, n, dst->sa_family, hlen);
	} else
	if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
	}

	/*
	* Bridges require special output handling.
	*/
	if (ifp->if_bridge) {
	BRIDGE_OUTPUT(ifp, m, error);
	return (error);
	}

	#if defined(INET) \|\| defined(INET6)
	if (ifp->if_carp &&
	(error = (*carp_output_p)(ifp, m, dst)))
	goto bad;
	#endif

	/* Handle ng_ether(4) processing, if any */
	if (ifp->if_l2com != NULL) {
	KASSERT(ng_ether_output_p != NULL,
	("ng_ether_output_p is NULL"));
	if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
	bad: if (m != NULL)
	m_freem(m);
	return (error);
	}
	if (m == NULL)
	return (0);
	}

	/* Continue with link-layer output */
	return ether_output_frame(ifp, m);
	}

	/*
	* Ethernet link layer output routine to send a raw frame to the device.
	*
	* This assumes that the 14 byte Ethernet header is present and contiguous
	* in the first mbuf (if BRIDGE'ing).
	*/
	int
	ether_output_frame(struct ifnet ifp, struct mbuf m)
	{
	int i;

	if (PFIL_HOOKED(&V_link_pfil_hook)) {
	i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL);

	if (i != 0)
	return (EACCES);

	if (m == NULL)
	return (0);
	}

	/*
	* Queue message on interface, update output statistics if
	* successful, and start output if interface not yet active.
	*/
	return ((ifp->if_transmit)(ifp, m));
	}

	/*
	* Process a received Ethernet packet; the packet is in the
	* mbuf chain m with the ethernet header at the front.
	*/
	static void
	ether_input_internal(struct ifnet ifp, struct mbuf m)
	{
	struct ether_header *eh;
	u_short etype;

	if ((ifp->if_flags & IFF_UP) == 0) {
	m_freem(m);
	return;
	}
	#ifdef DIAGNOSTIC
	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
	if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
	m_freem(m);
	return;
	}
	#endif
	if (m->m_len < ETHER_HDR_LEN) {
	/* XXX maybe should pullup? */
	if_printf(ifp, "discard frame w/o leading ethernet "
	"header (len %u pkt len %u)\n",
	m->m_len, m->m_pkthdr.len);
	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
	m_freem(m);
	return;
	}
	eh = mtod(m, struct ether_header *);
	etype = ntohs(eh->ether_type);
	random_harvest_queue(m, sizeof(*m), 2, RANDOM_NET_ETHER);

	CURVNET_SET_QUIET(ifp->if_vnet);

	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
	if (ETHER_IS_BROADCAST(eh->ether_dhost))
	m->m_flags \|= M_BCAST;
	else
	m->m_flags \|= M_MCAST;
	if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
	}

	#ifdef MAC
	/*
	* Tag the mbuf with an appropriate MAC label before any other
	* consumers can get to it.
	*/
	mac_ifnet_create_mbuf(ifp, m);
	#endif

	/*
	* Give bpf a chance at the packet.
	*/
	ETHER_BPF_MTAP(ifp, m);

	/*
	* If the CRC is still on the packet, trim it off. We do this once
	* and once only in case we are re-entered. Nothing else on the
	* Ethernet receive path expects to see the FCS.
	*/
	if (m->m_flags & M_HASFCS) {
	m_adj(m, -ETHER_CRC_LEN);
	m->m_flags &= ~M_HASFCS;
	}

	if (!(ifp->if_capenable & IFCAP_HWSTATS))
	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);

	/* Allow monitor mode to claim this frame, after stats are updated. */
	if (ifp->if_flags & IFF_MONITOR) {
	m_freem(m);
	CURVNET_RESTORE();
	return;
	}

	/* Handle input from a lagg(4) port */
	if (ifp->if_type == IFT_IEEE8023ADLAG) {
	KASSERT(lagg_input_p != NULL,
	("%s: if_lagg not loaded!", __func__));
	m = (*lagg_input_p)(ifp, m);
	if (m != NULL)
	ifp = m->m_pkthdr.rcvif;
	else {
	CURVNET_RESTORE();
	return;
	}
	}

	/*
	* If the hardware did not process an 802.1Q tag, do this now,
	* to allow 802.1P priority frames to be passed to the main input
	* path correctly.
	* TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
	*/
	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
	struct ether_vlan_header *evl;

	if (m->m_len < sizeof(*evl) &&
	(m = m_pullup(m, sizeof(*evl))) == NULL) {
	#ifdef DIAGNOSTIC
	if_printf(ifp, "cannot pullup VLAN header\n");
	#endif
	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
	CURVNET_RESTORE();
	return;
	}

	evl = mtod(m, struct ether_vlan_header *);
	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
	m->m_flags \|= M_VLANTAG;

	bcopy((char )evl, (char )evl + ETHER_VLAN_ENCAP_LEN,
	ETHER_HDR_LEN - ETHER_TYPE_LEN);
	m_adj(m, ETHER_VLAN_ENCAP_LEN);
	eh = mtod(m, struct ether_header *);
	}

	M_SETFIB(m, ifp->if_fib);

	/* Allow ng_ether(4) to claim this frame. */
	if (ifp->if_l2com != NULL) {
	KASSERT(ng_ether_input_p != NULL,
	("%s: ng_ether_input_p is NULL", __func__));
	m->m_flags &= ~M_PROMISC;
	(*ng_ether_input_p)(ifp, &m);
	if (m == NULL) {
	CURVNET_RESTORE();
	return;
	}
	eh = mtod(m, struct ether_header *);
	}

	/*
	* Allow if_bridge(4) to claim this frame.
	* The BRIDGE_INPUT() macro will update ifp if the bridge changed it
	* and the frame should be delivered locally.
	*/
	if (ifp->if_bridge != NULL) {
	m->m_flags &= ~M_PROMISC;
	BRIDGE_INPUT(ifp, m);
	if (m == NULL) {
	CURVNET_RESTORE();
	return;
	}
	eh = mtod(m, struct ether_header *);
	}

	#if defined(INET) \|\| defined(INET6)
	/*
	* Clear M_PROMISC on frame so that carp(4) will see it when the
	* mbuf flows up to Layer 3.
	* FreeBSD's implementation of carp(4) uses the inprotosw
	* to dispatch IPPROTO_CARP. carp(4) also allocates its own
	* Ethernet addresses of the form 00:00:5e:00:01:xx, which
	* is outside the scope of the M_PROMISC test below.
	* TODO: Maintain a hash table of ethernet addresses other than
	* ether_dhost which may be active on this ifp.
	*/
	if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
	m->m_flags &= ~M_PROMISC;
	} else
	#endif
	{
	/*
	* If the frame received was not for our MAC address, set the
	* M_PROMISC flag on the mbuf chain. The frame may need to
	* be seen by the rest of the Ethernet input path in case of
	* re-entry (e.g. bridge, vlan, netgraph) but should not be
	* seen by upper protocol layers.
	*/
	if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
	bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
	m->m_flags \|= M_PROMISC;
	}

	ether_demux(ifp, m);
	CURVNET_RESTORE();
	}

	/*
	* Ethernet input dispatch; by default, direct dispatch here regardless of
	* global configuration. However, if RSS is enabled, hook up RSS affinity
	* so that when deferred or hybrid dispatch is enabled, we can redistribute
	* load based on RSS.
	*
	* XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
	* not it had already done work distribution via multi-queue. Then we could
	* direct dispatch in the event load balancing was already complete and
	* handle the case of interfaces with different capabilities better.
	*
	* XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
	* at multiple layers?
	*
	* XXXRW: For now, enable all this only if RSS is compiled in, although it
	* works fine without RSS. Need to characterise the performance overhead
	* of the detour through the netisr code in the event the result is always
	* direct dispatch.
	*/
	static void
	ether_nh_input(struct mbuf *m)
	{

	M_ASSERTPKTHDR(m);
	KASSERT(m->m_pkthdr.rcvif != NULL,
	("%s: NULL interface pointer", __func__));
	ether_input_internal(m->m_pkthdr.rcvif, m);
	}

	static struct netisr_handler ether_nh = {
	.nh_name = "ether",
	.nh_handler = ether_nh_input,
	.nh_proto = NETISR_ETHER,
	#ifdef RSS
	.nh_policy = NETISR_POLICY_CPU,
	.nh_dispatch = NETISR_DISPATCH_DIRECT,
	.nh_m2cpuid = rss_m2cpuid,
	#else
	.nh_policy = NETISR_POLICY_SOURCE,
	.nh_dispatch = NETISR_DISPATCH_DIRECT,
	#endif
	};

	static void
	ether_init(__unused void *arg)
	{

	netisr_register(&ether_nh);
	}
	SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);

	static void
	vnet_ether_init(__unused void *arg)
	{
	int i;

	/* Initialize packet filter hooks. */
	V_link_pfil_hook.ph_type = PFIL_TYPE_AF;
	V_link_pfil_hook.ph_af = AF_LINK;
	if ((i = pfil_head_register(&V_link_pfil_hook)) != 0)
	printf("%s: WARNING: unable to register pfil link hook, "
	"error %d\n", __func__, i);
	#ifdef VIMAGE
	netisr_register_vnet(&ether_nh);
	#endif
	}
	VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
	vnet_ether_init, NULL);

	#ifdef VIMAGE
	static void
	vnet_ether_pfil_destroy(__unused void *arg)
	{
	int i;

	if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0)
	printf("%s: WARNING: unable to unregister pfil link hook, "
	"error %d\n", __func__, i);
	}
	VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
	vnet_ether_pfil_destroy, NULL);

	static void
	vnet_ether_destroy(__unused void *arg)
	{

	netisr_unregister_vnet(&ether_nh);
	}
	VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
	vnet_ether_destroy, NULL);
	#endif



	static void
	ether_input(struct ifnet ifp, struct mbuf m)
	{

	struct mbuf *mn;

	/*
	* The drivers are allowed to pass in a chain of packets linked with
	* m_nextpkt. We split them up into separate packets here and pass
	* them up. This allows the drivers to amortize the receive lock.
	*/
	while (m) {
	mn = m->m_nextpkt;
	m->m_nextpkt = NULL;

	/*
	* We will rely on rcvif being set properly in the deferred context,
	* so assert it is correct here.
	*/
	KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p "
	"rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp));
	CURVNET_SET_QUIET(ifp->if_vnet);
	netisr_dispatch(NETISR_ETHER, m);
	CURVNET_RESTORE();
	m = mn;
	}
	}

	/*
	* Upper layer processing for a received Ethernet packet.
	*/
	void
	ether_demux(struct ifnet ifp, struct mbuf m)
	{
	struct ether_header *eh;
	int i, isr;
	u_short ether_type;

	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));

	/* Do not grab PROMISC frames in case we are re-entered. */
	if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) {
	i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL);

	if (i != 0 \|\| m == NULL)
	return;
	}

	eh = mtod(m, struct ether_header *);
	ether_type = ntohs(eh->ether_type);

	/*
	* If this frame has a VLAN tag other than 0, call vlan_input()
	* if its module is loaded. Otherwise, drop.
	*/
	if ((m->m_flags & M_VLANTAG) &&
	EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
	if (ifp->if_vlantrunk == NULL) {
	if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
	m_freem(m);
	return;
	}
	KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
	__func__));
	/* Clear before possibly re-entering ether_input(). */
	m->m_flags &= ~M_PROMISC;
	(*vlan_input_p)(ifp, m);
	return;
	}

	/*
	* Pass promiscuously received frames to the upper layer if the user
	* requested this by setting IFF_PPROMISC. Otherwise, drop them.
	*/
	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
	m_freem(m);
	return;
	}

	/*
	* Reset layer specific mbuf flags to avoid confusing upper layers.
	* Strip off Ethernet header.
	*/
	m->m_flags &= ~M_VLANTAG;
	m_clrprotoflags(m);
	m_adj(m, ETHER_HDR_LEN);

	/*
	* Dispatch frame to upper layer.
	*/
	switch (ether_type) {
	#ifdef INET
	case ETHERTYPE_IP:
	isr = NETISR_IP;
	break;

	case ETHERTYPE_ARP:
	if (ifp->if_flags & IFF_NOARP) {
	/* Discard packet if ARP is disabled on interface */
	m_freem(m);
	return;
	}
	isr = NETISR_ARP;
	break;
	#endif
	#ifdef INET6
	case ETHERTYPE_IPV6:
	isr = NETISR_IPV6;
	break;
	#endif
	default:
	goto discard;
	}
	netisr_dispatch(isr, m);
	return;

	discard:
	/*
	* Packet is to be discarded. If netgraph is present,
	* hand the packet to it for last chance processing;
	* otherwise dispose of it.
	*/
	if (ifp->if_l2com != NULL) {
	KASSERT(ng_ether_input_orphan_p != NULL,
	("ng_ether_input_orphan_p is NULL"));
	/*
	* Put back the ethernet header so netgraph has a
	* consistent view of inbound packets.
	*/
	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
	(*ng_ether_input_orphan_p)(ifp, m);
	return;
	}
	m_freem(m);
	}

	/*
	* Convert Ethernet address to printable (loggable) representation.
	* This routine is for compatibility; it's better to just use
	*
	* printf("%6D", <pointer to address>, ":");
	*
	* since there's no static buffer involved.
	*/
	char *
	ether_sprintf(const u_char *ap)
	{
	static char etherbuf[18];
	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
	return (etherbuf);
	}

	/*
	* Perform common duties while attaching to interface list
	*/
	void
	ether_ifattach(struct ifnet ifp, const u_int8_t lla)
	{
	int i;
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;

	ifp->if_addrlen = ETHER_ADDR_LEN;
	ifp->if_hdrlen = ETHER_HDR_LEN;
	if_attach(ifp);
	ifp->if_mtu = ETHERMTU;
	ifp->if_output = ether_output;
	ifp->if_input = ether_input;
	ifp->if_resolvemulti = ether_resolvemulti;
	ifp->if_requestencap = ether_requestencap;
	#ifdef VIMAGE
	ifp->if_reassign = ether_reassign;
	#endif
	if (ifp->if_baudrate == 0)
	ifp->if_baudrate = IF_Mbps(10); /* just a default */
	ifp->if_broadcastaddr = etherbroadcastaddr;

	ifa = ifp->if_addr;
	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	sdl->sdl_type = IFT_ETHER;
	sdl->sdl_alen = ifp->if_addrlen;
	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);

	if (ifp->if_hw_addr != NULL)
	bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen);

	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
	if (ng_ether_attach_p != NULL)
	(*ng_ether_attach_p)(ifp);

	/* Announce Ethernet MAC address if non-zero. */
	for (i = 0; i < ifp->if_addrlen; i++)
	if (lla[i] != 0)
	break;
	if (i != ifp->if_addrlen)
	if_printf(ifp, "Ethernet address: %6D\n", lla, ":");

	uuid_ether_add(LLADDR(sdl));

	/* Add necessary bits are setup; announce it now. */
	EVENTHANDLER_INVOKE(ether_ifattach_event, ifp);
	if (IS_DEFAULT_VNET(curvnet))
	devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL);
	}

	/*
	* Perform common duties while detaching an Ethernet interface
	*/
	void
	ether_ifdetach(struct ifnet *ifp)
	{
	struct sockaddr_dl *sdl;

	sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
	uuid_ether_del(LLADDR(sdl));

	if (ifp->if_l2com != NULL) {
	KASSERT(ng_ether_detach_p != NULL,
	("ng_ether_detach_p is NULL"));
	(*ng_ether_detach_p)(ifp);
	}

	bpfdetach(ifp);
	if_detach(ifp);
	}

	#ifdef VIMAGE
	void
	ether_reassign(struct ifnet ifp, struct vnet new_vnet, char *unused __unused)
	{

	if (ifp->if_l2com != NULL) {
	KASSERT(ng_ether_detach_p != NULL,
	("ng_ether_detach_p is NULL"));
	(*ng_ether_detach_p)(ifp);
	}

	if (ng_ether_attach_p != NULL) {
	CURVNET_SET_QUIET(new_vnet);
	(*ng_ether_attach_p)(ifp);
	CURVNET_RESTORE();
	}
	}
	#endif

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet");

	#if 0
	/*
	* This is for reference. We have a table-driven version
	* of the little-endian crc32 generator, which is faster
	* than the double-loop.
	*/
	uint32_t
	ether_crc32_le(const uint8_t *buf, size_t len)
	{
	size_t i;
	uint32_t crc;
	int bit;
	uint8_t data;

	crc = 0xffffffff; /* initial value */

	for (i = 0; i < len; i++) {
	for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
	carry = (crc ^ data) & 1;
	crc >>= 1;
	if (carry)
	crc = (crc ^ ETHER_CRC_POLY_LE);
	}
	}

	return (crc);
	}
	#else
	uint32_t
	ether_crc32_le(const uint8_t *buf, size_t len)
	{
	static const uint32_t crctab[] = {
	0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
	0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
	0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
	0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
	};
	size_t i;
	uint32_t crc;

	crc = 0xffffffff; /* initial value */

	for (i = 0; i < len; i++) {
	crc ^= buf[i];
	crc = (crc >> 4) ^ crctab[crc & 0xf];
	crc = (crc >> 4) ^ crctab[crc & 0xf];
	}

	return (crc);
	}
	#endif

	uint32_t
	ether_crc32_be(const uint8_t *buf, size_t len)
	{
	size_t i;
	uint32_t crc, carry;
	int bit;
	uint8_t data;

	crc = 0xffffffff; /* initial value */

	for (i = 0; i < len; i++) {
	for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
	carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
	crc <<= 1;
	if (carry)
	crc = (crc ^ ETHER_CRC_POLY_BE) \| carry;
	}
	}

	return (crc);
	}

	int
	ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
	{
	struct ifaddr ifa = (struct ifaddr ) data;
	struct ifreq ifr = (struct ifreq ) data;
	int error = 0;

	switch (command) {
	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;

	switch (ifa->ifa_addr->sa_family) {
	#ifdef INET
	case AF_INET:
	ifp->if_init(ifp->if_softc); /* before arpwhohas */
	arp_ifinit(ifp, ifa);
	break;
	#endif
	default:
	ifp->if_init(ifp->if_softc);
	break;
	}
	break;

	case SIOCGIFADDR:
	{
	struct sockaddr *sa;

	sa = (struct sockaddr *) & ifr->ifr_data;
	bcopy(IF_LLADDR(ifp),
	(caddr_t) sa->sa_data, ETHER_ADDR_LEN);
	}
	break;

	case SIOCSIFMTU:
	/*
	* Set the interface MTU.
	*/
	if (ifr->ifr_mtu > ETHERMTU) {
	error = EINVAL;
	} else {
	ifp->if_mtu = ifr->ifr_mtu;
	}
	break;
	default:
	error = EINVAL; /* XXX netbsd has ENOTTY??? */
	break;
	}
	return (error);
	}

	static int
	ether_resolvemulti(struct ifnet ifp, struct sockaddr *llsa,
	struct sockaddr *sa)
	{
	struct sockaddr_dl *sdl;
	#ifdef INET
	struct sockaddr_in *sin;
	#endif
	#ifdef INET6
	struct sockaddr_in6 *sin6;
	#endif
	u_char *e_addr;

	switch(sa->sa_family) {
	case AF_LINK:
	/*
	* No mapping needed. Just check that it's a valid MC address.
	*/
	sdl = (struct sockaddr_dl *)sa;
	e_addr = LLADDR(sdl);
	if (!ETHER_IS_MULTICAST(e_addr))
	return EADDRNOTAVAIL;
	*llsa = NULL;
	return 0;

	#ifdef INET
	case AF_INET:
	sin = (struct sockaddr_in *)sa;
	if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
	return EADDRNOTAVAIL;
	sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
	sdl->sdl_alen = ETHER_ADDR_LEN;
	e_addr = LLADDR(sdl);
	ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
	llsa = (struct sockaddr )sdl;
	return 0;
	#endif
	#ifdef INET6
	case AF_INET6:
	sin6 = (struct sockaddr_in6 *)sa;
	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
	/*
	* An IP6 address of 0 means listen to all
	* of the Ethernet multicast address used for IP6.
	* (This is used for multicast routers.)
	*/
	ifp->if_flags \|= IFF_ALLMULTI;
	*llsa = NULL;
	return 0;
	}
	if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
	return EADDRNOTAVAIL;
	sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
	sdl->sdl_alen = ETHER_ADDR_LEN;
	e_addr = LLADDR(sdl);
	ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
	llsa = (struct sockaddr )sdl;
	return 0;
	#endif

	default:
	/*
	* Well, the text isn't quite right, but it's the name
	* that counts...
	*/
	return EAFNOSUPPORT;
	}
	}

	static moduledata_t ether_mod = {
	.name = "ether",
	};

	void
	ether_vlan_mtap(struct bpf_if bp, struct mbuf m, void *data, u_int dlen)
	{
	struct ether_vlan_header vlan;
	struct mbuf mv, mb;

	KASSERT((m->m_flags & M_VLANTAG) != 0,
	("%s: vlan information not present", __func__));
	KASSERT(m->m_len >= sizeof(struct ether_header),
	("%s: mbuf not large enough for header", __func__));
	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
	vlan.evl_proto = vlan.evl_encap_proto;
	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
	m->m_len -= sizeof(struct ether_header);
	m->m_data += sizeof(struct ether_header);
	/*
	* If a data link has been supplied by the caller, then we will need to
	* re-create a stack allocated mbuf chain with the following structure:
	*
	* (1) mbuf #1 will contain the supplied data link
	* (2) mbuf #2 will contain the vlan header
	* (3) mbuf #3 will contain the original mbuf's packet data
	*
	* Otherwise, submit the packet and vlan header via bpf_mtap2().
	*/
	if (data != NULL) {
	mv.m_next = m;
	mv.m_data = (caddr_t)&vlan;
	mv.m_len = sizeof(vlan);
	mb.m_next = &mv;
	mb.m_data = data;
	mb.m_len = dlen;
	bpf_mtap(bp, &mb);
	} else
	bpf_mtap2(bp, &vlan, sizeof(vlan), m);
	m->m_len += sizeof(struct ether_header);
	m->m_data -= sizeof(struct ether_header);
	}

	struct mbuf *
	ether_vlanencap(struct mbuf *m, uint16_t tag)
	{
	struct ether_vlan_header *evl;

	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
	if (m == NULL)
	return (NULL);
	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */

	if (m->m_len < sizeof(*evl)) {
	m = m_pullup(m, sizeof(*evl));
	if (m == NULL)
	return (NULL);
	}

	/*
	* Transform the Ethernet header into an Ethernet header
	* with 802.1Q encapsulation.
	*/
	evl = mtod(m, struct ether_vlan_header *);
	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
	(char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
	evl->evl_tag = htons(tag);
	return (m);
	}

	DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
	MODULE_VERSION(ether, 1);
	Index: head/sys/net/if_gif.c
	===================================================================
	--- head/sys/net/if_gif.c (revision 327172)
	+++ head/sys/net/if_gif.c (revision 327173)
	@@ -1,1066 +1,1064 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/rmlock.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/sx.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/conf.h>
	#include <machine/cpu.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_clone.h>
	#include <net/if_types.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/bpf.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_ecn.h>
	#ifdef INET
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>
	#endif /* INET */

	#ifdef INET6
	#ifndef INET
	#include <netinet/in.h>
	#endif
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_ecn.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/ip6protosw.h>
	#endif /* INET6 */

	#include <netinet/ip_encap.h>
	#include <net/ethernet.h>
	#include <net/if_bridgevar.h>
	#include <net/if_gif.h>

	#include <security/mac/mac_framework.h>

	static const char gifname[] = "gif";

	/*
	* gif_mtx protects a per-vnet gif_softc_list.
	*/
	static VNET_DEFINE(struct mtx, gif_mtx);
	#define V_gif_mtx VNET(gif_mtx)
	static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface");
	static VNET_DEFINE(LIST_HEAD(, gif_softc), gif_softc_list);
	#define V_gif_softc_list VNET(gif_softc_list)
	static struct sx gif_ioctl_sx;
	SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl");

	#define GIF_LIST_LOCK_INIT(x) mtx_init(&V_gif_mtx, "gif_mtx", \
	NULL, MTX_DEF)
	#define GIF_LIST_LOCK_DESTROY(x) mtx_destroy(&V_gif_mtx)
	#define GIF_LIST_LOCK(x) mtx_lock(&V_gif_mtx)
	#define GIF_LIST_UNLOCK(x) mtx_unlock(&V_gif_mtx)

	void (ng_gif_input_p)(struct ifnet ifp, struct mbuf **mp, int af);
	void (ng_gif_input_orphan_p)(struct ifnet ifp, struct mbuf *m, int af);
	void (ng_gif_attach_p)(struct ifnet ifp);
	void (ng_gif_detach_p)(struct ifnet ifp);

	static int gif_check_nesting(struct ifnet , struct mbuf );
	static int gif_set_tunnel(struct ifnet , struct sockaddr ,
	struct sockaddr *);
	static void gif_delete_tunnel(struct ifnet *);
	static int gif_ioctl(struct ifnet *, u_long, caddr_t);
	static int gif_transmit(struct ifnet , struct mbuf );
	static void gif_qflush(struct ifnet *);
	static int gif_clone_create(struct if_clone *, int, caddr_t);
	static void gif_clone_destroy(struct ifnet *);
	static VNET_DEFINE(struct if_clone *, gif_cloner);
	#define V_gif_cloner VNET(gif_cloner)

	static int gifmodevent(module_t, int, void *);

	SYSCTL_DECL(_net_link);
	static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0,
	"Generic Tunnel Interface");
	#ifndef MAX_GIF_NEST
	/*
	* This macro controls the default upper limitation on nesting of gif tunnels.
	* Since, setting a large value to this macro with a careless configuration
	* may introduce system crash, we don't allow any nestings by default.
	* If you need to configure nested gif tunnels, you can define this macro
	* in your kernel configuration file. However, if you do so, please be
	* careful to configure the tunnels so that it won't make a loop.
	*/
	#define MAX_GIF_NEST 1
	#endif
	static VNET_DEFINE(int, max_gif_nesting) = MAX_GIF_NEST;
	#define V_max_gif_nesting VNET(max_gif_nesting)
	SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(max_gif_nesting), 0, "Max nested tunnels");

	/*
	* By default, we disallow creation of multiple tunnels between the same
	* pair of addresses. Some applications require this functionality so
	* we allow control over this check here.
	*/
	#ifdef XBONEHACK
	static VNET_DEFINE(int, parallel_tunnels) = 1;
	#else
	static VNET_DEFINE(int, parallel_tunnels) = 0;
	#endif
	#define V_parallel_tunnels VNET(parallel_tunnels)
	SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(parallel_tunnels), 0,
	"Allow parallel tunnels?");

	static int
	gif_clone_create(struct if_clone *ifc, int unit, caddr_t params)
	{
	struct gif_softc *sc;

	sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK \| M_ZERO);
	sc->gif_fibnum = curthread->td_proc->p_fibnum;
	GIF2IFP(sc) = if_alloc(IFT_GIF);
	GIF_LOCK_INIT(sc);
	GIF2IFP(sc)->if_softc = sc;
	if_initname(GIF2IFP(sc), gifname, unit);

	GIF2IFP(sc)->if_addrlen = 0;
	GIF2IFP(sc)->if_mtu = GIF_MTU;
	GIF2IFP(sc)->if_flags = IFF_POINTOPOINT \| IFF_MULTICAST;
	#if 0
	/* turn off ingress filter */
	GIF2IFP(sc)->if_flags \|= IFF_LINK2;
	#endif
	GIF2IFP(sc)->if_ioctl = gif_ioctl;
	GIF2IFP(sc)->if_transmit = gif_transmit;
	GIF2IFP(sc)->if_qflush = gif_qflush;
	GIF2IFP(sc)->if_output = gif_output;
	GIF2IFP(sc)->if_capabilities \|= IFCAP_LINKSTATE;
	GIF2IFP(sc)->if_capenable \|= IFCAP_LINKSTATE;
	if_attach(GIF2IFP(sc));
	bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t));
	if (ng_gif_attach_p != NULL)
	(*ng_gif_attach_p)(GIF2IFP(sc));

	GIF_LIST_LOCK();
	LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list);
	GIF_LIST_UNLOCK();
	return (0);
	}

	static void
	gif_clone_destroy(struct ifnet *ifp)
	{
	struct gif_softc *sc;

	sx_xlock(&gif_ioctl_sx);
	sc = ifp->if_softc;
	gif_delete_tunnel(ifp);
	GIF_LIST_LOCK();
	LIST_REMOVE(sc, gif_list);
	GIF_LIST_UNLOCK();
	if (ng_gif_detach_p != NULL)
	(*ng_gif_detach_p)(ifp);
	bpfdetach(ifp);
	if_detach(ifp);
	ifp->if_softc = NULL;
	sx_xunlock(&gif_ioctl_sx);

	if_free(ifp);
	GIF_LOCK_DESTROY(sc);
	free(sc, M_GIF);
	}

	static void
	vnet_gif_init(const void *unused __unused)
	{

	LIST_INIT(&V_gif_softc_list);
	GIF_LIST_LOCK_INIT();
	V_gif_cloner = if_clone_simple(gifname, gif_clone_create,
	gif_clone_destroy, 0);
	}
	VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
	vnet_gif_init, NULL);

	static void
	vnet_gif_uninit(const void *unused __unused)
	{

	if_clone_detach(V_gif_cloner);
	GIF_LIST_LOCK_DESTROY();
	}
	VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
	vnet_gif_uninit, NULL);

	static int
	gifmodevent(module_t mod, int type, void *data)
	{

	switch (type) {
	case MOD_LOAD:
	case MOD_UNLOAD:
	break;
	default:
	return (EOPNOTSUPP);
	}
	return (0);
	}

	static moduledata_t gif_mod = {
	"if_gif",
	gifmodevent,
	0
	};

	DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	MODULE_VERSION(if_gif, 1);

	int
	gif_encapcheck(const struct mbuf m, int off, int proto, void arg)
	{
	GIF_RLOCK_TRACKER;
	const struct ip *ip;
	struct gif_softc *sc;
	int ret;

	sc = (struct gif_softc *)arg;
	if (sc == NULL \|\| (GIF2IFP(sc)->if_flags & IFF_UP) == 0)
	return (0);

	ret = 0;
	GIF_RLOCK(sc);

	/* no physical address */
	if (sc->gif_family == 0)
	goto done;

	switch (proto) {
	#ifdef INET
	case IPPROTO_IPV4:
	#endif
	#ifdef INET6
	case IPPROTO_IPV6:
	#endif
	case IPPROTO_ETHERIP:
	break;
	default:
	goto done;
	}

	/* Bail on short packets */
	M_ASSERTPKTHDR(m);
	if (m->m_pkthdr.len < sizeof(struct ip))
	goto done;

	ip = mtod(m, const struct ip *);
	switch (ip->ip_v) {
	#ifdef INET
	case 4:
	if (sc->gif_family != AF_INET)
	goto done;
	ret = in_gif_encapcheck(m, off, proto, arg);
	break;
	#endif
	#ifdef INET6
	case 6:
	if (m->m_pkthdr.len < sizeof(struct ip6_hdr))
	goto done;
	if (sc->gif_family != AF_INET6)
	goto done;
	ret = in6_gif_encapcheck(m, off, proto, arg);
	break;
	#endif
	}
	done:
	GIF_RUNLOCK(sc);
	return (ret);
	}

	static int
	gif_transmit(struct ifnet ifp, struct mbuf m)
	{
	struct gif_softc *sc;
	struct etherip_header *eth;
	#ifdef INET
	struct ip *ip;
	#endif
	#ifdef INET6
	struct ip6_hdr *ip6;
	uint32_t t;
	#endif
	uint32_t af;
	uint8_t proto, ecn;
	int error;

	#ifdef MAC
	error = mac_ifnet_check_transmit(ifp, m);
	if (error) {
	m_freem(m);
	goto err;
	}
	#endif
	error = ENETDOWN;
	sc = ifp->if_softc;
	if ((ifp->if_flags & IFF_MONITOR) != 0 \|\|
	(ifp->if_flags & IFF_UP) == 0 \|\|
	sc->gif_family == 0 \|\|
	(error = gif_check_nesting(ifp, m)) != 0) {
	m_freem(m);
	goto err;
	}
	/* Now pull back the af that we stashed in the csum_data. */
	if (ifp->if_bridge)
	af = AF_LINK;
	else
	af = m->m_pkthdr.csum_data;
	m->m_flags &= ~(M_BCAST\|M_MCAST);
	M_SETFIB(m, sc->gif_fibnum);
	BPF_MTAP2(ifp, &af, sizeof(af), m);
	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
	/* inner AF-specific encapsulation */
	ecn = 0;
	switch (af) {
	#ifdef INET
	case AF_INET:
	proto = IPPROTO_IPV4;
	if (m->m_len < sizeof(struct ip))
	m = m_pullup(m, sizeof(struct ip));
	if (m == NULL) {
	error = ENOBUFS;
	goto err;
	}
	ip = mtod(m, struct ip *);
	ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
	ECN_NOCARE, &ecn, &ip->ip_tos);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	proto = IPPROTO_IPV6;
	if (m->m_len < sizeof(struct ip6_hdr))
	m = m_pullup(m, sizeof(struct ip6_hdr));
	if (m == NULL) {
	error = ENOBUFS;
	goto err;
	}
	t = 0;
	ip6 = mtod(m, struct ip6_hdr *);
	ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
	ECN_NOCARE, &t, &ip6->ip6_flow);
	ecn = (ntohl(t) >> 20) & 0xff;
	break;
	#endif
	case AF_LINK:
	proto = IPPROTO_ETHERIP;
	M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT);
	if (m == NULL) {
	error = ENOBUFS;
	goto err;
	}
	eth = mtod(m, struct etherip_header *);
	eth->eip_resvh = 0;
	eth->eip_ver = ETHERIP_VERSION;
	eth->eip_resvl = 0;
	break;
	default:
	error = EAFNOSUPPORT;
	m_freem(m);
	goto err;
	}
	/* XXX should we check if our outer source is legal? */
	/* dispatch to output logic based on outer AF */
	switch (sc->gif_family) {
	#ifdef INET
	case AF_INET:
	error = in_gif_output(ifp, m, proto, ecn);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	error = in6_gif_output(ifp, m, proto, ecn);
	break;
	#endif
	default:
	m_freem(m);
	}
	err:
	if (error)
	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
	return (error);
	}

	static void
	gif_qflush(struct ifnet *ifp __unused)
	{

	}

	#define MTAG_GIF 1080679712
	static int
	gif_check_nesting(struct ifnet ifp, struct mbuf m)
	{
	struct m_tag *mtag;
	int count;

	/*
	* gif may cause infinite recursion calls when misconfigured.
	* We'll prevent this by detecting loops.
	*
	* High nesting level may cause stack exhaustion.
	* We'll prevent this by introducing upper limit.
	*/
	count = 1;
	mtag = NULL;
	while ((mtag = m_tag_locate(m, MTAG_GIF, 0, mtag)) != NULL) {
	if ((struct ifnet *)(mtag + 1) == ifp) {
	log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp));
	return (EIO);
	}
	count++;
	}
	if (count > V_max_gif_nesting) {
	log(LOG_NOTICE,
	"%s: if_output recursively called too many times(%d)\n",
	if_name(ifp), count);
	return (EIO);
	}
	mtag = m_tag_alloc(MTAG_GIF, 0, sizeof(struct ifnet *), M_NOWAIT);
	if (mtag == NULL)
	return (ENOMEM);
	(struct ifnet *)(mtag + 1) = ifp;
	m_tag_prepend(m, mtag);
	return (0);
	}

	int
	gif_output(struct ifnet ifp, struct mbuf m, const struct sockaddr *dst,
	struct route *ro)
	{
	uint32_t af;

	if (dst->sa_family == AF_UNSPEC)
	bcopy(dst->sa_data, &af, sizeof(af));
	else
	af = dst->sa_family;
	/*
	* Now save the af in the inbound pkt csum data, this is a cheat since
	* we are using the inbound csum_data field to carry the af over to
	* the gif_transmit() routine, avoiding using yet another mtag.
	*/
	m->m_pkthdr.csum_data = af;
	return (ifp->if_transmit(ifp, m));
	}

	void
	gif_input(struct mbuf m, struct ifnet ifp, int proto, uint8_t ecn)
	{
	struct etherip_header *eip;
	#ifdef INET
	struct ip *ip;
	#endif
	#ifdef INET6
	struct ip6_hdr *ip6;
	uint32_t t;
	#endif
	- struct gif_softc *sc;
	struct ether_header *eh;
	struct ifnet *oldifp;
	int isr, n, af;

	if (ifp == NULL) {
	/* just in case */
	m_freem(m);
	return;
	}
	- sc = ifp->if_softc;
	m->m_pkthdr.rcvif = ifp;
	m_clrprotoflags(m);
	switch (proto) {
	#ifdef INET
	case IPPROTO_IPV4:
	af = AF_INET;
	if (m->m_len < sizeof(struct ip))
	m = m_pullup(m, sizeof(struct ip));
	if (m == NULL)
	goto drop;
	ip = mtod(m, struct ip *);
	if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
	ECN_NOCARE, &ecn, &ip->ip_tos) == 0) {
	m_freem(m);
	goto drop;
	}
	break;
	#endif
	#ifdef INET6
	case IPPROTO_IPV6:
	af = AF_INET6;
	if (m->m_len < sizeof(struct ip6_hdr))
	m = m_pullup(m, sizeof(struct ip6_hdr));
	if (m == NULL)
	goto drop;
	t = htonl((uint32_t)ecn << 20);
	ip6 = mtod(m, struct ip6_hdr *);
	if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
	ECN_NOCARE, &t, &ip6->ip6_flow) == 0) {
	m_freem(m);
	goto drop;
	}
	break;
	#endif
	case IPPROTO_ETHERIP:
	af = AF_LINK;
	break;
	default:
	m_freem(m);
	goto drop;
	}

	#ifdef MAC
	mac_ifnet_create_mbuf(ifp, m);
	#endif

	if (bpf_peers_present(ifp->if_bpf)) {
	uint32_t af1 = af;
	bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m);
	}

	if ((ifp->if_flags & IFF_MONITOR) != 0) {
	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
	m_freem(m);
	return;
	}

	if (ng_gif_input_p != NULL) {
	(*ng_gif_input_p)(ifp, &m, af);
	if (m == NULL)
	goto drop;
	}

	/*
	* Put the packet to the network layer input queue according to the
	* specified address family.
	* Note: older versions of gif_input directly called network layer
	* input functions, e.g. ip6_input, here. We changed the policy to
	* prevent too many recursive calls of such input functions, which
	* might cause kernel panic. But the change may introduce another
	* problem; if the input queue is full, packets are discarded.
	* The kernel stack overflow really happened, and we believed
	* queue-full rarely occurs, so we changed the policy.
	*/
	switch (af) {
	#ifdef INET
	case AF_INET:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	isr = NETISR_IPV6;
	break;
	#endif
	case AF_LINK:
	n = sizeof(struct etherip_header) + sizeof(struct ether_header);
	if (n > m->m_len)
	m = m_pullup(m, n);
	if (m == NULL)
	goto drop;
	eip = mtod(m, struct etherip_header *);
	if (eip->eip_ver != ETHERIP_VERSION) {
	/* discard unknown versions */
	m_freem(m);
	goto drop;
	}
	m_adj(m, sizeof(struct etherip_header));

	m->m_flags &= ~(M_BCAST\|M_MCAST);
	m->m_pkthdr.rcvif = ifp;

	if (ifp->if_bridge) {
	oldifp = ifp;
	eh = mtod(m, struct ether_header *);
	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
	if (ETHER_IS_BROADCAST(eh->ether_dhost))
	m->m_flags \|= M_BCAST;
	else
	m->m_flags \|= M_MCAST;
	if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
	}
	BRIDGE_INPUT(ifp, m);

	if (m != NULL && ifp != oldifp) {
	/*
	* The bridge gave us back itself or one of the
	* members for which the frame is addressed.
	*/
	ether_demux(ifp, m);
	return;
	}
	}
	if (m != NULL)
	m_freem(m);
	return;

	default:
	if (ng_gif_input_orphan_p != NULL)
	(*ng_gif_input_orphan_p)(ifp, m, af);
	else
	m_freem(m);
	return;
	}

	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
	M_SETFIB(m, ifp->if_fib);
	netisr_dispatch(isr, m);
	return;
	drop:
	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
	}

	/* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */
	int
	gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	GIF_RLOCK_TRACKER;
	struct ifreq ifr = (struct ifreq)data;
	struct sockaddr dst, src;
	struct gif_softc *sc;
	#ifdef INET
	struct sockaddr_in *sin = NULL;
	#endif
	#ifdef INET6
	struct sockaddr_in6 *sin6 = NULL;
	#endif
	u_int options;
	int error;

	switch (cmd) {
	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;
	case SIOCADDMULTI:
	case SIOCDELMULTI:
	case SIOCGIFMTU:
	case SIOCSIFFLAGS:
	return (0);
	case SIOCSIFMTU:
	if (ifr->ifr_mtu < GIF_MTU_MIN \|\|
	ifr->ifr_mtu > GIF_MTU_MAX)
	return (EINVAL);
	else
	ifp->if_mtu = ifr->ifr_mtu;
	return (0);
	}
	sx_xlock(&gif_ioctl_sx);
	sc = ifp->if_softc;
	if (sc == NULL) {
	error = ENXIO;
	goto bad;
	}
	error = 0;
	switch (cmd) {
	case SIOCSIFPHYADDR:
	#ifdef INET6
	case SIOCSIFPHYADDR_IN6:
	#endif
	error = EINVAL;
	switch (cmd) {
	#ifdef INET
	case SIOCSIFPHYADDR:
	src = (struct sockaddr *)
	&(((struct in_aliasreq *)data)->ifra_addr);
	dst = (struct sockaddr *)
	&(((struct in_aliasreq *)data)->ifra_dstaddr);
	break;
	#endif
	#ifdef INET6
	case SIOCSIFPHYADDR_IN6:
	src = (struct sockaddr *)
	&(((struct in6_aliasreq *)data)->ifra_addr);
	dst = (struct sockaddr *)
	&(((struct in6_aliasreq *)data)->ifra_dstaddr);
	break;
	#endif
	default:
	goto bad;
	}
	/* sa_family must be equal */
	if (src->sa_family != dst->sa_family \|\|
	src->sa_len != dst->sa_len)
	goto bad;

	/* validate sa_len */
	/* check sa_family looks sane for the cmd */
	switch (src->sa_family) {
	#ifdef INET
	case AF_INET:
	if (src->sa_len != sizeof(struct sockaddr_in))
	goto bad;
	if (cmd != SIOCSIFPHYADDR) {
	error = EAFNOSUPPORT;
	goto bad;
	}
	if (satosin(src)->sin_addr.s_addr == INADDR_ANY \|\|
	satosin(dst)->sin_addr.s_addr == INADDR_ANY) {
	error = EADDRNOTAVAIL;
	goto bad;
	}
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	if (src->sa_len != sizeof(struct sockaddr_in6))
	goto bad;
	if (cmd != SIOCSIFPHYADDR_IN6) {
	error = EAFNOSUPPORT;
	goto bad;
	}
	error = EADDRNOTAVAIL;
	if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)
	\|\|
	IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr))
	goto bad;
	/*
	* Check validity of the scope zone ID of the
	* addresses, and convert it into the kernel
	* internal form if necessary.
	*/
	error = sa6_embedscope(satosin6(src), 0);
	if (error != 0)
	goto bad;
	error = sa6_embedscope(satosin6(dst), 0);
	if (error != 0)
	goto bad;
	break;
	#endif
	default:
	error = EAFNOSUPPORT;
	goto bad;
	}
	error = gif_set_tunnel(ifp, src, dst);
	break;
	case SIOCDIFPHYADDR:
	gif_delete_tunnel(ifp);
	break;
	case SIOCGIFPSRCADDR:
	case SIOCGIFPDSTADDR:
	#ifdef INET6
	case SIOCGIFPSRCADDR_IN6:
	case SIOCGIFPDSTADDR_IN6:
	#endif
	if (sc->gif_family == 0) {
	error = EADDRNOTAVAIL;
	break;
	}
	GIF_RLOCK(sc);
	switch (cmd) {
	#ifdef INET
	case SIOCGIFPSRCADDR:
	case SIOCGIFPDSTADDR:
	if (sc->gif_family != AF_INET) {
	error = EADDRNOTAVAIL;
	break;
	}
	sin = (struct sockaddr_in *)&ifr->ifr_addr;
	memset(sin, 0, sizeof(*sin));
	sin->sin_family = AF_INET;
	sin->sin_len = sizeof(*sin);
	break;
	#endif
	#ifdef INET6
	case SIOCGIFPSRCADDR_IN6:
	case SIOCGIFPDSTADDR_IN6:
	if (sc->gif_family != AF_INET6) {
	error = EADDRNOTAVAIL;
	break;
	}
	sin6 = (struct sockaddr_in6 *)
	&(((struct in6_ifreq *)data)->ifr_addr);
	memset(sin6, 0, sizeof(*sin6));
	sin6->sin6_family = AF_INET6;
	sin6->sin6_len = sizeof(*sin6);
	break;
	#endif
	default:
	error = EAFNOSUPPORT;
	}
	if (error == 0) {
	switch (cmd) {
	#ifdef INET
	case SIOCGIFPSRCADDR:
	sin->sin_addr = sc->gif_iphdr->ip_src;
	break;
	case SIOCGIFPDSTADDR:
	sin->sin_addr = sc->gif_iphdr->ip_dst;
	break;
	#endif
	#ifdef INET6
	case SIOCGIFPSRCADDR_IN6:
	sin6->sin6_addr = sc->gif_ip6hdr->ip6_src;
	break;
	case SIOCGIFPDSTADDR_IN6:
	sin6->sin6_addr = sc->gif_ip6hdr->ip6_dst;
	break;
	#endif
	}
	}
	GIF_RUNLOCK(sc);
	if (error != 0)
	break;
	switch (cmd) {
	#ifdef INET
	case SIOCGIFPSRCADDR:
	case SIOCGIFPDSTADDR:
	error = prison_if(curthread->td_ucred,
	(struct sockaddr *)sin);
	if (error != 0)
	memset(sin, 0, sizeof(*sin));
	break;
	#endif
	#ifdef INET6
	case SIOCGIFPSRCADDR_IN6:
	case SIOCGIFPDSTADDR_IN6:
	error = prison_if(curthread->td_ucred,
	(struct sockaddr *)sin6);
	if (error == 0)
	error = sa6_recoverscope(sin6);
	if (error != 0)
	memset(sin6, 0, sizeof(*sin6));
	#endif
	}
	break;
	case SIOCGTUNFIB:
	ifr->ifr_fib = sc->gif_fibnum;
	break;
	case SIOCSTUNFIB:
	if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0)
	break;
	if (ifr->ifr_fib >= rt_numfibs)
	error = EINVAL;
	else
	sc->gif_fibnum = ifr->ifr_fib;
	break;
	case GIFGOPTS:
	options = sc->gif_options;
	error = copyout(&options, ifr->ifr_data, sizeof(options));
	break;
	case GIFSOPTS:
	if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0)
	break;
	error = copyin(ifr->ifr_data, &options, sizeof(options));
	if (error)
	break;
	if (options & ~GIF_OPTMASK)
	error = EINVAL;
	else
	sc->gif_options = options;
	break;
	default:
	error = EINVAL;
	break;
	}
	bad:
	sx_xunlock(&gif_ioctl_sx);
	return (error);
	}

	static void
	gif_detach(struct gif_softc *sc)
	{

	sx_assert(&gif_ioctl_sx, SA_XLOCKED);
	if (sc->gif_ecookie != NULL)
	encap_detach(sc->gif_ecookie);
	sc->gif_ecookie = NULL;
	}

	static int
	gif_attach(struct gif_softc *sc, int af)
	{

	sx_assert(&gif_ioctl_sx, SA_XLOCKED);
	switch (af) {
	#ifdef INET
	case AF_INET:
	return (in_gif_attach(sc));
	#endif
	#ifdef INET6
	case AF_INET6:
	return (in6_gif_attach(sc));
	#endif
	}
	return (EAFNOSUPPORT);
	}

	static int
	gif_set_tunnel(struct ifnet ifp, struct sockaddr src, struct sockaddr *dst)
	{
	struct gif_softc *sc = ifp->if_softc;
	struct gif_softc *tsc;
	#ifdef INET
	struct ip *ip;
	#endif
	#ifdef INET6
	struct ip6_hdr *ip6;
	#endif
	void *hdr;
	int error = 0;

	if (sc == NULL)
	return (ENXIO);
	/* Disallow parallel tunnels unless instructed otherwise. */
	if (V_parallel_tunnels == 0) {
	GIF_LIST_LOCK();
	LIST_FOREACH(tsc, &V_gif_softc_list, gif_list) {
	if (tsc == sc \|\| tsc->gif_family != src->sa_family)
	continue;
	#ifdef INET
	if (tsc->gif_family == AF_INET &&
	tsc->gif_iphdr->ip_src.s_addr ==
	satosin(src)->sin_addr.s_addr &&
	tsc->gif_iphdr->ip_dst.s_addr ==
	satosin(dst)->sin_addr.s_addr) {
	error = EADDRNOTAVAIL;
	GIF_LIST_UNLOCK();
	goto bad;
	}
	#endif
	#ifdef INET6
	if (tsc->gif_family == AF_INET6 &&
	IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_src,
	&satosin6(src)->sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_dst,
	&satosin6(dst)->sin6_addr)) {
	error = EADDRNOTAVAIL;
	GIF_LIST_UNLOCK();
	goto bad;
	}
	#endif
	}
	GIF_LIST_UNLOCK();
	}
	switch (src->sa_family) {
	#ifdef INET
	case AF_INET:
	hdr = ip = malloc(sizeof(struct ip), M_GIF,
	M_WAITOK \| M_ZERO);
	ip->ip_src.s_addr = satosin(src)->sin_addr.s_addr;
	ip->ip_dst.s_addr = satosin(dst)->sin_addr.s_addr;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	hdr = ip6 = malloc(sizeof(struct ip6_hdr), M_GIF,
	M_WAITOK \| M_ZERO);
	ip6->ip6_src = satosin6(src)->sin6_addr;
	ip6->ip6_dst = satosin6(dst)->sin6_addr;
	ip6->ip6_vfc = IPV6_VERSION;
	break;
	#endif
	default:
	return (EAFNOSUPPORT);
	}

	if (sc->gif_family != src->sa_family)
	gif_detach(sc);
	if (sc->gif_family == 0 \|\|
	sc->gif_family != src->sa_family)
	error = gif_attach(sc, src->sa_family);

	GIF_WLOCK(sc);
	if (sc->gif_family != 0)
	free(sc->gif_hdr, M_GIF);
	sc->gif_family = src->sa_family;
	sc->gif_hdr = hdr;
	GIF_WUNLOCK(sc);
	#if defined(INET) \|\| defined(INET6)
	bad:
	#endif
	if (error == 0 && sc->gif_family != 0) {
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	if_link_state_change(ifp, LINK_STATE_UP);
	} else {
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	if_link_state_change(ifp, LINK_STATE_DOWN);
	}
	return (error);
	}

	static void
	gif_delete_tunnel(struct ifnet *ifp)
	{
	struct gif_softc *sc = ifp->if_softc;
	int family;

	if (sc == NULL)
	return;

	GIF_WLOCK(sc);
	family = sc->gif_family;
	sc->gif_family = 0;
	GIF_WUNLOCK(sc);
	if (family != 0) {
	gif_detach(sc);
	free(sc->gif_hdr, M_GIF);
	}
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	if_link_state_change(ifp, LINK_STATE_DOWN);
	}
	Index: head/sys/netinet/tcp_output.c
	===================================================================
	--- head/sys/netinet/tcp_output.c (revision 327172)
	+++ head/sys/netinet/tcp_output.c (revision 327173)
	@@ -1,1868 +1,1872 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/domain.h>
	#ifdef TCP_HHOOK
	#include <sys/hhook.h>
	#endif
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/protosw.h>
	#include <sys/sdt.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>

	#include <net/if.h>
	#include <net/route.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_kdtrace.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#ifdef INET6
	#include <netinet6/in6_pcb.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#endif
	#ifdef TCP_RFC7413
	#include <netinet/tcp_fastopen.h>
	#endif
	#include <netinet/tcp.h>
	#define TCPOUTFLAGS
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcpip.h>
	#include <netinet/cc/cc.h>
	#ifdef TCPPCAP
	#include <netinet/tcp_pcap.h>
	#endif
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif
	#ifdef TCP_OFFLOAD
	#include <netinet/tcp_offload.h>
	#endif

	#include <netipsec/ipsec_support.h>

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	VNET_DEFINE(int, path_mtu_discovery) = 1;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(path_mtu_discovery), 1,
	"Enable Path MTU Discovery");

	VNET_DEFINE(int, tcp_do_tso) = 1;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(tcp_do_tso), 0,
	"Enable TCP Segmentation Offload");

	VNET_DEFINE(int, tcp_sendspace) = 1024*32;
	#define V_tcp_sendspace VNET(tcp_sendspace)
	SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");

	VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(tcp_do_autosndbuf), 0,
	"Enable automatic send buffer sizing");

	VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(tcp_autosndbuf_inc), 0,
	"Incrementor step size of automatic send buffer");

	VNET_DEFINE(int, tcp_autosndbuf_max) = 210241024;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(tcp_autosndbuf_max), 0,
	"Max size of automatic send buffer");

	VNET_DEFINE(int, tcp_sendbuf_auto_lowat) = 0;
	#define V_tcp_sendbuf_auto_lowat VNET(tcp_sendbuf_auto_lowat)
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(tcp_sendbuf_auto_lowat), 0,
	"Modify threshold for auto send buffer growth to account for SO_SNDLOWAT");

	/*
	* Make sure that either retransmit or persist timer is set for SYN, FIN and
	* non-ACK.
	*/
	#define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \
	KASSERT(((len) == 0 && ((th_flags) & (TH_SYN \| TH_FIN)) == 0) \|\|\
	tcp_timer_active((tp), TT_REXMT) \|\| \
	tcp_timer_active((tp), TT_PERSIST), \
	("neither rexmt nor persist timer is set"))

	#ifdef TCP_HHOOK
	static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
	struct tcphdr th, struct tcpopt to,
	uint32_t len, int tso);
	#endif
	static void inline cc_after_idle(struct tcpcb *tp);

	#ifdef TCP_HHOOK
	/*
	* Wrapper for the TCP established output helper hook.
	*/
	static void inline
	hhook_run_tcp_est_out(struct tcpcb tp, struct tcphdr th,
	struct tcpopt *to, uint32_t len, int tso)
	{
	struct tcp_hhook_data hhook_data;

	if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
	hhook_data.tp = tp;
	hhook_data.th = th;
	hhook_data.to = to;
	hhook_data.len = len;
	hhook_data.tso = tso;

	hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
	tp->osd);
	}
	}
	#endif

	/*
	* CC wrapper hook functions
	*/
	static void inline
	cc_after_idle(struct tcpcb *tp)
	{
	INP_WLOCK_ASSERT(tp->t_inpcb);

	if (CC_ALGO(tp)->after_idle != NULL)
	CC_ALGO(tp)->after_idle(tp->ccv);
	}

	/*
	* Tcp output routine: figure out what should be sent and send it.
	*/
	int
	tcp_output(struct tcpcb *tp)
	{
	struct socket *so = tp->t_inpcb->inp_socket;
	int32_t len;
	uint32_t recwin, sendwin;
	int off, flags, error = 0; /* Keep compiler happy */
	struct mbuf *m;
	struct ip *ip = NULL;
	+#ifdef TCPDEBUG
	struct ipovly *ipov = NULL;
	+#endif
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned ipoptlen, optlen, hdrlen;
	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	unsigned ipsec_optlen = 0;
	#endif
	int idle, sendalot;
	int sack_rxmit, sack_bytes_rxmt;
	struct sackhole *p;
	int tso, mtu;
	struct tcpopt to;
	#if 0
	int maxburst = TCP_MAXBURST;
	#endif
	#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	int isipv6;

	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
	#endif

	INP_WLOCK_ASSERT(tp->t_inpcb);

	#ifdef TCP_OFFLOAD
	if (tp->t_flags & TF_TOE)
	return (tcp_offload_output(tp));
	#endif

	#ifdef TCP_RFC7413
	/*
	* For TFO connections in SYN_RECEIVED, only allow the initial
	* SYN\|ACK and those sent by the retransmit timer.
	*/
	if (IS_FASTOPEN(tp->t_flags) &&
	(tp->t_state == TCPS_SYN_RECEIVED) &&
	SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN\|ACK sent */
	(tp->snd_nxt != tp->snd_una)) /* not a retransmit */
	return (0);
	#endif
	/*
	* Determine length of data that should be transmitted,
	* and flags that will be used.
	* If there is some data or critical controls (SYN, RST)
	* to send, then transmit; otherwise, investigate further.
	*/
	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
	cc_after_idle(tp);
	tp->t_flags &= ~TF_LASTIDLE;
	if (idle) {
	if (tp->t_flags & TF_MORETOCOME) {
	tp->t_flags \|= TF_LASTIDLE;
	idle = 0;
	}
	}
	again:
	/*
	* If we've recently taken a timeout, snd_max will be greater than
	* snd_nxt. There may be SACK information that allows us to avoid
	* resending already delivered data. Adjust snd_nxt accordingly.
	*/
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	SEQ_LT(tp->snd_nxt, tp->snd_max))
	tcp_sack_adjust(tp);
	sendalot = 0;
	tso = 0;
	mtu = 0;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	* Send any SACK-generated retransmissions. If we're explicitly trying
	* to send out new data (when sendalot is 1), bypass this function.
	* If we retransmit in fast recovery mode, decrement snd_cwnd, since
	* we're replacing a (future) new transmission with a retransmission
	* now, and we previously incremented snd_cwnd in tcp_input().
	*/
	/*
	* Still in sack recovery , reset rxmit flag to zero.
	*/
	sack_rxmit = 0;
	sack_bytes_rxmt = 0;
	len = 0;
	p = NULL;
	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
	(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
	uint32_t cwin;

	cwin =
	imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0);
	/* Do not retransmit SACK segments beyond snd_recover */
	if (SEQ_GT(p->end, tp->snd_recover)) {
	/*
	* (At least) part of sack hole extends beyond
	* snd_recover. Check to see if we can rexmit data
	* for this hole.
	*/
	if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
	/*
	* Can't rexmit any more data for this hole.
	* That data will be rexmitted in the next
	* sack recovery episode, when snd_recover
	* moves past p->rxmit.
	*/
	p = NULL;
	goto after_sack_rexmit;
	} else
	/* Can rexmit part of the current hole */
	len = ((int32_t)ulmin(cwin,
	tp->snd_recover - p->rxmit));
	} else
	len = ((int32_t)ulmin(cwin, p->end - p->rxmit));
	off = p->rxmit - tp->snd_una;
	KASSERT(off >= 0,("%s: sack block to the left of una : %d",
	__func__, off));
	if (len > 0) {
	sack_rxmit = 1;
	sendalot = 1;
	TCPSTAT_INC(tcps_sack_rexmits);
	TCPSTAT_ADD(tcps_sack_rexmit_bytes,
	min(len, tp->t_maxseg));
	}
	}
	after_sack_rexmit:
	/*
	* Get standard flags, and add SYN or FIN if requested by 'hidden'
	* state flags.
	*/
	if (tp->t_flags & TF_NEEDFIN)
	flags \|= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
	flags \|= TH_SYN;

	SOCKBUF_LOCK(&so->so_snd);
	/*
	* If in persist timeout with window of 0, send 1 byte.
	* Otherwise, if window is small but nonzero
	* and timer expired, we will send what we can
	* and go to transmit state.
	*/
	if (tp->t_flags & TF_FORCEDATA) {
	if (sendwin == 0) {
	/*
	* If we still have some data to send, then
	* clear the FIN bit. Usually this would
	* happen below when it realizes that we
	* aren't sending all the data. However,
	* if we have exactly 1 byte of unsent data,
	* then it won't clear the FIN bit below,
	* and if we are in persist state, we wind
	* up sending the packet without recording
	* that we sent the FIN bit.
	*
	* We can't just blindly clear the FIN bit,
	* because if we don't have any more data
	* to send then the probe will be the FIN
	* itself.
	*/
	if (off < sbused(&so->so_snd))
	flags &= ~TH_FIN;
	sendwin = 1;
	} else {
	tcp_timer_activate(tp, TT_PERSIST, 0);
	tp->t_rxtshift = 0;
	}
	}

	/*
	* If snd_nxt == snd_max and we have transmitted a FIN, the
	* offset will be > 0 even if so_snd.sb_cc is 0, resulting in
	* a negative length. This can also occur when TCP opens up
	* its congestion window while receiving additional duplicate
	* acks after fast-retransmit because TCP will reset snd_nxt
	* to snd_max after the fast-retransmit.
	*
	* In the normal retransmit-FIN-only case, however, snd_nxt will
	* be set to snd_una, the offset will be 0, and the length may
	* wind up 0.
	*
	* If sack_rxmit is true we are retransmitting from the scoreboard
	* in which case len is already set.
	*/
	if (sack_rxmit == 0) {
	if (sack_bytes_rxmt == 0)
	len = ((int32_t)min(sbavail(&so->so_snd), sendwin) -
	off);
	else {
	int32_t cwin;

	/*
	* We are inside of a SACK recovery episode and are
	* sending new data, having retransmitted all the
	* data possible in the scoreboard.
	*/
	len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) -
	off);
	/*
	* Don't remove this (len > 0) check !
	* We explicitly check for len > 0 here (although it
	* isn't really necessary), to work around a gcc
	* optimization issue - to force gcc to compute
	* len above. Without this check, the computation
	* of len is bungled by the optimizer.
	*/
	if (len > 0) {
	cwin = tp->snd_cwnd -
	(tp->snd_nxt - tp->sack_newdata) -
	sack_bytes_rxmt;
	if (cwin < 0)
	cwin = 0;
	len = imin(len, cwin);
	}
	}
	}

	/*
	* Lop off SYN bit if it has already been sent. However, if this
	* is SYN-SENT state and if segment contains data and if we don't
	* know that foreign host supports TAO, suppress sending segment.
	*/
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
	if (tp->t_state != TCPS_SYN_RECEIVED)
	flags &= ~TH_SYN;
	#ifdef TCP_RFC7413
	/*
	* When sending additional segments following a TFO SYN\|ACK,
	* do not include the SYN bit.
	*/
	if (IS_FASTOPEN(tp->t_flags) &&
	(tp->t_state == TCPS_SYN_RECEIVED))
	flags &= ~TH_SYN;
	#endif
	off--, len++;
	}

	/*
	* Be careful not to send data and/or FIN on SYN segments.
	* This measure is needed to prevent interoperability problems
	* with not fully conformant TCP implementations.
	*/
	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
	len = 0;
	flags &= ~TH_FIN;
	}

	#ifdef TCP_RFC7413
	/*
	* When retransmitting SYN\|ACK on a passively-created TFO socket,
	* don't include data, as the presence of data may have caused the
	* original SYN\|ACK to have been dropped by a middlebox.
	*/
	if (IS_FASTOPEN(tp->t_flags) &&
	(((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) \|\|
	(flags & TH_RST)))
	len = 0;
	#endif
	if (len <= 0) {
	/*
	* If FIN has been sent but not acked,
	* but we haven't been called to retransmit,
	* len will be < 0. Otherwise, window shrank
	* after we sent into it. If window shrank to 0,
	* cancel pending retransmit, pull snd_nxt back
	* to (closed) window, and set the persist timer
	* if it isn't already going. If the window didn't
	* close completely, just wait for an ACK.
	*
	* We also do a general check here to ensure that
	* we will set the persist timer when we have data
	* to send, but a 0-byte window. This makes sure
	* the persist timer is set even if the packet
	* hits one of the "goto send" lines below.
	*/
	len = 0;
	if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
	(off < (int) sbavail(&so->so_snd))) {
	tcp_timer_activate(tp, TT_REXMT, 0);
	tp->t_rxtshift = 0;
	tp->snd_nxt = tp->snd_una;
	if (!tcp_timer_active(tp, TT_PERSIST))
	tcp_setpersist(tp);
	}
	}

	/* len will be >= 0 after this point. */
	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));

	tcp_sndbuf_autoscale(tp, so, sendwin);

	/*
	* Decide if we can use TCP Segmentation Offloading (if supported by
	* hardware).
	*
	* TSO may only be used if we are in a pure bulk sending state. The
	* presence of TCP-MD5, SACK retransmits, SACK advertizements and
	* IP options prevent using TSO. With TSO the TCP header is the same
	* (except for the sequence number) for all generated packets. This
	* makes it impossible to transmit any options which vary per generated
	* segment or packet.
	*
	* IPv4 handling has a clear separation of ip options and ip header
	* flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
	* the right thing below to provide length of just ip options and thus
	* checking for ipoptlen is enough to decide if ip options are present.
	*/
	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	/*
	* Pre-calculate here as we save another lookup into the darknesses
	* of IPsec that way and can actually decide if TSO is ok.
	*/
	#ifdef INET6
	if (isipv6 && IPSEC_ENABLED(ipv6))
	ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
	#ifdef INET
	else
	#endif
	#endif /* INET6 */
	#ifdef INET
	if (IPSEC_ENABLED(ipv4))
	ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
	#endif /* INET */
	#endif /* IPSEC */
	#ifdef INET6
	if (isipv6)
	ipoptlen = ip6_optlen(tp->t_inpcb);
	else
	#endif
	if (tp->t_inpcb->inp_options)
	ipoptlen = tp->t_inpcb->inp_options->m_len -
	offsetof(struct ipoption, ipopt_list);
	else
	ipoptlen = 0;
	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	ipoptlen += ipsec_optlen;
	#endif

	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
	((tp->t_flags & TF_SIGNATURE) == 0) &&
	tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
	ipoptlen == 0)
	tso = 1;

	if (sack_rxmit) {
	if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
	flags &= ~TH_FIN;
	} else {
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
	sbused(&so->so_snd)))
	flags &= ~TH_FIN;
	}

	recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
	(long)TCP_MAXWIN << tp->rcv_scale);

	/*
	* Sender silly window avoidance. We transmit under the following
	* conditions when len is non-zero:
	*
	* - We have a full segment (or more with TSO)
	* - This is the last buffer in a write()/send() and we are
	* either idle or running NODELAY
	* - we've timed out (e.g. persist timer)
	* - we have more then 1/2 the maximum send window's worth of
	* data (receiver may be limited the window size)
	* - we need to retransmit
	*/
	if (len) {
	if (len >= tp->t_maxseg)
	goto send;
	/*
	* NOTE! on localhost connections an 'ack' from the remote
	* end may occur synchronously with the output and cause
	* us to flush a buffer queued with moretocome. XXX
	*
	* note: the len + off check is almost certainly unnecessary.
	*/
	if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
	(idle \|\| (tp->t_flags & TF_NODELAY)) &&
	(uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) &&
	(tp->t_flags & TF_NOPUSH) == 0) {
	goto send;
	}
	if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */
	goto send;
	if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
	goto send;
	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
	goto send;
	if (sack_rxmit)
	goto send;
	}

	/*
	* Sending of standalone window updates.
	*
	* Window updates are important when we close our window due to a
	* full socket buffer and are opening it again after the application
	* reads data from it. Once the window has opened again and the
	* remote end starts to send again the ACK clock takes over and
	* provides the most current window information.
	*
	* We must avoid the silly window syndrome whereas every read
	* from the receive buffer, no matter how small, causes a window
	* update to be sent. We also should avoid sending a flurry of
	* window updates when the socket buffer had queued a lot of data
	* and the application is doing small reads.
	*
	* Prevent a flurry of pointless window updates by only sending
	* an update when we can increase the advertized window by more
	* than 1/4th of the socket buffer capacity. When the buffer is
	* getting full or is very small be more aggressive and send an
	* update whenever we can increase by two mss sized segments.
	* In all other situations the ACK's to new incoming data will
	* carry further window increases.
	*
	* Don't send an independent window update if a delayed
	* ACK is pending (it will get piggy-backed on it) or the
	* remote side already has done a half-close and won't send
	* more data. Skip this if the connection is in T/TCP
	* half-open state.
	*/
	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
	!(tp->t_flags & TF_DELACK) &&
	!TCPS_HAVERCVDFIN(tp->t_state)) {
	/*
	* "adv" is the amount we could increase the window,
	* taking into account that we are limited by
	* TCP_MAXWIN << tp->rcv_scale.
	*/
	int32_t adv;
	int oldwin;

	adv = recwin;
	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
	oldwin = (tp->rcv_adv - tp->rcv_nxt);
	adv -= oldwin;
	} else
	oldwin = 0;

	/*
	* If the new window size ends up being the same as or less
	* than the old size when it is scaled, then don't force
	* a window update.
	*/
	if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
	goto dontupdate;

	if (adv >= (int32_t)(2 * tp->t_maxseg) &&
	(adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) \|\|
	recwin <= (so->so_rcv.sb_hiwat / 8) \|\|
	so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
	goto send;
	if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat)
	goto send;
	}
	dontupdate:

	/*
	* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
	* is also a catch-all for the retransmit timer timeout case.
	*/
	if (tp->t_flags & TF_ACKNOW)
	goto send;
	if ((flags & TH_RST) \|\|
	((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
	goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
	goto send;
	/*
	* If our state indicates that FIN should be sent
	* and we have not yet done so, then we need to send.
	*/
	if (flags & TH_FIN &&
	((tp->t_flags & TF_SENTFIN) == 0 \|\| tp->snd_nxt == tp->snd_una))
	goto send;
	/*
	* In SACK, it is possible for tcp_output to fail to send a segment
	* after the retransmission timer has been turned off. Make sure
	* that the retransmission timer is set.
	*/
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	SEQ_GT(tp->snd_max, tp->snd_una) &&
	!tcp_timer_active(tp, TT_REXMT) &&
	!tcp_timer_active(tp, TT_PERSIST)) {
	tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
	goto just_return;
	}
	/*
	* TCP window updates are not reliable, rather a polling protocol
	* using ``persist'' packets is used to insure receipt of window
	* updates. The three ``states'' for the output side are:
	* idle not doing retransmits or persists
	* persisting to move a small or zero window
	* (re)transmitting and thereby not persisting
	*
	* tcp_timer_active(tp, TT_PERSIST)
	* is true when we are in persist state.
	* (tp->t_flags & TF_FORCEDATA)
	* is set when we are called to send a persist packet.
	* tcp_timer_active(tp, TT_REXMT)
	* is set when we are retransmitting
	* The output side is idle when both timers are zero.
	*
	* If send window is too small, there is data to transmit, and no
	* retransmit or persist is pending, then go to persist state.
	* If nothing happens soon, send when timer expires:
	* if window is nonzero, transmit what we can,
	* otherwise force out a byte.
	*/
	if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
	!tcp_timer_active(tp, TT_PERSIST)) {
	tp->t_rxtshift = 0;
	tcp_setpersist(tp);
	}

	/*
	* No reason to send a segment, just return.
	*/
	just_return:
	SOCKBUF_UNLOCK(&so->so_snd);
	return (0);

	send:
	SOCKBUF_LOCK_ASSERT(&so->so_snd);
	if (len > 0) {
	if (len >= tp->t_maxseg)
	tp->t_flags2 \|= TF2_PLPMTU_MAXSEGSNT;
	else
	tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
	}
	/*
	* Before ESTABLISHED, force sending of initial options
	* unless TCP set not to do any options.
	* NOTE: we assume that the IP/TCP header plus TCP options
	* always fit in a single mbuf, leaving room for a maximum
	* link header, i.e.
	* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
	*/
	optlen = 0;
	#ifdef INET6
	if (isipv6)
	hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	else
	#endif
	hdrlen = sizeof (struct tcpiphdr);

	/*
	* Compute options for segment.
	* We only have to care about SYN and established connection
	* segments. Options for SYN-ACK segments are handled in TCP
	* syncache.
	*/
	to.to_flags = 0;
	if ((tp->t_flags & TF_NOOPT) == 0) {
	/* Maximum segment size. */
	if (flags & TH_SYN) {
	tp->snd_nxt = tp->iss;
	to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
	to.to_flags \|= TOF_MSS;
	#ifdef TCP_RFC7413
	/*
	* Only include the TFO option on the first
	* transmission of the SYN\|ACK on a
	* passively-created TFO socket, as the presence of
	* the TFO option may have caused the original
	* SYN\|ACK to have been dropped by a middlebox.
	*/
	if (IS_FASTOPEN(tp->t_flags) &&
	(tp->t_state == TCPS_SYN_RECEIVED) &&
	(tp->t_rxtshift == 0)) {
	to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
	to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
	to.to_flags \|= TOF_FASTOPEN;
	}
	#endif
	}
	/* Window scaling. */
	if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
	to.to_wscale = tp->request_r_scale;
	to.to_flags \|= TOF_SCALE;
	}
	/* Timestamps. */
	if ((tp->t_flags & TF_RCVD_TSTMP) \|\|
	((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
	to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
	to.to_tsecr = tp->ts_recent;
	to.to_flags \|= TOF_TS;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 &&
	(so->so_rcv.sb_flags & SB_AUTOSIZE))
	tp->rfbuf_ts = tcp_ts_getticks();

	/* Selective ACK's. */
	if (tp->t_flags & TF_SACK_PERMIT) {
	if (flags & TH_SYN)
	to.to_flags \|= TOF_SACKPERM;
	else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	(tp->t_flags & TF_SACK_PERMIT) &&
	tp->rcv_numsacks > 0) {
	to.to_flags \|= TOF_SACK;
	to.to_nsacks = tp->rcv_numsacks;
	to.to_sacks = (u_char *)tp->sackblks;
	}
	}
	#if defined(IPSEC_SUPPORT) \|\| defined(TCP_SIGNATURE)
	/* TCP-MD5 (RFC2385). */
	/*
	* Check that TCP_MD5SIG is enabled in tcpcb to
	* account the size needed to set this TCP option.
	*/
	if (tp->t_flags & TF_SIGNATURE)
	to.to_flags \|= TOF_SIGNATURE;
	#endif /* TCP_SIGNATURE */

	/* Processing the options. */
	hdrlen += optlen = tcp_addoptions(&to, opt);
	}

	/*
	* Adjust data length if insertion of options will
	* bump the packet length beyond the t_maxseg length.
	* Clear the FIN bit because we cut off the tail of
	* the segment.
	*/
	if (len + optlen + ipoptlen > tp->t_maxseg) {
	flags &= ~TH_FIN;

	if (tso) {
	u_int if_hw_tsomax;
	u_int if_hw_tsomaxsegcount;
	u_int if_hw_tsomaxsegsize;
	struct mbuf *mb;
	u_int moff;
	int max_len;

	/* extract TSO information */
	if_hw_tsomax = tp->t_tsomax;
	if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
	if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;

	/*
	* Limit a TSO burst to prevent it from
	* overflowing or exceeding the maximum length
	* allowed by the network interface:
	*/
	KASSERT(ipoptlen == 0,
	("%s: TSO can't do IP options", __func__));

	/*
	* Check if we should limit by maximum payload
	* length:
	*/
	if (if_hw_tsomax != 0) {
	/* compute maximum TSO length */
	max_len = (if_hw_tsomax - hdrlen -
	max_linkhdr);
	if (max_len <= 0) {
	len = 0;
	} else if (len > max_len) {
	sendalot = 1;
	len = max_len;
	}
	}

	/*
	* Check if we should limit by maximum segment
	* size and count:
	*/
	if (if_hw_tsomaxsegcount != 0 &&
	if_hw_tsomaxsegsize != 0) {
	/*
	* Subtract one segment for the LINK
	* and TCP/IP headers mbuf that will
	* be prepended to this mbuf chain
	* after the code in this section
	* limits the number of mbufs in the
	* chain to if_hw_tsomaxsegcount.
	*/
	if_hw_tsomaxsegcount -= 1;
	max_len = 0;
	mb = sbsndmbuf(&so->so_snd, off, &moff);

	while (mb != NULL && max_len < len) {
	u_int mlen;
	u_int frags;

	/*
	* Get length of mbuf fragment
	* and how many hardware frags,
	* rounded up, it would use:
	*/
	mlen = (mb->m_len - moff);
	frags = howmany(mlen,
	if_hw_tsomaxsegsize);

	/* Handle special case: Zero Length Mbuf */
	if (frags == 0)
	frags = 1;

	/*
	* Check if the fragment limit
	* will be reached or exceeded:
	*/
	if (frags >= if_hw_tsomaxsegcount) {
	max_len += min(mlen,
	if_hw_tsomaxsegcount *
	if_hw_tsomaxsegsize);
	break;
	}
	max_len += mlen;
	if_hw_tsomaxsegcount -= frags;
	moff = 0;
	mb = mb->m_next;
	}
	if (max_len <= 0) {
	len = 0;
	} else if (len > max_len) {
	sendalot = 1;
	len = max_len;
	}
	}

	/*
	* Prevent the last segment from being
	* fractional unless the send sockbuf can be
	* emptied:
	*/
	max_len = (tp->t_maxseg - optlen);
	if (((uint32_t)off + (uint32_t)len) <
	sbavail(&so->so_snd)) {
	moff = len % max_len;
	if (moff != 0) {
	len -= moff;
	sendalot = 1;
	}
	}

	/*
	* In case there are too many small fragments
	* don't use TSO:
	*/
	if (len <= max_len) {
	len = max_len;
	sendalot = 1;
	tso = 0;
	}

	/*
	* Send the FIN in a separate segment
	* after the bulk sending is done.
	* We don't trust the TSO implementations
	* to clear the FIN flag on all but the
	* last segment.
	*/
	if (tp->t_flags & TF_NEEDFIN)
	sendalot = 1;

	} else {
	len = tp->t_maxseg - optlen - ipoptlen;
	sendalot = 1;
	}
	} else
	tso = 0;

	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
	("%s: len > IP_MAXPACKET", __func__));

	/#ifdef DIAGNOSTIC/
	#ifdef INET6
	if (max_linkhdr + hdrlen > MCLBYTES)
	#else
	if (max_linkhdr + hdrlen > MHLEN)
	#endif
	panic("tcphdr too big");
	/#endif/

	/*
	* This KASSERT is here to catch edge cases at a well defined place.
	* Before, those had triggered (random) panic conditions further down.
	*/
	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));

	/*
	* Grab a header mbuf, attaching a copy of data to
	* be transmitted, and initialize the header from
	* the template for sends on this connection.
	*/
	if (len) {
	struct mbuf *mb;
	u_int moff;

	if ((tp->t_flags & TF_FORCEDATA) && len == 1)
	TCPSTAT_INC(tcps_sndprobe);
	else if (SEQ_LT(tp->snd_nxt, tp->snd_max) \|\| sack_rxmit) {
	tp->t_sndrexmitpack++;
	TCPSTAT_INC(tcps_sndrexmitpack);
	TCPSTAT_ADD(tcps_sndrexmitbyte, len);
	} else {
	TCPSTAT_INC(tcps_sndpack);
	TCPSTAT_ADD(tcps_sndbyte, len);
	}
	#ifdef INET6
	if (MHLEN < hdrlen + max_linkhdr)
	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
	else
	#endif
	m = m_gethdr(M_NOWAIT, MT_DATA);

	if (m == NULL) {
	SOCKBUF_UNLOCK(&so->so_snd);
	error = ENOBUFS;
	sack_rxmit = 0;
	goto out;
	}

	m->m_data += max_linkhdr;
	m->m_len = hdrlen;

	/*
	* Start the m_copy functions from the closest mbuf
	* to the offset in the socket buffer chain.
	*/
	mb = sbsndptr(&so->so_snd, off, len, &moff);

	if (len <= MHLEN - hdrlen - max_linkhdr) {
	m_copydata(mb, moff, len,
	mtod(m, caddr_t) + hdrlen);
	m->m_len += len;
	} else {
	m->m_next = m_copym(mb, moff, len, M_NOWAIT);
	if (m->m_next == NULL) {
	SOCKBUF_UNLOCK(&so->so_snd);
	(void) m_free(m);
	error = ENOBUFS;
	sack_rxmit = 0;
	goto out;
	}
	}

	/*
	* If we're sending everything we've got, set PUSH.
	* (This will keep happy those implementations which only
	* give data to the user when a buffer fills or
	* a PUSH comes in.)
	*/
	if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) &&
	!(flags & TH_SYN))
	flags \|= TH_PUSH;
	SOCKBUF_UNLOCK(&so->so_snd);
	} else {
	SOCKBUF_UNLOCK(&so->so_snd);
	if (tp->t_flags & TF_ACKNOW)
	TCPSTAT_INC(tcps_sndacks);
	else if (flags & (TH_SYN\|TH_FIN\|TH_RST))
	TCPSTAT_INC(tcps_sndctrl);
	else if (SEQ_GT(tp->snd_up, tp->snd_una))
	TCPSTAT_INC(tcps_sndurg);
	else
	TCPSTAT_INC(tcps_sndwinup);

	m = m_gethdr(M_NOWAIT, MT_DATA);
	if (m == NULL) {
	error = ENOBUFS;
	sack_rxmit = 0;
	goto out;
	}
	#ifdef INET6
	if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
	MHLEN >= hdrlen) {
	M_ALIGN(m, hdrlen);
	} else
	#endif
	m->m_data += max_linkhdr;
	m->m_len = hdrlen;
	}
	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
	m->m_pkthdr.rcvif = (struct ifnet *)0;
	#ifdef MAC
	mac_inpcb_create_mbuf(tp->t_inpcb, m);
	#endif
	#ifdef INET6
	if (isipv6) {
	ip6 = mtod(m, struct ip6_hdr *);
	th = (struct tcphdr *)(ip6 + 1);
	tcpip_fillheaders(tp->t_inpcb, ip6, th);
	} else
	#endif /* INET6 */
	{
	ip = mtod(m, struct ip *);
	+#ifdef TCPDEBUG
	ipov = (struct ipovly *)ip;
	+#endif
	th = (struct tcphdr *)(ip + 1);
	tcpip_fillheaders(tp->t_inpcb, ip, th);
	}

	/*
	* Fill in fields, remembering maximum advertised
	* window for use in delaying messages about window sizes.
	* If resending a FIN, be sure not to use a new sequence number.
	*/
	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
	tp->snd_nxt == tp->snd_max)
	tp->snd_nxt--;
	/*
	* If we are starting a connection, send ECN setup
	* SYN packet. If we are on a retransmit, we may
	* resend those bits a number of times as per
	* RFC 3168.
	*/
	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
	if (tp->t_rxtshift >= 1) {
	if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
	flags \|= TH_ECE\|TH_CWR;
	} else
	flags \|= TH_ECE\|TH_CWR;
	}

	if (tp->t_state == TCPS_ESTABLISHED &&
	(tp->t_flags & TF_ECN_PERMIT)) {
	/*
	* If the peer has ECN, mark data packets with
	* ECN capable transmission (ECT).
	* Ignore pure ack packets, retransmissions and window probes.
	*/
	if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
	!((tp->t_flags & TF_FORCEDATA) && len == 1)) {
	#ifdef INET6
	if (isipv6)
	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << 20);
	else
	#endif
	ip->ip_tos \|= IPTOS_ECN_ECT0;
	TCPSTAT_INC(tcps_ecn_ect0);
	}

	/*
	* Reply with proper ECN notifications.
	*/
	if (tp->t_flags & TF_ECN_SND_CWR) {
	flags \|= TH_CWR;
	tp->t_flags &= ~TF_ECN_SND_CWR;
	}
	if (tp->t_flags & TF_ECN_SND_ECE)
	flags \|= TH_ECE;
	}

	/*
	* If we are doing retransmissions, then snd_nxt will
	* not reflect the first unsent octet. For ACK only
	* packets, we do not want the sequence number of the
	* retransmitted packet, we want the sequence number
	* of the next unsent octet. So, if there is no data
	* (and no SYN or FIN), use snd_max instead of snd_nxt
	* when filling in ti_seq. But if we are in persist
	* state, snd_max might reflect one byte beyond the
	* right edge of the window, so use snd_nxt in that
	* case, since we know we aren't doing a retransmission.
	* (retransmit and persist are mutually exclusive...)
	*/
	if (sack_rxmit == 0) {
	if (len \|\| (flags & (TH_SYN\|TH_FIN)) \|\|
	tcp_timer_active(tp, TT_PERSIST))
	th->th_seq = htonl(tp->snd_nxt);
	else
	th->th_seq = htonl(tp->snd_max);
	} else {
	th->th_seq = htonl(p->rxmit);
	p->rxmit += len;
	tp->sackhint.sack_bytes_rexmit += len;
	}
	th->th_ack = htonl(tp->rcv_nxt);
	if (optlen) {
	bcopy(opt, th + 1, optlen);
	th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
	}
	th->th_flags = flags;
	/*
	* Calculate receive window. Don't shrink window,
	* but avoid silly window syndrome.
	*/
	if (recwin < (so->so_rcv.sb_hiwat / 4) &&
	recwin < tp->t_maxseg)
	recwin = 0;
	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
	recwin < (tp->rcv_adv - tp->rcv_nxt))
	recwin = (tp->rcv_adv - tp->rcv_nxt);

	/*
	* According to RFC1323 the window field in a SYN (i.e., a <SYN>
	* or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK>
	* case is handled in syncache.
	*/
	if (flags & TH_SYN)
	th->th_win = htons((u_short)
	(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
	else
	th->th_win = htons((u_short)(recwin >> tp->rcv_scale));

	/*
	* Adjust the RXWIN0SENT flag - indicate that we have advertised
	* a 0 window. This may cause the remote transmitter to stall. This
	* flag tells soreceive() to disable delayed acknowledgements when
	* draining the buffer. This can occur if the receiver is attempting
	* to read more data than can be buffered prior to transmitting on
	* the connection.
	*/
	if (th->th_win == 0) {
	tp->t_sndzerowin++;
	tp->t_flags \|= TF_RXWIN0SENT;
	} else
	tp->t_flags &= ~TF_RXWIN0SENT;
	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
	th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
	th->th_flags \|= TH_URG;
	} else
	/*
	* If no urgent pointer to send, then we pull
	* the urgent pointer to the left edge of the send window
	* so that it doesn't drift into the send window on sequence
	* number wraparound.
	*/
	tp->snd_up = tp->snd_una; /* drag it along */

	/*
	* Put TCP length in extended header, and then
	* checksum extended header and data.
	*/
	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);

	#if defined(IPSEC_SUPPORT) \|\| defined(TCP_SIGNATURE)
	if (to.to_flags & TOF_SIGNATURE) {
	/*
	* Calculate MD5 signature and put it into the place
	* determined before.
	* NOTE: since TCP options buffer doesn't point into
	* mbuf's data, calculate offset and use it.
	*/
	if (!TCPMD5_ENABLED() \|\| (error = TCPMD5_OUTPUT(m, th,
	(u_char *)(th + 1) + (to.to_signature - opt))) != 0) {
	/*
	* Do not send segment if the calculation of MD5
	* digest has failed.
	*/
	m_freem(m);
	goto out;
	}
	}
	#endif
	#ifdef INET6
	if (isipv6) {
	/*
	* There is no need to fill in ip6_plen right now.
	* It will be filled later by ip6_output.
	*/
	m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
	th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
	optlen + len, IPPROTO_TCP, 0);
	}
	#endif
	#if defined(INET6) && defined(INET)
	else
	#endif
	#ifdef INET
	{
	m->m_pkthdr.csum_flags = CSUM_TCP;
	th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));

	/* IP version must be set here for ipv4/ipv6 checking later */
	KASSERT(ip->ip_v == IPVERSION,
	("%s: IP version incorrect: %d", __func__, ip->ip_v));
	}
	#endif

	/*
	* Enable TSO and specify the size of the segments.
	* The TCP pseudo header checksum is always provided.
	*/
	if (tso) {
	KASSERT(len > tp->t_maxseg - optlen,
	("%s: len <= tso_segsz", __func__));
	m->m_pkthdr.csum_flags \|= CSUM_TSO;
	m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
	}

	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
	("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
	__func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
	#else
	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
	("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
	__func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
	#endif

	#ifdef TCP_HHOOK
	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
	hhook_run_tcp_est_out(tp, th, &to, len, tso);
	#endif

	#ifdef TCPDEBUG
	/*
	* Trace.
	*/
	if (so->so_options & SO_DEBUG) {
	u_short save = 0;
	#ifdef INET6
	if (!isipv6)
	#endif
	{
	save = ipov->ih_len;
	ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
	}
	tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
	#ifdef INET6
	if (!isipv6)
	#endif
	ipov->ih_len = save;
	}
	#endif /* TCPDEBUG */
	TCP_PROBE3(debug__output, tp, th, m);

	/*
	* Fill in IP length and desired time to live and
	* send to IP level. There should be a better way
	* to handle ttl and tos; we could keep them in
	* the template, but need a way to checksum without them.
	*/
	/*
	* m->m_pkthdr.len should have been set before checksum calculation,
	* because in6_cksum() need it.
	*/
	#ifdef INET6
	if (isipv6) {
	/*
	* we separately set hoplimit for every segment, since the
	* user might want to change the value via setsockopt.
	* Also, desired default hop limit might be changed via
	* Neighbor Discovery.
	*/
	ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);

	/*
	* Set the packet size here for the benefit of DTrace probes.
	* ip6_output() will set it properly; it's supposed to include
	* the option header lengths as well.
	*/
	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));

	if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
	tp->t_flags2 \|= TF2_PLPMTU_PMTUD;
	else
	tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;

	if (tp->t_state == TCPS_SYN_SENT)
	TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);

	TCP_PROBE5(send, NULL, tp, ip6, tp, th);

	#ifdef TCPPCAP
	/* Save packet, if requested. */
	tcp_pcap_add(th, m, &(tp->t_outpkts));
	#endif

	/* TODO: IPv6 IP6TOS_ECT bit on */
	error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
	&tp->t_inpcb->inp_route6,
	((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
	NULL, NULL, tp->t_inpcb);

	if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_rt != NULL)
	mtu = tp->t_inpcb->inp_route6.ro_rt->rt_mtu;
	}
	#endif /* INET6 */
	#if defined(INET) && defined(INET6)
	else
	#endif
	#ifdef INET
	{
	ip->ip_len = htons(m->m_pkthdr.len);
	#ifdef INET6
	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
	ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
	#endif /* INET6 */
	/*
	* If we do path MTU discovery, then we set DF on every packet.
	* This might not be the best thing to do according to RFC3390
	* Section 2. However the tcp hostcache migitates the problem
	* so it affects only the first tcp connection with a host.
	*
	* NB: Don't set DF on small MTU/MSS to have a safe fallback.
	*/
	if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
	ip->ip_off \|= htons(IP_DF);
	tp->t_flags2 \|= TF2_PLPMTU_PMTUD;
	} else {
	tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
	}

	if (tp->t_state == TCPS_SYN_SENT)
	TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);

	TCP_PROBE5(send, NULL, tp, ip, tp, th);

	#ifdef TCPPCAP
	/* Save packet, if requested. */
	tcp_pcap_add(th, m, &(tp->t_outpkts));
	#endif

	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
	((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
	tp->t_inpcb);

	if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL)
	mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu;
	}
	#endif /* INET */

	out:
	/*
	* In transmit state, time the transmission and arrange for
	* the retransmit. In persist state, just set snd_max.
	*/
	if ((tp->t_flags & TF_FORCEDATA) == 0 \|\|
	!tcp_timer_active(tp, TT_PERSIST)) {
	tcp_seq startseq = tp->snd_nxt;

	/*
	* Advance snd_nxt over sequence space of this segment.
	*/
	if (flags & (TH_SYN\|TH_FIN)) {
	if (flags & TH_SYN)
	tp->snd_nxt++;
	if (flags & TH_FIN) {
	tp->snd_nxt++;
	tp->t_flags \|= TF_SENTFIN;
	}
	}
	if (sack_rxmit)
	goto timer;
	tp->snd_nxt += len;
	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
	tp->snd_max = tp->snd_nxt;
	/*
	* Time this transmission if not a retransmission and
	* not currently timing anything.
	*/
	if (tp->t_rtttime == 0) {
	tp->t_rtttime = ticks;
	tp->t_rtseq = startseq;
	TCPSTAT_INC(tcps_segstimed);
	}
	}

	/*
	* Set retransmit timer if not currently set,
	* and not doing a pure ack or a keep-alive probe.
	* Initial value for retransmit timer is smoothed
	* round-trip time + 2 * round-trip time variance.
	* Initialize shift counter which is used for backoff
	* of retransmit time.
	*/
	timer:
	if (!tcp_timer_active(tp, TT_REXMT) &&
	((sack_rxmit && tp->snd_nxt != tp->snd_max) \|\|
	(tp->snd_nxt != tp->snd_una))) {
	if (tcp_timer_active(tp, TT_PERSIST)) {
	tcp_timer_activate(tp, TT_PERSIST, 0);
	tp->t_rxtshift = 0;
	}
	tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
	} else if (len == 0 && sbavail(&so->so_snd) &&
	!tcp_timer_active(tp, TT_REXMT) &&
	!tcp_timer_active(tp, TT_PERSIST)) {
	/*
	* Avoid a situation where we do not set persist timer
	* after a zero window condition. For example:
	* 1) A -> B: packet with enough data to fill the window
	* 2) B -> A: ACK for #1 + new data (0 window
	* advertisement)
	* 3) A -> B: ACK for #2, 0 len packet
	*
	* In this case, A will not activate the persist timer,
	* because it chose to send a packet. Unless tcp_output
	* is called for some other reason (delayed ack timer,
	* another input packet from B, socket syscall), A will
	* not send zero window probes.
	*
	* So, if you send a 0-length packet, but there is data
	* in the socket buffer, and neither the rexmt or
	* persist timer is already set, then activate the
	* persist timer.
	*/
	tp->t_rxtshift = 0;
	tcp_setpersist(tp);
	}
	} else {
	/*
	* Persist case, update snd_max but since we are in
	* persist mode (no window) we do not update snd_nxt.
	*/
	int xlen = len;
	if (flags & TH_SYN)
	++xlen;
	if (flags & TH_FIN) {
	++xlen;
	tp->t_flags \|= TF_SENTFIN;
	}
	if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
	tp->snd_max = tp->snd_nxt + xlen;
	}

	if (error) {

	/*
	* We know that the packet was lost, so back out the
	* sequence number advance, if any.
	*
	* If the error is EPERM the packet got blocked by the
	* local firewall. Normally we should terminate the
	* connection but the blocking may have been spurious
	* due to a firewall reconfiguration cycle. So we treat
	* it like a packet loss and let the retransmit timer and
	* timeouts do their work over time.
	* XXX: It is a POLA question whether calling tcp_drop right
	* away would be the really correct behavior instead.
	*/
	if (((tp->t_flags & TF_FORCEDATA) == 0 \|\|
	!tcp_timer_active(tp, TT_PERSIST)) &&
	((flags & TH_SYN) == 0) &&
	(error != EPERM)) {
	if (sack_rxmit) {
	p->rxmit -= len;
	tp->sackhint.sack_bytes_rexmit -= len;
	KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
	("sackhint bytes rtx >= 0"));
	} else
	tp->snd_nxt -= len;
	}
	SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
	switch (error) {
	case EACCES:
	tp->t_softerror = error;
	return (0);
	case EPERM:
	tp->t_softerror = error;
	return (error);
	case ENOBUFS:
	TCP_XMIT_TIMER_ASSERT(tp, len, flags);
	tp->snd_cwnd = tp->t_maxseg;
	return (0);
	case EMSGSIZE:
	/*
	* For some reason the interface we used initially
	* to send segments changed to another or lowered
	* its MTU.
	* If TSO was active we either got an interface
	* without TSO capabilits or TSO was turned off.
	* If we obtained mtu from ip_output() then update
	* it and try again.
	*/
	if (tso)
	tp->t_flags &= ~TF_TSO;
	if (mtu != 0) {
	tcp_mss_update(tp, -1, mtu, NULL, NULL);
	goto again;
	}
	return (error);
	case EHOSTDOWN:
	case EHOSTUNREACH:
	case ENETDOWN:
	case ENETUNREACH:
	if (TCPS_HAVERCVDSYN(tp->t_state)) {
	tp->t_softerror = error;
	return (0);
	}
	/* FALLTHROUGH */
	default:
	return (error);
	}
	}
	TCPSTAT_INC(tcps_sndtotal);

	/*
	* Data sent (as far as we can tell).
	* If this advertises a larger window than any other segment,
	* then remember the size of the advertised window.
	* Any pending ACK has now been sent.
	*/
	if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
	tp->rcv_adv = tp->rcv_nxt + recwin;
	tp->last_ack_sent = tp->rcv_nxt;
	tp->t_flags &= ~(TF_ACKNOW \| TF_DELACK);
	if (tcp_timer_active(tp, TT_DELACK))
	tcp_timer_activate(tp, TT_DELACK, 0);
	#if 0
	/*
	* This completely breaks TCP if newreno is turned on. What happens
	* is that if delayed-acks are turned on on the receiver, this code
	* on the transmitter effectively destroys the TCP window, forcing
	* it to four packets (1.5Kx4 = 6K window).
	*/
	if (sendalot && --maxburst)
	goto again;
	#endif
	if (sendalot)
	goto again;
	return (0);
	}

	void
	tcp_setpersist(struct tcpcb *tp)
	{
	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
	int tt;

	tp->t_flags &= ~TF_PREVVALID;
	if (tcp_timer_active(tp, TT_REXMT))
	panic("tcp_setpersist: retransmit pending");
	/*
	* Start/restart persistence timer.
	*/
	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
	tcp_persmin, tcp_persmax);
	tcp_timer_activate(tp, TT_PERSIST, tt);
	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
	tp->t_rxtshift++;
	}

	/*
	* Insert TCP options according to the supplied parameters to the place
	* optp in a consistent way. Can handle unaligned destinations.
	*
	* The order of the option processing is crucial for optimal packing and
	* alignment for the scarce option space.
	*
	* The optimal order for a SYN/SYN-ACK segment is:
	* MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
	* Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
	*
	* The SACK options should be last. SACK blocks consume 8*n+2 bytes.
	* So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
	* At minimum we need 10 bytes (to generate 1 SACK block). If both
	* TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
	* we only have 10 bytes for SACK options (40 - (12 + 18)).
	*/
	int
	tcp_addoptions(struct tcpopt to, u_char optp)
	{
	u_int32_t mask, optlen = 0;

	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
	if ((to->to_flags & mask) != mask)
	continue;
	if (optlen == TCP_MAXOLEN)
	break;
	switch (to->to_flags & mask) {
	case TOF_MSS:
	while (optlen % 4) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
	continue;
	optlen += TCPOLEN_MAXSEG;
	*optp++ = TCPOPT_MAXSEG;
	*optp++ = TCPOLEN_MAXSEG;
	to->to_mss = htons(to->to_mss);
	bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
	optp += sizeof(to->to_mss);
	break;
	case TOF_SCALE:
	while (!optlen \|\| optlen % 2 != 1) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
	continue;
	optlen += TCPOLEN_WINDOW;
	*optp++ = TCPOPT_WINDOW;
	*optp++ = TCPOLEN_WINDOW;
	*optp++ = to->to_wscale;
	break;
	case TOF_SACKPERM:
	while (optlen % 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
	continue;
	optlen += TCPOLEN_SACK_PERMITTED;
	*optp++ = TCPOPT_SACK_PERMITTED;
	*optp++ = TCPOLEN_SACK_PERMITTED;
	break;
	case TOF_TS:
	while (!optlen \|\| optlen % 4 != 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
	continue;
	optlen += TCPOLEN_TIMESTAMP;
	*optp++ = TCPOPT_TIMESTAMP;
	*optp++ = TCPOLEN_TIMESTAMP;
	to->to_tsval = htonl(to->to_tsval);
	to->to_tsecr = htonl(to->to_tsecr);
	bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
	optp += sizeof(to->to_tsval);
	bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
	optp += sizeof(to->to_tsecr);
	break;
	case TOF_SIGNATURE:
	{
	int siglen = TCPOLEN_SIGNATURE - 2;

	while (!optlen \|\| optlen % 4 != 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) {
	to->to_flags &= ~TOF_SIGNATURE;
	continue;
	}
	optlen += TCPOLEN_SIGNATURE;
	*optp++ = TCPOPT_SIGNATURE;
	*optp++ = TCPOLEN_SIGNATURE;
	to->to_signature = optp;
	while (siglen--)
	*optp++ = 0;
	break;
	}
	case TOF_SACK:
	{
	int sackblks = 0;
	struct sackblk sack = (struct sackblk )to->to_sacks;
	tcp_seq sack_seq;

	while (!optlen \|\| optlen % 4 != 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
	continue;
	optlen += TCPOLEN_SACKHDR;
	*optp++ = TCPOPT_SACK;
	sackblks = min(to->to_nsacks,
	(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
	optp++ = TCPOLEN_SACKHDR + sackblks TCPOLEN_SACK;
	while (sackblks--) {
	sack_seq = htonl(sack->start);
	bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
	optp += sizeof(sack_seq);
	sack_seq = htonl(sack->end);
	bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
	optp += sizeof(sack_seq);
	optlen += TCPOLEN_SACK;
	sack++;
	}
	TCPSTAT_INC(tcps_sack_send_blocks);
	break;
	}
	#ifdef TCP_RFC7413
	case TOF_FASTOPEN:
	{
	int total_len;

	/* XXX is there any point to aligning this option? */
	total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;
	if (TCP_MAXOLEN - optlen < total_len)
	continue;
	*optp++ = TCPOPT_FAST_OPEN;
	*optp++ = total_len;
	if (to->to_tfo_len > 0) {
	bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);
	optp += to->to_tfo_len;
	}
	optlen += total_len;
	break;
	}
	#endif
	default:
	panic("%s: unknown TCP option type", __func__);
	break;
	}
	}

	/* Terminate and pad TCP options to a 4 byte boundary. */
	if (optlen % 4) {
	optlen += TCPOLEN_EOL;
	*optp++ = TCPOPT_EOL;
	}
	/*
	* According to RFC 793 (STD0007):
	* "The content of the header beyond the End-of-Option option
	* must be header padding (i.e., zero)."
	* and later: "The padding is composed of zeros."
	*/
	while (optlen % 4) {
	optlen += TCPOLEN_PAD;
	*optp++ = TCPOPT_PAD;
	}

	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
	return (optlen);
	}

	void
	tcp_sndbuf_autoscale(struct tcpcb tp, struct socket so, uint32_t sendwin)
	{

	/*
	* Automatic sizing of send socket buffer. Often the send buffer
	* size is not optimally adjusted to the actual network conditions
	* at hand (delay bandwidth product). Setting the buffer size too
	* small limits throughput on links with high bandwidth and high
	* delay (eg. trans-continental/oceanic links). Setting the
	* buffer size too big consumes too much real kernel memory,
	* especially with many connections on busy servers.
	*
	* The criteria to step up the send buffer one notch are:
	* 1. receive window of remote host is larger than send buffer
	* (with a fudge factor of 5/4th);
	* 2. send buffer is filled to 7/8th with data (so we actually
	* have data to make use of it);
	* 3. send buffer fill has not hit maximal automatic size;
	* 4. our send window (slow start and cogestion controlled) is
	* larger than sent but unacknowledged data in send buffer.
	*
	* The remote host receive window scaling factor may limit the
	* growing of the send buffer before it reaches its allowed
	* maximum.
	*
	* It scales directly with slow start or congestion window
	* and does at most one step per received ACK. This fast
	* scaling has the drawback of growing the send buffer beyond
	* what is strictly necessary to make full use of a given
	* delay*bandwidth product. However testing has shown this not
	* to be much of an problem. At worst we are trading wasting
	* of available bandwidth (the non-use of it) for wasting some
	* socket buffer memory.
	*
	* TODO: Shrink send buffer during idle periods together
	* with congestion window. Requires another timer. Has to
	* wait for upcoming tcp timer rewrite.
	*
	* XXXGL: should there be used sbused() or sbavail()?
	*/
	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
	int lowat;

	lowat = V_tcp_sendbuf_auto_lowat ? so->so_snd.sb_lowat : 0;
	if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat - lowat &&
	sbused(&so->so_snd) >=
	(so->so_snd.sb_hiwat / 8 * 7) - lowat &&
	sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
	sendwin >= (sbused(&so->so_snd) -
	(tp->snd_nxt - tp->snd_una))) {
	if (!sbreserve_locked(&so->so_snd,
	min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
	V_tcp_autosndbuf_max), so, curthread))
	so->so_snd.sb_flags &= ~SB_AUTOSIZE;
	}
	}
	}
	Index: head/sys/netinet6/in6_mcast.c
	===================================================================
	--- head/sys/netinet6/in6_mcast.c (revision 327172)
	+++ head/sys/netinet6/in6_mcast.c (revision 327173)
	@@ -1,2836 +1,2833 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2009 Bruce Simpson.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* IPv6 multicast socket, group, and socket option processing module.
	* Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/protosw.h>
	#include <sys/sysctl.h>
	#include <sys/priv.h>
	#include <sys/ktr.h>
	#include <sys/tree.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_dl.h>
	#include <net/route.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet6/in6_fib.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/tcp_var.h>
	#include <netinet6/nd6.h>
	#include <netinet6/mld6_var.h>
	#include <netinet6/scope6_var.h>

	#ifndef KTR_MLD
	#define KTR_MLD KTR_INET6
	#endif

	#ifndef __SOCKUNION_DECLARED
	union sockunion {
	struct sockaddr_storage ss;
	struct sockaddr sa;
	struct sockaddr_dl sdl;
	struct sockaddr_in6 sin6;
	};
	typedef union sockunion sockunion_t;
	#define __SOCKUNION_DECLARED
	#endif /* __SOCKUNION_DECLARED */

	static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
	"IPv6 multicast PCB-layer source filter");
	static MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
	static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
	static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
	"IPv6 multicast MLD-layer source filter");

	RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);

	/*
	* Locking:
	* - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
	* - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
	* it can be taken by code in net/if.c also.
	* - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
	*
	* struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
	* any need for in6_multi itself to be virtualized -- it is bound to an ifp
	* anyway no matter what happens.
	*/
	struct mtx in6_multi_mtx;
	MTX_SYSINIT(in6_multi_mtx, &in6_multi_mtx, "in6_multi_mtx", MTX_DEF);

	static void im6f_commit(struct in6_mfilter *);
	static int im6f_get_source(struct in6_mfilter *imf,
	const struct sockaddr_in6 *psin,
	struct in6_msource **);
	static struct in6_msource *
	im6f_graft(struct in6_mfilter *, const uint8_t,
	const struct sockaddr_in6 *);
	static void im6f_leave(struct in6_mfilter *);
	static int im6f_prune(struct in6_mfilter , const struct sockaddr_in6 );
	static void im6f_purge(struct in6_mfilter *);
	static void im6f_rollback(struct in6_mfilter *);
	static void im6f_reap(struct in6_mfilter *);
	static int im6o_grow(struct ip6_moptions *);
	static size_t im6o_match_group(const struct ip6_moptions *,
	const struct ifnet , const struct sockaddr );
	static struct in6_msource *
	im6o_match_source(const struct ip6_moptions *, const size_t,
	const struct sockaddr *);
	static void im6s_merge(struct ip6_msource *ims,
	const struct in6_msource *lims, const int rollback);
	static int in6_mc_get(struct ifnet , const struct in6_addr ,
	struct in6_multi **);
	static int in6m_get_source(struct in6_multi *inm,
	const struct in6_addr *addr, const int noalloc,
	struct ip6_msource **pims);
	#ifdef KTR
	static int in6m_is_ifp_detached(const struct in6_multi *);
	#endif
	static int in6m_merge(struct in6_multi , /const/ struct in6_mfilter );
	static void in6m_purge(struct in6_multi *);
	static void in6m_reap(struct in6_multi *);
	static struct ip6_moptions *
	in6p_findmoptions(struct inpcb *);
	static int in6p_get_source_filters(struct inpcb , struct sockopt );
	static int in6p_join_group(struct inpcb , struct sockopt );
	static int in6p_leave_group(struct inpcb , struct sockopt );
	static struct ifnet *
	in6p_lookup_mcast_ifp(const struct inpcb *,
	const struct sockaddr_in6 *);
	static int in6p_block_unblock_source(struct inpcb , struct sockopt );
	static int in6p_set_multicast_if(struct inpcb , struct sockopt );
	static int in6p_set_source_filters(struct inpcb , struct sockopt );
	static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);

	SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */

	static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0,
	"IPv6 multicast");

	static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
	SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
	CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
	"Max source filters per group");

	static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
	SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
	CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
	"Max source filters per socket");

	/* TODO Virtualize this switch. */
	int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
	SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
	&in6_mcast_loop, 0, "Loopback multicast datagrams by default");

	static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
	"Per-interface stack-wide source filters");

	#ifdef KTR
	/*
	* Inline function which wraps assertions for a valid ifp.
	* The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
	* is detached.
	*/
	static int __inline
	in6m_is_ifp_detached(const struct in6_multi *inm)
	{
	struct ifnet *ifp;

	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
	ifp = inm->in6m_ifma->ifma_ifp;
	if (ifp != NULL) {
	/*
	* Sanity check that network-layer notion of ifp is the
	* same as that of link-layer.
	*/
	KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
	}

	return (ifp == NULL);
	}
	#endif

	/*
	* Initialize an in6_mfilter structure to a known state at t0, t1
	* with an empty source filter list.
	*/
	static __inline void
	im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
	{
	memset(imf, 0, sizeof(struct in6_mfilter));
	RB_INIT(&imf->im6f_sources);
	imf->im6f_st[0] = st0;
	imf->im6f_st[1] = st1;
	}

	/*
	* Resize the ip6_moptions vector to the next power-of-two minus 1.
	* May be called with locks held; do not sleep.
	*/
	static int
	im6o_grow(struct ip6_moptions *imo)
	{
	struct in6_multi **nmships;
	struct in6_multi **omships;
	struct in6_mfilter *nmfilters;
	struct in6_mfilter *omfilters;
	size_t idx;
	size_t newmax;
	size_t oldmax;

	nmships = NULL;
	nmfilters = NULL;
	omships = imo->im6o_membership;
	omfilters = imo->im6o_mfilters;
	oldmax = imo->im6o_max_memberships;
	newmax = ((oldmax + 1) * 2) - 1;

	if (newmax <= IPV6_MAX_MEMBERSHIPS) {
	nmships = (struct in6_multi **)realloc(omships,
	sizeof(struct in6_multi ) newmax, M_IP6MOPTS, M_NOWAIT);
	nmfilters = (struct in6_mfilter *)realloc(omfilters,
	sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER,
	M_NOWAIT);
	if (nmships != NULL && nmfilters != NULL) {
	/* Initialize newly allocated source filter heads. */
	for (idx = oldmax; idx < newmax; idx++) {
	im6f_init(&nmfilters[idx], MCAST_UNDEFINED,
	MCAST_EXCLUDE);
	}
	imo->im6o_max_memberships = newmax;
	imo->im6o_membership = nmships;
	imo->im6o_mfilters = nmfilters;
	}
	}

	if (nmships == NULL \|\| nmfilters == NULL) {
	if (nmships != NULL)
	free(nmships, M_IP6MOPTS);
	if (nmfilters != NULL)
	free(nmfilters, M_IN6MFILTER);
	return (ETOOMANYREFS);
	}

	return (0);
	}

	/*
	* Find an IPv6 multicast group entry for this ip6_moptions instance
	* which matches the specified group, and optionally an interface.
	* Return its index into the array, or -1 if not found.
	*/
	static size_t
	im6o_match_group(const struct ip6_moptions imo, const struct ifnet ifp,
	const struct sockaddr *group)
	{
	const struct sockaddr_in6 *gsin6;
	struct in6_multi **pinm;
	int idx;
	int nmships;

	gsin6 = (const struct sockaddr_in6 *)group;

	/* The im6o_membership array may be lazy allocated. */
	if (imo->im6o_membership == NULL \|\| imo->im6o_num_memberships == 0)
	return (-1);

	nmships = imo->im6o_num_memberships;
	pinm = &imo->im6o_membership[0];
	for (idx = 0; idx < nmships; idx++, pinm++) {
	if (*pinm == NULL)
	continue;
	if ((ifp == NULL \|\| ((*pinm)->in6m_ifp == ifp)) &&
	IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr,
	&gsin6->sin6_addr)) {
	break;
	}
	}
	if (idx >= nmships)
	idx = -1;

	return (idx);
	}

	/*
	* Find an IPv6 multicast source entry for this imo which matches
	* the given group index for this socket, and source address.
	*
	* XXX TODO: The scope ID, if present in src, is stripped before
	* any comparison. We SHOULD enforce scope/zone checks where the source
	* filter entry has a link scope.
	*
	* NOTE: This does not check if the entry is in-mode, merely if
	* it exists, which may not be the desired behaviour.
	*/
	static struct in6_msource *
	im6o_match_source(const struct ip6_moptions *imo, const size_t gidx,
	const struct sockaddr *src)
	{
	struct ip6_msource find;
	struct in6_mfilter *imf;
	struct ip6_msource *ims;
	const sockunion_t *psa;

	KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
	KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships,
	("%s: invalid index %d\n", __func__, (int)gidx));

	/* The im6o_mfilters array may be lazy allocated. */
	if (imo->im6o_mfilters == NULL)
	return (NULL);
	imf = &imo->im6o_mfilters[gidx];

	psa = (const sockunion_t *)src;
	find.im6s_addr = psa->sin6.sin6_addr;
	in6_clearscope(&find.im6s_addr); /* XXX */
	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);

	return ((struct in6_msource *)ims);
	}

	/*
	* Perform filtering for multicast datagrams on a socket by group and source.
	*
	* Returns 0 if a datagram should be allowed through, or various error codes
	* if the socket was not a member of the group, or the source was muted, etc.
	*/
	int
	im6o_mc_filter(const struct ip6_moptions imo, const struct ifnet ifp,
	const struct sockaddr group, const struct sockaddr src)
	{
	size_t gidx;
	struct in6_msource *ims;
	int mode;

	KASSERT(ifp != NULL, ("%s: null ifp", __func__));

	gidx = im6o_match_group(imo, ifp, group);
	if (gidx == -1)
	return (MCAST_NOTGMEMBER);

	/*
	* Check if the source was included in an (S,G) join.
	* Allow reception on exclusive memberships by default,
	* reject reception on inclusive memberships by default.
	* Exclude source only if an in-mode exclude filter exists.
	* Include source only if an in-mode include filter exists.
	* NOTE: We are comparing group state here at MLD t1 (now)
	* with socket-layer t0 (since last downcall).
	*/
	mode = imo->im6o_mfilters[gidx].im6f_st[1];
	ims = im6o_match_source(imo, gidx, src);

	if ((ims == NULL && mode == MCAST_INCLUDE) \|\|
	(ims != NULL && ims->im6sl_st[0] != mode))
	return (MCAST_NOTSMEMBER);

	return (MCAST_PASS);
	}

	/*
	* Find and return a reference to an in6_multi record for (ifp, group),
	* and bump its reference count.
	* If one does not exist, try to allocate it, and update link-layer multicast
	* filters on ifp to listen for group.
	* Assumes the IN6_MULTI lock is held across the call.
	* Return 0 if successful, otherwise return an appropriate error code.
	*/
	static int
	in6_mc_get(struct ifnet ifp, const struct in6_addr group,
	struct in6_multi **pinm)
	{
	struct sockaddr_in6 gsin6;
	struct ifmultiaddr *ifma;
	struct in6_multi *inm;
	int error;

	error = 0;

	/*
	* XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
	* if_addmulti() takes this mutex itself, so we must drop and
	* re-acquire around the call.
	*/
	IN6_MULTI_LOCK_ASSERT();
	IF_ADDR_WLOCK(ifp);

	inm = in6m_lookup_locked(ifp, group);
	if (inm != NULL) {
	/*
	* If we already joined this group, just bump the
	* refcount and return it.
	*/
	KASSERT(inm->in6m_refcount >= 1,
	("%s: bad refcount %d", __func__, inm->in6m_refcount));
	++inm->in6m_refcount;
	*pinm = inm;
	goto out_locked;
	}

	memset(&gsin6, 0, sizeof(gsin6));
	gsin6.sin6_family = AF_INET6;
	gsin6.sin6_len = sizeof(struct sockaddr_in6);
	gsin6.sin6_addr = *group;

	/*
	* Check if a link-layer group is already associated
	* with this network-layer group on the given ifnet.
	*/
	IF_ADDR_WUNLOCK(ifp);
	error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
	if (error != 0)
	return (error);
	IF_ADDR_WLOCK(ifp);

	/*
	* If something other than netinet6 is occupying the link-layer
	* group, print a meaningful error message and back out of
	* the allocation.
	* Otherwise, bump the refcount on the existing network-layer
	* group association and return it.
	*/
	if (ifma->ifma_protospec != NULL) {
	inm = (struct in6_multi *)ifma->ifma_protospec;
	#ifdef INVARIANTS
	KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
	__func__));
	KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
	("%s: ifma not AF_INET6", __func__));
	KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
	if (inm->in6m_ifma != ifma \|\| inm->in6m_ifp != ifp \|\|
	!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
	panic("%s: ifma %p is inconsistent with %p (%p)",
	__func__, ifma, inm, group);
	#endif
	++inm->in6m_refcount;
	*pinm = inm;
	goto out_locked;
	}

	IF_ADDR_WLOCK_ASSERT(ifp);

	/*
	* A new in6_multi record is needed; allocate and initialize it.
	* We DO NOT perform an MLD join as the in6_ layer may need to
	* push an initial source list down to MLD to support SSM.
	*
	* The initial source filter state is INCLUDE, {} as per the RFC.
	* Pending state-changes per group are subject to a bounds check.
	*/
	inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT \| M_ZERO);
	if (inm == NULL) {
	IF_ADDR_WUNLOCK(ifp);
	if_delmulti_ifma(ifma);
	return (ENOMEM);
	}
	inm->in6m_addr = *group;
	inm->in6m_ifp = ifp;
	inm->in6m_mli = MLD_IFINFO(ifp);
	inm->in6m_ifma = ifma;
	inm->in6m_refcount = 1;
	inm->in6m_state = MLD_NOT_MEMBER;
	mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);

	inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
	RB_INIT(&inm->in6m_srcs);

	ifma->ifma_protospec = inm;
	*pinm = inm;

	out_locked:
	IF_ADDR_WUNLOCK(ifp);
	return (error);
	}

	/*
	* Drop a reference to an in6_multi record.
	*
	* If the refcount drops to 0, free the in6_multi record and
	* delete the underlying link-layer membership.
	*/
	void
	in6m_release_locked(struct in6_multi *inm)
	{
	struct ifmultiaddr *ifma;

	IN6_MULTI_LOCK_ASSERT();

	CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);

	if (--inm->in6m_refcount > 0) {
	CTR2(KTR_MLD, "%s: refcount is now %d", __func__,
	inm->in6m_refcount);
	return;
	}

	CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);

	ifma = inm->in6m_ifma;

	/* XXX this access is not covered by IF_ADDR_LOCK */
	CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
	KASSERT(ifma->ifma_protospec == inm,
	("%s: ifma_protospec != inm", __func__));
	ifma->ifma_protospec = NULL;

	in6m_purge(inm);

	free(inm, M_IP6MADDR);

	if_delmulti_ifma(ifma);
	}

	/*
	* Clear recorded source entries for a group.
	* Used by the MLD code. Caller must hold the IN6_MULTI lock.
	* FIXME: Should reap.
	*/
	void
	in6m_clear_recorded(struct in6_multi *inm)
	{
	struct ip6_msource *ims;

	IN6_MULTI_LOCK_ASSERT();

	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
	if (ims->im6s_stp) {
	ims->im6s_stp = 0;
	--inm->in6m_st[1].iss_rec;
	}
	}
	KASSERT(inm->in6m_st[1].iss_rec == 0,
	("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
	}

	/*
	* Record a source as pending for a Source-Group MLDv2 query.
	* This lives here as it modifies the shared tree.
	*
	* inm is the group descriptor.
	* naddr is the address of the source to record in network-byte order.
	*
	* If the net.inet6.mld.sgalloc sysctl is non-zero, we will
	* lazy-allocate a source node in response to an SG query.
	* Otherwise, no allocation is performed. This saves some memory
	* with the trade-off that the source will not be reported to the
	* router if joined in the window between the query response and
	* the group actually being joined on the local host.
	*
	* VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
	* This turns off the allocation of a recorded source entry if
	* the group has not been joined.
	*
	* Return 0 if the source didn't exist or was already marked as recorded.
	* Return 1 if the source was marked as recorded by this function.
	* Return <0 if any error occurred (negated errno code).
	*/
	int
	in6m_record_source(struct in6_multi inm, const struct in6_addr addr)
	{
	struct ip6_msource find;
	struct ip6_msource ims, nims;

	IN6_MULTI_LOCK_ASSERT();

	find.im6s_addr = *addr;
	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
	if (ims && ims->im6s_stp)
	return (0);
	if (ims == NULL) {
	if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
	return (-ENOSPC);
	nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
	M_NOWAIT \| M_ZERO);
	if (nims == NULL)
	return (-ENOMEM);
	nims->im6s_addr = find.im6s_addr;
	RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
	++inm->in6m_nsrc;
	ims = nims;
	}

	/*
	* Mark the source as recorded and update the recorded
	* source count.
	*/
	++ims->im6s_stp;
	++inm->in6m_st[1].iss_rec;

	return (1);
	}

	/*
	* Return a pointer to an in6_msource owned by an in6_mfilter,
	* given its source address.
	* Lazy-allocate if needed. If this is a new entry its filter state is
	* undefined at t0.
	*
	* imf is the filter set being modified.
	* addr is the source address.
	*
	* SMPng: May be called with locks held; malloc must not block.
	*/
	static int
	im6f_get_source(struct in6_mfilter imf, const struct sockaddr_in6 psin,
	struct in6_msource **plims)
	{
	struct ip6_msource find;
	struct ip6_msource ims, nims;
	struct in6_msource *lims;
	int error;

	error = 0;
	ims = NULL;
	lims = NULL;

	find.im6s_addr = psin->sin6_addr;
	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
	lims = (struct in6_msource *)ims;
	if (lims == NULL) {
	if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
	return (ENOSPC);
	nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
	M_NOWAIT \| M_ZERO);
	if (nims == NULL)
	return (ENOMEM);
	lims = (struct in6_msource *)nims;
	lims->im6s_addr = find.im6s_addr;
	lims->im6sl_st[0] = MCAST_UNDEFINED;
	RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
	++imf->im6f_nsrc;
	}

	*plims = lims;

	return (error);
	}

	/*
	* Graft a source entry into an existing socket-layer filter set,
	* maintaining any required invariants and checking allocations.
	*
	* The source is marked as being in the new filter mode at t1.
	*
	* Return the pointer to the new node, otherwise return NULL.
	*/
	static struct in6_msource *
	im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
	const struct sockaddr_in6 *psin)
	{
	struct ip6_msource *nims;
	struct in6_msource *lims;

	nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
	M_NOWAIT \| M_ZERO);
	if (nims == NULL)
	return (NULL);
	lims = (struct in6_msource *)nims;
	lims->im6s_addr = psin->sin6_addr;
	lims->im6sl_st[0] = MCAST_UNDEFINED;
	lims->im6sl_st[1] = st1;
	RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
	++imf->im6f_nsrc;

	return (lims);
	}

	/*
	* Prune a source entry from an existing socket-layer filter set,
	* maintaining any required invariants and checking allocations.
	*
	* The source is marked as being left at t1, it is not freed.
	*
	* Return 0 if no error occurred, otherwise return an errno value.
	*/
	static int
	im6f_prune(struct in6_mfilter imf, const struct sockaddr_in6 psin)
	{
	struct ip6_msource find;
	struct ip6_msource *ims;
	struct in6_msource *lims;

	find.im6s_addr = psin->sin6_addr;
	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
	if (ims == NULL)
	return (ENOENT);
	lims = (struct in6_msource *)ims;
	lims->im6sl_st[1] = MCAST_UNDEFINED;
	return (0);
	}

	/*
	* Revert socket-layer filter set deltas at t1 to t0 state.
	*/
	static void
	im6f_rollback(struct in6_mfilter *imf)
	{
	struct ip6_msource ims, tims;
	struct in6_msource *lims;

	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
	lims = (struct in6_msource *)ims;
	if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
	/* no change at t1 */
	continue;
	} else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
	/* revert change to existing source at t1 */
	lims->im6sl_st[1] = lims->im6sl_st[0];
	} else {
	/* revert source added t1 */
	CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
	RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
	free(ims, M_IN6MFILTER);
	imf->im6f_nsrc--;
	}
	}
	imf->im6f_st[1] = imf->im6f_st[0];
	}

	/*
	* Mark socket-layer filter set as INCLUDE {} at t1.
	*/
	static void
	im6f_leave(struct in6_mfilter *imf)
	{
	struct ip6_msource *ims;
	struct in6_msource *lims;

	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
	lims = (struct in6_msource *)ims;
	lims->im6sl_st[1] = MCAST_UNDEFINED;
	}
	imf->im6f_st[1] = MCAST_INCLUDE;
	}

	/*
	* Mark socket-layer filter set deltas as committed.
	*/
	static void
	im6f_commit(struct in6_mfilter *imf)
	{
	struct ip6_msource *ims;
	struct in6_msource *lims;

	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
	lims = (struct in6_msource *)ims;
	lims->im6sl_st[0] = lims->im6sl_st[1];
	}
	imf->im6f_st[0] = imf->im6f_st[1];
	}

	/*
	* Reap unreferenced sources from socket-layer filter set.
	*/
	static void
	im6f_reap(struct in6_mfilter *imf)
	{
	struct ip6_msource ims, tims;
	struct in6_msource *lims;

	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
	lims = (struct in6_msource *)ims;
	if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
	(lims->im6sl_st[1] == MCAST_UNDEFINED)) {
	CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
	RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
	free(ims, M_IN6MFILTER);
	imf->im6f_nsrc--;
	}
	}
	}

	/*
	* Purge socket-layer filter set.
	*/
	static void
	im6f_purge(struct in6_mfilter *imf)
	{
	struct ip6_msource ims, tims;

	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
	CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
	RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
	free(ims, M_IN6MFILTER);
	imf->im6f_nsrc--;
	}
	imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
	KASSERT(RB_EMPTY(&imf->im6f_sources),
	("%s: im6f_sources not empty", __func__));
	}

	/*
	* Look up a source filter entry for a multicast group.
	*
	* inm is the group descriptor to work with.
	* addr is the IPv6 address to look up.
	* noalloc may be non-zero to suppress allocation of sources.
	* *pims will be set to the address of the retrieved or allocated source.
	*
	* SMPng: NOTE: may be called with locks held.
	* Return 0 if successful, otherwise return a non-zero error code.
	*/
	static int
	in6m_get_source(struct in6_multi inm, const struct in6_addr addr,
	const int noalloc, struct ip6_msource **pims)
	{
	struct ip6_msource find;
	struct ip6_msource ims, nims;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	find.im6s_addr = *addr;
	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
	if (ims == NULL && !noalloc) {
	if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
	return (ENOSPC);
	nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
	M_NOWAIT \| M_ZERO);
	if (nims == NULL)
	return (ENOMEM);
	nims->im6s_addr = *addr;
	RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
	++inm->in6m_nsrc;
	ims = nims;
	CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
	ip6_sprintf(ip6tbuf, addr), ims);
	}

	*pims = ims;
	return (0);
	}

	/*
	* Merge socket-layer source into MLD-layer source.
	* If rollback is non-zero, perform the inverse of the merge.
	*/
	static void
	im6s_merge(struct ip6_msource ims, const struct in6_msource lims,
	const int rollback)
	{
	int n = rollback ? -1 : 1;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];

	ip6_sprintf(ip6tbuf, &lims->im6s_addr);
	#endif

	if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
	CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
	ims->im6s_st[1].ex -= n;
	} else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
	CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
	ims->im6s_st[1].in -= n;
	}

	if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
	CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
	ims->im6s_st[1].ex += n;
	} else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
	CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
	ims->im6s_st[1].in += n;
	}
	}

	/*
	* Atomically update the global in6_multi state, when a membership's
	* filter list is being updated in any way.
	*
	* imf is the per-inpcb-membership group filter pointer.
	* A fake imf may be passed for in-kernel consumers.
	*
	* XXX This is a candidate for a set-symmetric-difference style loop
	* which would eliminate the repeated lookup from root of ims nodes,
	* as they share the same key space.
	*
	* If any error occurred this function will back out of refcounts
	* and return a non-zero value.
	*/
	static int
	in6m_merge(struct in6_multi inm, /const/ struct in6_mfilter imf)
	{
	struct ip6_msource ims, nims;
	struct in6_msource *lims;
	int schanged, error;
	int nsrc0, nsrc1;

	schanged = 0;
	error = 0;
	nsrc1 = nsrc0 = 0;

	/*
	* Update the source filters first, as this may fail.
	* Maintain count of in-mode filters at t0, t1. These are
	* used to work out if we transition into ASM mode or not.
	* Maintain a count of source filters whose state was
	* actually modified by this operation.
	*/
	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
	lims = (struct in6_msource *)ims;
	if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
	if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
	if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
	error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
	++schanged;
	if (error)
	break;
	im6s_merge(nims, lims, 0);
	}
	if (error) {
	struct ip6_msource *bims;

	RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
	lims = (struct in6_msource *)ims;
	if (lims->im6sl_st[0] == lims->im6sl_st[1])
	continue;
	(void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
	if (bims == NULL)
	continue;
	im6s_merge(bims, lims, 1);
	}
	goto out_reap;
	}

	CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
	__func__, nsrc0, nsrc1);

	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
	if (imf->im6f_st[0] == imf->im6f_st[1] &&
	imf->im6f_st[1] == MCAST_INCLUDE) {
	if (nsrc1 == 0) {
	CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
	--inm->in6m_st[1].iss_in;
	}
	}

	/* Handle filter mode transition on socket. */
	if (imf->im6f_st[0] != imf->im6f_st[1]) {
	CTR3(KTR_MLD, "%s: imf transition %d to %d",
	__func__, imf->im6f_st[0], imf->im6f_st[1]);

	if (imf->im6f_st[0] == MCAST_EXCLUDE) {
	CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
	--inm->in6m_st[1].iss_ex;
	} else if (imf->im6f_st[0] == MCAST_INCLUDE) {
	CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
	--inm->in6m_st[1].iss_in;
	}

	if (imf->im6f_st[1] == MCAST_EXCLUDE) {
	CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
	inm->in6m_st[1].iss_ex++;
	} else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
	CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
	inm->in6m_st[1].iss_in++;
	}
	}

	/*
	* Track inm filter state in terms of listener counts.
	* If there are any exclusive listeners, stack-wide
	* membership is exclusive.
	* Otherwise, if only inclusive listeners, stack-wide is inclusive.
	* If no listeners remain, state is undefined at t1,
	* and the MLD lifecycle for this group should finish.
	*/
	if (inm->in6m_st[1].iss_ex > 0) {
	CTR1(KTR_MLD, "%s: transition to EX", __func__);
	inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
	} else if (inm->in6m_st[1].iss_in > 0) {
	CTR1(KTR_MLD, "%s: transition to IN", __func__);
	inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
	} else {
	CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
	}

	/* Decrement ASM listener count on transition out of ASM mode. */
	if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
	if ((imf->im6f_st[1] != MCAST_EXCLUDE) \|\|
	(imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
	CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
	--inm->in6m_st[1].iss_asm;
	}
	}

	/* Increment ASM listener count on transition to ASM mode. */
	if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
	CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
	inm->in6m_st[1].iss_asm++;
	}

	CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
	in6m_print(inm);

	out_reap:
	if (schanged > 0) {
	CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
	in6m_reap(inm);
	}
	return (error);
	}

	/*
	* Mark an in6_multi's filter set deltas as committed.
	* Called by MLD after a state change has been enqueued.
	*/
	void
	in6m_commit(struct in6_multi *inm)
	{
	struct ip6_msource *ims;

	CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
	CTR1(KTR_MLD, "%s: pre commit:", __func__);
	in6m_print(inm);

	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
	ims->im6s_st[0] = ims->im6s_st[1];
	}
	inm->in6m_st[0] = inm->in6m_st[1];
	}

	/*
	* Reap unreferenced nodes from an in6_multi's filter set.
	*/
	static void
	in6m_reap(struct in6_multi *inm)
	{
	struct ip6_msource ims, tims;

	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
	if (ims->im6s_st[0].ex > 0 \|\| ims->im6s_st[0].in > 0 \|\|
	ims->im6s_st[1].ex > 0 \|\| ims->im6s_st[1].in > 0 \|\|
	ims->im6s_stp != 0)
	continue;
	CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
	RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
	free(ims, M_IP6MSOURCE);
	inm->in6m_nsrc--;
	}
	}

	/*
	* Purge all source nodes from an in6_multi's filter set.
	*/
	static void
	in6m_purge(struct in6_multi *inm)
	{
	struct ip6_msource ims, tims;

	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
	CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
	RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
	free(ims, M_IP6MSOURCE);
	inm->in6m_nsrc--;
	}
	/* Free state-change requests that might be queued. */
	mbufq_drain(&inm->in6m_scq);
	}

	/*
	* Join a multicast address w/o sources.
	* KAME compatibility entry point.
	*
	* SMPng: Assume no mc locks held by caller.
	*/
	struct in6_multi_mship *
	in6_joingroup(struct ifnet ifp, struct in6_addr mcaddr,
	int *errorp, int delay)
	{
	struct in6_multi_mship *imm;
	int error;

	imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT);
	if (imm == NULL) {
	*errorp = ENOBUFS;
	return (NULL);
	}

	delay = (delay * PR_FASTHZ) / hz;

	error = in6_mc_join(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay);
	if (error) {
	*errorp = error;
	free(imm, M_IP6MADDR);
	return (NULL);
	}

	return (imm);
	}

	/*
	* Leave a multicast address w/o sources.
	* KAME compatibility entry point.
	*
	* SMPng: Assume no mc locks held by caller.
	*/
	int
	in6_leavegroup(struct in6_multi_mship *imm)
	{

	if (imm->i6mm_maddr != NULL)
	in6_mc_leave(imm->i6mm_maddr, NULL);
	free(imm, M_IP6MADDR);
	return 0;
	}

	/*
	* Join a multicast group; unlocked entry point.
	*
	* SMPng: XXX: in6_mc_join() is called from in6_control() when upper
	* locks are not held. Fortunately, ifp is unlikely to have been detached
	* at this point, so we assume it's OK to recurse.
	*/
	int
	in6_mc_join(struct ifnet ifp, const struct in6_addr mcaddr,
	/const/ struct in6_mfilter imf, struct in6_multi *pinm,
	const int delay)
	{
	int error;

	IN6_MULTI_LOCK();
	error = in6_mc_join_locked(ifp, mcaddr, imf, pinm, delay);
	IN6_MULTI_UNLOCK();

	return (error);
	}

	/*
	* Join a multicast group; real entry point.
	*
	* Only preserves atomicity at inm level.
	* NOTE: imf argument cannot be const due to sys/tree.h limitations.
	*
	* If the MLD downcall fails, the group is not joined, and an error
	* code is returned.
	*/
	int
	in6_mc_join_locked(struct ifnet ifp, const struct in6_addr mcaddr,
	/const/ struct in6_mfilter imf, struct in6_multi *pinm,
	const int delay)
	{
	struct in6_mfilter timf;
	struct in6_multi *inm;
	int error;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	#ifdef INVARIANTS
	/*
	* Sanity: Check scope zone ID was set for ifp, if and
	* only if group is scoped to an interface.
	*/
	KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
	("%s: not a multicast address", __func__));
	if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) \|\|
	IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
	KASSERT(mcaddr->s6_addr16[1] != 0,
	("%s: scope zone ID not set", __func__));
	}
	#endif

	IN6_MULTI_LOCK_ASSERT();

	CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
	ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));

	error = 0;
	inm = NULL;

	/*
	* If no imf was specified (i.e. kernel consumer),
	* fake one up and assume it is an ASM join.
	*/
	if (imf == NULL) {
	im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
	imf = &timf;
	}

	error = in6_mc_get(ifp, mcaddr, &inm);
	if (error) {
	CTR1(KTR_MLD, "%s: in6_mc_get() failure", __func__);
	return (error);
	}

	CTR1(KTR_MLD, "%s: merge inm state", __func__);
	error = in6m_merge(inm, imf);
	if (error) {
	CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
	goto out_in6m_release;
	}

	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
	error = mld_change_state(inm, delay);
	if (error) {
	CTR1(KTR_MLD, "%s: failed to update source", __func__);
	goto out_in6m_release;
	}

	out_in6m_release:
	if (error) {
	CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
	in6m_release_locked(inm);
	} else {
	*pinm = inm;
	}

	return (error);
	}

	/*
	* Leave a multicast group; unlocked entry point.
	*/
	int
	in6_mc_leave(struct in6_multi inm, /const/ struct in6_mfilter imf)
	{
	- struct ifnet *ifp;
	int error;
	-
	- ifp = inm->in6m_ifp;

	IN6_MULTI_LOCK();
	error = in6_mc_leave_locked(inm, imf);
	IN6_MULTI_UNLOCK();

	return (error);
	}

	/*
	* Leave a multicast group; real entry point.
	* All source filters will be expunged.
	*
	* Only preserves atomicity at inm level.
	*
	* Holding the write lock for the INP which contains imf
	* is highly advisable. We can't assert for it as imf does not
	* contain a back-pointer to the owning inp.
	*
	* Note: This is not the same as in6m_release(*) as this function also
	* makes a state change downcall into MLD.
	*/
	int
	in6_mc_leave_locked(struct in6_multi inm, /const/ struct in6_mfilter imf)
	{
	struct in6_mfilter timf;
	int error;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	error = 0;

	IN6_MULTI_LOCK_ASSERT();

	CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
	inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	(in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
	imf);

	/*
	* If no imf was specified (i.e. kernel consumer),
	* fake one up and assume it is an ASM join.
	*/
	if (imf == NULL) {
	im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
	imf = &timf;
	}

	/*
	* Begin state merge transaction at MLD layer.
	*
	* As this particular invocation should not cause any memory
	* to be allocated, and there is no opportunity to roll back
	* the transaction, it MUST NOT fail.
	*/
	CTR1(KTR_MLD, "%s: merge inm state", __func__);
	error = in6m_merge(inm, imf);
	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));

	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
	error = mld_change_state(inm, 0);
	if (error)
	CTR1(KTR_MLD, "%s: failed mld downcall", __func__);

	CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
	in6m_release_locked(inm);

	return (error);
	}

	/*
	* Block or unblock an ASM multicast source on an inpcb.
	* This implements the delta-based API described in RFC 3678.
	*
	* The delta-based API applies only to exclusive-mode memberships.
	* An MLD downcall will be performed.
	*
	* SMPng: NOTE: Must take Giant as a join may create a new ifma.
	*
	* Return 0 if successful, otherwise return an appropriate error code.
	*/
	static int
	in6p_block_unblock_source(struct inpcb inp, struct sockopt sopt)
	{
	struct group_source_req gsr;
	sockunion_t gsa, ssa;
	struct ifnet *ifp;
	struct in6_mfilter *imf;
	struct ip6_moptions *imo;
	struct in6_msource *ims;
	struct in6_multi *inm;
	size_t idx;
	uint16_t fmode;
	int error, doblock;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	ifp = NULL;
	error = 0;
	doblock = 0;

	memset(&gsr, 0, sizeof(struct group_source_req));
	gsa = (sockunion_t *)&gsr.gsr_group;
	ssa = (sockunion_t *)&gsr.gsr_source;

	switch (sopt->sopt_name) {
	case MCAST_BLOCK_SOURCE:
	case MCAST_UNBLOCK_SOURCE:
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_source_req),
	sizeof(struct group_source_req));
	if (error)
	return (error);

	if (gsa->sin6.sin6_family != AF_INET6 \|\|
	gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);

	if (ssa->sin6.sin6_family != AF_INET6 \|\|
	ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);

	if (gsr.gsr_interface == 0 \|\| V_if_index < gsr.gsr_interface)
	return (EADDRNOTAVAIL);

	ifp = ifnet_byindex(gsr.gsr_interface);

	if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
	doblock = 1;
	break;

	default:
	CTR2(KTR_MLD, "%s: unknown sopt_name %d",
	__func__, sopt->sopt_name);
	return (EOPNOTSUPP);
	break;
	}

	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
	return (EINVAL);

	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);

	/*
	* Check if we are actually a member of this group.
	*/
	imo = in6p_findmoptions(inp);
	idx = im6o_match_group(imo, ifp, &gsa->sa);
	if (idx == -1 \|\| imo->im6o_mfilters == NULL) {
	error = EADDRNOTAVAIL;
	goto out_in6p_locked;
	}

	KASSERT(imo->im6o_mfilters != NULL,
	("%s: im6o_mfilters not allocated", __func__));
	imf = &imo->im6o_mfilters[idx];
	inm = imo->im6o_membership[idx];

	/*
	* Attempting to use the delta-based API on an
	* non exclusive-mode membership is an error.
	*/
	fmode = imf->im6f_st[0];
	if (fmode != MCAST_EXCLUDE) {
	error = EINVAL;
	goto out_in6p_locked;
	}

	/*
	* Deal with error cases up-front:
	* Asked to block, but already blocked; or
	* Asked to unblock, but nothing to unblock.
	* If adding a new block entry, allocate it.
	*/
	ims = im6o_match_source(imo, idx, &ssa->sa);
	if ((ims != NULL && doblock) \|\| (ims == NULL && !doblock)) {
	CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
	ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
	doblock ? "" : "not ");
	error = EADDRNOTAVAIL;
	goto out_in6p_locked;
	}

	INP_WLOCK_ASSERT(inp);

	/*
	* Begin state merge transaction at socket layer.
	*/
	if (doblock) {
	CTR2(KTR_MLD, "%s: %s source", __func__, "block");
	ims = im6f_graft(imf, fmode, &ssa->sin6);
	if (ims == NULL)
	error = ENOMEM;
	} else {
	CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
	error = im6f_prune(imf, &ssa->sin6);
	}

	if (error) {
	CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
	goto out_im6f_rollback;
	}

	/*
	* Begin state merge transaction at MLD layer.
	*/
	IN6_MULTI_LOCK();

	CTR1(KTR_MLD, "%s: merge inm state", __func__);
	error = in6m_merge(inm, imf);
	if (error)
	CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
	else {
	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
	error = mld_change_state(inm, 0);
	if (error)
	CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
	}

	IN6_MULTI_UNLOCK();

	out_im6f_rollback:
	if (error)
	im6f_rollback(imf);
	else
	im6f_commit(imf);

	im6f_reap(imf);

	out_in6p_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Given an inpcb, return its multicast options structure pointer. Accepts
	* an unlocked inpcb pointer, but will return it locked. May sleep.
	*
	* SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
	* SMPng: NOTE: Returns with the INP write lock held.
	*/
	static struct ip6_moptions *
	in6p_findmoptions(struct inpcb *inp)
	{
	struct ip6_moptions *imo;
	struct in6_multi **immp;
	struct in6_mfilter *imfp;
	size_t idx;

	INP_WLOCK(inp);
	if (inp->in6p_moptions != NULL)
	return (inp->in6p_moptions);

	INP_WUNLOCK(inp);

	imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
	immp = malloc(sizeof(immp) IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS,
	M_WAITOK \| M_ZERO);
	imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS,
	M_IN6MFILTER, M_WAITOK);

	imo->im6o_multicast_ifp = NULL;
	imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
	imo->im6o_multicast_loop = in6_mcast_loop;
	imo->im6o_num_memberships = 0;
	imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
	imo->im6o_membership = immp;

	/* Initialize per-group source filters. */
	for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++)
	im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
	imo->im6o_mfilters = imfp;

	INP_WLOCK(inp);
	if (inp->in6p_moptions != NULL) {
	free(imfp, M_IN6MFILTER);
	free(immp, M_IP6MOPTS);
	free(imo, M_IP6MOPTS);
	return (inp->in6p_moptions);
	}
	inp->in6p_moptions = imo;
	return (imo);
	}

	/*
	* Discard the IPv6 multicast options (and source filters).
	*
	* SMPng: NOTE: assumes INP write lock is held.
	*/
	void
	ip6_freemoptions(struct ip6_moptions *imo)
	{
	struct in6_mfilter *imf;
	size_t idx, nmships;

	KASSERT(imo != NULL, ("%s: ip6_moptions is NULL", __func__));

	nmships = imo->im6o_num_memberships;
	for (idx = 0; idx < nmships; ++idx) {
	imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL;
	if (imf)
	im6f_leave(imf);
	/* XXX this will thrash the lock(s) */
	(void)in6_mc_leave(imo->im6o_membership[idx], imf);
	if (imf)
	im6f_purge(imf);
	}

	if (imo->im6o_mfilters)
	free(imo->im6o_mfilters, M_IN6MFILTER);
	free(imo->im6o_membership, M_IP6MOPTS);
	free(imo, M_IP6MOPTS);
	}

	/*
	* Atomically get source filters on a socket for an IPv6 multicast group.
	* Called with INP lock held; returns with lock released.
	*/
	static int
	in6p_get_source_filters(struct inpcb inp, struct sockopt sopt)
	{
	struct __msfilterreq msfr;
	sockunion_t *gsa;
	struct ifnet *ifp;
	struct ip6_moptions *imo;
	struct in6_mfilter *imf;
	struct ip6_msource *ims;
	struct in6_msource *lims;
	struct sockaddr_in6 *psin;
	struct sockaddr_storage *ptss;
	struct sockaddr_storage *tss;
	int error;
	size_t idx, nsrcs, ncsrcs;

	INP_WLOCK_ASSERT(inp);

	imo = inp->in6p_moptions;
	KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));

	INP_WUNLOCK(inp);

	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
	sizeof(struct __msfilterreq));
	if (error)
	return (error);

	if (msfr.msfr_group.ss_family != AF_INET6 \|\|
	msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
	return (EINVAL);

	gsa = (sockunion_t *)&msfr.msfr_group;
	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
	return (EINVAL);

	if (msfr.msfr_ifindex == 0 \|\| V_if_index < msfr.msfr_ifindex)
	return (EADDRNOTAVAIL);
	ifp = ifnet_byindex(msfr.msfr_ifindex);
	if (ifp == NULL)
	return (EADDRNOTAVAIL);
	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);

	INP_WLOCK(inp);

	/*
	* Lookup group on the socket.
	*/
	idx = im6o_match_group(imo, ifp, &gsa->sa);
	if (idx == -1 \|\| imo->im6o_mfilters == NULL) {
	INP_WUNLOCK(inp);
	return (EADDRNOTAVAIL);
	}
	imf = &imo->im6o_mfilters[idx];

	/*
	* Ignore memberships which are in limbo.
	*/
	if (imf->im6f_st[1] == MCAST_UNDEFINED) {
	INP_WUNLOCK(inp);
	return (EAGAIN);
	}
	msfr.msfr_fmode = imf->im6f_st[1];

	/*
	* If the user specified a buffer, copy out the source filter
	* entries to userland gracefully.
	* We only copy out the number of entries which userland
	* has asked for, but we always tell userland how big the
	* buffer really needs to be.
	*/
	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
	msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
	tss = NULL;
	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
	tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
	M_TEMP, M_NOWAIT \| M_ZERO);
	if (tss == NULL) {
	INP_WUNLOCK(inp);
	return (ENOBUFS);
	}
	}

	/*
	* Count number of sources in-mode at t0.
	* If buffer space exists and remains, copy out source entries.
	*/
	nsrcs = msfr.msfr_nsrcs;
	ncsrcs = 0;
	ptss = tss;
	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
	lims = (struct in6_msource *)ims;
	if (lims->im6sl_st[0] == MCAST_UNDEFINED \|\|
	lims->im6sl_st[0] != imf->im6f_st[0])
	continue;
	++ncsrcs;
	if (tss != NULL && nsrcs > 0) {
	psin = (struct sockaddr_in6 *)ptss;
	psin->sin6_family = AF_INET6;
	psin->sin6_len = sizeof(struct sockaddr_in6);
	psin->sin6_addr = lims->im6s_addr;
	psin->sin6_port = 0;
	--nsrcs;
	++ptss;
	}
	}

	INP_WUNLOCK(inp);

	if (tss != NULL) {
	error = copyout(tss, msfr.msfr_srcs,
	sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
	free(tss, M_TEMP);
	if (error)
	return (error);
	}

	msfr.msfr_nsrcs = ncsrcs;
	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));

	return (error);
	}

	/*
	* Return the IP multicast options in response to user getsockopt().
	*/
	int
	ip6_getmoptions(struct inpcb inp, struct sockopt sopt)
	{
	struct ip6_moptions *im6o;
	int error;
	u_int optval;

	INP_WLOCK(inp);
	im6o = inp->in6p_moptions;
	/*
	* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
	* or is a divert socket, reject it.
	*/
	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT \|\|
	(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
	inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
	INP_WUNLOCK(inp);
	return (EOPNOTSUPP);
	}

	error = 0;
	switch (sopt->sopt_name) {
	case IPV6_MULTICAST_IF:
	if (im6o == NULL \|\| im6o->im6o_multicast_ifp == NULL) {
	optval = 0;
	} else {
	optval = im6o->im6o_multicast_ifp->if_index;
	}
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof(u_int));
	break;

	case IPV6_MULTICAST_HOPS:
	if (im6o == NULL)
	optval = V_ip6_defmcasthlim;
	else
	optval = im6o->im6o_multicast_hlim;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof(u_int));
	break;

	case IPV6_MULTICAST_LOOP:
	if (im6o == NULL)
	optval = in6_mcast_loop; /* XXX VIMAGE */
	else
	optval = im6o->im6o_multicast_loop;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof(u_int));
	break;

	case IPV6_MSFILTER:
	if (im6o == NULL) {
	error = EADDRNOTAVAIL;
	INP_WUNLOCK(inp);
	} else {
	error = in6p_get_source_filters(inp, sopt);
	}
	break;

	default:
	INP_WUNLOCK(inp);
	error = ENOPROTOOPT;
	break;
	}

	INP_UNLOCK_ASSERT(inp);

	return (error);
	}

	/*
	* Look up the ifnet to use for a multicast group membership,
	* given the address of an IPv6 group.
	*
	* This routine exists to support legacy IPv6 multicast applications.
	*
	* If inp is non-NULL, use this socket's current FIB number for any
	* required FIB lookup. Look up the group address in the unicast FIB,
	* and use its ifp; usually, this points to the default next-hop.
	* If the FIB lookup fails, return NULL.
	*
	* FUTURE: Support multiple forwarding tables for IPv6.
	*
	* Returns NULL if no ifp could be found.
	*/
	static struct ifnet *
	in6p_lookup_mcast_ifp(const struct inpcb *in6p,
	const struct sockaddr_in6 *gsin6)
	{
	struct nhop6_basic nh6;
	struct in6_addr dst;
	uint32_t scopeid;
	uint32_t fibnum;

	KASSERT(in6p->inp_vflag & INP_IPV6,
	("%s: not INP_IPV6 inpcb", __func__));
	KASSERT(gsin6->sin6_family == AF_INET6,
	("%s: not AF_INET6 group", __func__));

	in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
	fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB;
	if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0)
	return (NULL);

	return (nh6.nh_ifp);
	}

	/*
	* Join an IPv6 multicast group, possibly with a source.
	*
	* FIXME: The KAME use of the unspecified address (::)
	* to join all multicast groups is currently unsupported.
	*/
	static int
	in6p_join_group(struct inpcb inp, struct sockopt sopt)
	{
	struct group_source_req gsr;
	sockunion_t gsa, ssa;
	struct ifnet *ifp;
	struct in6_mfilter *imf;
	struct ip6_moptions *imo;
	struct in6_multi *inm;
	struct in6_msource *lims;
	size_t idx;
	int error, is_new;

	ifp = NULL;
	imf = NULL;
	lims = NULL;
	error = 0;
	is_new = 0;

	memset(&gsr, 0, sizeof(struct group_source_req));
	gsa = (sockunion_t *)&gsr.gsr_group;
	gsa->ss.ss_family = AF_UNSPEC;
	ssa = (sockunion_t *)&gsr.gsr_source;
	ssa->ss.ss_family = AF_UNSPEC;

	/*
	* Chew everything into struct group_source_req.
	* Overwrite the port field if present, as the sockaddr
	* being copied in may be matched with a binary comparison.
	* Ignore passed-in scope ID.
	*/
	switch (sopt->sopt_name) {
	case IPV6_JOIN_GROUP: {
	struct ipv6_mreq mreq;

	error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
	sizeof(struct ipv6_mreq));
	if (error)
	return (error);

	gsa->sin6.sin6_family = AF_INET6;
	gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
	gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;

	if (mreq.ipv6mr_interface == 0) {
	ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
	} else {
	if (V_if_index < mreq.ipv6mr_interface)
	return (EADDRNOTAVAIL);
	ifp = ifnet_byindex(mreq.ipv6mr_interface);
	}
	CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
	__func__, mreq.ipv6mr_interface, ifp);
	} break;

	case MCAST_JOIN_GROUP:
	case MCAST_JOIN_SOURCE_GROUP:
	if (sopt->sopt_name == MCAST_JOIN_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_req),
	sizeof(struct group_req));
	} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_source_req),
	sizeof(struct group_source_req));
	}
	if (error)
	return (error);

	if (gsa->sin6.sin6_family != AF_INET6 \|\|
	gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);

	if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
	if (ssa->sin6.sin6_family != AF_INET6 \|\|
	ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);
	if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
	return (EINVAL);
	/*
	* TODO: Validate embedded scope ID in source
	* list entry against passed-in ifp, if and only
	* if source list filter entry is iface or node local.
	*/
	in6_clearscope(&ssa->sin6.sin6_addr);
	ssa->sin6.sin6_port = 0;
	ssa->sin6.sin6_scope_id = 0;
	}

	if (gsr.gsr_interface == 0 \|\| V_if_index < gsr.gsr_interface)
	return (EADDRNOTAVAIL);
	ifp = ifnet_byindex(gsr.gsr_interface);
	break;

	default:
	CTR2(KTR_MLD, "%s: unknown sopt_name %d",
	__func__, sopt->sopt_name);
	return (EOPNOTSUPP);
	break;
	}

	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
	return (EINVAL);

	if (ifp == NULL \|\| (ifp->if_flags & IFF_MULTICAST) == 0)
	return (EADDRNOTAVAIL);

	gsa->sin6.sin6_port = 0;
	gsa->sin6.sin6_scope_id = 0;

	/*
	* Always set the scope zone ID on memberships created from userland.
	* Use the passed-in ifp to do this.
	* XXX The in6_setscope() return value is meaningless.
	* XXX SCOPE6_LOCK() is taken by in6_setscope().
	*/
	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);

	imo = in6p_findmoptions(inp);
	idx = im6o_match_group(imo, ifp, &gsa->sa);
	if (idx == -1) {
	is_new = 1;
	} else {
	inm = imo->im6o_membership[idx];
	imf = &imo->im6o_mfilters[idx];
	if (ssa->ss.ss_family != AF_UNSPEC) {
	/*
	* MCAST_JOIN_SOURCE_GROUP on an exclusive membership
	* is an error. On an existing inclusive membership,
	* it just adds the source to the filter list.
	*/
	if (imf->im6f_st[1] != MCAST_INCLUDE) {
	error = EINVAL;
	goto out_in6p_locked;
	}
	/*
	* Throw out duplicates.
	*
	* XXX FIXME: This makes a naive assumption that
	* even if entries exist for *ssa in this imf,
	* they will be rejected as dupes, even if they
	* are not valid in the current mode (in-mode).
	*
	* in6_msource is transactioned just as for anything
	* else in SSM -- but note naive use of in6m_graft()
	* below for allocating new filter entries.
	*
	* This is only an issue if someone mixes the
	* full-state SSM API with the delta-based API,
	* which is discouraged in the relevant RFCs.
	*/
	lims = im6o_match_source(imo, idx, &ssa->sa);
	if (lims != NULL /*&&
	lims->im6sl_st[1] == MCAST_INCLUDE*/) {
	error = EADDRNOTAVAIL;
	goto out_in6p_locked;
	}
	} else {
	/*
	* MCAST_JOIN_GROUP alone, on any existing membership,
	* is rejected, to stop the same inpcb tying up
	* multiple refs to the in_multi.
	* On an existing inclusive membership, this is also
	* an error; if you want to change filter mode,
	* you must use the userland API setsourcefilter().
	* XXX We don't reject this for imf in UNDEFINED
	* state at t1, because allocation of a filter
	* is atomic with allocation of a membership.
	*/
	error = EINVAL;
	goto out_in6p_locked;
	}
	}

	/*
	* Begin state merge transaction at socket layer.
	*/
	INP_WLOCK_ASSERT(inp);

	if (is_new) {
	if (imo->im6o_num_memberships == imo->im6o_max_memberships) {
	error = im6o_grow(imo);
	if (error)
	goto out_in6p_locked;
	}
	/*
	* Allocate the new slot upfront so we can deal with
	* grafting the new source filter in same code path
	* as for join-source on existing membership.
	*/
	idx = imo->im6o_num_memberships;
	imo->im6o_membership[idx] = NULL;
	imo->im6o_num_memberships++;
	KASSERT(imo->im6o_mfilters != NULL,
	("%s: im6f_mfilters vector was not allocated", __func__));
	imf = &imo->im6o_mfilters[idx];
	KASSERT(RB_EMPTY(&imf->im6f_sources),
	("%s: im6f_sources not empty", __func__));
	}

	/*
	* Graft new source into filter list for this inpcb's
	* membership of the group. The in6_multi may not have
	* been allocated yet if this is a new membership, however,
	* the in_mfilter slot will be allocated and must be initialized.
	*
	* Note: Grafting of exclusive mode filters doesn't happen
	* in this path.
	* XXX: Should check for non-NULL lims (node exists but may
	* not be in-mode) for interop with full-state API.
	*/
	if (ssa->ss.ss_family != AF_UNSPEC) {
	/* Membership starts in IN mode */
	if (is_new) {
	CTR1(KTR_MLD, "%s: new join w/source", __func__);
	im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
	} else {
	CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
	}
	lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
	if (lims == NULL) {
	CTR1(KTR_MLD, "%s: merge imf state failed",
	__func__);
	error = ENOMEM;
	goto out_im6o_free;
	}
	} else {
	/* No address specified; Membership starts in EX mode */
	if (is_new) {
	CTR1(KTR_MLD, "%s: new join w/o source", __func__);
	im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
	}
	}

	/*
	* Begin state merge transaction at MLD layer.
	*/
	IN6_MULTI_LOCK();

	if (is_new) {
	error = in6_mc_join_locked(ifp, &gsa->sin6.sin6_addr, imf,
	&inm, 0);
	if (error) {
	IN6_MULTI_UNLOCK();
	goto out_im6o_free;
	}
	imo->im6o_membership[idx] = inm;
	} else {
	CTR1(KTR_MLD, "%s: merge inm state", __func__);
	error = in6m_merge(inm, imf);
	if (error)
	CTR1(KTR_MLD, "%s: failed to merge inm state",
	__func__);
	else {
	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
	error = mld_change_state(inm, 0);
	if (error)
	CTR1(KTR_MLD, "%s: failed mld downcall",
	__func__);
	}
	}

	IN6_MULTI_UNLOCK();
	INP_WLOCK_ASSERT(inp);
	if (error) {
	im6f_rollback(imf);
	if (is_new)
	im6f_purge(imf);
	else
	im6f_reap(imf);
	} else {
	im6f_commit(imf);
	}

	out_im6o_free:
	if (error && is_new) {
	imo->im6o_membership[idx] = NULL;
	--imo->im6o_num_memberships;
	}

	out_in6p_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Leave an IPv6 multicast group on an inpcb, possibly with a source.
	*/
	static int
	in6p_leave_group(struct inpcb inp, struct sockopt sopt)
	{
	struct ipv6_mreq mreq;
	struct group_source_req gsr;
	sockunion_t gsa, ssa;
	struct ifnet *ifp;
	struct in6_mfilter *imf;
	struct ip6_moptions *imo;
	struct in6_msource *ims;
	struct in6_multi *inm;
	uint32_t ifindex;
	size_t idx;
	int error, is_final;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	ifp = NULL;
	ifindex = 0;
	error = 0;
	is_final = 1;

	memset(&gsr, 0, sizeof(struct group_source_req));
	gsa = (sockunion_t *)&gsr.gsr_group;
	gsa->ss.ss_family = AF_UNSPEC;
	ssa = (sockunion_t *)&gsr.gsr_source;
	ssa->ss.ss_family = AF_UNSPEC;

	/*
	* Chew everything passed in up into a struct group_source_req
	* as that is easier to process.
	* Note: Any embedded scope ID in the multicast group passed
	* in by userland is ignored, the interface index is the recommended
	* mechanism to specify an interface; see below.
	*/
	switch (sopt->sopt_name) {
	case IPV6_LEAVE_GROUP:
	error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
	sizeof(struct ipv6_mreq));
	if (error)
	return (error);
	gsa->sin6.sin6_family = AF_INET6;
	gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
	gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
	gsa->sin6.sin6_port = 0;
	gsa->sin6.sin6_scope_id = 0;
	ifindex = mreq.ipv6mr_interface;
	break;

	case MCAST_LEAVE_GROUP:
	case MCAST_LEAVE_SOURCE_GROUP:
	if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_req),
	sizeof(struct group_req));
	} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_source_req),
	sizeof(struct group_source_req));
	}
	if (error)
	return (error);

	if (gsa->sin6.sin6_family != AF_INET6 \|\|
	gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);
	if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
	if (ssa->sin6.sin6_family != AF_INET6 \|\|
	ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);
	if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
	return (EINVAL);
	/*
	* TODO: Validate embedded scope ID in source
	* list entry against passed-in ifp, if and only
	* if source list filter entry is iface or node local.
	*/
	in6_clearscope(&ssa->sin6.sin6_addr);
	}
	gsa->sin6.sin6_port = 0;
	gsa->sin6.sin6_scope_id = 0;
	ifindex = gsr.gsr_interface;
	break;

	default:
	CTR2(KTR_MLD, "%s: unknown sopt_name %d",
	__func__, sopt->sopt_name);
	return (EOPNOTSUPP);
	break;
	}

	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
	return (EINVAL);

	/*
	* Validate interface index if provided. If no interface index
	* was provided separately, attempt to look the membership up
	* from the default scope as a last resort to disambiguate
	* the membership we are being asked to leave.
	* XXX SCOPE6 lock potentially taken here.
	*/
	if (ifindex != 0) {
	if (V_if_index < ifindex)
	return (EADDRNOTAVAIL);
	ifp = ifnet_byindex(ifindex);
	if (ifp == NULL)
	return (EADDRNOTAVAIL);
	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
	} else {
	error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
	if (error)
	return (EADDRNOTAVAIL);
	/*
	* Some badly behaved applications don't pass an ifindex
	* or a scope ID, which is an API violation. In this case,
	* perform a lookup as per a v6 join.
	*
	* XXX For now, stomp on zone ID for the corner case.
	* This is not the 'KAME way', but we need to see the ifp
	* directly until such time as this implementation is
	* refactored, assuming the scope IDs are the way to go.
	*/
	ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
	if (ifindex == 0) {
	CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
	"ifp for group %s.", __func__,
	ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
	ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
	} else {
	ifp = ifnet_byindex(ifindex);
	}
	if (ifp == NULL)
	return (EADDRNOTAVAIL);
	}

	CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
	KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));

	/*
	* Find the membership in the membership array.
	*/
	imo = in6p_findmoptions(inp);
	idx = im6o_match_group(imo, ifp, &gsa->sa);
	if (idx == -1) {
	error = EADDRNOTAVAIL;
	goto out_in6p_locked;
	}
	inm = imo->im6o_membership[idx];
	imf = &imo->im6o_mfilters[idx];

	if (ssa->ss.ss_family != AF_UNSPEC)
	is_final = 0;

	/*
	* Begin state merge transaction at socket layer.
	*/
	INP_WLOCK_ASSERT(inp);

	/*
	* If we were instructed only to leave a given source, do so.
	* MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
	*/
	if (is_final) {
	im6f_leave(imf);
	} else {
	if (imf->im6f_st[0] == MCAST_EXCLUDE) {
	error = EADDRNOTAVAIL;
	goto out_in6p_locked;
	}
	ims = im6o_match_source(imo, idx, &ssa->sa);
	if (ims == NULL) {
	CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
	ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
	"not ");
	error = EADDRNOTAVAIL;
	goto out_in6p_locked;
	}
	CTR2(KTR_MLD, "%s: %s source", __func__, "block");
	error = im6f_prune(imf, &ssa->sin6);
	if (error) {
	CTR1(KTR_MLD, "%s: merge imf state failed",
	__func__);
	goto out_in6p_locked;
	}
	}

	/*
	* Begin state merge transaction at MLD layer.
	*/
	IN6_MULTI_LOCK();

	if (is_final) {
	/*
	* Give up the multicast address record to which
	* the membership points.
	*/
	(void)in6_mc_leave_locked(inm, imf);
	} else {
	CTR1(KTR_MLD, "%s: merge inm state", __func__);
	error = in6m_merge(inm, imf);
	if (error)
	CTR1(KTR_MLD, "%s: failed to merge inm state",
	__func__);
	else {
	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
	error = mld_change_state(inm, 0);
	if (error)
	CTR1(KTR_MLD, "%s: failed mld downcall",
	__func__);
	}
	}

	IN6_MULTI_UNLOCK();

	if (error)
	im6f_rollback(imf);
	else
	im6f_commit(imf);

	im6f_reap(imf);

	if (is_final) {
	/* Remove the gap in the membership array. */
	for (++idx; idx < imo->im6o_num_memberships; ++idx) {
	imo->im6o_membership[idx-1] = imo->im6o_membership[idx];
	imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx];
	}
	imo->im6o_num_memberships--;
	}

	out_in6p_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Select the interface for transmitting IPv6 multicast datagrams.
	*
	* Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
	* may be passed to this socket option. An address of in6addr_any or an
	* interface index of 0 is used to remove a previous selection.
	* When no interface is selected, one is chosen for every send.
	*/
	static int
	in6p_set_multicast_if(struct inpcb inp, struct sockopt sopt)
	{
	struct ifnet *ifp;
	struct ip6_moptions *imo;
	u_int ifindex;
	int error;

	if (sopt->sopt_valsize != sizeof(u_int))
	return (EINVAL);

	error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
	if (error)
	return (error);
	if (V_if_index < ifindex)
	return (EINVAL);
	if (ifindex == 0)
	ifp = NULL;
	else {
	ifp = ifnet_byindex(ifindex);
	if (ifp == NULL)
	return (EINVAL);
	if ((ifp->if_flags & IFF_MULTICAST) == 0)
	return (EADDRNOTAVAIL);
	}
	imo = in6p_findmoptions(inp);
	imo->im6o_multicast_ifp = ifp;
	INP_WUNLOCK(inp);

	return (0);
	}

	/*
	* Atomically set source filters on a socket for an IPv6 multicast group.
	*
	* SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
	*/
	static int
	in6p_set_source_filters(struct inpcb inp, struct sockopt sopt)
	{
	struct __msfilterreq msfr;
	sockunion_t *gsa;
	struct ifnet *ifp;
	struct in6_mfilter *imf;
	struct ip6_moptions *imo;
	struct in6_multi *inm;
	size_t idx;
	int error;

	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
	sizeof(struct __msfilterreq));
	if (error)
	return (error);

	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
	return (ENOBUFS);

	if (msfr.msfr_fmode != MCAST_EXCLUDE &&
	msfr.msfr_fmode != MCAST_INCLUDE)
	return (EINVAL);

	if (msfr.msfr_group.ss_family != AF_INET6 \|\|
	msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
	return (EINVAL);

	gsa = (sockunion_t *)&msfr.msfr_group;
	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
	return (EINVAL);

	gsa->sin6.sin6_port = 0; /* ignore port */

	if (msfr.msfr_ifindex == 0 \|\| V_if_index < msfr.msfr_ifindex)
	return (EADDRNOTAVAIL);
	ifp = ifnet_byindex(msfr.msfr_ifindex);
	if (ifp == NULL)
	return (EADDRNOTAVAIL);
	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);

	/*
	* Take the INP write lock.
	* Check if this socket is a member of this group.
	*/
	imo = in6p_findmoptions(inp);
	idx = im6o_match_group(imo, ifp, &gsa->sa);
	if (idx == -1 \|\| imo->im6o_mfilters == NULL) {
	error = EADDRNOTAVAIL;
	goto out_in6p_locked;
	}
	inm = imo->im6o_membership[idx];
	imf = &imo->im6o_mfilters[idx];

	/*
	* Begin state merge transaction at socket layer.
	*/
	INP_WLOCK_ASSERT(inp);

	imf->im6f_st[1] = msfr.msfr_fmode;

	/*
	* Apply any new source filters, if present.
	* Make a copy of the user-space source vector so
	* that we may copy them with a single copyin. This
	* allows us to deal with page faults up-front.
	*/
	if (msfr.msfr_nsrcs > 0) {
	struct in6_msource *lims;
	struct sockaddr_in6 *psin;
	struct sockaddr_storage kss, pkss;
	int i;

	INP_WUNLOCK(inp);

	CTR2(KTR_MLD, "%s: loading %lu source list entries",
	__func__, (unsigned long)msfr.msfr_nsrcs);
	kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
	M_TEMP, M_WAITOK);
	error = copyin(msfr.msfr_srcs, kss,
	sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
	if (error) {
	free(kss, M_TEMP);
	return (error);
	}

	INP_WLOCK(inp);

	/*
	* Mark all source filters as UNDEFINED at t1.
	* Restore new group filter mode, as im6f_leave()
	* will set it to INCLUDE.
	*/
	im6f_leave(imf);
	imf->im6f_st[1] = msfr.msfr_fmode;

	/*
	* Update socket layer filters at t1, lazy-allocating
	* new entries. This saves a bunch of memory at the
	* cost of one RB_FIND() per source entry; duplicate
	* entries in the msfr_nsrcs vector are ignored.
	* If we encounter an error, rollback transaction.
	*
	* XXX This too could be replaced with a set-symmetric
	* difference like loop to avoid walking from root
	* every time, as the key space is common.
	*/
	for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
	psin = (struct sockaddr_in6 *)pkss;
	if (psin->sin6_family != AF_INET6) {
	error = EAFNOSUPPORT;
	break;
	}
	if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
	error = EINVAL;
	break;
	}
	if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
	error = EINVAL;
	break;
	}
	/*
	* TODO: Validate embedded scope ID in source
	* list entry against passed-in ifp, if and only
	* if source list filter entry is iface or node local.
	*/
	in6_clearscope(&psin->sin6_addr);
	error = im6f_get_source(imf, psin, &lims);
	if (error)
	break;
	lims->im6sl_st[1] = imf->im6f_st[1];
	}
	free(kss, M_TEMP);
	}

	if (error)
	goto out_im6f_rollback;

	INP_WLOCK_ASSERT(inp);
	IN6_MULTI_LOCK();

	/*
	* Begin state merge transaction at MLD layer.
	*/
	CTR1(KTR_MLD, "%s: merge inm state", __func__);
	error = in6m_merge(inm, imf);
	if (error)
	CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
	else {
	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
	error = mld_change_state(inm, 0);
	if (error)
	CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
	}

	IN6_MULTI_UNLOCK();

	out_im6f_rollback:
	if (error)
	im6f_rollback(imf);
	else
	im6f_commit(imf);

	im6f_reap(imf);

	out_in6p_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Set the IP multicast options in response to user setsockopt().
	*
	* Many of the socket options handled in this function duplicate the
	* functionality of socket options in the regular unicast API. However,
	* it is not possible to merge the duplicate code, because the idempotence
	* of the IPv6 multicast part of the BSD Sockets API must be preserved;
	* the effects of these options must be treated as separate and distinct.
	*
	* SMPng: XXX: Unlocked read of inp_socket believed OK.
	*/
	int
	ip6_setmoptions(struct inpcb inp, struct sockopt sopt)
	{
	struct ip6_moptions *im6o;
	int error;

	error = 0;

	/*
	* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
	* or is a divert socket, reject it.
	*/
	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT \|\|
	(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
	inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
	return (EOPNOTSUPP);

	switch (sopt->sopt_name) {
	case IPV6_MULTICAST_IF:
	error = in6p_set_multicast_if(inp, sopt);
	break;

	case IPV6_MULTICAST_HOPS: {
	int hlim;

	if (sopt->sopt_valsize != sizeof(int)) {
	error = EINVAL;
	break;
	}
	error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
	if (error)
	break;
	if (hlim < -1 \|\| hlim > 255) {
	error = EINVAL;
	break;
	} else if (hlim == -1) {
	hlim = V_ip6_defmcasthlim;
	}
	im6o = in6p_findmoptions(inp);
	im6o->im6o_multicast_hlim = hlim;
	INP_WUNLOCK(inp);
	break;
	}

	case IPV6_MULTICAST_LOOP: {
	u_int loop;

	/*
	* Set the loopback flag for outgoing multicast packets.
	* Must be zero or one.
	*/
	if (sopt->sopt_valsize != sizeof(u_int)) {
	error = EINVAL;
	break;
	}
	error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
	if (error)
	break;
	if (loop > 1) {
	error = EINVAL;
	break;
	}
	im6o = in6p_findmoptions(inp);
	im6o->im6o_multicast_loop = loop;
	INP_WUNLOCK(inp);
	break;
	}

	case IPV6_JOIN_GROUP:
	case MCAST_JOIN_GROUP:
	case MCAST_JOIN_SOURCE_GROUP:
	error = in6p_join_group(inp, sopt);
	break;

	case IPV6_LEAVE_GROUP:
	case MCAST_LEAVE_GROUP:
	case MCAST_LEAVE_SOURCE_GROUP:
	error = in6p_leave_group(inp, sopt);
	break;

	case MCAST_BLOCK_SOURCE:
	case MCAST_UNBLOCK_SOURCE:
	error = in6p_block_unblock_source(inp, sopt);
	break;

	case IPV6_MSFILTER:
	error = in6p_set_source_filters(inp, sopt);
	break;

	default:
	error = EOPNOTSUPP;
	break;
	}

	INP_UNLOCK_ASSERT(inp);

	return (error);
	}

	/*
	* Expose MLD's multicast filter mode and source list(s) to userland,
	* keyed by (ifindex, group).
	* The filter mode is written out as a uint32_t, followed by
	* 0..n of struct in6_addr.
	* For use by ifmcstat(8).
	* SMPng: NOTE: unlocked read of ifindex space.
	*/
	static int
	sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
	{
	struct in6_addr mcaddr;
	struct in6_addr src;
	struct ifnet *ifp;
	struct ifmultiaddr *ifma;
	struct in6_multi *inm;
	struct ip6_msource *ims;
	int *name;
	int retval;
	u_int namelen;
	uint32_t fmode, ifindex;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	name = (int *)arg1;
	namelen = arg2;

	if (req->newptr != NULL)
	return (EPERM);

	/* int: ifindex + 4 * 32 bits of IPv6 address */
	if (namelen != 5)
	return (EINVAL);

	ifindex = name[0];
	if (ifindex <= 0 \|\| ifindex > V_if_index) {
	CTR2(KTR_MLD, "%s: ifindex %u out of range",
	__func__, ifindex);
	return (ENOENT);
	}

	memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
	if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
	CTR2(KTR_MLD, "%s: group %s is not multicast",
	__func__, ip6_sprintf(ip6tbuf, &mcaddr));
	return (EINVAL);
	}

	ifp = ifnet_byindex(ifindex);
	if (ifp == NULL) {
	CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
	__func__, ifindex);
	return (ENOENT);
	}
	/*
	* Internal MLD lookups require that scope/zone ID is set.
	*/
	(void)in6_setscope(&mcaddr, ifp, NULL);

	retval = sysctl_wire_old_buffer(req,
	sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
	if (retval)
	return (retval);

	IN6_MULTI_LOCK();

	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_INET6 \|\|
	ifma->ifma_protospec == NULL)
	continue;
	inm = (struct in6_multi *)ifma->ifma_protospec;
	if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
	continue;
	fmode = inm->in6m_st[1].iss_fmode;
	retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
	if (retval != 0)
	break;
	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
	CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
	/*
	* Only copy-out sources which are in-mode.
	*/
	if (fmode != im6s_get_mode(inm, ims, 1)) {
	CTR1(KTR_MLD, "%s: skip non-in-mode",
	__func__);
	continue;
	}
	src = ims->im6s_addr;
	retval = SYSCTL_OUT(req, &src,
	sizeof(struct in6_addr));
	if (retval != 0)
	break;
	}
	}
	IF_ADDR_RUNLOCK(ifp);

	IN6_MULTI_UNLOCK();

	return (retval);
	}

	#ifdef KTR

	static const char *in6m_modestrs[] = { "un", "in", "ex" };

	static const char *
	in6m_mode_str(const int mode)
	{

	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
	return (in6m_modestrs[mode]);
	return ("??");
	}

	static const char *in6m_statestrs[] = {
	"not-member",
	"silent",
	"idle",
	"lazy",
	"sleeping",
	"awakening",
	"query-pending",
	"sg-query-pending",
	"leaving"
	};

	static const char *
	in6m_state_str(const int state)
	{

	if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
	return (in6m_statestrs[state]);
	return ("??");
	}

	/*
	* Dump an in6_multi structure to the console.
	*/
	void
	in6m_print(const struct in6_multi *inm)
	{
	int t;
	char ip6tbuf[INET6_ADDRSTRLEN];

	if ((ktr_mask & KTR_MLD) == 0)
	return;

	printf("%s: --- begin in6m %p ---\n", __func__, inm);
	printf("addr %s ifp %p(%s) ifma %p\n",
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	inm->in6m_ifp,
	if_name(inm->in6m_ifp),
	inm->in6m_ifma);
	printf("timer %u state %s refcount %u scq.len %u\n",
	inm->in6m_timer,
	in6m_state_str(inm->in6m_state),
	inm->in6m_refcount,
	mbufq_len(&inm->in6m_scq));
	printf("mli %p nsrc %lu sctimer %u scrv %u\n",
	inm->in6m_mli,
	inm->in6m_nsrc,
	inm->in6m_sctimer,
	inm->in6m_scrv);
	for (t = 0; t < 2; t++) {
	printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
	in6m_mode_str(inm->in6m_st[t].iss_fmode),
	inm->in6m_st[t].iss_asm,
	inm->in6m_st[t].iss_ex,
	inm->in6m_st[t].iss_in,
	inm->in6m_st[t].iss_rec);
	}
	printf("%s: --- end in6m %p ---\n", __func__, inm);
	}

	#else /* !KTR */

	void
	in6m_print(const struct in6_multi *inm)
	{

	}

	#endif /* KTR */
	Index: head/sys/netinet6/in6_src.c
	===================================================================
	--- head/sys/netinet6/in6_src.c (revision 327172)
	+++ head/sys/netinet6/in6_src.c (revision 327173)
	@@ -1,1246 +1,1243 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/rmlock.h>
	#include <sys/sx.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_dl.h>
	#include <net/route.h>
	#include <net/if_llatbl.h>
	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>

	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/in6_fib.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>

	static struct mtx addrsel_lock;
	#define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF)
	#define ADDRSEL_LOCK() mtx_lock(&addrsel_lock)
	#define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock)
	#define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED)

	static struct sx addrsel_sxlock;
	#define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock")
	#define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock)
	#define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock)
	#define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock)
	#define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock)

	#define ADDR_LABEL_NOTAPP (-1)
	static VNET_DEFINE(struct in6_addrpolicy, defaultaddrpolicy);
	#define V_defaultaddrpolicy VNET(defaultaddrpolicy)

	VNET_DEFINE(int, ip6_prefer_tempaddr) = 0;

	static int selectroute(struct sockaddr_in6 , struct ip6_pktopts ,
	struct ip6_moptions , struct route_in6 , struct ifnet **,
	struct rtentry **, int, u_int);
	static int in6_selectif(struct sockaddr_in6 , struct ip6_pktopts ,
	struct ip6_moptions , struct ifnet *,
	struct ifnet *, u_int);
	static int in6_selectsrc(uint32_t, struct sockaddr_in6 *,
	struct ip6_pktopts , struct inpcb , struct ucred *,
	struct ifnet *, struct in6_addr );

	static struct in6_addrpolicy lookup_addrsel_policy(struct sockaddr_in6 );

	static void init_policy_queue(void);
	static int add_addrsel_policyent(struct in6_addrpolicy *);
	static int delete_addrsel_policyent(struct in6_addrpolicy *);
	static int walk_addrsel_policy(int ()(struct in6_addrpolicy , void *),
	void *);
	static int dump_addrsel_policyent(struct in6_addrpolicy , void );
	static struct in6_addrpolicy match_addrsel_policy(struct sockaddr_in6 );

	/*
	* Return an IPv6 address, which is the most appropriate for a given
	* destination and user specified options.
	* If necessary, this function lookups the routing table and returns
	* an entry to the caller for later use.
	*/
	#define REPLACE(r) do {\
	IP6STAT_INC(ip6s_sources_rule[(r)]); \
	- rule = (r); \
	/* { \
	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
	printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
	} */ \
	goto replace; \
	} while(0)
	#define NEXT(r) do {\
	/* { \
	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
	printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
	} */ \
	goto next; /* XXX: we can't use 'continue' here */ \
	} while(0)
	#define BREAK(r) do { \
	IP6STAT_INC(ip6s_sources_rule[(r)]); \
	- rule = (r); \
	goto out; /* XXX: we can't use 'break' here */ \
	} while(0)

	static int
	in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock,
	struct ip6_pktopts opts, struct inpcb inp, struct ucred *cred,
	struct ifnet *ifpp, struct in6_addr srcp)
	{
	struct rm_priotracker in6_ifa_tracker;
	struct in6_addr dst, tmp;
	struct ifnet ifp = NULL, oifp = NULL;
	struct in6_ifaddr ia = NULL, ia_best = NULL;
	struct in6_pktinfo *pi = NULL;
	int dst_scope = -1, best_scope = -1, best_matchlen = -1;
	struct in6_addrpolicy dst_policy = NULL, best_policy = NULL;
	u_int32_t odstzone;
	int prefer_tempaddr;
	- int error, rule;
	+ int error;
	struct ip6_moptions *mopts;

	KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__));

	dst = dstsock->sin6_addr; /* make a copy for local operation */
	if (ifpp) {
	/*
	* Save a possibly passed in ifp for in6_selectsrc. Only
	* neighbor discovery code should use this feature, where
	* we may know the interface but not the FIB number holding
	* the connected subnet in case someone deleted it from the
	* default FIB and we need to check the interface.
	*/
	if (*ifpp != NULL)
	oifp = *ifpp;
	*ifpp = NULL;
	}

	if (inp != NULL) {
	INP_LOCK_ASSERT(inp);
	mopts = inp->in6p_moptions;
	} else {
	mopts = NULL;
	}

	/*
	* If the source address is explicitly specified by the caller,
	* check if the requested source address is indeed a unicast address
	* assigned to the node, and can be used as the packet's source
	* address. If everything is okay, use the address as source.
	*/
	if (opts && (pi = opts->ip6po_pktinfo) &&
	!IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
	/* get the outgoing interface */
	if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
	fibnum))
	!= 0)
	return (error);

	/*
	* determine the appropriate zone id of the source based on
	* the zone of the destination and the outgoing interface.
	* If the specified address is ambiguous wrt the scope zone,
	* the interface must be specified; otherwise, ifa_ifwithaddr()
	* will fail matching the address.
	*/
	tmp = pi->ipi6_addr;
	if (ifp) {
	error = in6_setscope(&tmp, ifp, &odstzone);
	if (error)
	return (error);
	}
	if (cred != NULL && (error = prison_local_ip6(cred,
	&tmp, (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0)
	return (error);

	/*
	* If IPV6_BINDANY socket option is set, we allow to specify
	* non local addresses as source address in IPV6_PKTINFO
	* ancillary data.
	*/
	if ((inp->inp_flags & INP_BINDANY) == 0) {
	ia = in6ifa_ifwithaddr(&tmp, 0 /* XXX */);
	if (ia == NULL \|\| (ia->ia6_flags & (IN6_IFF_ANYCAST \|
	IN6_IFF_NOTREADY))) {
	if (ia != NULL)
	ifa_free(&ia->ia_ifa);
	return (EADDRNOTAVAIL);
	}
	bcopy(&ia->ia_addr.sin6_addr, srcp, sizeof(*srcp));
	ifa_free(&ia->ia_ifa);
	} else
	bcopy(&tmp, srcp, sizeof(*srcp));
	pi->ipi6_addr = tmp; /* XXX: this overrides pi */
	if (ifpp)
	*ifpp = ifp;
	return (0);
	}

	/*
	* Otherwise, if the socket has already bound the source, just use it.
	*/
	if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
	if (cred != NULL &&
	(error = prison_local_ip6(cred, &inp->in6p_laddr,
	((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
	return (error);
	bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp));
	return (0);
	}

	/*
	* Bypass source address selection and use the primary jail IP
	* if requested.
	*/
	if (cred != NULL && !prison_saddrsel_ip6(cred, srcp))
	return (0);

	/*
	* If the address is not specified, choose the best one based on
	* the outgoing interface and the destination address.
	*/
	/* get the outgoing interface */
	if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
	(inp != NULL) ? inp->inp_inc.inc_fibnum : fibnum)) != 0)
	return (error);

	#ifdef DIAGNOSTIC
	if (ifp == NULL) /* this should not happen */
	panic("in6_selectsrc: NULL ifp");
	#endif
	error = in6_setscope(&dst, ifp, &odstzone);
	if (error)
	return (error);

	- rule = 0;
	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
	int new_scope = -1, new_matchlen = -1;
	struct in6_addrpolicy *new_policy = NULL;
	u_int32_t srczone, osrczone, dstzone;
	struct in6_addr src;
	struct ifnet *ifp1 = ia->ia_ifp;

	/*
	* We'll never take an address that breaks the scope zone
	* of the destination. We also skip an address if its zone
	* does not contain the outgoing interface.
	* XXX: we should probably use sin6_scope_id here.
	*/
	if (in6_setscope(&dst, ifp1, &dstzone) \|\|
	odstzone != dstzone) {
	continue;
	}
	src = ia->ia_addr.sin6_addr;
	if (in6_setscope(&src, ifp, &osrczone) \|\|
	in6_setscope(&src, ifp1, &srczone) \|\|
	osrczone != srczone) {
	continue;
	}

	/* avoid unusable addresses */
	if ((ia->ia6_flags &
	(IN6_IFF_NOTREADY \| IN6_IFF_ANYCAST \| IN6_IFF_DETACHED))) {
	continue;
	}
	if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
	continue;

	/* If jailed only take addresses of the jail into account. */
	if (cred != NULL &&
	prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0)
	continue;

	/* Rule 1: Prefer same address */
	if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) {
	ia_best = ia;
	BREAK(1); /* there should be no better candidate */
	}

	if (ia_best == NULL)
	REPLACE(0);

	/* Rule 2: Prefer appropriate scope */
	if (dst_scope < 0)
	dst_scope = in6_addrscope(&dst);
	new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
	if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
	if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
	REPLACE(2);
	NEXT(2);
	} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
	if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
	NEXT(2);
	REPLACE(2);
	}

	/*
	* Rule 3: Avoid deprecated addresses. Note that the case of
	* !ip6_use_deprecated is already rejected above.
	*/
	if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
	NEXT(3);
	if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
	REPLACE(3);

	/* Rule 4: Prefer home addresses */
	/*
	* XXX: This is a TODO. We should probably merge the MIP6
	* case above.
	*/

	/* Rule 5: Prefer outgoing interface */
	if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) {
	if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
	NEXT(5);
	if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
	REPLACE(5);
	}

	/*
	* Rule 6: Prefer matching label
	* Note that best_policy should be non-NULL here.
	*/
	if (dst_policy == NULL)
	dst_policy = lookup_addrsel_policy(dstsock);
	if (dst_policy->label != ADDR_LABEL_NOTAPP) {
	new_policy = lookup_addrsel_policy(&ia->ia_addr);
	if (dst_policy->label == best_policy->label &&
	dst_policy->label != new_policy->label)
	NEXT(6);
	if (dst_policy->label != best_policy->label &&
	dst_policy->label == new_policy->label)
	REPLACE(6);
	}

	/*
	* Rule 7: Prefer public addresses.
	* We allow users to reverse the logic by configuring
	* a sysctl variable, so that privacy conscious users can
	* always prefer temporary addresses.
	*/
	if (opts == NULL \|\|
	opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
	prefer_tempaddr = V_ip6_prefer_tempaddr;
	} else if (opts->ip6po_prefer_tempaddr ==
	IP6PO_TEMPADDR_NOTPREFER) {
	prefer_tempaddr = 0;
	} else
	prefer_tempaddr = 1;
	if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
	(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
	if (prefer_tempaddr)
	REPLACE(7);
	else
	NEXT(7);
	}
	if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
	!(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
	if (prefer_tempaddr)
	NEXT(7);
	else
	REPLACE(7);
	}

	/*
	* Rule 8: prefer addresses on alive interfaces.
	* This is a KAME specific rule.
	*/
	if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
	!(ia->ia_ifp->if_flags & IFF_UP))
	NEXT(8);
	if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
	(ia->ia_ifp->if_flags & IFF_UP))
	REPLACE(8);

	/*
	* Rule 9: prefer address with better virtual status.
	*/
	if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa))
	REPLACE(9);
	if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa))
	NEXT(9);

	/*
	* Rule 10: prefer address with `prefer_source' flag.
	*/
	if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 &&
	(ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0)
	REPLACE(10);
	if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 &&
	(ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0)
	NEXT(10);

	/*
	* Rule 14: Use longest matching prefix.
	* Note: in the address selection draft, this rule is
	* documented as "Rule 8". However, since it is also
	* documented that this rule can be overridden, we assign
	* a large number so that it is easy to assign smaller numbers
	* to more preferred rules.
	*/
	new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
	if (best_matchlen < new_matchlen)
	REPLACE(14);
	if (new_matchlen < best_matchlen)
	NEXT(14);

	/* Rule 15 is reserved. */

	/*
	* Last resort: just keep the current candidate.
	* Or, do we need more rules?
	*/
	continue;

	replace:
	ia_best = ia;
	best_scope = (new_scope >= 0 ? new_scope :
	in6_addrscope(&ia_best->ia_addr.sin6_addr));
	best_policy = (new_policy ? new_policy :
	lookup_addrsel_policy(&ia_best->ia_addr));
	best_matchlen = (new_matchlen >= 0 ? new_matchlen :
	in6_matchlen(&ia_best->ia_addr.sin6_addr,
	&dst));

	next:
	continue;

	out:
	break;
	}

	if ((ia = ia_best) == NULL) {
	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
	IP6STAT_INC(ip6s_sources_none);
	return (EADDRNOTAVAIL);
	}

	/*
	* At this point at least one of the addresses belonged to the jail
	* but it could still be, that we want to further restrict it, e.g.
	* theoratically IN6_IS_ADDR_LOOPBACK.
	* It must not be IN6_IS_ADDR_UNSPECIFIED anymore.
	* prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should
	* let all others previously selected pass.
	* Use tmp to not change ::1 on lo0 to the primary jail address.
	*/
	tmp = ia->ia_addr.sin6_addr;
	if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL &&
	(inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) {
	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
	IP6STAT_INC(ip6s_sources_none);
	return (EADDRNOTAVAIL);
	}

	if (ifpp)
	*ifpp = ifp;

	bcopy(&tmp, srcp, sizeof(*srcp));
	if (ia->ia_ifp == ifp)
	IP6STAT_INC(ip6s_sources_sameif[best_scope]);
	else
	IP6STAT_INC(ip6s_sources_otherif[best_scope]);
	if (dst_scope == best_scope)
	IP6STAT_INC(ip6s_sources_samescope[best_scope]);
	else
	IP6STAT_INC(ip6s_sources_otherscope[best_scope]);
	if (IFA6_IS_DEPRECATED(ia))
	IP6STAT_INC(ip6s_sources_deprecated[best_scope]);
	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
	return (0);
	}

	/*
	* Select source address based on @inp, @dstsock and @opts.
	* Stores selected address to @srcp. If @scope_ambiguous is set,
	* embed scope from selected outgoing interface. If @hlim pointer
	* is provided, stores calculated hop limit there.
	* Returns 0 on success.
	*/
	int
	in6_selectsrc_socket(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct inpcb inp, struct ucred cred, int scope_ambiguous,
	struct in6_addr srcp, int hlim)
	{
	struct ifnet *retifp;
	uint32_t fibnum;
	int error;

	fibnum = inp->inp_inc.inc_fibnum;
	retifp = NULL;

	error = in6_selectsrc(fibnum, dstsock, opts, inp, cred, &retifp, srcp);
	if (error != 0)
	return (error);

	if (hlim != NULL)
	*hlim = in6_selecthlim(inp, retifp);

	if (retifp == NULL \|\| scope_ambiguous == 0)
	return (0);

	/*
	* Application should provide a proper zone ID or the use of
	* default zone IDs should be enabled. Unfortunately, some
	* applications do not behave as it should, so we need a
	* workaround. Even if an appropriate ID is not determined
	* (when it's required), if we can determine the outgoing
	* interface. determine the zone ID based on the interface.
	*/
	error = in6_setscope(&dstsock->sin6_addr, retifp, NULL);

	return (error);
	}

	/*
	* Select source address based on @fibnum, @dst and @scopeid.
	* Stores selected address to @srcp.
	* Returns 0 on success.
	*
	* Used by non-socket based consumers (ND code mostly)
	*/
	int
	in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst,
	uint32_t scopeid, struct ifnet ifp, struct in6_addr srcp,
	int *hlim)
	{
	struct ifnet *retifp;
	struct sockaddr_in6 dst_sa;
	int error;

	retifp = ifp;
	bzero(&dst_sa, sizeof(dst_sa));
	dst_sa.sin6_family = AF_INET6;
	dst_sa.sin6_len = sizeof(dst_sa);
	dst_sa.sin6_addr = *dst;
	dst_sa.sin6_scope_id = scopeid;
	sa6_embedscope(&dst_sa, 0);

	error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL, NULL, &retifp, srcp);
	if (hlim != NULL)
	*hlim = in6_selecthlim(NULL, retifp);

	return (error);
	}

	/*
	* clone - meaningful only for bsdi and freebsd
	*/
	static int
	selectroute(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct ip6_moptions mopts, struct route_in6 ro,
	struct ifnet retifp, struct rtentry retrt, int norouteok, u_int fibnum)
	{
	int error = 0;
	struct ifnet *ifp = NULL;
	struct rtentry *rt = NULL;
	struct sockaddr_in6 *sin6_next;
	struct in6_pktinfo *pi = NULL;
	struct in6_addr *dst = &dstsock->sin6_addr;
	uint32_t zoneid;
	#if 0
	char ip6buf[INET6_ADDRSTRLEN];

	if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
	dstsock->sin6_addr.s6_addr32[1] == 0 &&
	!IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
	printf("in6_selectroute: strange destination %s\n",
	ip6_sprintf(ip6buf, &dstsock->sin6_addr));
	} else {
	printf("in6_selectroute: destination = %s%%%d\n",
	ip6_sprintf(ip6buf, &dstsock->sin6_addr),
	dstsock->sin6_scope_id); /* for debug */
	}
	#endif

	/* If the caller specify the outgoing interface explicitly, use it. */
	if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
	/* XXX boundary check is assumed to be already done. */
	ifp = ifnet_byindex(pi->ipi6_ifindex);
	if (ifp != NULL &&
	(norouteok \|\| retrt == NULL \|\|
	IN6_IS_ADDR_MULTICAST(dst))) {
	/*
	* we do not have to check or get the route for
	* multicast.
	*/
	goto done;
	} else
	goto getroute;
	}
	/*
	* If the destination address is a multicast address and the outgoing
	* interface for the address is specified by the caller, use it.
	*/
	if (IN6_IS_ADDR_MULTICAST(dst) &&
	mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
	goto done; /* we do not need a route for multicast. */
	}
	/*
	* If destination address is LLA or link- or node-local multicast,
	* use it's embedded scope zone id to determine outgoing interface.
	*/
	if (IN6_IS_ADDR_MC_LINKLOCAL(dst) \|\|
	IN6_IS_ADDR_MC_NODELOCAL(dst)) {
	zoneid = ntohs(in6_getscope(dst));
	if (zoneid > 0) {
	ifp = in6_getlinkifnet(zoneid);
	goto done;
	}
	}

	getroute:
	/*
	* If the next hop address for the packet is specified by the caller,
	* use it as the gateway.
	*/
	if (opts && opts->ip6po_nexthop) {
	struct route_in6 *ron;

	sin6_next = satosin6(opts->ip6po_nexthop);
	if (IN6_IS_ADDR_LINKLOCAL(&sin6_next->sin6_addr)) {
	/*
	* Next hop is LLA, thus it should be neighbor.
	* Determine outgoing interface by zone index.
	*/
	zoneid = ntohs(in6_getscope(&sin6_next->sin6_addr));
	if (zoneid > 0) {
	ifp = in6_getlinkifnet(zoneid);
	goto done;
	}
	}
	ron = &opts->ip6po_nextroute;
	/* Use a cached route if it exists and is valid. */
	if (ron->ro_rt != NULL && (
	(ron->ro_rt->rt_flags & RTF_UP) == 0 \|\|
	ron->ro_dst.sin6_family != AF_INET6 \|\|
	!IN6_ARE_ADDR_EQUAL(&ron->ro_dst.sin6_addr,
	&sin6_next->sin6_addr)))
	RO_RTFREE(ron);
	if (ron->ro_rt == NULL) {
	ron->ro_dst = *sin6_next;
	in6_rtalloc(ron, fibnum); /* multi path case? */
	}
	/*
	* The node identified by that address must be a
	* neighbor of the sending host.
	*/
	if (ron->ro_rt == NULL \|\|
	(ron->ro_rt->rt_flags & RTF_GATEWAY) != 0)
	error = EHOSTUNREACH;
	goto done;
	}

	/*
	* Use a cached route if it exists and is valid, else try to allocate
	* a new one. Note that we should check the address family of the
	* cached destination, in case of sharing the cache with IPv4.
	*/
	if (ro) {
	if (ro->ro_rt &&
	(!(ro->ro_rt->rt_flags & RTF_UP) \|\|
	((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 \|\|
	!IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
	dst))) {
	RTFREE(ro->ro_rt);
	ro->ro_rt = (struct rtentry *)NULL;
	}
	if (ro->ro_rt == (struct rtentry *)NULL) {
	struct sockaddr_in6 *sa6;

	/* No route yet, so try to acquire one */
	bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
	sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
	sa6 = dstsock;
	sa6->sin6_scope_id = 0;

	#ifdef RADIX_MPATH
	rtalloc_mpath_fib((struct route *)ro,
	ntohl(sa6->sin6_addr.s6_addr32[3]), fibnum);
	#else
	ro->ro_rt = in6_rtalloc1((struct sockaddr *)
	&ro->ro_dst, 0, 0UL, fibnum);
	if (ro->ro_rt)
	RT_UNLOCK(ro->ro_rt);
	#endif
	}

	/*
	* do not care about the result if we have the nexthop
	* explicitly specified.
	*/
	if (opts && opts->ip6po_nexthop)
	goto done;

	if (ro->ro_rt) {
	ifp = ro->ro_rt->rt_ifp;

	if (ifp == NULL) { /* can this really happen? */
	RTFREE(ro->ro_rt);
	ro->ro_rt = NULL;
	}
	}
	if (ro->ro_rt == NULL)
	error = EHOSTUNREACH;
	rt = ro->ro_rt;

	/*
	* Check if the outgoing interface conflicts with
	* the interface specified by ipi6_ifindex (if specified).
	* Note that loopback interface is always okay.
	* (this may happen when we are sending a packet to one of
	* our own addresses.)
	*/
	if (ifp && opts && opts->ip6po_pktinfo &&
	opts->ip6po_pktinfo->ipi6_ifindex) {
	if (!(ifp->if_flags & IFF_LOOPBACK) &&
	ifp->if_index !=
	opts->ip6po_pktinfo->ipi6_ifindex) {
	error = EHOSTUNREACH;
	goto done;
	}
	}
	}

	done:
	if (ifp == NULL && rt == NULL) {
	/*
	* This can happen if the caller did not pass a cached route
	* nor any other hints. We treat this case an error.
	*/
	error = EHOSTUNREACH;
	}
	if (error == EHOSTUNREACH)
	IP6STAT_INC(ip6s_noroute);

	if (retifp != NULL) {
	*retifp = ifp;

	/*
	* Adjust the "outgoing" interface. If we're going to loop
	* the packet back to ourselves, the ifp would be the loopback
	* interface. However, we'd rather know the interface associated
	* to the destination address (which should probably be one of
	* our own addresses.)
	*/
	if (rt) {
	if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
	(rt->rt_gateway->sa_family == AF_LINK))
	*retifp =
	ifnet_byindex(((struct sockaddr_dl *)
	rt->rt_gateway)->sdl_index);
	}
	}

	if (retrt != NULL)
	retrt = rt; / rt may be NULL */

	return (error);
	}

	static int
	in6_selectif(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct ip6_moptions mopts, struct ifnet *retifp,
	struct ifnet *oifp, u_int fibnum)
	{
	int error;
	struct route_in6 sro;
	struct rtentry *rt = NULL;
	int rt_flags;

	KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__));

	bzero(&sro, sizeof(sro));
	rt_flags = 0;

	error = selectroute(dstsock, opts, mopts, &sro, retifp, &rt, 1, fibnum);

	if (rt)
	rt_flags = rt->rt_flags;
	if (rt && rt == sro.ro_rt)
	RTFREE(rt);

	if (error != 0) {
	/* Help ND. See oifp comment in in6_selectsrc(). */
	if (oifp != NULL && fibnum == RT_DEFAULT_FIB) {
	*retifp = oifp;
	error = 0;
	}
	return (error);
	}

	/*
	* do not use a rejected or black hole route.
	* XXX: this check should be done in the L2 output routine.
	* However, if we skipped this check here, we'd see the following
	* scenario:
	* - install a rejected route for a scoped address prefix
	* (like fe80::/10)
	* - send a packet to a destination that matches the scoped prefix,
	* with ambiguity about the scope zone.
	* - pick the outgoing interface from the route, and disambiguate the
	* scope zone with the interface.
	* - ip6_output() would try to get another route with the "new"
	* destination, which may be valid.
	* - we'd see no error on output.
	* Although this may not be very harmful, it should still be confusing.
	* We thus reject the case here.
	*/

	if (rt_flags & (RTF_REJECT \| RTF_BLACKHOLE)) {
	error = (rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
	return (error);
	}

	return (0);
	}

	/*
	* Public wrapper function to selectroute().
	*
	* XXX-BZ in6_selectroute() should and will grow the FIB argument. The
	* in6_selectroute_fib() function is only there for backward compat on stable.
	*/
	int
	in6_selectroute(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct ip6_moptions mopts, struct route_in6 ro,
	struct ifnet retifp, struct rtentry retrt)
	{

	return (selectroute(dstsock, opts, mopts, ro, retifp,
	retrt, 0, RT_DEFAULT_FIB));
	}

	#ifndef BURN_BRIDGES
	int
	in6_selectroute_fib(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct ip6_moptions mopts, struct route_in6 ro,
	struct ifnet retifp, struct rtentry retrt, u_int fibnum)
	{

	return (selectroute(dstsock, opts, mopts, ro, retifp,
	retrt, 0, fibnum));
	}
	#endif

	/*
	* Default hop limit selection. The precedence is as follows:
	* 1. Hoplimit value specified via ioctl.
	* 2. (If the outgoing interface is detected) the current
	* hop limit of the interface specified by router advertisement.
	* 3. The system default hoplimit.
	*/
	int
	in6_selecthlim(struct inpcb in6p, struct ifnet ifp)
	{

	if (in6p && in6p->in6p_hops >= 0)
	return (in6p->in6p_hops);
	else if (ifp)
	return (ND_IFINFO(ifp)->chlim);
	else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
	struct nhop6_basic nh6;
	struct in6_addr dst;
	uint32_t fibnum, scopeid;
	int hlim;

	fibnum = in6p->inp_inc.inc_fibnum;
	in6_splitscope(&in6p->in6p_faddr, &dst, &scopeid);
	if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6)==0){
	hlim = ND_IFINFO(nh6.nh_ifp)->chlim;
	return (hlim);
	}
	}
	return (V_ip6_defhlim);
	}

	/*
	* XXX: this is borrowed from in6_pcbbind(). If possible, we should
	* share this function by all bsd...
	*/
	int
	in6_pcbsetport(struct in6_addr laddr, struct inpcb inp, struct ucred *cred)
	{
	struct socket *so = inp->inp_socket;
	u_int16_t lport = 0;
	int error, lookupflags = 0;
	#ifdef INVARIANTS
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
	#endif

	INP_WLOCK_ASSERT(inp);
	INP_HASH_WLOCK_ASSERT(pcbinfo);

	error = prison_local_ip6(cred, laddr,
	((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0));
	if (error)
	return(error);

	/* XXX: this is redundant when called from in6_pcbbind */
	if ((so->so_options & (SO_REUSEADDR\|SO_REUSEPORT)) == 0)
	lookupflags = INPLOOKUP_WILDCARD;

	inp->inp_flags \|= INP_ANONPORT;

	error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags);
	if (error != 0)
	return (error);

	inp->inp_lport = lport;
	if (in_pcbinshash(inp) != 0) {
	inp->in6p_laddr = in6addr_any;
	inp->inp_lport = 0;
	return (EAGAIN);
	}

	return (0);
	}

	void
	addrsel_policy_init(void)
	{

	init_policy_queue();

	/* initialize the "last resort" policy */
	bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy));
	V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;

	if (!IS_DEFAULT_VNET(curvnet))
	return;

	ADDRSEL_LOCK_INIT();
	ADDRSEL_SXLOCK_INIT();
	}

	static struct in6_addrpolicy *
	lookup_addrsel_policy(struct sockaddr_in6 *key)
	{
	struct in6_addrpolicy *match = NULL;

	ADDRSEL_LOCK();
	match = match_addrsel_policy(key);

	if (match == NULL)
	match = &V_defaultaddrpolicy;
	else
	match->use++;
	ADDRSEL_UNLOCK();

	return (match);
	}

	/*
	* Subroutines to manage the address selection policy table via sysctl.
	*/
	struct walkarg {
	struct sysctl_req *w_req;
	};

	static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
	SYSCTL_DECL(_net_inet6_ip6);
	static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
	CTLFLAG_RD, in6_src_sysctl, "");

	static int
	in6_src_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct walkarg w;

	if (req->newptr)
	return EPERM;

	bzero(&w, sizeof(w));
	w.w_req = req;

	return (walk_addrsel_policy(dump_addrsel_policyent, &w));
	}

	int
	in6_src_ioctl(u_long cmd, caddr_t data)
	{
	struct in6_addrpolicy ent0;

	if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
	return (EOPNOTSUPP); /* check for safety */

	ent0 = (struct in6_addrpolicy )data;

	if (ent0.label == ADDR_LABEL_NOTAPP)
	return (EINVAL);
	/* check if the prefix mask is consecutive. */
	if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
	return (EINVAL);
	/* clear trailing garbages (if any) of the prefix address. */
	IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr);
	ent0.use = 0;

	switch (cmd) {
	case SIOCAADDRCTL_POLICY:
	return (add_addrsel_policyent(&ent0));
	case SIOCDADDRCTL_POLICY:
	return (delete_addrsel_policyent(&ent0));
	}

	return (0); /* XXX: compromise compilers */
	}

	/*
	* The followings are implementation of the policy table using a
	* simple tail queue.
	* XXX such details should be hidden.
	* XXX implementation using binary tree should be more efficient.
	*/
	struct addrsel_policyent {
	TAILQ_ENTRY(addrsel_policyent) ape_entry;
	struct in6_addrpolicy ape_policy;
	};

	TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);

	static VNET_DEFINE(struct addrsel_policyhead, addrsel_policytab);
	#define V_addrsel_policytab VNET(addrsel_policytab)

	static void
	init_policy_queue(void)
	{

	TAILQ_INIT(&V_addrsel_policytab);
	}

	static int
	add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
	{
	struct addrsel_policyent new, pol;

	new = malloc(sizeof(*new), M_IFADDR,
	M_WAITOK);
	ADDRSEL_XLOCK();
	ADDRSEL_LOCK();

	/* duplication check */
	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
	if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
	&pol->ape_policy.addr.sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
	&pol->ape_policy.addrmask.sin6_addr)) {
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();
	free(new, M_IFADDR);
	return (EEXIST); /* or override it? */
	}
	}

	bzero(new, sizeof(*new));

	/* XXX: should validate entry */
	new->ape_policy = *newpolicy;

	TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry);
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();

	return (0);
	}

	static int
	delete_addrsel_policyent(struct in6_addrpolicy *key)
	{
	struct addrsel_policyent *pol;

	ADDRSEL_XLOCK();
	ADDRSEL_LOCK();

	/* search for the entry in the table */
	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
	if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
	&pol->ape_policy.addr.sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
	&pol->ape_policy.addrmask.sin6_addr)) {
	break;
	}
	}
	if (pol == NULL) {
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();
	return (ESRCH);
	}

	TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry);
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();
	free(pol, M_IFADDR);

	return (0);
	}

	static int
	walk_addrsel_policy(int (callback)(struct in6_addrpolicy , void ), void w)
	{
	struct addrsel_policyent *pol;
	int error = 0;

	ADDRSEL_SLOCK();
	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
	if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
	ADDRSEL_SUNLOCK();
	return (error);
	}
	}
	ADDRSEL_SUNLOCK();
	return (error);
	}

	static int
	dump_addrsel_policyent(struct in6_addrpolicy pol, void arg)
	{
	int error = 0;
	struct walkarg *w = arg;

	error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));

	return (error);
	}

	static struct in6_addrpolicy *
	match_addrsel_policy(struct sockaddr_in6 *key)
	{
	struct addrsel_policyent *pent;
	struct in6_addrpolicy bestpol = NULL, pol;
	int matchlen, bestmatchlen = -1;
	u_char mp, ep, k, p, m;

	TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) {
	matchlen = 0;

	pol = &pent->ape_policy;
	mp = (u_char *)&pol->addrmask.sin6_addr;
	ep = mp + 16; /* XXX: scope field? */
	k = (u_char *)&key->sin6_addr;
	p = (u_char *)&pol->addr.sin6_addr;
	for (; mp < ep && *mp; mp++, k++, p++) {
	m = *mp;
	if ((k & m) != p)
	goto next; /* not match */
	if (m == 0xff) /* short cut for a typical case */
	matchlen += 8;
	else {
	while (m >= 0x80) {
	matchlen++;
	m <<= 1;
	}
	}
	}

	/* matched. check if this is better than the current best. */
	if (bestpol == NULL \|\|
	matchlen > bestmatchlen) {
	bestpol = pol;
	bestmatchlen = matchlen;
	}

	next:
	continue;
	}

	return (bestpol);
	}
	Index: head/sys/netinet6/mld6.c
	===================================================================
	--- head/sys/netinet6/mld6.c (revision 327172)
	+++ head/sys/netinet6/mld6.c (revision 327173)
	@@ -1,3318 +1,3317 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2009 Bruce Simpson.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
	*/

	/*-
	* Copyright (c) 1988 Stephen Deering.
	* Copyright (c) 1992, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Stephen Deering of Stanford University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)igmp.c 8.1 (Berkeley) 7/19/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/protosw.h>
	#include <sys/sysctl.h>
	#include <sys/kernel.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/ktr.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/route.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet/icmp6.h>
	#include <netinet6/mld6.h>
	#include <netinet6/mld6_var.h>

	#include <security/mac/mac_framework.h>

	#ifndef KTR_MLD
	#define KTR_MLD KTR_INET6
	#endif

	static struct mld_ifsoftc *
	mli_alloc_locked(struct ifnet *);
	static void mli_delete_locked(const struct ifnet *);
	static void mld_dispatch_packet(struct mbuf *);
	static void mld_dispatch_queue(struct mbufq *, int);
	static void mld_final_leave(struct in6_multi , struct mld_ifsoftc );
	static void mld_fasttimo_vnet(void);
	static int mld_handle_state_change(struct in6_multi *,
	struct mld_ifsoftc *);
	static int mld_initial_join(struct in6_multi , struct mld_ifsoftc ,
	const int);
	#ifdef KTR
	static char * mld_rec_type_to_str(const int);
	#endif
	static void mld_set_version(struct mld_ifsoftc *, const int);
	static void mld_slowtimo_vnet(void);
	static int mld_v1_input_query(struct ifnet , const struct ip6_hdr ,
	/const/ struct mld_hdr *);
	static int mld_v1_input_report(struct ifnet , const struct ip6_hdr ,
	/const/ struct mld_hdr *);
	static void mld_v1_process_group_timer(struct mld_ifsoftc *,
	struct in6_multi *);
	static void mld_v1_process_querier_timers(struct mld_ifsoftc *);
	static int mld_v1_transmit_report(struct in6_multi *, const int);
	static void mld_v1_update_group(struct in6_multi *, const int);
	static void mld_v2_cancel_link_timers(struct mld_ifsoftc *);
	static void mld_v2_dispatch_general_query(struct mld_ifsoftc *);
	static struct mbuf *
	mld_v2_encap_report(struct ifnet , struct mbuf );
	static int mld_v2_enqueue_filter_change(struct mbufq *,
	struct in6_multi *);
	static int mld_v2_enqueue_group_record(struct mbufq *,
	struct in6_multi *, const int, const int, const int,
	const int);
	static int mld_v2_input_query(struct ifnet , const struct ip6_hdr ,
	struct mbuf *, const int, const int);
	static int mld_v2_merge_state_changes(struct in6_multi *,
	struct mbufq *);
	static void mld_v2_process_group_timers(struct mld_ifsoftc *,
	struct mbufq , struct mbufq ,
	struct in6_multi *, const int);
	static int mld_v2_process_group_query(struct in6_multi *,
	struct mld_ifsoftc mli, int, struct mbuf , const int);
	static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS);
	static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS);

	/*
	* Normative references: RFC 2710, RFC 3590, RFC 3810.
	*
	* Locking:
	* * The MLD subsystem lock ends up being system-wide for the moment,
	* but could be per-VIMAGE later on.
	* * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
	* Any may be taken independently; if any are held at the same
	* time, the above lock order must be followed.
	* * IN6_MULTI_LOCK covers in_multi.
	* * MLD_LOCK covers per-link state and any global variables in this file.
	* * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
	* per-link state iterators.
	*
	* XXX LOR PREVENTION
	* A special case for IPv6 is the in6_setscope() routine. ip6_output()
	* will not accept an ifp; it wants an embedded scope ID, unlike
	* ip_output(), which happily takes the ifp given to it. The embedded
	* scope ID is only used by MLD to select the outgoing interface.
	*
	* During interface attach and detach, MLD will take MLD_LOCK after
	* the IF_AFDATA_LOCK.
	* As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call
	* it with MLD_LOCK held without triggering an LOR. A netisr with indirect
	* dispatch could work around this, but we'd rather not do that, as it
	* can introduce other races.
	*
	* As such, we exploit the fact that the scope ID is just the interface
	* index, and embed it in the IPv6 destination address accordingly.
	* This is potentially NOT VALID for MLDv1 reports, as they
	* are always sent to the multicast group itself; as MLDv2
	* reports are always sent to ff02::16, this is not an issue
	* when MLDv2 is in use.
	*
	* This does not however eliminate the LOR when ip6_output() itself
	* calls in6_setscope() internally whilst MLD_LOCK is held. This will
	* trigger a LOR warning in WITNESS when the ifnet is detached.
	*
	* The right answer is probably to make IF_AFDATA_LOCK an rwlock, given
	* how it's used across the network stack. Here we're simply exploiting
	* the fact that MLD runs at a similar layer in the stack to scope6.c.
	*
	* VIMAGE:
	* * Each in6_multi corresponds to an ifp, and each ifp corresponds
	* to a vnet in ifp->if_vnet.
	*/
	static struct mtx mld_mtx;
	static MALLOC_DEFINE(M_MLD, "mld", "mld state");

	#define MLD_EMBEDSCOPE(pin6, zoneid) \
	if (IN6_IS_SCOPE_LINKLOCAL(pin6) \|\| \
	IN6_IS_ADDR_MC_INTFACELOCAL(pin6)) \
	(pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) \

	/*
	* VIMAGE-wide globals.
	*/
	static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0};
	static VNET_DEFINE(LIST_HEAD(, mld_ifsoftc), mli_head);
	static VNET_DEFINE(int, interface_timers_running6);
	static VNET_DEFINE(int, state_change_timers_running6);
	static VNET_DEFINE(int, current_state_timers_running6);

	#define V_mld_gsrdelay VNET(mld_gsrdelay)
	#define V_mli_head VNET(mli_head)
	#define V_interface_timers_running6 VNET(interface_timers_running6)
	#define V_state_change_timers_running6 VNET(state_change_timers_running6)
	#define V_current_state_timers_running6 VNET(current_state_timers_running6)

	SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */

	SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0,
	"IPv6 Multicast Listener Discovery");

	/*
	* Virtualized sysctls.
	*/
	SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
	CTLFLAG_VNET \| CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I",
	"Rate limit for MLDv2 Group-and-Source queries in seconds");

	/*
	* Non-virtualized sysctls.
	*/
	static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
	"Per-interface MLDv2 state");

	static int mld_v1enable = 1;
	SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
	&mld_v1enable, 0, "Enable fallback to MLDv1");

	static int mld_use_allow = 1;
	SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
	&mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");

	/*
	* Packed Router Alert option structure declaration.
	*/
	struct mld_raopt {
	struct ip6_hbh hbh;
	struct ip6_opt pad;
	struct ip6_opt_router ra;
	} __packed;

	/*
	* Router Alert hop-by-hop option header.
	*/
	static struct mld_raopt mld_ra = {
	.hbh = { 0, 0 },
	.pad = { .ip6o_type = IP6OPT_PADN, 0 },
	.ra = {
	.ip6or_type = IP6OPT_ROUTER_ALERT,
	.ip6or_len = IP6OPT_RTALERT_LEN - 2,
	.ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF),
	.ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF)
	}
	};
	static struct ip6_pktopts mld_po;

	static __inline void
	mld_save_context(struct mbuf m, struct ifnet ifp)
	{

	#ifdef VIMAGE
	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
	#endif /* VIMAGE */
	m->m_pkthdr.flowid = ifp->if_index;
	}

	static __inline void
	mld_scrub_context(struct mbuf *m)
	{

	m->m_pkthdr.PH_loc.ptr = NULL;
	m->m_pkthdr.flowid = 0;
	}

	/*
	* Restore context from a queued output chain.
	* Return saved ifindex.
	*
	* VIMAGE: The assertion is there to make sure that we
	* actually called CURVNET_SET() with what's in the mbuf chain.
	*/
	static __inline uint32_t
	mld_restore_context(struct mbuf *m)
	{

	#if defined(VIMAGE) && defined(INVARIANTS)
	KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr,
	("%s: called when curvnet was not restored: cuvnet %p m ptr %p",
	__func__, curvnet, m->m_pkthdr.PH_loc.ptr));
	#endif
	return (m->m_pkthdr.flowid);
	}

	/*
	* Retrieve or set threshold between group-source queries in seconds.
	*
	* VIMAGE: Assume curvnet set by caller.
	* SMPng: NOTE: Serialized by MLD lock.
	*/
	static int
	sysctl_mld_gsr(SYSCTL_HANDLER_ARGS)
	{
	int error;
	int i;

	error = sysctl_wire_old_buffer(req, sizeof(int));
	if (error)
	return (error);

	MLD_LOCK();

	i = V_mld_gsrdelay.tv_sec;

	error = sysctl_handle_int(oidp, &i, 0, req);
	if (error \|\| !req->newptr)
	goto out_locked;

	if (i < -1 \|\| i >= 60) {
	error = EINVAL;
	goto out_locked;
	}

	CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d",
	V_mld_gsrdelay.tv_sec, i);
	V_mld_gsrdelay.tv_sec = i;

	out_locked:
	MLD_UNLOCK();
	return (error);
	}

	/*
	* Expose struct mld_ifsoftc to userland, keyed by ifindex.
	* For use by ifmcstat(8).
	*
	* SMPng: NOTE: Does an unlocked ifindex space read.
	* VIMAGE: Assume curvnet set by caller. The node handler itself
	* is not directly virtualized.
	*/
	static int
	sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS)
	{
	int *name;
	int error;
	u_int namelen;
	struct ifnet *ifp;
	struct mld_ifsoftc *mli;

	name = (int *)arg1;
	namelen = arg2;

	if (req->newptr != NULL)
	return (EPERM);

	if (namelen != 1)
	return (EINVAL);

	error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo));
	if (error)
	return (error);

	IN6_MULTI_LOCK();
	MLD_LOCK();

	if (name[0] <= 0 \|\| name[0] > V_if_index) {
	error = ENOENT;
	goto out_locked;
	}

	error = ENOENT;

	ifp = ifnet_byindex(name[0]);
	if (ifp == NULL)
	goto out_locked;

	LIST_FOREACH(mli, &V_mli_head, mli_link) {
	if (ifp == mli->mli_ifp) {
	struct mld_ifinfo info;

	info.mli_version = mli->mli_version;
	info.mli_v1_timer = mli->mli_v1_timer;
	info.mli_v2_timer = mli->mli_v2_timer;
	info.mli_flags = mli->mli_flags;
	info.mli_rv = mli->mli_rv;
	info.mli_qi = mli->mli_qi;
	info.mli_qri = mli->mli_qri;
	info.mli_uri = mli->mli_uri;
	error = SYSCTL_OUT(req, &info, sizeof(info));
	break;
	}
	}

	out_locked:
	MLD_UNLOCK();
	IN6_MULTI_UNLOCK();
	return (error);
	}

	/*
	* Dispatch an entire queue of pending packet chains.
	* VIMAGE: Assumes the vnet pointer has been set.
	*/
	static void
	mld_dispatch_queue(struct mbufq *mq, int limit)
	{
	struct mbuf *m;

	while ((m = mbufq_dequeue(mq)) != NULL) {
	CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m);
	mld_dispatch_packet(m);
	if (--limit == 0)
	break;
	}
	}

	/*
	* Filter outgoing MLD report state by group.
	*
	* Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1)
	* and node-local addresses. However, kernel and socket consumers
	* always embed the KAME scope ID in the address provided, so strip it
	* when performing comparison.
	* Note: This is not the same as the multicast scope.
	*
	* Return zero if the given group is one for which MLD reports
	* should be suppressed, or non-zero if reports should be issued.
	*/
	static __inline int
	mld_is_addr_reported(const struct in6_addr *addr)
	{

	KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__));

	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL)
	return (0);

	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) {
	struct in6_addr tmp = *addr;
	in6_clearscope(&tmp);
	if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes))
	return (0);
	}

	return (1);
	}

	/*
	* Attach MLD when PF_INET6 is attached to an interface.
	*
	* SMPng: Normally called with IF_AFDATA_LOCK held.
	*/
	struct mld_ifsoftc *
	mld_domifattach(struct ifnet *ifp)
	{
	struct mld_ifsoftc *mli;

	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
	__func__, ifp, if_name(ifp));

	MLD_LOCK();

	mli = mli_alloc_locked(ifp);
	if (!(ifp->if_flags & IFF_MULTICAST))
	mli->mli_flags \|= MLIF_SILENT;
	if (mld_use_allow)
	mli->mli_flags \|= MLIF_USEALLOW;

	MLD_UNLOCK();

	return (mli);
	}

	/*
	* VIMAGE: assume curvnet set by caller.
	*/
	static struct mld_ifsoftc *
	mli_alloc_locked(/const/ struct ifnet *ifp)
	{
	struct mld_ifsoftc *mli;

	MLD_LOCK_ASSERT();

	mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_NOWAIT\|M_ZERO);
	if (mli == NULL)
	goto out;

	mli->mli_ifp = ifp;
	mli->mli_version = MLD_VERSION_2;
	mli->mli_flags = 0;
	mli->mli_rv = MLD_RV_INIT;
	mli->mli_qi = MLD_QI_INIT;
	mli->mli_qri = MLD_QRI_INIT;
	mli->mli_uri = MLD_URI_INIT;
	SLIST_INIT(&mli->mli_relinmhead);
	mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);

	LIST_INSERT_HEAD(&V_mli_head, mli, mli_link);

	CTR2(KTR_MLD, "allocate mld_ifsoftc for ifp %p(%s)",
	ifp, if_name(ifp));

	out:
	return (mli);
	}

	/*
	* Hook for ifdetach.
	*
	* NOTE: Some finalization tasks need to run before the protocol domain
	* is detached, but also before the link layer does its cleanup.
	* Run before link-layer cleanup; cleanup groups, but do not free MLD state.
	*
	* SMPng: Caller must hold IN6_MULTI_LOCK().
	* Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator.
	* XXX This routine is also bitten by unlocked ifma_protospec access.
	*/
	void
	mld_ifdetach(struct ifnet *ifp)
	{
	struct mld_ifsoftc *mli;
	struct ifmultiaddr *ifma;
	struct in6_multi inm, tinm;

	CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp,
	if_name(ifp));

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK();

	mli = MLD_IFINFO(ifp);
	if (mli->mli_version == MLD_VERSION_2) {
	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_INET6 \|\|
	ifma->ifma_protospec == NULL)
	continue;
	inm = (struct in6_multi *)ifma->ifma_protospec;
	if (inm->in6m_state == MLD_LEAVING_MEMBER) {
	SLIST_INSERT_HEAD(&mli->mli_relinmhead,
	inm, in6m_nrele);
	}
	in6m_clear_recorded(inm);
	}
	IF_ADDR_RUNLOCK(ifp);
	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele,
	tinm) {
	SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
	in6m_release_locked(inm);
	}
	}

	MLD_UNLOCK();
	}

	/*
	* Hook for domifdetach.
	* Runs after link-layer cleanup; free MLD state.
	*
	* SMPng: Normally called with IF_AFDATA_LOCK held.
	*/
	void
	mld_domifdetach(struct ifnet *ifp)
	{

	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
	__func__, ifp, if_name(ifp));

	MLD_LOCK();
	mli_delete_locked(ifp);
	MLD_UNLOCK();
	}

	static void
	mli_delete_locked(const struct ifnet *ifp)
	{
	struct mld_ifsoftc mli, tmli;

	CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)",
	__func__, ifp, if_name(ifp));

	MLD_LOCK_ASSERT();

	LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) {
	if (mli->mli_ifp == ifp) {
	/*
	* Free deferred General Query responses.
	*/
	mbufq_drain(&mli->mli_gq);

	LIST_REMOVE(mli, mli_link);

	KASSERT(SLIST_EMPTY(&mli->mli_relinmhead),
	("%s: there are dangling in_multi references",
	__func__));

	free(mli, M_MLD);
	return;
	}
	}
	}

	/*
	* Process a received MLDv1 general or address-specific query.
	* Assumes that the query header has been pulled up to sizeof(mld_hdr).
	*
	* NOTE: Can't be fully const correct as we temporarily embed scope ID in
	* mld_addr. This is OK as we own the mbuf chain.
	*/
	static int
	mld_v1_input_query(struct ifnet ifp, const struct ip6_hdr ip6,
	/const/ struct mld_hdr *mld)
	{
	struct ifmultiaddr *ifma;
	struct mld_ifsoftc *mli;
	struct in6_multi *inm;
	int is_general_query;
	uint16_t timer;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	is_general_query = 0;

	if (!mld_v1enable) {
	CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &mld->mld_addr),
	ifp, if_name(ifp));
	return (0);
	}

	/*
	* RFC3810 Section 6.2: MLD queries must originate from
	* a router's link-local address.
	*/
	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
	CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &ip6->ip6_src),
	ifp, if_name(ifp));
	return (0);
	}

	/*
	* Do address field validation upfront before we accept
	* the query.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
	/*
	* MLDv1 General Query.
	* If this was not sent to the all-nodes group, ignore it.
	*/
	struct in6_addr dst;

	dst = ip6->ip6_dst;
	in6_clearscope(&dst);
	if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes))
	return (EINVAL);
	is_general_query = 1;
	} else {
	/*
	* Embed scope ID of receiving interface in MLD query for
	* lookup whilst we don't hold other locks.
	*/
	in6_setscope(&mld->mld_addr, ifp, NULL);
	}

	IN6_MULTI_LOCK();
	MLD_LOCK();

	/*
	* Switch to MLDv1 host compatibility mode.
	*/
	mli = MLD_IFINFO(ifp);
	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
	mld_set_version(mli, MLD_VERSION_1);

	timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE;
	if (timer == 0)
	timer = 1;

	IF_ADDR_RLOCK(ifp);
	if (is_general_query) {
	/*
	* For each reporting group joined on this
	* interface, kick the report timer.
	*/
	CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)",
	ifp, if_name(ifp));
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_INET6 \|\|
	ifma->ifma_protospec == NULL)
	continue;
	inm = (struct in6_multi *)ifma->ifma_protospec;
	mld_v1_update_group(inm, timer);
	}
	} else {
	/*
	* MLDv1 Group-Specific Query.
	* If this is a group-specific MLDv1 query, we need only
	* look up the single group to process it.
	*/
	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
	if (inm != NULL) {
	CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &mld->mld_addr),
	ifp, if_name(ifp));
	mld_v1_update_group(inm, timer);
	}
	/* XXX Clear embedded scope ID as userland won't expect it. */
	in6_clearscope(&mld->mld_addr);
	}

	IF_ADDR_RUNLOCK(ifp);
	MLD_UNLOCK();
	IN6_MULTI_UNLOCK();

	return (0);
	}

	/*
	* Update the report timer on a group in response to an MLDv1 query.
	*
	* If we are becoming the reporting member for this group, start the timer.
	* If we already are the reporting member for this group, and timer is
	* below the threshold, reset it.
	*
	* We may be updating the group for the first time since we switched
	* to MLDv2. If we are, then we must clear any recorded source lists,
	* and transition to REPORTING state; the group timer is overloaded
	* for group and group-source query responses.
	*
	* Unlike MLDv2, the delay per group should be jittered
	* to avoid bursts of MLDv1 reports.
	*/
	static void
	mld_v1_update_group(struct in6_multi *inm, const int timer)
	{
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__,
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp), timer);

	IN6_MULTI_LOCK_ASSERT();

	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	break;
	case MLD_REPORTING_MEMBER:
	if (inm->in6m_timer != 0 &&
	inm->in6m_timer <= timer) {
	CTR1(KTR_MLD, "%s: REPORTING and timer running, "
	"skipping.", __func__);
	break;
	}
	/* FALLTHROUGH */
	case MLD_SG_QUERY_PENDING_MEMBER:
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_IDLE_MEMBER:
	case MLD_LAZY_MEMBER:
	case MLD_AWAKENING_MEMBER:
	CTR1(KTR_MLD, "%s: ->REPORTING", __func__);
	inm->in6m_state = MLD_REPORTING_MEMBER;
	inm->in6m_timer = MLD_RANDOM_DELAY(timer);
	V_current_state_timers_running6 = 1;
	break;
	case MLD_SLEEPING_MEMBER:
	CTR1(KTR_MLD, "%s: ->AWAKENING", __func__);
	inm->in6m_state = MLD_AWAKENING_MEMBER;
	break;
	case MLD_LEAVING_MEMBER:
	break;
	}
	}

	/*
	* Process a received MLDv2 general, group-specific or
	* group-and-source-specific query.
	*
	* Assumes that the query header has been pulled up to sizeof(mldv2_query).
	*
	* Return 0 if successful, otherwise an appropriate error code is returned.
	*/
	static int
	mld_v2_input_query(struct ifnet ifp, const struct ip6_hdr ip6,
	struct mbuf *m, const int off, const int icmp6len)
	{
	struct mld_ifsoftc *mli;
	struct mldv2_query *mld;
	struct in6_multi *inm;
	uint32_t maxdelay, nsrc, qqi;
	int is_general_query;
	uint16_t timer;
	uint8_t qrv;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	is_general_query = 0;

	/*
	* RFC3810 Section 6.2: MLD queries must originate from
	* a router's link-local address.
	*/
	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
	CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &ip6->ip6_src),
	ifp, if_name(ifp));
	return (0);
	}

	CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp));

	mld = (struct mldv2_query )(mtod(m, uint8_t ) + off);

	maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */
	if (maxdelay >= 32768) {
	maxdelay = (MLD_MRC_MANT(maxdelay) \| 0x1000) <<
	(MLD_MRC_EXP(maxdelay) + 3);
	}
	timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE;
	if (timer == 0)
	timer = 1;

	qrv = MLD_QRV(mld->mld_misc);
	if (qrv < 2) {
	CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__,
	qrv, MLD_RV_INIT);
	qrv = MLD_RV_INIT;
	}

	qqi = mld->mld_qqi;
	if (qqi >= 128) {
	qqi = MLD_QQIC_MANT(mld->mld_qqi) <<
	(MLD_QQIC_EXP(mld->mld_qqi) + 3);
	}

	nsrc = ntohs(mld->mld_numsrc);
	if (nsrc > MLD_MAX_GS_SOURCES)
	return (EMSGSIZE);
	if (icmp6len < sizeof(struct mldv2_query) +
	(nsrc * sizeof(struct in6_addr)))
	return (EMSGSIZE);

	/*
	* Do further input validation upfront to avoid resetting timers
	* should we need to discard this query.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
	/*
	* A general query with a source list has undefined
	* behaviour; discard it.
	*/
	if (nsrc > 0)
	return (EINVAL);
	is_general_query = 1;
	} else {
	/*
	* Embed scope ID of receiving interface in MLD query for
	* lookup whilst we don't hold other locks (due to KAME
	* locking lameness). We own this mbuf chain just now.
	*/
	in6_setscope(&mld->mld_addr, ifp, NULL);
	}

	IN6_MULTI_LOCK();
	MLD_LOCK();

	mli = MLD_IFINFO(ifp);
	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));

	/*
	* Discard the v2 query if we're in Compatibility Mode.
	* The RFC is pretty clear that hosts need to stay in MLDv1 mode
	* until the Old Version Querier Present timer expires.
	*/
	if (mli->mli_version != MLD_VERSION_2)
	goto out_locked;

	mld_set_version(mli, MLD_VERSION_2);
	mli->mli_rv = qrv;
	mli->mli_qi = qqi;
	mli->mli_qri = maxdelay;

	CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi,
	maxdelay);

	if (is_general_query) {
	/*
	* MLDv2 General Query.
	*
	* Schedule a current-state report on this ifp for
	* all groups, possibly containing source lists.
	*
	* If there is a pending General Query response
	* scheduled earlier than the selected delay, do
	* not schedule any other reports.
	* Otherwise, reset the interface timer.
	*/
	CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)",
	ifp, if_name(ifp));
	if (mli->mli_v2_timer == 0 \|\| mli->mli_v2_timer >= timer) {
	mli->mli_v2_timer = MLD_RANDOM_DELAY(timer);
	V_interface_timers_running6 = 1;
	}
	} else {
	/*
	* MLDv2 Group-specific or Group-and-source-specific Query.
	*
	* Group-source-specific queries are throttled on
	* a per-group basis to defeat denial-of-service attempts.
	* Queries for groups we are not a member of on this
	* link are simply ignored.
	*/
	IF_ADDR_RLOCK(ifp);
	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
	if (inm == NULL) {
	IF_ADDR_RUNLOCK(ifp);
	goto out_locked;
	}
	if (nsrc > 0) {
	if (!ratecheck(&inm->in6m_lastgsrtv,
	&V_mld_gsrdelay)) {
	CTR1(KTR_MLD, "%s: GS query throttled.",
	__func__);
	IF_ADDR_RUNLOCK(ifp);
	goto out_locked;
	}
	}
	CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)",
	ifp, if_name(ifp));
	/*
	* If there is a pending General Query response
	* scheduled sooner than the selected delay, no
	* further report need be scheduled.
	* Otherwise, prepare to respond to the
	* group-specific or group-and-source query.
	*/
	if (mli->mli_v2_timer == 0 \|\| mli->mli_v2_timer >= timer)
	mld_v2_process_group_query(inm, mli, timer, m, off);

	/* XXX Clear embedded scope ID as userland won't expect it. */
	in6_clearscope(&mld->mld_addr);
	IF_ADDR_RUNLOCK(ifp);
	}

	out_locked:
	MLD_UNLOCK();
	IN6_MULTI_UNLOCK();

	return (0);
	}

	/*
	* Process a received MLDv2 group-specific or group-and-source-specific
	* query.
	* Return <0 if any error occurred. Currently this is ignored.
	*/
	static int
	mld_v2_process_group_query(struct in6_multi inm, struct mld_ifsoftc mli,
	int timer, struct mbuf *m0, const int off)
	{
	struct mldv2_query *mld;
	int retval;
	uint16_t nsrc;

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	retval = 0;
	mld = (struct mldv2_query )(mtod(m0, uint8_t ) + off);

	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	case MLD_SLEEPING_MEMBER:
	case MLD_LAZY_MEMBER:
	case MLD_AWAKENING_MEMBER:
	case MLD_IDLE_MEMBER:
	case MLD_LEAVING_MEMBER:
	return (retval);
	break;
	case MLD_REPORTING_MEMBER:
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_SG_QUERY_PENDING_MEMBER:
	break;
	}

	nsrc = ntohs(mld->mld_numsrc);

	/*
	* Deal with group-specific queries upfront.
	* If any group query is already pending, purge any recorded
	* source-list state if it exists, and schedule a query response
	* for this group-specific query.
	*/
	if (nsrc == 0) {
	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER \|\|
	inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) {
	in6m_clear_recorded(inm);
	timer = min(inm->in6m_timer, timer);
	}
	inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER;
	inm->in6m_timer = MLD_RANDOM_DELAY(timer);
	V_current_state_timers_running6 = 1;
	return (retval);
	}

	/*
	* Deal with the case where a group-and-source-specific query has
	* been received but a group-specific query is already pending.
	*/
	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) {
	timer = min(inm->in6m_timer, timer);
	inm->in6m_timer = MLD_RANDOM_DELAY(timer);
	V_current_state_timers_running6 = 1;
	return (retval);
	}

	/*
	* Finally, deal with the case where a group-and-source-specific
	* query has been received, where a response to a previous g-s-r
	* query exists, or none exists.
	* In this case, we need to parse the source-list which the Querier
	* has provided us with and check if we have any source list filter
	* entries at T1 for these sources. If we do not, there is no need
	* schedule a report and the query may be dropped.
	* If we do, we must record them and schedule a current-state
	* report for those sources.
	*/
	if (inm->in6m_nsrc > 0) {
	struct mbuf *m;
	uint8_t *sp;
	int i, nrecorded;
	int soff;

	m = m0;
	soff = off + sizeof(struct mldv2_query);
	nrecorded = 0;
	for (i = 0; i < nsrc; i++) {
	sp = mtod(m, uint8_t *) + soff;
	retval = in6m_record_source(inm,
	(const struct in6_addr *)sp);
	if (retval < 0)
	break;
	nrecorded += retval;
	soff += sizeof(struct in6_addr);
	if (soff >= m->m_len) {
	soff = soff - m->m_len;
	m = m->m_next;
	if (m == NULL)
	break;
	}
	}
	if (nrecorded > 0) {
	CTR1(KTR_MLD,
	"%s: schedule response to SG query", __func__);
	inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER;
	inm->in6m_timer = MLD_RANDOM_DELAY(timer);
	V_current_state_timers_running6 = 1;
	}
	}

	return (retval);
	}

	/*
	* Process a received MLDv1 host membership report.
	* Assumes mld points to mld_hdr in pulled up mbuf chain.
	*
	* NOTE: Can't be fully const correct as we temporarily embed scope ID in
	* mld_addr. This is OK as we own the mbuf chain.
	*/
	static int
	mld_v1_input_report(struct ifnet ifp, const struct ip6_hdr ip6,
	/const/ struct mld_hdr *mld)
	{
	struct in6_addr src, dst;
	struct in6_ifaddr *ia;
	struct in6_multi *inm;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	if (!mld_v1enable) {
	CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &mld->mld_addr),
	ifp, if_name(ifp));
	return (0);
	}

	if (ifp->if_flags & IFF_LOOPBACK)
	return (0);

	/*
	* MLDv1 reports must originate from a host's link-local address,
	* or the unspecified address (when booting).
	*/
	src = ip6->ip6_src;
	in6_clearscope(&src);
	if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) {
	CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &ip6->ip6_src),
	ifp, if_name(ifp));
	return (EINVAL);
	}

	/*
	* RFC2710 Section 4: MLDv1 reports must pertain to a multicast
	* group, and must be directed to the group itself.
	*/
	dst = ip6->ip6_dst;
	in6_clearscope(&dst);
	if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) \|\|
	!IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) {
	CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &ip6->ip6_dst),
	ifp, if_name(ifp));
	return (EINVAL);
	}

	/*
	* Make sure we don't hear our own membership report, as fast
	* leave requires knowing that we are the only member of a
	* group. Assume we used the link-local address if available,
	* otherwise look for ::.
	*
	* XXX Note that scope ID comparison is needed for the address
	* returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be
	* performed for the on-wire address.
	*/
	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY\|IN6_IFF_ANYCAST);
	if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) \|\|
	(ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) {
	if (ia != NULL)
	ifa_free(&ia->ia_ifa);
	return (0);
	}
	if (ia != NULL)
	ifa_free(&ia->ia_ifa);

	CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp));

	/*
	* Embed scope ID of receiving interface in MLD query for lookup
	* whilst we don't hold other locks (due to KAME locking lameness).
	*/
	if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr))
	in6_setscope(&mld->mld_addr, ifp, NULL);

	IN6_MULTI_LOCK();
	MLD_LOCK();
	IF_ADDR_RLOCK(ifp);

	/*
	* MLDv1 report suppression.
	* If we are a member of this group, and our membership should be
	* reported, and our group timer is pending or about to be reset,
	* stop our group timer by transitioning to the 'lazy' state.
	*/
	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
	if (inm != NULL) {
	struct mld_ifsoftc *mli;

	mli = inm->in6m_mli;
	KASSERT(mli != NULL,
	("%s: no mli for ifp %p", __func__, ifp));

	/*
	* If we are in MLDv2 host mode, do not allow the
	* other host's MLDv1 report to suppress our reports.
	*/
	if (mli->mli_version == MLD_VERSION_2)
	goto out_locked;

	inm->in6m_timer = 0;

	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	case MLD_SLEEPING_MEMBER:
	break;
	case MLD_REPORTING_MEMBER:
	case MLD_IDLE_MEMBER:
	case MLD_AWAKENING_MEMBER:
	CTR3(KTR_MLD,
	"report suppressed for %s on ifp %p(%s)",
	ip6_sprintf(ip6tbuf, &mld->mld_addr),
	ifp, if_name(ifp));
	case MLD_LAZY_MEMBER:
	inm->in6m_state = MLD_LAZY_MEMBER;
	break;
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_SG_QUERY_PENDING_MEMBER:
	case MLD_LEAVING_MEMBER:
	break;
	}
	}

	out_locked:
	IF_ADDR_RUNLOCK(ifp);
	MLD_UNLOCK();
	IN6_MULTI_UNLOCK();

	/* XXX Clear embedded scope ID as userland won't expect it. */
	in6_clearscope(&mld->mld_addr);

	return (0);
	}

	/*
	* MLD input path.
	*
	* Assume query messages which fit in a single ICMPv6 message header
	* have been pulled up.
	* Assume that userland will want to see the message, even if it
	* otherwise fails kernel input validation; do not free it.
	* Pullup may however free the mbuf chain m if it fails.
	*
	* Return IPPROTO_DONE if we freed m. Otherwise, return 0.
	*/
	int
	mld_input(struct mbuf *m, int off, int icmp6len)
	{
	struct ifnet *ifp;
	struct ip6_hdr *ip6;
	struct mld_hdr *mld;
	int mldlen;

	CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off);

	ifp = m->m_pkthdr.rcvif;

	ip6 = mtod(m, struct ip6_hdr *);

	/* Pullup to appropriate size. */
	mld = (struct mld_hdr )(mtod(m, uint8_t ) + off);
	if (mld->mld_type == MLD_LISTENER_QUERY &&
	icmp6len >= sizeof(struct mldv2_query)) {
	mldlen = sizeof(struct mldv2_query);
	} else {
	mldlen = sizeof(struct mld_hdr);
	}
	IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen);
	if (mld == NULL) {
	ICMP6STAT_INC(icp6s_badlen);
	return (IPPROTO_DONE);
	}

	/*
	* Userland needs to see all of this traffic for implementing
	* the endpoint discovery portion of multicast routing.
	*/
	switch (mld->mld_type) {
	case MLD_LISTENER_QUERY:
	icmp6_ifstat_inc(ifp, ifs6_in_mldquery);
	if (icmp6len == sizeof(struct mld_hdr)) {
	if (mld_v1_input_query(ifp, ip6, mld) != 0)
	return (0);
	} else if (icmp6len >= sizeof(struct mldv2_query)) {
	if (mld_v2_input_query(ifp, ip6, m, off,
	icmp6len) != 0)
	return (0);
	}
	break;
	case MLD_LISTENER_REPORT:
	icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
	if (mld_v1_input_report(ifp, ip6, mld) != 0)
	return (0);
	break;
	case MLDV2_LISTENER_REPORT:
	icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
	break;
	case MLD_LISTENER_DONE:
	icmp6_ifstat_inc(ifp, ifs6_in_mlddone);
	break;
	default:
	break;
	}

	return (0);
	}

	/*
	* Fast timeout handler (global).
	* VIMAGE: Timeout handlers are expected to service all vimages.
	*/
	void
	mld_fasttimo(void)
	{
	VNET_ITERATOR_DECL(vnet_iter);

	VNET_LIST_RLOCK_NOSLEEP();
	VNET_FOREACH(vnet_iter) {
	CURVNET_SET(vnet_iter);
	mld_fasttimo_vnet();
	CURVNET_RESTORE();
	}
	VNET_LIST_RUNLOCK_NOSLEEP();
	}

	/*
	* Fast timeout handler (per-vnet).
	*
	* VIMAGE: Assume caller has set up our curvnet.
	*/
	static void
	mld_fasttimo_vnet(void)
	{
	struct mbufq scq; /* State-change packets */
	struct mbufq qrq; /* Query response packets */
	struct ifnet *ifp;
	struct mld_ifsoftc *mli;
	struct ifmultiaddr *ifma;
	struct in6_multi inm, tinm;
	int uri_fasthz;

	uri_fasthz = 0;

	/*
	* Quick check to see if any work needs to be done, in order to
	* minimize the overhead of fasttimo processing.
	* SMPng: XXX Unlocked reads.
	*/
	if (!V_current_state_timers_running6 &&
	!V_interface_timers_running6 &&
	!V_state_change_timers_running6)
	return;

	IN6_MULTI_LOCK();
	MLD_LOCK();

	/*
	* MLDv2 General Query response timer processing.
	*/
	if (V_interface_timers_running6) {
	CTR1(KTR_MLD, "%s: interface timers running", __func__);

	V_interface_timers_running6 = 0;
	LIST_FOREACH(mli, &V_mli_head, mli_link) {
	if (mli->mli_v2_timer == 0) {
	/* Do nothing. */
	} else if (--mli->mli_v2_timer == 0) {
	mld_v2_dispatch_general_query(mli);
	} else {
	V_interface_timers_running6 = 1;
	}
	}
	}

	if (!V_current_state_timers_running6 &&
	!V_state_change_timers_running6)
	goto out_locked;

	V_current_state_timers_running6 = 0;
	V_state_change_timers_running6 = 0;

	CTR1(KTR_MLD, "%s: state change timers running", __func__);

	/*
	* MLD host report and state-change timer processing.
	* Note: Processing a v2 group timer may remove a node.
	*/
	LIST_FOREACH(mli, &V_mli_head, mli_link) {
	ifp = mli->mli_ifp;

	if (mli->mli_version == MLD_VERSION_2) {
	uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri *
	PR_FASTHZ);
	mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS);
	mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS);
	}

	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_INET6 \|\|
	ifma->ifma_protospec == NULL)
	continue;
	inm = (struct in6_multi *)ifma->ifma_protospec;
	switch (mli->mli_version) {
	case MLD_VERSION_1:
	mld_v1_process_group_timer(mli, inm);
	break;
	case MLD_VERSION_2:
	mld_v2_process_group_timers(mli, &qrq,
	&scq, inm, uri_fasthz);
	break;
	}
	}
	IF_ADDR_RUNLOCK(ifp);

	switch (mli->mli_version) {
	case MLD_VERSION_1:
	/*
	* Transmit reports for this lifecycle. This
	* is done while not holding IF_ADDR_LOCK
	* since this can call
	* in6ifa_ifpforlinklocal() which locks
	* IF_ADDR_LOCK internally as well as
	* ip6_output() to transmit a packet.
	*/
	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
	in6m_nrele, tinm) {
	SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
	in6m_nrele);
	(void)mld_v1_transmit_report(inm,
	MLD_LISTENER_REPORT);
	}
	break;
	case MLD_VERSION_2:
	mld_dispatch_queue(&qrq, 0);
	mld_dispatch_queue(&scq, 0);

	/*
	* Free the in_multi reference(s) for
	* this lifecycle.
	*/
	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
	in6m_nrele, tinm) {
	SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
	in6m_nrele);
	in6m_release_locked(inm);
	}
	break;
	}
	}

	out_locked:
	MLD_UNLOCK();
	IN6_MULTI_UNLOCK();
	}

	/*
	* Update host report group timer.
	* Will update the global pending timer flags.
	*/
	static void
	mld_v1_process_group_timer(struct mld_ifsoftc mli, struct in6_multi inm)
	{
	int report_timer_expired;

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	if (inm->in6m_timer == 0) {
	report_timer_expired = 0;
	} else if (--inm->in6m_timer == 0) {
	report_timer_expired = 1;
	} else {
	V_current_state_timers_running6 = 1;
	return;
	}

	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	case MLD_IDLE_MEMBER:
	case MLD_LAZY_MEMBER:
	case MLD_SLEEPING_MEMBER:
	case MLD_AWAKENING_MEMBER:
	break;
	case MLD_REPORTING_MEMBER:
	if (report_timer_expired) {
	inm->in6m_state = MLD_IDLE_MEMBER;
	SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
	in6m_nrele);
	}
	break;
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_SG_QUERY_PENDING_MEMBER:
	case MLD_LEAVING_MEMBER:
	break;
	}
	}

	/*
	* Update a group's timers for MLDv2.
	* Will update the global pending timer flags.
	* Note: Unlocked read from mli.
	*/
	static void
	mld_v2_process_group_timers(struct mld_ifsoftc *mli,
	struct mbufq qrq, struct mbufq scq,
	struct in6_multi *inm, const int uri_fasthz)
	{
	int query_response_timer_expired;
	int state_change_retransmit_timer_expired;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	query_response_timer_expired = 0;
	state_change_retransmit_timer_expired = 0;

	/*
	* During a transition from compatibility mode back to MLDv2,
	* a group record in REPORTING state may still have its group
	* timer active. This is a no-op in this function; it is easier
	* to deal with it here than to complicate the slow-timeout path.
	*/
	if (inm->in6m_timer == 0) {
	query_response_timer_expired = 0;
	} else if (--inm->in6m_timer == 0) {
	query_response_timer_expired = 1;
	} else {
	V_current_state_timers_running6 = 1;
	}

	if (inm->in6m_sctimer == 0) {
	state_change_retransmit_timer_expired = 0;
	} else if (--inm->in6m_sctimer == 0) {
	state_change_retransmit_timer_expired = 1;
	} else {
	V_state_change_timers_running6 = 1;
	}

	/* We are in fasttimo, so be quick about it. */
	if (!state_change_retransmit_timer_expired &&
	!query_response_timer_expired)
	return;

	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	case MLD_SLEEPING_MEMBER:
	case MLD_LAZY_MEMBER:
	case MLD_AWAKENING_MEMBER:
	case MLD_IDLE_MEMBER:
	break;
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_SG_QUERY_PENDING_MEMBER:
	/*
	* Respond to a previously pending Group-Specific
	* or Group-and-Source-Specific query by enqueueing
	* the appropriate Current-State report for
	* immediate transmission.
	*/
	if (query_response_timer_expired) {
	int retval;

	retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1,
	(inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER),
	0);
	CTR2(KTR_MLD, "%s: enqueue record = %d",
	__func__, retval);
	inm->in6m_state = MLD_REPORTING_MEMBER;
	in6m_clear_recorded(inm);
	}
	/* FALLTHROUGH */
	case MLD_REPORTING_MEMBER:
	case MLD_LEAVING_MEMBER:
	if (state_change_retransmit_timer_expired) {
	/*
	* State-change retransmission timer fired.
	* If there are any further pending retransmissions,
	* set the global pending state-change flag, and
	* reset the timer.
	*/
	if (--inm->in6m_scrv > 0) {
	inm->in6m_sctimer = uri_fasthz;
	V_state_change_timers_running6 = 1;
	}
	/*
	* Retransmit the previously computed state-change
	* report. If there are no further pending
	* retransmissions, the mbuf queue will be consumed.
	* Update T0 state to T1 as we have now sent
	* a state-change.
	*/
	(void)mld_v2_merge_state_changes(inm, scq);

	in6m_commit(inm);
	CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp));

	/*
	* If we are leaving the group for good, make sure
	* we release MLD's reference to it.
	* This release must be deferred using a SLIST,
	* as we are called from a loop which traverses
	* the in_ifmultiaddr TAILQ.
	*/
	if (inm->in6m_state == MLD_LEAVING_MEMBER &&
	inm->in6m_scrv == 0) {
	inm->in6m_state = MLD_NOT_MEMBER;
	SLIST_INSERT_HEAD(&mli->mli_relinmhead,
	inm, in6m_nrele);
	}
	}
	break;
	}
	}

	/*
	* Switch to a different version on the given interface,
	* as per Section 9.12.
	*/
	static void
	mld_set_version(struct mld_ifsoftc *mli, const int version)
	{
	int old_version_timer;

	MLD_LOCK_ASSERT();

	CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__,
	version, mli->mli_ifp, if_name(mli->mli_ifp));

	if (version == MLD_VERSION_1) {
	/*
	* Compute the "Older Version Querier Present" timer as per
	* Section 9.12.
	*/
	old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri;
	old_version_timer *= PR_SLOWHZ;
	mli->mli_v1_timer = old_version_timer;
	}

	if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) {
	mli->mli_version = MLD_VERSION_1;
	mld_v2_cancel_link_timers(mli);
	}
	}

	/*
	* Cancel pending MLDv2 timers for the given link and all groups
	* joined on it; state-change, general-query, and group-query timers.
	*/
	static void
	mld_v2_cancel_link_timers(struct mld_ifsoftc *mli)
	{
	struct ifmultiaddr *ifma;
	struct ifnet *ifp;
	struct in6_multi inm, tinm;

	CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__,
	mli->mli_ifp, if_name(mli->mli_ifp));

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	/*
	* Fast-track this potentially expensive operation
	* by checking all the global 'timer pending' flags.
	*/
	if (!V_interface_timers_running6 &&
	!V_state_change_timers_running6 &&
	!V_current_state_timers_running6)
	return;

	mli->mli_v2_timer = 0;

	ifp = mli->mli_ifp;

	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_INET6)
	continue;
	inm = (struct in6_multi *)ifma->ifma_protospec;
	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	case MLD_IDLE_MEMBER:
	case MLD_LAZY_MEMBER:
	case MLD_SLEEPING_MEMBER:
	case MLD_AWAKENING_MEMBER:
	break;
	case MLD_LEAVING_MEMBER:
	/*
	* If we are leaving the group and switching
	* version, we need to release the final
	* reference held for issuing the INCLUDE {}.
	*/
	SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
	in6m_nrele);
	/* FALLTHROUGH */
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_SG_QUERY_PENDING_MEMBER:
	in6m_clear_recorded(inm);
	/* FALLTHROUGH */
	case MLD_REPORTING_MEMBER:
	inm->in6m_sctimer = 0;
	inm->in6m_timer = 0;
	inm->in6m_state = MLD_REPORTING_MEMBER;
	/*
	* Free any pending MLDv2 state-change records.
	*/
	mbufq_drain(&inm->in6m_scq);
	break;
	}
	}
	IF_ADDR_RUNLOCK(ifp);
	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) {
	SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
	in6m_release_locked(inm);
	}
	}

	/*
	* Global slowtimo handler.
	* VIMAGE: Timeout handlers are expected to service all vimages.
	*/
	void
	mld_slowtimo(void)
	{
	VNET_ITERATOR_DECL(vnet_iter);

	VNET_LIST_RLOCK_NOSLEEP();
	VNET_FOREACH(vnet_iter) {
	CURVNET_SET(vnet_iter);
	mld_slowtimo_vnet();
	CURVNET_RESTORE();
	}
	VNET_LIST_RUNLOCK_NOSLEEP();
	}

	/*
	* Per-vnet slowtimo handler.
	*/
	static void
	mld_slowtimo_vnet(void)
	{
	struct mld_ifsoftc *mli;

	MLD_LOCK();

	LIST_FOREACH(mli, &V_mli_head, mli_link) {
	mld_v1_process_querier_timers(mli);
	}

	MLD_UNLOCK();
	}

	/*
	* Update the Older Version Querier Present timers for a link.
	* See Section 9.12 of RFC 3810.
	*/
	static void
	mld_v1_process_querier_timers(struct mld_ifsoftc *mli)
	{

	MLD_LOCK_ASSERT();

	if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) {
	/*
	* MLDv1 Querier Present timer expired; revert to MLDv2.
	*/
	CTR5(KTR_MLD,
	"%s: transition from v%d -> v%d on %p(%s)",
	__func__, mli->mli_version, MLD_VERSION_2,
	mli->mli_ifp, if_name(mli->mli_ifp));
	mli->mli_version = MLD_VERSION_2;
	}
	}

	/*
	* Transmit an MLDv1 report immediately.
	*/
	static int
	mld_v1_transmit_report(struct in6_multi *in6m, const int type)
	{
	struct ifnet *ifp;
	struct in6_ifaddr *ia;
	struct ip6_hdr *ip6;
	struct mbuf mh, md;
	struct mld_hdr *mld;

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	ifp = in6m->in6m_ifp;
	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY\|IN6_IFF_ANYCAST);
	/* ia may be NULL if link-local address is tentative. */

	mh = m_gethdr(M_NOWAIT, MT_DATA);
	if (mh == NULL) {
	if (ia != NULL)
	ifa_free(&ia->ia_ifa);
	return (ENOMEM);
	}
	md = m_get(M_NOWAIT, MT_DATA);
	if (md == NULL) {
	m_free(mh);
	if (ia != NULL)
	ifa_free(&ia->ia_ifa);
	return (ENOMEM);
	}
	mh->m_next = md;

	/*
	* FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so
	* that ether_output() does not need to allocate another mbuf
	* for the header in the most common case.
	*/
	M_ALIGN(mh, sizeof(struct ip6_hdr));
	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
	mh->m_len = sizeof(struct ip6_hdr);

	ip6 = mtod(mh, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
	ip6->ip6_dst = in6m->in6m_addr;

	md->m_len = sizeof(struct mld_hdr);
	mld = mtod(md, struct mld_hdr *);
	mld->mld_type = type;
	mld->mld_code = 0;
	mld->mld_cksum = 0;
	mld->mld_maxdelay = 0;
	mld->mld_reserved = 0;
	mld->mld_addr = in6m->in6m_addr;
	in6_clearscope(&mld->mld_addr);
	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
	sizeof(struct ip6_hdr), sizeof(struct mld_hdr));

	mld_save_context(mh, ifp);
	mh->m_flags \|= M_MLDV1;

	mld_dispatch_packet(mh);

	if (ia != NULL)
	ifa_free(&ia->ia_ifa);
	return (0);
	}

	/*
	* Process a state change from the upper layer for the given IPv6 group.
	*
	* Each socket holds a reference on the in_multi in its own ip_moptions.
	* The socket layer will have made the necessary updates to.the group
	* state, it is now up to MLD to issue a state change report if there
	* has been any change between T0 (when the last state-change was issued)
	* and T1 (now).
	*
	* We use the MLDv2 state machine at group level. The MLd module
	* however makes the decision as to which MLD protocol version to speak.
	* A state change from INCLUDE {} always means an initial join.
	* A state change to INCLUDE {} always means a final leave.
	*
	* If delay is non-zero, and the state change is an initial multicast
	* join, the state change report will be delayed by 'delay' ticks
	* in units of PR_FASTHZ if MLDv1 is active on the link; otherwise
	* the initial MLDv2 state change report will be delayed by whichever
	* is sooner, a pending state-change timer or delay itself.
	*
	* VIMAGE: curvnet should have been set by caller, as this routine
	* is called from the socket option handlers.
	*/
	int
	mld_change_state(struct in6_multi *inm, const int delay)
	{
	struct mld_ifsoftc *mli;
	struct ifnet *ifp;
	int error;

	IN6_MULTI_LOCK_ASSERT();

	error = 0;

	/*
	* Try to detect if the upper layer just asked us to change state
	* for an interface which has now gone away.
	*/
	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
	ifp = inm->in6m_ifma->ifma_ifp;
	if (ifp != NULL) {
	/*
	* Sanity check that netinet6's notion of ifp is the
	* same as net's.
	*/
	KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
	}

	MLD_LOCK();

	mli = MLD_IFINFO(ifp);
	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));

	/*
	* If we detect a state transition to or from MCAST_UNDEFINED
	* for this group, then we are starting or finishing an MLD
	* life cycle for this group.
	*/
	if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) {
	CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__,
	inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode);
	if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) {
	CTR1(KTR_MLD, "%s: initial join", __func__);
	error = mld_initial_join(inm, mli, delay);
	goto out_locked;
	} else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) {
	CTR1(KTR_MLD, "%s: final leave", __func__);
	mld_final_leave(inm, mli);
	goto out_locked;
	}
	} else {
	CTR1(KTR_MLD, "%s: filter set change", __func__);
	}

	error = mld_handle_state_change(inm, mli);

	out_locked:
	MLD_UNLOCK();
	return (error);
	}

	/*
	* Perform the initial join for an MLD group.
	*
	* When joining a group:
	* If the group should have its MLD traffic suppressed, do nothing.
	* MLDv1 starts sending MLDv1 host membership reports.
	* MLDv2 will schedule an MLDv2 state-change report containing the
	* initial state of the membership.
	*
	* If the delay argument is non-zero, then we must delay sending the
	* initial state change for delay ticks (in units of PR_FASTHZ).
	*/
	static int
	mld_initial_join(struct in6_multi inm, struct mld_ifsoftc mli,
	const int delay)
	{
	struct ifnet *ifp;
	struct mbufq *mq;
	int error, retval, syncstates;
	int odelay;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)",
	__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	inm->in6m_ifp, if_name(inm->in6m_ifp));

	error = 0;
	syncstates = 1;

	ifp = inm->in6m_ifp;

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__));

	/*
	* Groups joined on loopback or marked as 'not reported',
	* enter the MLD_SILENT_MEMBER state and
	* are never reported in any protocol exchanges.
	* All other groups enter the appropriate state machine
	* for the version in use on this link.
	* A link marked as MLIF_SILENT causes MLD to be completely
	* disabled for the link.
	*/
	if ((ifp->if_flags & IFF_LOOPBACK) \|\|
	(mli->mli_flags & MLIF_SILENT) \|\|
	!mld_is_addr_reported(&inm->in6m_addr)) {
	CTR1(KTR_MLD,
	"%s: not kicking state machine for silent group", __func__);
	inm->in6m_state = MLD_SILENT_MEMBER;
	inm->in6m_timer = 0;
	} else {
	/*
	* Deal with overlapping in_multi lifecycle.
	* If this group was LEAVING, then make sure
	* we drop the reference we picked up to keep the
	* group around for the final INCLUDE {} enqueue.
	*/
	if (mli->mli_version == MLD_VERSION_2 &&
	inm->in6m_state == MLD_LEAVING_MEMBER)
	in6m_release_locked(inm);

	inm->in6m_state = MLD_REPORTING_MEMBER;

	switch (mli->mli_version) {
	case MLD_VERSION_1:
	/*
	* If a delay was provided, only use it if
	* it is greater than the delay normally
	* used for an MLDv1 state change report,
	* and delay sending the initial MLDv1 report
	* by not transitioning to the IDLE state.
	*/
	odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ);
	if (delay) {
	inm->in6m_timer = max(delay, odelay);
	V_current_state_timers_running6 = 1;
	} else {
	inm->in6m_state = MLD_IDLE_MEMBER;
	error = mld_v1_transmit_report(inm,
	MLD_LISTENER_REPORT);
	if (error == 0) {
	inm->in6m_timer = odelay;
	V_current_state_timers_running6 = 1;
	}
	}
	break;

	case MLD_VERSION_2:
	/*
	* Defer update of T0 to T1, until the first copy
	* of the state change has been transmitted.
	*/
	syncstates = 0;

	/*
	* Immediately enqueue a State-Change Report for
	* this interface, freeing any previous reports.
	* Don't kick the timers if there is nothing to do,
	* or if an error occurred.
	*/
	mq = &inm->in6m_scq;
	mbufq_drain(mq);
	retval = mld_v2_enqueue_group_record(mq, inm, 1,
	0, 0, (mli->mli_flags & MLIF_USEALLOW));
	CTR2(KTR_MLD, "%s: enqueue record = %d",
	__func__, retval);
	if (retval <= 0) {
	error = retval * -1;
	break;
	}

	/*
	* Schedule transmission of pending state-change
	* report up to RV times for this link. The timer
	* will fire at the next mld_fasttimo (~200ms),
	* giving us an opportunity to merge the reports.
	*
	* If a delay was provided to this function, only
	* use this delay if sooner than the existing one.
	*/
	KASSERT(mli->mli_rv > 1,
	("%s: invalid robustness %d", __func__,
	mli->mli_rv));
	inm->in6m_scrv = mli->mli_rv;
	if (delay) {
	if (inm->in6m_sctimer > 1) {
	inm->in6m_sctimer =
	min(inm->in6m_sctimer, delay);
	} else
	inm->in6m_sctimer = delay;
	} else
	inm->in6m_sctimer = 1;
	V_state_change_timers_running6 = 1;

	error = 0;
	break;
	}
	}

	/*
	* Only update the T0 state if state change is atomic,
	* i.e. we don't need to wait for a timer to fire before we
	* can consider the state change to have been communicated.
	*/
	if (syncstates) {
	in6m_commit(inm);
	CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp));
	}

	return (error);
	}

	/*
	* Issue an intermediate state change during the life-cycle.
	*/
	static int
	mld_handle_state_change(struct in6_multi inm, struct mld_ifsoftc mli)
	{
	struct ifnet *ifp;
	int retval;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)",
	__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	inm->in6m_ifp, if_name(inm->in6m_ifp));

	ifp = inm->in6m_ifp;

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	KASSERT(mli && mli->mli_ifp == ifp,
	("%s: inconsistent ifp", __func__));

	if ((ifp->if_flags & IFF_LOOPBACK) \|\|
	(mli->mli_flags & MLIF_SILENT) \|\|
	!mld_is_addr_reported(&inm->in6m_addr) \|\|
	(mli->mli_version != MLD_VERSION_2)) {
	if (!mld_is_addr_reported(&inm->in6m_addr)) {
	CTR1(KTR_MLD,
	"%s: not kicking state machine for silent group", __func__);
	}
	CTR1(KTR_MLD, "%s: nothing to do", __func__);
	in6m_commit(inm);
	CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp));
	return (0);
	}

	mbufq_drain(&inm->in6m_scq);

	retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0,
	(mli->mli_flags & MLIF_USEALLOW));
	CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval);
	if (retval <= 0)
	return (-retval);

	/*
	* If record(s) were enqueued, start the state-change
	* report timer for this group.
	*/
	inm->in6m_scrv = mli->mli_rv;
	inm->in6m_sctimer = 1;
	V_state_change_timers_running6 = 1;

	return (0);
	}

	/*
	* Perform the final leave for a multicast address.
	*
	* When leaving a group:
	* MLDv1 sends a DONE message, if and only if we are the reporter.
	* MLDv2 enqueues a state-change report containing a transition
	* to INCLUDE {} for immediate transmission.
	*/
	static void
	mld_final_leave(struct in6_multi inm, struct mld_ifsoftc mli)
	{
	int syncstates;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	syncstates = 1;

	CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)",
	__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	inm->in6m_ifp, if_name(inm->in6m_ifp));

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	case MLD_LEAVING_MEMBER:
	/* Already leaving or left; do nothing. */
	CTR1(KTR_MLD,
	"%s: not kicking state machine for silent group", __func__);
	break;
	case MLD_REPORTING_MEMBER:
	case MLD_IDLE_MEMBER:
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_SG_QUERY_PENDING_MEMBER:
	if (mli->mli_version == MLD_VERSION_1) {
	#ifdef INVARIANTS
	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER \|\|
	inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER)
	panic("%s: MLDv2 state reached, not MLDv2 mode",
	__func__);
	#endif
	mld_v1_transmit_report(inm, MLD_LISTENER_DONE);
	inm->in6m_state = MLD_NOT_MEMBER;
	V_current_state_timers_running6 = 1;
	} else if (mli->mli_version == MLD_VERSION_2) {
	/*
	* Stop group timer and all pending reports.
	* Immediately enqueue a state-change report
	* TO_IN {} to be sent on the next fast timeout,
	* giving us an opportunity to merge reports.
	*/
	mbufq_drain(&inm->in6m_scq);
	inm->in6m_timer = 0;
	inm->in6m_scrv = mli->mli_rv;
	CTR4(KTR_MLD, "%s: Leaving %s/%s with %d "
	"pending retransmissions.", __func__,
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp), inm->in6m_scrv);
	if (inm->in6m_scrv == 0) {
	inm->in6m_state = MLD_NOT_MEMBER;
	inm->in6m_sctimer = 0;
	} else {
	int retval;

	in6m_acquire_locked(inm);

	retval = mld_v2_enqueue_group_record(
	&inm->in6m_scq, inm, 1, 0, 0,
	(mli->mli_flags & MLIF_USEALLOW));
	KASSERT(retval != 0,
	("%s: enqueue record = %d", __func__,
	retval));

	inm->in6m_state = MLD_LEAVING_MEMBER;
	inm->in6m_sctimer = 1;
	V_state_change_timers_running6 = 1;
	syncstates = 0;
	}
	break;
	}
	break;
	case MLD_LAZY_MEMBER:
	case MLD_SLEEPING_MEMBER:
	case MLD_AWAKENING_MEMBER:
	/* Our reports are suppressed; do nothing. */
	break;
	}

	if (syncstates) {
	in6m_commit(inm);
	CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp));
	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
	CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s",
	__func__, &inm->in6m_addr, if_name(inm->in6m_ifp));
	}
	}

	/*
	* Enqueue an MLDv2 group record to the given output queue.
	*
	* If is_state_change is zero, a current-state record is appended.
	* If is_state_change is non-zero, a state-change report is appended.
	*
	* If is_group_query is non-zero, an mbuf packet chain is allocated.
	* If is_group_query is zero, and if there is a packet with free space
	* at the tail of the queue, it will be appended to providing there
	* is enough free space.
	* Otherwise a new mbuf packet chain is allocated.
	*
	* If is_source_query is non-zero, each source is checked to see if
	* it was recorded for a Group-Source query, and will be omitted if
	* it is not both in-mode and recorded.
	*
	* If use_block_allow is non-zero, state change reports for initial join
	* and final leave, on an inclusive mode group with a source list, will be
	* rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively.
	*
	* The function will attempt to allocate leading space in the packet
	* for the IPv6+ICMP headers to be prepended without fragmenting the chain.
	*
	* If successful the size of all data appended to the queue is returned,
	* otherwise an error code less than zero is returned, or zero if
	* no record(s) were appended.
	*/
	static int
	mld_v2_enqueue_group_record(struct mbufq mq, struct in6_multi inm,
	const int is_state_change, const int is_group_query,
	const int is_source_query, const int use_block_allow)
	{
	struct mldv2_record mr;
	struct mldv2_record *pmr;
	struct ifnet *ifp;
	struct ip6_msource ims, nims;
	struct mbuf m0, m, *md;
	- int error, is_filter_list_change;
	+ int is_filter_list_change;
	int minrec0len, m0srcs, msrcs, nbytes, off;
	int record_has_sources;
	int now;
	int type;
	uint8_t mode;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	IN6_MULTI_LOCK_ASSERT();

	- error = 0;
	ifp = inm->in6m_ifp;
	is_filter_list_change = 0;
	m = NULL;
	m0 = NULL;
	m0srcs = 0;
	msrcs = 0;
	nbytes = 0;
	nims = NULL;
	record_has_sources = 1;
	pmr = NULL;
	type = MLD_DO_NOTHING;
	mode = inm->in6m_st[1].iss_fmode;

	/*
	* If we did not transition out of ASM mode during t0->t1,
	* and there are no source nodes to process, we can skip
	* the generation of source records.
	*/
	if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 &&
	inm->in6m_nsrc == 0)
	record_has_sources = 0;

	if (is_state_change) {
	/*
	* Queue a state change record.
	* If the mode did not change, and there are non-ASM
	* listeners or source filters present,
	* we potentially need to issue two records for the group.
	* If there are ASM listeners, and there was no filter
	* mode transition of any kind, do nothing.
	*
	* If we are transitioning to MCAST_UNDEFINED, we need
	* not send any sources. A transition to/from this state is
	* considered inclusive with some special treatment.
	*
	* If we are rewriting initial joins/leaves to use
	* ALLOW/BLOCK, and the group's membership is inclusive,
	* we need to send sources in all cases.
	*/
	if (mode != inm->in6m_st[0].iss_fmode) {
	if (mode == MCAST_EXCLUDE) {
	CTR1(KTR_MLD, "%s: change to EXCLUDE",
	__func__);
	type = MLD_CHANGE_TO_EXCLUDE_MODE;
	} else {
	CTR1(KTR_MLD, "%s: change to INCLUDE",
	__func__);
	if (use_block_allow) {
	/*
	* XXX
	* Here we're interested in state
	* edges either direction between
	* MCAST_UNDEFINED and MCAST_INCLUDE.
	* Perhaps we should just check
	* the group state, rather than
	* the filter mode.
	*/
	if (mode == MCAST_UNDEFINED) {
	type = MLD_BLOCK_OLD_SOURCES;
	} else {
	type = MLD_ALLOW_NEW_SOURCES;
	}
	} else {
	type = MLD_CHANGE_TO_INCLUDE_MODE;
	if (mode == MCAST_UNDEFINED)
	record_has_sources = 0;
	}
	}
	} else {
	if (record_has_sources) {
	is_filter_list_change = 1;
	} else {
	type = MLD_DO_NOTHING;
	}
	}
	} else {
	/*
	* Queue a current state record.
	*/
	if (mode == MCAST_EXCLUDE) {
	type = MLD_MODE_IS_EXCLUDE;
	} else if (mode == MCAST_INCLUDE) {
	type = MLD_MODE_IS_INCLUDE;
	KASSERT(inm->in6m_st[1].iss_asm == 0,
	("%s: inm %p is INCLUDE but ASM count is %d",
	__func__, inm, inm->in6m_st[1].iss_asm));
	}
	}

	/*
	* Generate the filter list changes using a separate function.
	*/
	if (is_filter_list_change)
	return (mld_v2_enqueue_filter_change(mq, inm));

	if (type == MLD_DO_NOTHING) {
	CTR3(KTR_MLD, "%s: nothing to do for %s/%s",
	__func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp));
	return (0);
	}

	/*
	* If any sources are present, we must be able to fit at least
	* one in the trailing space of the tail packet's mbuf,
	* ideally more.
	*/
	minrec0len = sizeof(struct mldv2_record);
	if (record_has_sources)
	minrec0len += sizeof(struct in6_addr);

	CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__,
	mld_rec_type_to_str(type),
	ip6_sprintf(ip6tbuf, &inm->in6m_addr),
	if_name(inm->in6m_ifp));

	/*
	* Check if we have a packet in the tail of the queue for this
	* group into which the first group record for this group will fit.
	* Otherwise allocate a new packet.
	* Always allocate leading space for IP6+RA+ICMPV6+REPORT.
	* Note: Group records for G/GSR query responses MUST be sent
	* in their own packet.
	*/
	m0 = mbufq_last(mq);
	if (!is_group_query &&
	m0 != NULL &&
	(m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) &&
	(m0->m_pkthdr.len + minrec0len) <
	(ifp->if_mtu - MLD_MTUSPACE)) {
	m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
	sizeof(struct mldv2_record)) /
	sizeof(struct in6_addr);
	m = m0;
	CTR1(KTR_MLD, "%s: use existing packet", __func__);
	} else {
	if (mbufq_full(mq)) {
	CTR1(KTR_MLD, "%s: outbound queue full", __func__);
	return (-ENOMEM);
	}
	m = NULL;
	m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
	sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
	if (!is_state_change && !is_group_query)
	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
	if (m == NULL)
	m = m_gethdr(M_NOWAIT, MT_DATA);
	if (m == NULL)
	return (-ENOMEM);

	mld_save_context(m, ifp);

	CTR1(KTR_MLD, "%s: allocated first packet", __func__);
	}

	/*
	* Append group record.
	* If we have sources, we don't know how many yet.
	*/
	mr.mr_type = type;
	mr.mr_datalen = 0;
	mr.mr_numsrc = 0;
	mr.mr_addr = inm->in6m_addr;
	in6_clearscope(&mr.mr_addr);
	if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
	if (m != m0)
	m_freem(m);
	CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
	return (-ENOMEM);
	}
	nbytes += sizeof(struct mldv2_record);

	/*
	* Append as many sources as will fit in the first packet.
	* If we are appending to a new packet, the chain allocation
	* may potentially use clusters; use m_getptr() in this case.
	* If we are appending to an existing packet, we need to obtain
	* a pointer to the group record after m_append(), in case a new
	* mbuf was allocated.
	*
	* Only append sources which are in-mode at t1. If we are
	* transitioning to MCAST_UNDEFINED state on the group, and
	* use_block_allow is zero, do not include source entries.
	* Otherwise, we need to include this source in the report.
	*
	* Only report recorded sources in our filter set when responding
	* to a group-source query.
	*/
	if (record_has_sources) {
	if (m == m0) {
	md = m_last(m);
	pmr = (struct mldv2_record )(mtod(md, uint8_t ) +
	md->m_len - nbytes);
	} else {
	md = m_getptr(m, 0, &off);
	pmr = (struct mldv2_record )(mtod(md, uint8_t ) +
	off);
	}
	msrcs = 0;
	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs,
	nims) {
	CTR2(KTR_MLD, "%s: visit node %s", __func__,
	ip6_sprintf(ip6tbuf, &ims->im6s_addr));
	now = im6s_get_mode(inm, ims, 1);
	CTR2(KTR_MLD, "%s: node is %d", __func__, now);
	if ((now != mode) \|\|
	(now == mode &&
	(!use_block_allow && mode == MCAST_UNDEFINED))) {
	CTR1(KTR_MLD, "%s: skip node", __func__);
	continue;
	}
	if (is_source_query && ims->im6s_stp == 0) {
	CTR1(KTR_MLD, "%s: skip unrecorded node",
	__func__);
	continue;
	}
	CTR1(KTR_MLD, "%s: append node", __func__);
	if (!m_append(m, sizeof(struct in6_addr),
	(void *)&ims->im6s_addr)) {
	if (m != m0)
	m_freem(m);
	CTR1(KTR_MLD, "%s: m_append() failed.",
	__func__);
	return (-ENOMEM);
	}
	nbytes += sizeof(struct in6_addr);
	++msrcs;
	if (msrcs == m0srcs)
	break;
	}
	CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__,
	msrcs);
	pmr->mr_numsrc = htons(msrcs);
	nbytes += (msrcs * sizeof(struct in6_addr));
	}

	if (is_source_query && msrcs == 0) {
	CTR1(KTR_MLD, "%s: no recorded sources to report", __func__);
	if (m != m0)
	m_freem(m);
	return (0);
	}

	/*
	* We are good to go with first packet.
	*/
	if (m != m0) {
	CTR1(KTR_MLD, "%s: enqueueing first packet", __func__);
	m->m_pkthdr.PH_vt.vt_nrecs = 1;
	mbufq_enqueue(mq, m);
	} else
	m->m_pkthdr.PH_vt.vt_nrecs++;

	/*
	* No further work needed if no source list in packet(s).
	*/
	if (!record_has_sources)
	return (nbytes);

	/*
	* Whilst sources remain to be announced, we need to allocate
	* a new packet and fill out as many sources as will fit.
	* Always try for a cluster first.
	*/
	while (nims != NULL) {
	if (mbufq_full(mq)) {
	CTR1(KTR_MLD, "%s: outbound queue full", __func__);
	return (-ENOMEM);
	}
	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
	if (m == NULL)
	m = m_gethdr(M_NOWAIT, MT_DATA);
	if (m == NULL)
	return (-ENOMEM);
	mld_save_context(m, ifp);
	md = m_getptr(m, 0, &off);
	pmr = (struct mldv2_record )(mtod(md, uint8_t ) + off);
	CTR1(KTR_MLD, "%s: allocated next packet", __func__);

	if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
	if (m != m0)
	m_freem(m);
	CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
	return (-ENOMEM);
	}
	m->m_pkthdr.PH_vt.vt_nrecs = 1;
	nbytes += sizeof(struct mldv2_record);

	m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
	sizeof(struct mldv2_record)) / sizeof(struct in6_addr);

	msrcs = 0;
	RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
	CTR2(KTR_MLD, "%s: visit node %s",
	__func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr));
	now = im6s_get_mode(inm, ims, 1);
	if ((now != mode) \|\|
	(now == mode &&
	(!use_block_allow && mode == MCAST_UNDEFINED))) {
	CTR1(KTR_MLD, "%s: skip node", __func__);
	continue;
	}
	if (is_source_query && ims->im6s_stp == 0) {
	CTR1(KTR_MLD, "%s: skip unrecorded node",
	__func__);
	continue;
	}
	CTR1(KTR_MLD, "%s: append node", __func__);
	if (!m_append(m, sizeof(struct in6_addr),
	(void *)&ims->im6s_addr)) {
	if (m != m0)
	m_freem(m);
	CTR1(KTR_MLD, "%s: m_append() failed.",
	__func__);
	return (-ENOMEM);
	}
	++msrcs;
	if (msrcs == m0srcs)
	break;
	}
	pmr->mr_numsrc = htons(msrcs);
	nbytes += (msrcs * sizeof(struct in6_addr));

	CTR1(KTR_MLD, "%s: enqueueing next packet", __func__);
	mbufq_enqueue(mq, m);
	}

	return (nbytes);
	}

	/*
	* Type used to mark record pass completion.
	* We exploit the fact we can cast to this easily from the
	* current filter modes on each ip_msource node.
	*/
	typedef enum {
	REC_NONE = 0x00, /* MCAST_UNDEFINED */
	REC_ALLOW = 0x01, /* MCAST_INCLUDE */
	REC_BLOCK = 0x02, /* MCAST_EXCLUDE */
	REC_FULL = REC_ALLOW \| REC_BLOCK
	} rectype_t;

	/*
	* Enqueue an MLDv2 filter list change to the given output queue.
	*
	* Source list filter state is held in an RB-tree. When the filter list
	* for a group is changed without changing its mode, we need to compute
	* the deltas between T0 and T1 for each source in the filter set,
	* and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
	*
	* As we may potentially queue two record types, and the entire R-B tree
	* needs to be walked at once, we break this out into its own function
	* so we can generate a tightly packed queue of packets.
	*
	* XXX This could be written to only use one tree walk, although that makes
	* serializing into the mbuf chains a bit harder. For now we do two walks
	* which makes things easier on us, and it may or may not be harder on
	* the L2 cache.
	*
	* If successful the size of all data appended to the queue is returned,
	* otherwise an error code less than zero is returned, or zero if
	* no record(s) were appended.
	*/
	static int
	mld_v2_enqueue_filter_change(struct mbufq mq, struct in6_multi inm)
	{
	static const int MINRECLEN =
	sizeof(struct mldv2_record) + sizeof(struct in6_addr);
	struct ifnet *ifp;
	struct mldv2_record mr;
	struct mldv2_record *pmr;
	struct ip6_msource ims, nims;
	struct mbuf m, m0, *md;
	int m0srcs, nbytes, npbytes, off, rsrcs, schanged;
	int nallow, nblock;
	uint8_t mode, now, then;
	rectype_t crt, drt, nrt;
	#ifdef KTR
	char ip6tbuf[INET6_ADDRSTRLEN];
	#endif

	IN6_MULTI_LOCK_ASSERT();

	if (inm->in6m_nsrc == 0 \|\|
	(inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0))
	return (0);

	ifp = inm->in6m_ifp; /* interface */
	mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */
	crt = REC_NONE; /* current group record type */
	drt = REC_NONE; /* mask of completed group record types */
	nrt = REC_NONE; /* record type for current node */
	m0srcs = 0; /* # source which will fit in current mbuf chain */
	npbytes = 0; /* # of bytes appended this packet */
	nbytes = 0; /* # of bytes appended to group's state-change queue */
	rsrcs = 0; /* # sources encoded in current record */
	schanged = 0; /* # nodes encoded in overall filter change */
	nallow = 0; /* # of source entries in ALLOW_NEW */
	nblock = 0; /* # of source entries in BLOCK_OLD */
	nims = NULL; /* next tree node pointer */

	/*
	* For each possible filter record mode.
	* The first kind of source we encounter tells us which
	* is the first kind of record we start appending.
	* If a node transitioned to UNDEFINED at t1, its mode is treated
	* as the inverse of the group's filter mode.
	*/
	while (drt != REC_FULL) {
	do {
	m0 = mbufq_last(mq);
	if (m0 != NULL &&
	(m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
	MLD_V2_REPORT_MAXRECS) &&
	(m0->m_pkthdr.len + MINRECLEN) <
	(ifp->if_mtu - MLD_MTUSPACE)) {
	m = m0;
	m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
	sizeof(struct mldv2_record)) /
	sizeof(struct in6_addr);
	CTR1(KTR_MLD,
	"%s: use previous packet", __func__);
	} else {
	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
	if (m == NULL)
	m = m_gethdr(M_NOWAIT, MT_DATA);
	if (m == NULL) {
	CTR1(KTR_MLD,
	"%s: m_get*() failed", __func__);
	return (-ENOMEM);
	}
	m->m_pkthdr.PH_vt.vt_nrecs = 0;
	mld_save_context(m, ifp);
	m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
	sizeof(struct mldv2_record)) /
	sizeof(struct in6_addr);
	npbytes = 0;
	CTR1(KTR_MLD,
	"%s: allocated new packet", __func__);
	}
	/*
	* Append the MLD group record header to the
	* current packet's data area.
	* Recalculate pointer to free space for next
	* group record, in case m_append() allocated
	* a new mbuf or cluster.
	*/
	memset(&mr, 0, sizeof(mr));
	mr.mr_addr = inm->in6m_addr;
	in6_clearscope(&mr.mr_addr);
	if (!m_append(m, sizeof(mr), (void *)&mr)) {
	if (m != m0)
	m_freem(m);
	CTR1(KTR_MLD,
	"%s: m_append() failed", __func__);
	return (-ENOMEM);
	}
	npbytes += sizeof(struct mldv2_record);
	if (m != m0) {
	/* new packet; offset in chain */
	md = m_getptr(m, npbytes -
	sizeof(struct mldv2_record), &off);
	pmr = (struct mldv2_record *)(mtod(md,
	uint8_t *) + off);
	} else {
	/* current packet; offset from last append */
	md = m_last(m);
	pmr = (struct mldv2_record *)(mtod(md,
	uint8_t *) + md->m_len -
	sizeof(struct mldv2_record));
	}
	/*
	* Begin walking the tree for this record type
	* pass, or continue from where we left off
	* previously if we had to allocate a new packet.
	* Only report deltas in-mode at t1.
	* We need not report included sources as allowed
	* if we are in inclusive mode on the group,
	* however the converse is not true.
	*/
	rsrcs = 0;
	if (nims == NULL) {
	nims = RB_MIN(ip6_msource_tree,
	&inm->in6m_srcs);
	}
	RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
	CTR2(KTR_MLD, "%s: visit node %s", __func__,
	ip6_sprintf(ip6tbuf, &ims->im6s_addr));
	now = im6s_get_mode(inm, ims, 1);
	then = im6s_get_mode(inm, ims, 0);
	CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d",
	__func__, then, now);
	if (now == then) {
	CTR1(KTR_MLD,
	"%s: skip unchanged", __func__);
	continue;
	}
	if (mode == MCAST_EXCLUDE &&
	now == MCAST_INCLUDE) {
	CTR1(KTR_MLD,
	"%s: skip IN src on EX group",
	__func__);
	continue;
	}
	nrt = (rectype_t)now;
	if (nrt == REC_NONE)
	nrt = (rectype_t)(~mode & REC_FULL);
	if (schanged++ == 0) {
	crt = nrt;
	} else if (crt != nrt)
	continue;
	if (!m_append(m, sizeof(struct in6_addr),
	(void *)&ims->im6s_addr)) {
	if (m != m0)
	m_freem(m);
	CTR1(KTR_MLD,
	"%s: m_append() failed", __func__);
	return (-ENOMEM);
	}
	nallow += !!(crt == REC_ALLOW);
	nblock += !!(crt == REC_BLOCK);
	if (++rsrcs == m0srcs)
	break;
	}
	/*
	* If we did not append any tree nodes on this
	* pass, back out of allocations.
	*/
	if (rsrcs == 0) {
	npbytes -= sizeof(struct mldv2_record);
	if (m != m0) {
	CTR1(KTR_MLD,
	"%s: m_free(m)", __func__);
	m_freem(m);
	} else {
	CTR1(KTR_MLD,
	"%s: m_adj(m, -mr)", __func__);
	m_adj(m, -((int)sizeof(
	struct mldv2_record)));
	}
	continue;
	}
	npbytes += (rsrcs * sizeof(struct in6_addr));
	if (crt == REC_ALLOW)
	pmr->mr_type = MLD_ALLOW_NEW_SOURCES;
	else if (crt == REC_BLOCK)
	pmr->mr_type = MLD_BLOCK_OLD_SOURCES;
	pmr->mr_numsrc = htons(rsrcs);
	/*
	* Count the new group record, and enqueue this
	* packet if it wasn't already queued.
	*/
	m->m_pkthdr.PH_vt.vt_nrecs++;
	if (m != m0)
	mbufq_enqueue(mq, m);
	nbytes += npbytes;
	} while (nims != NULL);
	drt \|= crt;
	crt = (~crt & REC_FULL);
	}

	CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
	nallow, nblock);

	return (nbytes);
	}

	static int
	mld_v2_merge_state_changes(struct in6_multi inm, struct mbufq scq)
	{
	struct mbufq *gq;
	struct mbuf m; / pending state-change */
	struct mbuf m0; / copy of pending state-change */
	struct mbuf mt; / last state-change in packet */
	int docopy, domerge;
	u_int recslen;

	docopy = 0;
	domerge = 0;
	recslen = 0;

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	/*
	* If there are further pending retransmissions, make a writable
	* copy of each queued state-change message before merging.
	*/
	if (inm->in6m_scrv > 0)
	docopy = 1;

	gq = &inm->in6m_scq;
	#ifdef KTR
	if (mbufq_first(gq) == NULL) {
	CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty",
	__func__, inm);
	}
	#endif

	m = mbufq_first(gq);
	while (m != NULL) {
	/*
	* Only merge the report into the current packet if
	* there is sufficient space to do so; an MLDv2 report
	* packet may only contain 65,535 group records.
	* Always use a simple mbuf chain concatentation to do this,
	* as large state changes for single groups may have
	* allocated clusters.
	*/
	domerge = 0;
	mt = mbufq_last(scq);
	if (mt != NULL) {
	recslen = m_length(m, NULL);

	if ((mt->m_pkthdr.PH_vt.vt_nrecs +
	m->m_pkthdr.PH_vt.vt_nrecs <=
	MLD_V2_REPORT_MAXRECS) &&
	(mt->m_pkthdr.len + recslen <=
	(inm->in6m_ifp->if_mtu - MLD_MTUSPACE)))
	domerge = 1;
	}

	if (!domerge && mbufq_full(gq)) {
	CTR2(KTR_MLD,
	"%s: outbound queue full, skipping whole packet %p",
	__func__, m);
	mt = m->m_nextpkt;
	if (!docopy)
	m_freem(m);
	m = mt;
	continue;
	}

	if (!docopy) {
	CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m);
	m0 = mbufq_dequeue(gq);
	m = m0->m_nextpkt;
	} else {
	CTR2(KTR_MLD, "%s: copying %p", __func__, m);
	m0 = m_dup(m, M_NOWAIT);
	if (m0 == NULL)
	return (ENOMEM);
	m0->m_nextpkt = NULL;
	m = m->m_nextpkt;
	}

	if (!domerge) {
	CTR3(KTR_MLD, "%s: queueing %p to scq %p)",
	__func__, m0, scq);
	mbufq_enqueue(scq, m0);
	} else {
	struct mbuf mtl; / last mbuf of packet mt */

	CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)",
	__func__, m0, mt);

	mtl = m_last(mt);
	m0->m_flags &= ~M_PKTHDR;
	mt->m_pkthdr.len += recslen;
	mt->m_pkthdr.PH_vt.vt_nrecs +=
	m0->m_pkthdr.PH_vt.vt_nrecs;

	mtl->m_next = m0;
	}
	}

	return (0);
	}

	/*
	* Respond to a pending MLDv2 General Query.
	*/
	static void
	mld_v2_dispatch_general_query(struct mld_ifsoftc *mli)
	{
	struct ifmultiaddr *ifma;
	struct ifnet *ifp;
	struct in6_multi *inm;
	int retval;

	IN6_MULTI_LOCK_ASSERT();
	MLD_LOCK_ASSERT();

	KASSERT(mli->mli_version == MLD_VERSION_2,
	("%s: called when version %d", __func__, mli->mli_version));

	/*
	* Check that there are some packets queued. If so, send them first.
	* For large number of groups the reply to general query can take
	* many packets, we should finish sending them before starting of
	* queuing the new reply.
	*/
	if (mbufq_len(&mli->mli_gq) != 0)
	goto send;

	ifp = mli->mli_ifp;

	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_INET6 \|\|
	ifma->ifma_protospec == NULL)
	continue;

	inm = (struct in6_multi *)ifma->ifma_protospec;
	KASSERT(ifp == inm->in6m_ifp,
	("%s: inconsistent ifp", __func__));

	switch (inm->in6m_state) {
	case MLD_NOT_MEMBER:
	case MLD_SILENT_MEMBER:
	break;
	case MLD_REPORTING_MEMBER:
	case MLD_IDLE_MEMBER:
	case MLD_LAZY_MEMBER:
	case MLD_SLEEPING_MEMBER:
	case MLD_AWAKENING_MEMBER:
	inm->in6m_state = MLD_REPORTING_MEMBER;
	retval = mld_v2_enqueue_group_record(&mli->mli_gq,
	inm, 0, 0, 0, 0);
	CTR2(KTR_MLD, "%s: enqueue record = %d",
	__func__, retval);
	break;
	case MLD_G_QUERY_PENDING_MEMBER:
	case MLD_SG_QUERY_PENDING_MEMBER:
	case MLD_LEAVING_MEMBER:
	break;
	}
	}
	IF_ADDR_RUNLOCK(ifp);

	send:
	mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST);

	/*
	* Slew transmission of bursts over 500ms intervals.
	*/
	if (mbufq_first(&mli->mli_gq) != NULL) {
	mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY(
	MLD_RESPONSE_BURST_INTERVAL);
	V_interface_timers_running6 = 1;
	}
	}

	/*
	* Transmit the next pending message in the output queue.
	*
	* VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
	* MRT: Nothing needs to be done, as MLD traffic is always local to
	* a link and uses a link-scope multicast address.
	*/
	static void
	mld_dispatch_packet(struct mbuf *m)
	{
	struct ip6_moptions im6o;
	struct ifnet *ifp;
	struct ifnet *oifp;
	struct mbuf *m0;
	struct mbuf *md;
	struct ip6_hdr *ip6;
	struct mld_hdr *mld;
	int error;
	int off;
	int type;
	uint32_t ifindex;

	CTR2(KTR_MLD, "%s: transmit %p", __func__, m);

	/*
	* Set VNET image pointer from enqueued mbuf chain
	* before doing anything else. Whilst we use interface
	* indexes to guard against interface detach, they are
	* unique to each VIMAGE and must be retrieved.
	*/
	ifindex = mld_restore_context(m);

	/*
	* Check if the ifnet still exists. This limits the scope of
	* any race in the absence of a global ifp lock for low cost
	* (an array lookup).
	*/
	ifp = ifnet_byindex(ifindex);
	if (ifp == NULL) {
	CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.",
	__func__, m, ifindex);
	m_freem(m);
	IP6STAT_INC(ip6s_noroute);
	goto out;
	}

	im6o.im6o_multicast_hlim = 1;
	im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL);
	im6o.im6o_multicast_ifp = ifp;

	if (m->m_flags & M_MLDV1) {
	m0 = m;
	} else {
	m0 = mld_v2_encap_report(ifp, m);
	if (m0 == NULL) {
	CTR2(KTR_MLD, "%s: dropped %p", __func__, m);
	IP6STAT_INC(ip6s_odropped);
	goto out;
	}
	}

	mld_scrub_context(m0);
	m_clrprotoflags(m);
	m0->m_pkthdr.rcvif = V_loif;

	ip6 = mtod(m0, struct ip6_hdr *);
	#if 0
	(void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */
	#else
	/*
	* XXX XXX Break some KPI rules to prevent an LOR which would
	* occur if we called in6_setscope() at transmission.
	* See comments at top of file.
	*/
	MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index);
	#endif

	/*
	* Retrieve the ICMPv6 type before handoff to ip6_output(),
	* so we can bump the stats.
	*/
	md = m_getptr(m0, sizeof(struct ip6_hdr), &off);
	mld = (struct mld_hdr )(mtod(md, uint8_t ) + off);
	type = mld->mld_type;

	error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o,
	&oifp, NULL);
	if (error) {
	CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error);
	goto out;
	}
	ICMP6STAT_INC(icp6s_outhist[type]);
	if (oifp != NULL) {
	icmp6_ifstat_inc(oifp, ifs6_out_msg);
	switch (type) {
	case MLD_LISTENER_REPORT:
	case MLDV2_LISTENER_REPORT:
	icmp6_ifstat_inc(oifp, ifs6_out_mldreport);
	break;
	case MLD_LISTENER_DONE:
	icmp6_ifstat_inc(oifp, ifs6_out_mlddone);
	break;
	}
	}
	out:
	return;
	}

	/*
	* Encapsulate an MLDv2 report.
	*
	* KAME IPv6 requires that hop-by-hop options be passed separately,
	* and that the IPv6 header be prepended in a separate mbuf.
	*
	* Returns a pointer to the new mbuf chain head, or NULL if the
	* allocation failed.
	*/
	static struct mbuf *
	mld_v2_encap_report(struct ifnet ifp, struct mbuf m)
	{
	struct mbuf *mh;
	struct mldv2_report *mld;
	struct ip6_hdr *ip6;
	struct in6_ifaddr *ia;
	int mldreclen;

	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
	KASSERT((m->m_flags & M_PKTHDR),
	("%s: mbuf chain %p is !M_PKTHDR", __func__, m));

	/*
	* RFC3590: OK to send as :: or tentative during DAD.
	*/
	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY\|IN6_IFF_ANYCAST);
	if (ia == NULL)
	CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__);

	mh = m_gethdr(M_NOWAIT, MT_DATA);
	if (mh == NULL) {
	if (ia != NULL)
	ifa_free(&ia->ia_ifa);
	m_freem(m);
	return (NULL);
	}
	M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));

	mldreclen = m_length(m, NULL);
	CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen);

	mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report);
	mh->m_pkthdr.len = sizeof(struct ip6_hdr) +
	sizeof(struct mldv2_report) + mldreclen;

	ip6 = mtod(mh, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
	if (ia != NULL)
	ifa_free(&ia->ia_ifa);
	ip6->ip6_dst = in6addr_linklocal_allv2routers;
	/* scope ID will be set in netisr */

	mld = (struct mldv2_report *)(ip6 + 1);
	mld->mld_type = MLDV2_LISTENER_REPORT;
	mld->mld_code = 0;
	mld->mld_cksum = 0;
	mld->mld_v2_reserved = 0;
	mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs);
	m->m_pkthdr.PH_vt.vt_nrecs = 0;

	mh->m_next = m;
	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
	sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen);
	return (mh);
	}

	#ifdef KTR
	static char *
	mld_rec_type_to_str(const int type)
	{

	switch (type) {
	case MLD_CHANGE_TO_EXCLUDE_MODE:
	return "TO_EX";
	break;
	case MLD_CHANGE_TO_INCLUDE_MODE:
	return "TO_IN";
	break;
	case MLD_MODE_IS_EXCLUDE:
	return "MODE_EX";
	break;
	case MLD_MODE_IS_INCLUDE:
	return "MODE_IN";
	break;
	case MLD_ALLOW_NEW_SOURCES:
	return "ALLOW_NEW";
	break;
	case MLD_BLOCK_OLD_SOURCES:
	return "BLOCK_OLD";
	break;
	default:
	break;
	}
	return "unknown";
	}
	#endif

	static void
	mld_init(void *unused __unused)
	{

	CTR1(KTR_MLD, "%s: initializing", __func__);
	MLD_LOCK_INIT();

	ip6_initpktopts(&mld_po);
	mld_po.ip6po_hlim = 1;
	mld_po.ip6po_hbh = &mld_ra.hbh;
	mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
	mld_po.ip6po_flags = IP6PO_DONTFRAG;
	}
	SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL);

	static void
	mld_uninit(void *unused __unused)
	{

	CTR1(KTR_MLD, "%s: tearing down", __func__);
	MLD_LOCK_DESTROY();
	}
	SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL);

	static void
	vnet_mld_init(const void *unused __unused)
	{

	CTR1(KTR_MLD, "%s: initializing", __func__);

	LIST_INIT(&V_mli_head);
	}
	VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init,
	NULL);

	static void
	vnet_mld_uninit(const void *unused __unused)
	{

	/* This can happen if we shutdown the network stack. */
	CTR1(KTR_MLD, "%s: tearing down", __func__);
	}
	VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit,
	NULL);

	static int
	mld_modevent(module_t mod, int type, void *unused __unused)
	{

	switch (type) {
	case MOD_LOAD:
	case MOD_UNLOAD:
	break;
	default:
	return (EOPNOTSUPP);
	}
	return (0);
	}

	static moduledata_t mld_mod = {
	"mld",
	mld_modevent,
	0
	};
	DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
	Index: head/sys/netinet6/nd6.c
	===================================================================
	--- head/sys/netinet6/nd6.c (revision 327172)
	+++ head/sys/netinet6/nd6.c (revision 327173)
	@@ -1,2765 +1,2760 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/callout.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/protosw.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/rwlock.h>
	#include <sys/queue.h>
	#include <sys/sdt.h>
	#include <sys/sysctl.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_arc.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/iso88025.h>
	#include <net/fddi.h>
	#include <net/route.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_kdtrace.h>
	#include <net/if_llatbl.h>
	#include <netinet/if_ether.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet/icmp6.h>
	#include <netinet6/send.h>

	#include <sys/limits.h>

	#include <security/mac/mac_framework.h>

	#define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
	#define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */

	#define SIN6(s) ((const struct sockaddr_in6 *)(s))

	MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");

	/* timer values */
	VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */
	VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */
	VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */
	VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */
	VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for
	* local traffic */
	VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage
	* collection timer */

	/* preventing too many loops in ND option parsing */
	static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */

	VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper
	* layer hints */
	static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved
	* ND entries */
	#define V_nd6_maxndopt VNET(nd6_maxndopt)
	#define V_nd6_maxqueuelen VNET(nd6_maxqueuelen)

	#ifdef ND6_DEBUG
	VNET_DEFINE(int, nd6_debug) = 1;
	#else
	VNET_DEFINE(int, nd6_debug) = 0;
	#endif

	static eventhandler_tag lle_event_eh, iflladdr_event_eh;

	VNET_DEFINE(struct nd_drhead, nd_defrouter);
	VNET_DEFINE(struct nd_prhead, nd_prefix);
	VNET_DEFINE(struct rwlock, nd6_lock);
	VNET_DEFINE(uint64_t, nd6_list_genid);
	VNET_DEFINE(struct mtx, nd6_onlink_mtx);

	VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL;
	#define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval)

	int (send_sendso_input_hook)(struct mbuf , struct ifnet *, int, int);

	static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *,
	struct ifnet *);
	static void nd6_setmtu0(struct ifnet , struct nd_ifinfo );
	static void nd6_slowtimo(void *);
	static int regen_tmpaddr(struct in6_ifaddr *);
	static void nd6_free(struct llentry **, int);
	static void nd6_free_redirect(const struct llentry *);
	static void nd6_llinfo_timer(void *);
	static void nd6_llinfo_settimer_locked(struct llentry *, long);
	static void clear_llinfo_pqueue(struct llentry *);
	static void nd6_rtrequest(int, struct rtentry , struct rt_addrinfo );
	static int nd6_resolve_slow(struct ifnet , int, struct mbuf ,
	const struct sockaddr_in6 , u_char , uint32_t , struct llentry *);
	static int nd6_need_cache(struct ifnet *);


	static VNET_DEFINE(struct callout, nd6_slowtimo_ch);
	#define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch)

	VNET_DEFINE(struct callout, nd6_timer_ch);
	#define V_nd6_timer_ch VNET(nd6_timer_ch)

	static void
	nd6_lle_event(void arg __unused, struct llentry lle, int evt)
	{
	struct rt_addrinfo rtinfo;
	struct sockaddr_in6 dst;
	struct sockaddr_dl gw;
	struct ifnet *ifp;
	int type;
	int fibnum;

	LLE_WLOCK_ASSERT(lle);

	if (lltable_get_af(lle->lle_tbl) != AF_INET6)
	return;

	switch (evt) {
	case LLENTRY_RESOLVED:
	type = RTM_ADD;
	KASSERT(lle->la_flags & LLE_VALID,
	("%s: %p resolved but not valid?", __func__, lle));
	break;
	case LLENTRY_EXPIRED:
	type = RTM_DELETE;
	break;
	default:
	return;
	}

	ifp = lltable_get_ifp(lle->lle_tbl);

	bzero(&dst, sizeof(dst));
	bzero(&gw, sizeof(gw));
	bzero(&rtinfo, sizeof(rtinfo));
	lltable_fill_sa_entry(lle, (struct sockaddr *)&dst);
	dst.sin6_scope_id = in6_getscopezone(ifp,
	in6_addrscope(&dst.sin6_addr));
	gw.sdl_len = sizeof(struct sockaddr_dl);
	gw.sdl_family = AF_LINK;
	gw.sdl_alen = ifp->if_addrlen;
	gw.sdl_index = ifp->if_index;
	gw.sdl_type = ifp->if_type;
	if (evt == LLENTRY_RESOLVED)
	bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen);
	rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst;
	rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw;
	rtinfo.rti_addrs = RTA_DST \| RTA_GATEWAY;
	fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ifp->if_fib;
	rt_missmsg_fib(type, &rtinfo, RTF_HOST \| RTF_LLDATA \| (
	type == RTM_ADD ? RTF_UP: 0), 0, fibnum);
	}

	/*
	* A handler for interface link layer address change event.
	*/
	static void
	nd6_iflladdr(void arg __unused, struct ifnet ifp)
	{

	lltable_update_ifaddr(LLTABLE6(ifp));
	}

	void
	nd6_init(void)
	{

	mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF);
	rw_init(&V_nd6_lock, "nd6 list");

	LIST_INIT(&V_nd_prefix);
	TAILQ_INIT(&V_nd_defrouter);

	/* Start timers. */
	callout_init(&V_nd6_slowtimo_ch, 0);
	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
	nd6_slowtimo, curvnet);

	callout_init(&V_nd6_timer_ch, 0);
	callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet);

	nd6_dad_init();
	if (IS_DEFAULT_VNET(curvnet)) {
	lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event,
	NULL, EVENTHANDLER_PRI_ANY);
	iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event,
	nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
	}
	}

	#ifdef VIMAGE
	void
	nd6_destroy()
	{

	callout_drain(&V_nd6_slowtimo_ch);
	callout_drain(&V_nd6_timer_ch);
	if (IS_DEFAULT_VNET(curvnet)) {
	EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
	EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh);
	}
	rw_destroy(&V_nd6_lock);
	mtx_destroy(&V_nd6_onlink_mtx);
	}
	#endif

	struct nd_ifinfo *
	nd6_ifattach(struct ifnet *ifp)
	{
	struct nd_ifinfo *nd;

	nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK \| M_ZERO);
	nd->initialized = 1;

	nd->chlim = IPV6_DEFHLIM;
	nd->basereachable = REACHABLE_TIME;
	nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
	nd->retrans = RETRANS_TIMER;

	nd->flags = ND6_IFF_PERFORMNUD;

	/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
	* XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by
	* default regardless of the V_ip6_auto_linklocal configuration to
	* give a reasonable default behavior.
	*/
	if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) \|\|
	(ifp->if_flags & IFF_LOOPBACK))
	nd->flags \|= ND6_IFF_AUTO_LINKLOCAL;
	/*
	* A loopback interface does not need to accept RTADV.
	* XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by
	* default regardless of the V_ip6_accept_rtadv configuration to
	* prevent the interface from accepting RA messages arrived
	* on one of the member interfaces with ND6_IFF_ACCEPT_RTADV.
	*/
	if (V_ip6_accept_rtadv &&
	!(ifp->if_flags & IFF_LOOPBACK) &&
	(ifp->if_type != IFT_BRIDGE))
	nd->flags \|= ND6_IFF_ACCEPT_RTADV;
	if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK))
	nd->flags \|= ND6_IFF_NO_RADR;

	/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
	nd6_setmtu0(ifp, nd);

	return nd;
	}

	void
	nd6_ifdetach(struct ifnet ifp, struct nd_ifinfo nd)
	{
	struct ifaddr ifa, next;

	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;

	/* stop DAD processing */
	nd6_dad_stop(ifa);
	}
	IF_ADDR_RUNLOCK(ifp);

	free(nd, M_IP6NDP);
	}

	/*
	* Reset ND level link MTU. This function is called when the physical MTU
	* changes, which means we might have to adjust the ND level MTU.
	*/
	void
	nd6_setmtu(struct ifnet *ifp)
	{
	if (ifp->if_afdata[AF_INET6] == NULL)
	return;

	nd6_setmtu0(ifp, ND_IFINFO(ifp));
	}

	/* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
	void
	nd6_setmtu0(struct ifnet ifp, struct nd_ifinfo ndi)
	{
	u_int32_t omaxmtu;

	omaxmtu = ndi->maxmtu;

	switch (ifp->if_type) {
	case IFT_ARCNET:
	ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */
	break;
	case IFT_FDDI:
	ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */
	break;
	case IFT_ISO88025:
	ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu);
	break;
	default:
	ndi->maxmtu = ifp->if_mtu;
	break;
	}

	/*
	* Decreasing the interface MTU under IPV6 minimum MTU may cause
	* undesirable situation. We thus notify the operator of the change
	* explicitly. The check for omaxmtu is necessary to restrict the
	* log to the case of changing the MTU, not initializing it.
	*/
	if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
	log(LOG_NOTICE, "nd6_setmtu0: "
	"new link MTU on %s (%lu) is too small for IPv6\n",
	if_name(ifp), (unsigned long)ndi->maxmtu);
	}

	if (ndi->maxmtu > V_in6_maxmtu)
	in6_setmaxmtu(); /* check all interfaces just in case */

	}

	void
	nd6_option_init(void opt, int icmp6len, union nd_opts ndopts)
	{

	bzero(ndopts, sizeof(*ndopts));
	ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
	ndopts->nd_opts_last
	= (struct nd_opt_hdr )(((u_char )opt) + icmp6len);

	if (icmp6len == 0) {
	ndopts->nd_opts_done = 1;
	ndopts->nd_opts_search = NULL;
	}
	}

	/*
	* Take one ND option.
	*/
	struct nd_opt_hdr *
	nd6_option(union nd_opts *ndopts)
	{
	struct nd_opt_hdr *nd_opt;
	int olen;

	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
	__func__));
	if (ndopts->nd_opts_search == NULL)
	return NULL;
	if (ndopts->nd_opts_done)
	return NULL;

	nd_opt = ndopts->nd_opts_search;

	/* make sure nd_opt_len is inside the buffer */
	if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
	bzero(ndopts, sizeof(*ndopts));
	return NULL;
	}

	olen = nd_opt->nd_opt_len << 3;
	if (olen == 0) {
	/*
	* Message validation requires that all included
	* options have a length that is greater than zero.
	*/
	bzero(ndopts, sizeof(*ndopts));
	return NULL;
	}

	ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
	if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
	/* option overruns the end of buffer, invalid */
	bzero(ndopts, sizeof(*ndopts));
	return NULL;
	} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
	/* reached the end of options chain */
	ndopts->nd_opts_done = 1;
	ndopts->nd_opts_search = NULL;
	}
	return nd_opt;
	}

	/*
	* Parse multiple ND options.
	* This function is much easier to use, for ND routines that do not need
	* multiple options of the same type.
	*/
	int
	nd6_options(union nd_opts *ndopts)
	{
	struct nd_opt_hdr *nd_opt;
	int i = 0;

	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
	__func__));
	if (ndopts->nd_opts_search == NULL)
	return 0;

	while (1) {
	nd_opt = nd6_option(ndopts);
	if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
	/*
	* Message validation requires that all included
	* options have a length that is greater than zero.
	*/
	ICMP6STAT_INC(icp6s_nd_badopt);
	bzero(ndopts, sizeof(*ndopts));
	return -1;
	}

	if (nd_opt == NULL)
	goto skip1;

	switch (nd_opt->nd_opt_type) {
	case ND_OPT_SOURCE_LINKADDR:
	case ND_OPT_TARGET_LINKADDR:
	case ND_OPT_MTU:
	case ND_OPT_REDIRECTED_HEADER:
	case ND_OPT_NONCE:
	if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
	nd6log((LOG_INFO,
	"duplicated ND6 option found (type=%d)\n",
	nd_opt->nd_opt_type));
	/* XXX bark? */
	} else {
	ndopts->nd_opt_array[nd_opt->nd_opt_type]
	= nd_opt;
	}
	break;
	case ND_OPT_PREFIX_INFORMATION:
	if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
	ndopts->nd_opt_array[nd_opt->nd_opt_type]
	= nd_opt;
	}
	ndopts->nd_opts_pi_end =
	(struct nd_opt_prefix_info *)nd_opt;
	break;
	/* What about ND_OPT_ROUTE_INFO? RFC 4191 */
	case ND_OPT_RDNSS: /* RFC 6106 */
	case ND_OPT_DNSSL: /* RFC 6106 */
	/*
	* Silently ignore options we know and do not care about
	* in the kernel.
	*/
	break;
	default:
	/*
	* Unknown options must be silently ignored,
	* to accommodate future extension to the protocol.
	*/
	nd6log((LOG_DEBUG,
	"nd6_options: unsupported option %d - "
	"option ignored\n", nd_opt->nd_opt_type));
	}

	skip1:
	i++;
	if (i > V_nd6_maxndopt) {
	ICMP6STAT_INC(icp6s_nd_toomanyopt);
	nd6log((LOG_INFO, "too many loop in nd opt\n"));
	break;
	}

	if (ndopts->nd_opts_done)
	break;
	}

	return 0;
	}

	/*
	* ND6 timer routine to handle ND6 entries
	*/
	static void
	nd6_llinfo_settimer_locked(struct llentry *ln, long tick)
	{
	int canceled;

	LLE_WLOCK_ASSERT(ln);

	if (tick < 0) {
	ln->la_expire = 0;
	ln->ln_ntick = 0;
	canceled = callout_stop(&ln->lle_timer);
	} else {
	ln->la_expire = time_uptime + tick / hz;
	LLE_ADDREF(ln);
	if (tick > INT_MAX) {
	ln->ln_ntick = tick - INT_MAX;
	canceled = callout_reset(&ln->lle_timer, INT_MAX,
	nd6_llinfo_timer, ln);
	} else {
	ln->ln_ntick = 0;
	canceled = callout_reset(&ln->lle_timer, tick,
	nd6_llinfo_timer, ln);
	}
	}
	if (canceled > 0)
	LLE_REMREF(ln);
	}

	/*
	* Gets source address of the first packet in hold queue
	* and stores it in @src.
	* Returns pointer to @src (if hold queue is not empty) or NULL.
	*
	* Set noinline to be dtrace-friendly
	*/
	static __noinline struct in6_addr *
	nd6_llinfo_get_holdsrc(struct llentry ln, struct in6_addr src)
	{
	struct ip6_hdr hdr;
	struct mbuf *m;

	if (ln->la_hold == NULL)
	return (NULL);

	/*
	* assume every packet in la_hold has the same IP header
	*/
	m = ln->la_hold;
	if (sizeof(hdr) > m->m_len)
	return (NULL);

	m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr);
	*src = hdr.ip6_src;

	return (src);
	}

	/*
	* Checks if we need to switch from STALE state.
	*
	* RFC 4861 requires switching from STALE to DELAY state
	* on first packet matching entry, waiting V_nd6_delay and
	* transition to PROBE state (if upper layer confirmation was
	* not received).
	*
	* This code performs a bit differently:
	* On packet hit we don't change state (but desired state
	* can be guessed by control plane). However, after V_nd6_delay
	* seconds code will transition to PROBE state (so DELAY state
	* is kinda skipped in most situations).
	*
	* Typically, V_nd6_gctimer is bigger than V_nd6_delay, so
	* we perform the following upon entering STALE state:
	*
	* 1) Arm timer to run each V_nd6_delay seconds to make sure that
	* if packet was transmitted at the start of given interval, we
	* would be able to switch to PROBE state in V_nd6_delay seconds
	* as user expects.
	*
	* 2) Reschedule timer until original V_nd6_gctimer expires keeping
	* lle in STALE state (remaining timer value stored in lle_remtime).
	*
	* 3) Reschedule timer if packet was transmitted less that V_nd6_delay
	* seconds ago.
	*
	* Returns non-zero value if the entry is still STALE (storing
	* the next timer interval in @pdelay).
	*
	* Returns zero value if original timer expired or we need to switch to
	* PROBE (store that in @do_switch variable).
	*/
	static int
	nd6_is_stale(struct llentry lle, long pdelay, int *do_switch)
	{
	int nd_delay, nd_gctimer, r_skip_req;
	time_t lle_hittime;
	long delay;

	*do_switch = 0;
	nd_gctimer = V_nd6_gctimer;
	nd_delay = V_nd6_delay;

	LLE_REQ_LOCK(lle);
	r_skip_req = lle->r_skip_req;
	lle_hittime = lle->lle_hittime;
	LLE_REQ_UNLOCK(lle);

	if (r_skip_req > 0) {

	/*
	* Nonzero r_skip_req value was set upon entering
	* STALE state. Since value was not changed, no
	* packets were passed using this lle. Ask for
	* timer reschedule and keep STALE state.
	*/
	delay = (long)(MIN(nd_gctimer, nd_delay));
	delay *= hz;
	if (lle->lle_remtime > delay)
	lle->lle_remtime -= delay;
	else {
	delay = lle->lle_remtime;
	lle->lle_remtime = 0;
	}

	if (delay == 0) {

	/*
	* The original ng6_gctime timeout ended,
	* no more rescheduling.
	*/
	return (0);
	}

	*pdelay = delay;
	return (1);
	}

	/*
	* Packet received. Verify timestamp
	*/
	delay = (long)(time_uptime - lle_hittime);
	if (delay < nd_delay) {

	/*
	* V_nd6_delay still not passed since the first
	* hit in STALE state.
	* Reshedule timer and return.
	*/
	pdelay = (long)(nd_delay - delay) hz;
	return (1);
	}

	/* Request switching to probe */
	*do_switch = 1;
	return (0);
	}


	/*
	* Switch @lle state to new state optionally arming timers.
	*
	* Set noinline to be dtrace-friendly
	*/
	__noinline void
	nd6_llinfo_setstate(struct llentry *lle, int newstate)
	{
	struct ifnet *ifp;
	int nd_gctimer, nd_delay;
	long delay, remtime;

	delay = 0;
	remtime = 0;

	switch (newstate) {
	case ND6_LLINFO_INCOMPLETE:
	ifp = lle->lle_tbl->llt_ifp;
	delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000;
	break;
	case ND6_LLINFO_REACHABLE:
	if (!ND6_LLINFO_PERMANENT(lle)) {
	ifp = lle->lle_tbl->llt_ifp;
	delay = (long)ND_IFINFO(ifp)->reachable * hz;
	}
	break;
	case ND6_LLINFO_STALE:

	/*
	* Notify fast path that we want to know if any packet
	* is transmitted by setting r_skip_req.
	*/
	LLE_REQ_LOCK(lle);
	lle->r_skip_req = 1;
	LLE_REQ_UNLOCK(lle);
	nd_delay = V_nd6_delay;
	nd_gctimer = V_nd6_gctimer;

	delay = (long)(MIN(nd_gctimer, nd_delay)) * hz;
	remtime = (long)nd_gctimer * hz - delay;
	break;
	case ND6_LLINFO_DELAY:
	lle->la_asked = 0;
	delay = (long)V_nd6_delay * hz;
	break;
	}

	if (delay > 0)
	nd6_llinfo_settimer_locked(lle, delay);

	lle->lle_remtime = remtime;
	lle->ln_state = newstate;
	}

	/*
	* Timer-dependent part of nd state machine.
	*
	* Set noinline to be dtrace-friendly
	*/
	static __noinline void
	nd6_llinfo_timer(void *arg)
	{
	struct llentry *ln;
	struct in6_addr dst, pdst, *psrc, src;
	struct ifnet *ifp;
	struct nd_ifinfo *ndi;
	int do_switch, send_ns;
	long delay;

	KASSERT(arg != NULL, ("%s: arg NULL", __func__));
	ln = (struct llentry *)arg;
	ifp = lltable_get_ifp(ln->lle_tbl);
	CURVNET_SET(ifp->if_vnet);

	ND6_RLOCK();
	LLE_WLOCK(ln);
	if (callout_pending(&ln->lle_timer)) {
	/*
	* Here we are a bit odd here in the treatment of
	* active/pending. If the pending bit is set, it got
	* rescheduled before I ran. The active
	* bit we ignore, since if it was stopped
	* in ll_tablefree() and was currently running
	* it would have return 0 so the code would
	* not have deleted it since the callout could
	* not be stopped so we want to go through
	* with the delete here now. If the callout
	* was restarted, the pending bit will be back on and
	* we just want to bail since the callout_reset would
	* return 1 and our reference would have been removed
	* by nd6_llinfo_settimer_locked above since canceled
	* would have been 1.
	*/
	LLE_WUNLOCK(ln);
	ND6_RUNLOCK();
	CURVNET_RESTORE();
	return;
	}
	ndi = ND_IFINFO(ifp);
	send_ns = 0;
	dst = &ln->r_l3addr.addr6;
	pdst = dst;

	if (ln->ln_ntick > 0) {
	if (ln->ln_ntick > INT_MAX) {
	ln->ln_ntick -= INT_MAX;
	nd6_llinfo_settimer_locked(ln, INT_MAX);
	} else {
	ln->ln_ntick = 0;
	nd6_llinfo_settimer_locked(ln, ln->ln_ntick);
	}
	goto done;
	}

	if (ln->la_flags & LLE_STATIC) {
	goto done;
	}

	if (ln->la_flags & LLE_DELETED) {
	nd6_free(&ln, 0);
	goto done;
	}

	switch (ln->ln_state) {
	case ND6_LLINFO_INCOMPLETE:
	if (ln->la_asked < V_nd6_mmaxtries) {
	ln->la_asked++;
	send_ns = 1;
	/* Send NS to multicast address */
	pdst = NULL;
	} else {
	struct mbuf *m = ln->la_hold;
	if (m) {
	struct mbuf *m0;

	/*
	* assuming every packet in la_hold has the
	* same IP header. Send error after unlock.
	*/
	m0 = m->m_nextpkt;
	m->m_nextpkt = NULL;
	ln->la_hold = m0;
	clear_llinfo_pqueue(ln);
	}
	nd6_free(&ln, 0);
	if (m != NULL)
	icmp6_error2(m, ICMP6_DST_UNREACH,
	ICMP6_DST_UNREACH_ADDR, 0, ifp);
	}
	break;
	case ND6_LLINFO_REACHABLE:
	if (!ND6_LLINFO_PERMANENT(ln))
	nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
	break;

	case ND6_LLINFO_STALE:
	if (nd6_is_stale(ln, &delay, &do_switch) != 0) {

	/*
	* No packet has used this entry and GC timeout
	* has not been passed. Reshedule timer and
	* return.
	*/
	nd6_llinfo_settimer_locked(ln, delay);
	break;
	}

	if (do_switch == 0) {

	/*
	* GC timer has ended and entry hasn't been used.
	* Run Garbage collector (RFC 4861, 5.3)
	*/
	if (!ND6_LLINFO_PERMANENT(ln))
	nd6_free(&ln, 1);
	break;
	}

	/* Entry has been used AND delay timer has ended. */

	/* FALLTHROUGH */

	case ND6_LLINFO_DELAY:
	if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
	/* We need NUD */
	ln->la_asked = 1;
	nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE);
	send_ns = 1;
	} else
	nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */
	break;
	case ND6_LLINFO_PROBE:
	if (ln->la_asked < V_nd6_umaxtries) {
	ln->la_asked++;
	send_ns = 1;
	} else {
	nd6_free(&ln, 0);
	}
	break;
	default:
	panic("%s: paths in a dark night can be confusing: %d",
	__func__, ln->ln_state);
	}
	done:
	if (ln != NULL)
	ND6_RUNLOCK();
	if (send_ns != 0) {
	nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
	psrc = nd6_llinfo_get_holdsrc(ln, &src);
	LLE_FREE_LOCKED(ln);
	ln = NULL;
	nd6_ns_output(ifp, psrc, pdst, dst, NULL);
	}

	if (ln != NULL)
	LLE_FREE_LOCKED(ln);
	CURVNET_RESTORE();
	}


	/*
	* ND6 timer routine to expire default route list and prefix list
	*/
	void
	nd6_timer(void *arg)
	{
	CURVNET_SET((struct vnet *) arg);
	struct nd_drhead drq;
	struct nd_prhead prl;
	struct nd_defrouter dr, ndr;
	struct nd_prefix pr, npr;
	struct in6_ifaddr ia6, nia6;
	uint64_t genid;

	TAILQ_INIT(&drq);
	LIST_INIT(&prl);

	ND6_WLOCK();
	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr)
	if (dr->expire && dr->expire < time_uptime)
	defrouter_unlink(dr, &drq);
	ND6_WUNLOCK();

	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
	TAILQ_REMOVE(&drq, dr, dr_entry);
	defrouter_del(dr);
	}

	/*
	* expire interface addresses.
	* in the past the loop was inside prefix expiry processing.
	* However, from a stricter speci-confrmance standpoint, we should
	* rather separate address lifetimes and prefix lifetimes.
	*
	* XXXRW: in6_ifaddrhead locking.
	*/
	addrloop:
	TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) {
	/* check address lifetime */
	if (IFA6_IS_INVALID(ia6)) {
	int regen = 0;

	/*
	* If the expiring address is temporary, try
	* regenerating a new one. This would be useful when
	* we suspended a laptop PC, then turned it on after a
	* period that could invalidate all temporary
	* addresses. Although we may have to restart the
	* loop (see below), it must be after purging the
	* address. Otherwise, we'd see an infinite loop of
	* regeneration.
	*/
	if (V_ip6_use_tempaddr &&
	(ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
	if (regen_tmpaddr(ia6) == 0)
	regen = 1;
	}

	in6_purgeaddr(&ia6->ia_ifa);

	if (regen)
	goto addrloop; /* XXX: see below */
	} else if (IFA6_IS_DEPRECATED(ia6)) {
	int oldflags = ia6->ia6_flags;

	ia6->ia6_flags \|= IN6_IFF_DEPRECATED;

	/*
	* If a temporary address has just become deprecated,
	* regenerate a new one if possible.
	*/
	if (V_ip6_use_tempaddr &&
	(ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
	(oldflags & IN6_IFF_DEPRECATED) == 0) {

	if (regen_tmpaddr(ia6) == 0) {
	/*
	* A new temporary address is
	* generated.
	* XXX: this means the address chain
	* has changed while we are still in
	* the loop. Although the change
	* would not cause disaster (because
	* it's not a deletion, but an
	* addition,) we'd rather restart the
	* loop just for safety. Or does this
	* significantly reduce performance??
	*/
	goto addrloop;
	}
	}
	} else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) {
	/*
	* Schedule DAD for a tentative address. This happens
	* if the interface was down or not running
	* when the address was configured.
	*/
	int delay;

	delay = arc4random() %
	(MAX_RTR_SOLICITATION_DELAY * hz);
	nd6_dad_start((struct ifaddr *)ia6, delay);
	} else {
	/*
	* Check status of the interface. If it is down,
	* mark the address as tentative for future DAD.
	*/
	if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 \|\|
	(ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING)
	== 0 \|\|
	(ND_IFINFO(ia6->ia_ifp)->flags &
	ND6_IFF_IFDISABLED) != 0) {
	ia6->ia6_flags &= ~IN6_IFF_DUPLICATED;
	ia6->ia6_flags \|= IN6_IFF_TENTATIVE;
	}
	/*
	* A new RA might have made a deprecated address
	* preferred.
	*/
	ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
	}
	}

	ND6_WLOCK();
	restart:
	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
	/*
	* Expire prefixes. Since the pltime is only used for
	* autoconfigured addresses, pltime processing for prefixes is
	* not necessary.
	*
	* Only unlink after all derived addresses have expired. This
	* may not occur until two hours after the prefix has expired
	* per RFC 4862. If the prefix expires before its derived
	* addresses, mark it off-link. This will be done automatically
	* after unlinking if no address references remain.
	*/
	if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME \|\|
	time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime)
	continue;

	if (pr->ndpr_addrcnt == 0) {
	nd6_prefix_unlink(pr, &prl);
	continue;
	}
	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
	genid = V_nd6_list_genid;
	nd6_prefix_ref(pr);
	ND6_WUNLOCK();
	ND6_ONLINK_LOCK();
	(void)nd6_prefix_offlink(pr);
	ND6_ONLINK_UNLOCK();
	ND6_WLOCK();
	nd6_prefix_rele(pr);
	if (genid != V_nd6_list_genid)
	goto restart;
	}
	}
	ND6_WUNLOCK();

	while ((pr = LIST_FIRST(&prl)) != NULL) {
	LIST_REMOVE(pr, ndpr_entry);
	nd6_prefix_del(pr);
	}

	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
	nd6_timer, curvnet);

	CURVNET_RESTORE();
	}

	/*
	* ia6 - deprecated/invalidated temporary address
	*/
	static int
	regen_tmpaddr(struct in6_ifaddr *ia6)
	{
	struct ifaddr *ifa;
	struct ifnet *ifp;
	struct in6_ifaddr *public_ifa6 = NULL;

	ifp = ia6->ia_ifa.ifa_ifp;
	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct in6_ifaddr *it6;

	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;

	it6 = (struct in6_ifaddr *)ifa;

	/* ignore no autoconf addresses. */
	if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
	continue;

	/* ignore autoconf addresses with different prefixes. */
	if (it6->ia6_ndpr == NULL \|\| it6->ia6_ndpr != ia6->ia6_ndpr)
	continue;

	/*
	* Now we are looking at an autoconf address with the same
	* prefix as ours. If the address is temporary and is still
	* preferred, do not create another one. It would be rare, but
	* could happen, for example, when we resume a laptop PC after
	* a long period.
	*/
	if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
	!IFA6_IS_DEPRECATED(it6)) {
	public_ifa6 = NULL;
	break;
	}

	/*
	* This is a public autoconf address that has the same prefix
	* as ours. If it is preferred, keep it. We can't break the
	* loop here, because there may be a still-preferred temporary
	* address with the prefix.
	*/
	if (!IFA6_IS_DEPRECATED(it6))
	public_ifa6 = it6;
	}
	if (public_ifa6 != NULL)
	ifa_ref(&public_ifa6->ia_ifa);
	IF_ADDR_RUNLOCK(ifp);

	if (public_ifa6 != NULL) {
	int e;

	if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
	ifa_free(&public_ifa6->ia_ifa);
	log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
	" tmp addr,errno=%d\n", e);
	return (-1);
	}
	ifa_free(&public_ifa6->ia_ifa);
	return (0);
	}

	return (-1);
	}

	/*
	* Remove prefix and default router list entries corresponding to ifp. Neighbor
	* cache entries are freed in in6_domifdetach().
	*/
	void
	nd6_purge(struct ifnet *ifp)
	{
	struct nd_drhead drq;
	struct nd_prhead prl;
	struct nd_defrouter dr, ndr;
	struct nd_prefix pr, npr;

	TAILQ_INIT(&drq);
	LIST_INIT(&prl);

	/*
	* Nuke default router list entries toward ifp.
	* We defer removal of default router list entries that is installed
	* in the routing table, in order to keep additional side effects as
	* small as possible.
	*/
	ND6_WLOCK();
	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
	if (dr->installed)
	continue;
	if (dr->ifp == ifp)
	defrouter_unlink(dr, &drq);
	}
	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
	if (!dr->installed)
	continue;
	if (dr->ifp == ifp)
	defrouter_unlink(dr, &drq);
	}

	/*
	* Remove prefixes on ifp. We should have already removed addresses on
	* this interface, so no addresses should be referencing these prefixes.
	*/
	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
	if (pr->ndpr_ifp == ifp)
	nd6_prefix_unlink(pr, &prl);
	}
	ND6_WUNLOCK();

	/* Delete the unlinked router and prefix objects. */
	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
	TAILQ_REMOVE(&drq, dr, dr_entry);
	defrouter_del(dr);
	}
	while ((pr = LIST_FIRST(&prl)) != NULL) {
	LIST_REMOVE(pr, ndpr_entry);
	nd6_prefix_del(pr);
	}

	/* cancel default outgoing interface setting */
	if (V_nd6_defifindex == ifp->if_index)
	nd6_setdefaultiface(0);

	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
	/* Refresh default router list. */
	defrouter_select_fib(ifp->if_fib);
	}
	}

	/*
	* the caller acquires and releases the lock on the lltbls
	* Returns the llentry locked
	*/
	struct llentry *
	nd6_lookup(const struct in6_addr addr6, int flags, struct ifnet ifp)
	{
	struct sockaddr_in6 sin6;
	struct llentry *ln;

	bzero(&sin6, sizeof(sin6));
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	sin6.sin6_family = AF_INET6;
	sin6.sin6_addr = *addr6;

	IF_AFDATA_LOCK_ASSERT(ifp);

	ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6);

	return (ln);
	}

	struct llentry *
	nd6_alloc(const struct in6_addr addr6, int flags, struct ifnet ifp)
	{
	struct sockaddr_in6 sin6;
	struct llentry *ln;

	bzero(&sin6, sizeof(sin6));
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	sin6.sin6_family = AF_INET6;
	sin6.sin6_addr = *addr6;

	ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6);
	if (ln != NULL)
	ln->ln_state = ND6_LLINFO_NOSTATE;

	return (ln);
	}

	/*
	* Test whether a given IPv6 address is a neighbor or not, ignoring
	* the actual neighbor cache. The neighbor cache is ignored in order
	* to not reenter the routing code from within itself.
	*/
	static int
	nd6_is_new_addr_neighbor(const struct sockaddr_in6 addr, struct ifnet ifp)
	{
	struct nd_prefix *pr;
	struct ifaddr *ifa;
	struct rt_addrinfo info;
	struct sockaddr_in6 rt_key;
	const struct sockaddr *dst6;
	uint64_t genid;
	int error, fibnum;

	/*
	* A link-local address is always a neighbor.
	* XXX: a link does not necessarily specify a single interface.
	*/
	if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
	struct sockaddr_in6 sin6_copy;
	u_int32_t zone;

	/*
	* We need sin6_copy since sa6_recoverscope() may modify the
	* content (XXX).
	*/
	sin6_copy = *addr;
	if (sa6_recoverscope(&sin6_copy))
	return (0); /* XXX: should be impossible */
	if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
	return (0);
	if (sin6_copy.sin6_scope_id == zone)
	return (1);
	else
	return (0);
	}

	bzero(&rt_key, sizeof(rt_key));
	bzero(&info, sizeof(info));
	info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key;

	/*
	* If the address matches one of our addresses,
	* it should be a neighbor.
	* If the address matches one of our on-link prefixes, it should be a
	* neighbor.
	*/
	ND6_RLOCK();
	restart:
	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
	if (pr->ndpr_ifp != ifp)
	continue;

	if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
	dst6 = (const struct sockaddr *)&pr->ndpr_prefix;

	/*
	* We only need to check all FIBs if add_addr_allfibs
	* is unset. If set, checking any FIB will suffice.
	*/
	fibnum = V_rt_add_addr_allfibs ? rt_numfibs - 1 : 0;
	for (; fibnum < rt_numfibs; fibnum++) {
	genid = V_nd6_list_genid;
	ND6_RUNLOCK();

	/*
	* Restore length field before
	* retrying lookup
	*/
	rt_key.sin6_len = sizeof(rt_key);
	error = rib_lookup_info(fibnum, dst6, 0, 0,
	&info);

	ND6_RLOCK();
	if (genid != V_nd6_list_genid)
	goto restart;
	if (error == 0)
	break;
	}
	if (error != 0)
	continue;

	/*
	* This is the case where multiple interfaces
	* have the same prefix, but only one is installed
	* into the routing table and that prefix entry
	* is not the one being examined here. In the case
	* where RADIX_MPATH is enabled, multiple route
	* entries (of the same rt_key value) will be
	* installed because the interface addresses all
	* differ.
	*/
	if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
	&rt_key.sin6_addr))
	continue;
	}

	if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
	&addr->sin6_addr, &pr->ndpr_mask)) {
	ND6_RUNLOCK();
	return (1);
	}
	}
	ND6_RUNLOCK();

	/*
	* If the address is assigned on the node of the other side of
	* a p2p interface, the address should be a neighbor.
	*/
	if (ifp->if_flags & IFF_POINTOPOINT) {
	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != addr->sin6_family)
	continue;
	if (ifa->ifa_dstaddr != NULL &&
	sa_equal(addr, ifa->ifa_dstaddr)) {
	IF_ADDR_RUNLOCK(ifp);
	return 1;
	}
	}
	IF_ADDR_RUNLOCK(ifp);
	}

	/*
	* If the default router list is empty, all addresses are regarded
	* as on-link, and thus, as a neighbor.
	*/
	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV &&
	TAILQ_EMPTY(&V_nd_defrouter) &&
	V_nd6_defifindex == ifp->if_index) {
	return (1);
	}

	return (0);
	}


	/*
	* Detect if a given IPv6 address identifies a neighbor on a given link.
	* XXX: should take care of the destination of a p2p link?
	*/
	int
	nd6_is_addr_neighbor(const struct sockaddr_in6 addr, struct ifnet ifp)
	{
	struct llentry *lle;
	int rc = 0;

	IF_AFDATA_UNLOCK_ASSERT(ifp);
	if (nd6_is_new_addr_neighbor(addr, ifp))
	return (1);

	/*
	* Even if the address matches none of our addresses, it might be
	* in the neighbor cache.
	*/
	IF_AFDATA_RLOCK(ifp);
	if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) {
	LLE_RUNLOCK(lle);
	rc = 1;
	}
	IF_AFDATA_RUNLOCK(ifp);
	return (rc);
	}

	/*
	* Free an nd6 llinfo entry.
	* Since the function would cause significant changes in the kernel, DO NOT
	* make it global, unless you have a strong reason for the change, and are sure
	* that the change is safe.
	*
	* Set noinline to be dtrace-friendly
	*/
	static __noinline void
	nd6_free(struct llentry **lnp, int gc)
	{
	struct ifnet *ifp;
	struct llentry *ln;
	struct nd_defrouter *dr;

	ln = *lnp;
	*lnp = NULL;

	LLE_WLOCK_ASSERT(ln);
	ND6_RLOCK_ASSERT();

	ifp = lltable_get_ifp(ln->lle_tbl);
	if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0)
	dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp);
	else
	dr = NULL;
	ND6_RUNLOCK();

	if ((ln->la_flags & LLE_DELETED) == 0)
	EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);

	/*
	* we used to have pfctlinput(PRC_HOSTDEAD) here.
	* even though it is not harmful, it was not really necessary.
	*/

	/* cancel timer */
	nd6_llinfo_settimer_locked(ln, -1);

	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
	if (dr != NULL && dr->expire &&
	ln->ln_state == ND6_LLINFO_STALE && gc) {
	/*
	* If the reason for the deletion is just garbage
	* collection, and the neighbor is an active default
	* router, do not delete it. Instead, reset the GC
	* timer using the router's lifetime.
	* Simply deleting the entry would affect default
	* router selection, which is not necessarily a good
	* thing, especially when we're using router preference
	* values.
	* XXX: the check for ln_state would be redundant,
	* but we intentionally keep it just in case.
	*/
	if (dr->expire > time_uptime)
	nd6_llinfo_settimer_locked(ln,
	(dr->expire - time_uptime) * hz);
	else
	nd6_llinfo_settimer_locked(ln,
	(long)V_nd6_gctimer * hz);

	LLE_REMREF(ln);
	LLE_WUNLOCK(ln);
	defrouter_rele(dr);
	return;
	}

	if (dr) {
	/*
	* Unreachablity of a router might affect the default
	* router selection and on-link detection of advertised
	* prefixes.
	*/

	/*
	* Temporarily fake the state to choose a new default
	* router and to perform on-link determination of
	* prefixes correctly.
	* Below the state will be set correctly,
	* or the entry itself will be deleted.
	*/
	ln->ln_state = ND6_LLINFO_INCOMPLETE;
	}

	if (ln->ln_router \|\| dr) {

	/*
	* We need to unlock to avoid a LOR with rt6_flush() with the
	* rnh and for the calls to pfxlist_onlink_check() and
	* defrouter_select_fib() in the block further down for calls
	* into nd6_lookup(). We still hold a ref.
	*/
	LLE_WUNLOCK(ln);

	/*
	* rt6_flush must be called whether or not the neighbor
	* is in the Default Router List.
	* See a corresponding comment in nd6_na_input().
	*/
	rt6_flush(&ln->r_l3addr.addr6, ifp);
	}

	if (dr) {
	/*
	* Since defrouter_select_fib() does not affect the
	* on-link determination and MIP6 needs the check
	* before the default router selection, we perform
	* the check now.
	*/
	pfxlist_onlink_check();

	/*
	* Refresh default router list.
	*/
	defrouter_select_fib(dr->ifp->if_fib);
	}

	/*
	* If this entry was added by an on-link redirect, remove the
	* corresponding host route.
	*/
	if (ln->la_flags & LLE_REDIRECT)
	nd6_free_redirect(ln);

	if (ln->ln_router \|\| dr)
	LLE_WLOCK(ln);
	}

	/*
	* Save to unlock. We still hold an extra reference and will not
	* free(9) in llentry_free() if someone else holds one as well.
	*/
	LLE_WUNLOCK(ln);
	IF_AFDATA_LOCK(ifp);
	LLE_WLOCK(ln);
	/* Guard against race with other llentry_free(). */
	if (ln->la_flags & LLE_LINKED) {
	/* Remove callout reference */
	LLE_REMREF(ln);
	lltable_unlink_entry(ln->lle_tbl, ln);
	}
	IF_AFDATA_UNLOCK(ifp);

	llentry_free(ln);
	if (dr != NULL)
	defrouter_rele(dr);
	}

	static int
	nd6_isdynrte(const struct rtentry rt, void xap)
	{

	if (rt->rt_flags == (RTF_UP \| RTF_HOST \| RTF_DYNAMIC))
	return (1);

	return (0);
	}
	/*
	* Remove the rtentry for the given llentry,
	* both of which were installed by a redirect.
	*/
	static void
	nd6_free_redirect(const struct llentry *ln)
	{
	int fibnum;
	struct sockaddr_in6 sin6;
	struct rt_addrinfo info;

	lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6);
	memset(&info, 0, sizeof(info));
	info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6;
	info.rti_filter = nd6_isdynrte;

	for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
	rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
	}

	/*
	* Rejuvenate this function for routing operations related
	* processing.
	*/
	void
	nd6_rtrequest(int req, struct rtentry rt, struct rt_addrinfo info)
	{
	struct sockaddr_in6 *gateway;
	struct nd_defrouter *dr;
	struct ifnet *ifp;

	gateway = (struct sockaddr_in6 *)rt->rt_gateway;
	ifp = rt->rt_ifp;

	switch (req) {
	case RTM_ADD:
	break;

	case RTM_DELETE:
	if (!ifp)
	return;
	/*
	* Only indirect routes are interesting.
	*/
	if ((rt->rt_flags & RTF_GATEWAY) == 0)
	return;
	/*
	* check for default route
	*/
	if (IN6_ARE_ADDR_EQUAL(&in6addr_any,
	&SIN6(rt_key(rt))->sin6_addr)) {
	dr = defrouter_lookup(&gateway->sin6_addr, ifp);
	if (dr != NULL) {
	dr->installed = 0;
	defrouter_rele(dr);
	}
	}
	break;
	}
	}


	int
	nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
	{
	struct in6_ndireq ndi = (struct in6_ndireq )data;
	struct in6_nbrinfo nbi = (struct in6_nbrinfo )data;
	struct in6_ndifreq ndif = (struct in6_ndifreq )data;
	int error = 0;

	if (ifp->if_afdata[AF_INET6] == NULL)
	return (EPFNOSUPPORT);
	switch (cmd) {
	case OSIOCGIFINFO_IN6:
	#define ND ndi->ndi
	/* XXX: old ndp(8) assumes a positive value for linkmtu. */
	bzero(&ND, sizeof(ND));
	ND.linkmtu = IN6_LINKMTU(ifp);
	ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
	ND.basereachable = ND_IFINFO(ifp)->basereachable;
	ND.reachable = ND_IFINFO(ifp)->reachable;
	ND.retrans = ND_IFINFO(ifp)->retrans;
	ND.flags = ND_IFINFO(ifp)->flags;
	ND.recalctm = ND_IFINFO(ifp)->recalctm;
	ND.chlim = ND_IFINFO(ifp)->chlim;
	break;
	case SIOCGIFINFO_IN6:
	ND = *ND_IFINFO(ifp);
	break;
	case SIOCSIFINFO_IN6:
	/*
	* used to change host variables from userland.
	* intended for a use on router to reflect RA configurations.
	*/
	/* 0 means 'unspecified' */
	if (ND.linkmtu != 0) {
	if (ND.linkmtu < IPV6_MMTU \|\|
	ND.linkmtu > IN6_LINKMTU(ifp)) {
	error = EINVAL;
	break;
	}
	ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
	}

	if (ND.basereachable != 0) {
	int obasereachable = ND_IFINFO(ifp)->basereachable;

	ND_IFINFO(ifp)->basereachable = ND.basereachable;
	if (ND.basereachable != obasereachable)
	ND_IFINFO(ifp)->reachable =
	ND_COMPUTE_RTIME(ND.basereachable);
	}
	if (ND.retrans != 0)
	ND_IFINFO(ifp)->retrans = ND.retrans;
	if (ND.chlim != 0)
	ND_IFINFO(ifp)->chlim = ND.chlim;
	/* FALLTHROUGH */
	case SIOCSIFINFO_FLAGS:
	{
	struct ifaddr *ifa;
	struct in6_ifaddr *ia;

	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
	!(ND.flags & ND6_IFF_IFDISABLED)) {
	/* ifdisabled 1->0 transision */

	/*
	* If the interface is marked as ND6_IFF_IFDISABLED and
	* has an link-local address with IN6_IFF_DUPLICATED,
	* do not clear ND6_IFF_IFDISABLED.
	* See RFC 4862, Section 5.4.5.
	*/
	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	ia = (struct in6_ifaddr *)ifa;
	if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
	IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
	break;
	}
	IF_ADDR_RUNLOCK(ifp);

	if (ifa != NULL) {
	/* LLA is duplicated. */
	ND.flags \|= ND6_IFF_IFDISABLED;
	log(LOG_ERR, "Cannot enable an interface"
	" with a link-local address marked"
	" duplicate.\n");
	} else {
	ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
	if (ifp->if_flags & IFF_UP)
	in6_if_up(ifp);
	}
	} else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
	(ND.flags & ND6_IFF_IFDISABLED)) {
	/* ifdisabled 0->1 transision */
	/* Mark all IPv6 address as tentative. */

	ND_IFINFO(ifp)->flags \|= ND6_IFF_IFDISABLED;
	if (V_ip6_dad_count > 0 &&
	(ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) {
	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead,
	ifa_link) {
	if (ifa->ifa_addr->sa_family !=
	AF_INET6)
	continue;
	ia = (struct in6_ifaddr *)ifa;
	ia->ia6_flags \|= IN6_IFF_TENTATIVE;
	}
	IF_ADDR_RUNLOCK(ifp);
	}
	}

	if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
	if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) {
	/* auto_linklocal 0->1 transision */

	/* If no link-local address on ifp, configure */
	ND_IFINFO(ifp)->flags \|= ND6_IFF_AUTO_LINKLOCAL;
	in6_ifattach(ifp, NULL);
	} else if (!(ND.flags & ND6_IFF_IFDISABLED) &&
	ifp->if_flags & IFF_UP) {
	/*
	* When the IF already has
	* ND6_IFF_AUTO_LINKLOCAL, no link-local
	* address is assigned, and IFF_UP, try to
	* assign one.
	*/
	IF_ADDR_RLOCK(ifp);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead,
	ifa_link) {
	if (ifa->ifa_addr->sa_family !=
	AF_INET6)
	continue;
	ia = (struct in6_ifaddr *)ifa;
	if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
	break;
	}
	IF_ADDR_RUNLOCK(ifp);
	if (ifa != NULL)
	/* No LLA is configured. */
	in6_ifattach(ifp, NULL);
	}
	}
	}
	ND_IFINFO(ifp)->flags = ND.flags;
	break;
	#undef ND
	case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */
	/* sync kernel routing table with the default router list */
	defrouter_reset();
	defrouter_select();
	break;
	case SIOCSPFXFLUSH_IN6:
	{
	/* flush all the prefix advertised by routers */
	struct in6_ifaddr ia, ia_next;
	struct nd_prefix pr, next;
	struct nd_prhead prl;

	LIST_INIT(&prl);

	ND6_WLOCK();
	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) {
	if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
	continue; /* XXX */
	nd6_prefix_unlink(pr, &prl);
	}
	ND6_WUNLOCK();

	while ((pr = LIST_FIRST(&prl)) != NULL) {
	LIST_REMOVE(pr, ndpr_entry);
	/* XXXRW: in6_ifaddrhead locking. */
	TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link,
	ia_next) {
	if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
	continue;

	if (ia->ia6_ndpr == pr)
	in6_purgeaddr(&ia->ia_ifa);
	}
	nd6_prefix_del(pr);
	}
	break;
	}
	case SIOCSRTRFLUSH_IN6:
	{
	/* flush all the default routers */
	struct nd_drhead drq;
	struct nd_defrouter *dr;

	TAILQ_INIT(&drq);

	defrouter_reset();

	ND6_WLOCK();
	while ((dr = TAILQ_FIRST(&V_nd_defrouter)) != NULL)
	defrouter_unlink(dr, &drq);
	ND6_WUNLOCK();
	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
	TAILQ_REMOVE(&drq, dr, dr_entry);
	defrouter_del(dr);
	}

	defrouter_select();
	break;
	}
	case SIOCGNBRINFO_IN6:
	{
	struct llentry *ln;
	struct in6_addr nb_addr = nbi->addr; /* make local for safety */

	if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
	return (error);

	IF_AFDATA_RLOCK(ifp);
	ln = nd6_lookup(&nb_addr, 0, ifp);
	IF_AFDATA_RUNLOCK(ifp);

	if (ln == NULL) {
	error = EINVAL;
	break;
	}
	nbi->state = ln->ln_state;
	nbi->asked = ln->la_asked;
	nbi->isrouter = ln->ln_router;
	if (ln->la_expire == 0)
	nbi->expire = 0;
	else
	nbi->expire = ln->la_expire + ln->lle_remtime / hz +
	(time_second - time_uptime);
	LLE_RUNLOCK(ln);
	break;
	}
	case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */
	ndif->ifindex = V_nd6_defifindex;
	break;
	case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */
	return (nd6_setdefaultiface(ndif->ifindex));
	}
	return (error);
	}

	/*
	* Calculates new isRouter value based on provided parameters and
	* returns it.
	*/
	static int
	nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr,
	int ln_router)
	{

	/*
	* ICMP6 type dependent behavior.
	*
	* NS: clear IsRouter if new entry
	* RS: clear IsRouter
	* RA: set IsRouter if there's lladdr
	* redir: clear IsRouter if new entry
	*
	* RA case, (1):
	* The spec says that we must set IsRouter in the following cases:
	* - If lladdr exist, set IsRouter. This means (1-5).
	* - If it is old entry (!newentry), set IsRouter. This means (7).
	* So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
	* A quetion arises for (1) case. (1) case has no lladdr in the
	* neighbor cache, this is similar to (6).
	* This case is rare but we figured that we MUST NOT set IsRouter.
	*
	* is_new old_addr new_addr NS RS RA redir
	* D R
	* 0 n n (1) c ? s
	* 0 y n (2) c s s
	* 0 n y (3) c s s
	* 0 y y (4) c s s
	* 0 y y (5) c s s
	* 1 -- n (6) c c c s
	* 1 -- y (7) c c s c s
	*
	* (c=clear s=set)
	*/
	switch (type & 0xff) {
	case ND_NEIGHBOR_SOLICIT:
	/*
	* New entry must have is_router flag cleared.
	*/
	if (is_new) /* (6-7) */
	ln_router = 0;
	break;
	case ND_REDIRECT:
	/*
	* If the icmp is a redirect to a better router, always set the
	* is_router flag. Otherwise, if the entry is newly created,
	* clear the flag. [RFC 2461, sec 8.3]
	*/
	if (code == ND_REDIRECT_ROUTER)
	ln_router = 1;
	else {
	if (is_new) /* (6-7) */
	ln_router = 0;
	}
	break;
	case ND_ROUTER_SOLICIT:
	/*
	* is_router flag must always be cleared.
	*/
	ln_router = 0;
	break;
	case ND_ROUTER_ADVERT:
	/*
	* Mark an entry with lladdr as a router.
	*/
	if ((!is_new && (old_addr \|\| new_addr)) \|\| /* (2-5) */
	(is_new && new_addr)) { /* (7) */
	ln_router = 1;
	}
	break;
	}

	return (ln_router);
	}

	/*
	* Create neighbor cache entry and cache link-layer address,
	* on reception of inbound ND6 packets. (RS/RA/NS/redirect)
	*
	* type - ICMP6 type
	* code - type dependent information
	*
	*/
	void
	nd6_cache_lladdr(struct ifnet ifp, struct in6_addr from, char *lladdr,
	int lladdrlen, int type, int code)
	{
	struct llentry ln = NULL, ln_tmp;
	int is_newentry;
	int do_update;
	int olladdr;
	int llchange;
	int flags;
	uint16_t router = 0;
	struct sockaddr_in6 sin6;
	struct mbuf *chain = NULL;
	u_char linkhdr[LLE_MAX_LINKHDR];
	size_t linkhdrsize;
	int lladdr_off;

	IF_AFDATA_UNLOCK_ASSERT(ifp);

	KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__));
	KASSERT(from != NULL, ("%s: from == NULL", __func__));

	/* nothing must be updated for unspecified address */
	if (IN6_IS_ADDR_UNSPECIFIED(from))
	return;

	/*
	* Validation about ifp->if_addrlen and lladdrlen must be done in
	* the caller.
	*
	* XXX If the link does not have link-layer adderss, what should
	* we do? (ifp->if_addrlen == 0)
	* Spec says nothing in sections for RA, RS and NA. There's small
	* description on it in NS section (RFC 2461 7.2.3).
	*/
	flags = lladdr ? LLE_EXCLUSIVE : 0;
	IF_AFDATA_RLOCK(ifp);
	ln = nd6_lookup(from, flags, ifp);
	IF_AFDATA_RUNLOCK(ifp);
	is_newentry = 0;
	if (ln == NULL) {
	flags \|= LLE_EXCLUSIVE;
	ln = nd6_alloc(from, 0, ifp);
	if (ln == NULL)
	return;

	/*
	* Since we already know all the data for the new entry,
	* fill it before insertion.
	*/
	if (lladdr != NULL) {
	linkhdrsize = sizeof(linkhdr);
	if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
	linkhdr, &linkhdrsize, &lladdr_off) != 0)
	return;
	lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
	lladdr_off);
	}

	IF_AFDATA_WLOCK(ifp);
	LLE_WLOCK(ln);
	/* Prefer any existing lle over newly-created one */
	ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp);
	if (ln_tmp == NULL)
	lltable_link_entry(LLTABLE6(ifp), ln);
	IF_AFDATA_WUNLOCK(ifp);
	if (ln_tmp == NULL) {
	/* No existing lle, mark as new entry (6,7) */
	is_newentry = 1;
	nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
	if (lladdr != NULL) /* (7) */
	EVENTHANDLER_INVOKE(lle_event, ln,
	LLENTRY_RESOLVED);
	} else {
	lltable_free_entry(LLTABLE6(ifp), ln);
	ln = ln_tmp;
	ln_tmp = NULL;
	}
	}
	/* do nothing if static ndp is set */
	if ((ln->la_flags & LLE_STATIC)) {
	if (flags & LLE_EXCLUSIVE)
	LLE_WUNLOCK(ln);
	else
	LLE_RUNLOCK(ln);
	return;
	}

	olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
	if (olladdr && lladdr) {
	llchange = bcmp(lladdr, ln->ll_addr,
	ifp->if_addrlen);
	} else if (!olladdr && lladdr)
	llchange = 1;
	else
	llchange = 0;

	/*
	* newentry olladdr lladdr llchange (*=record)
	* 0 n n -- (1)
	* 0 y n -- (2)
	* 0 n y y (3) * STALE
	* 0 y y n (4) *
	* 0 y y y (5) * STALE
	* 1 -- n -- (6) NOSTATE(= PASSIVE)
	* 1 -- y -- (7) * STALE
	*/

	do_update = 0;
	if (is_newentry == 0 && llchange != 0) {
	do_update = 1; /* (3,5) */

	/*
	* Record source link-layer address
	* XXX is it dependent to ifp->if_type?
	*/
	linkhdrsize = sizeof(linkhdr);
	if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
	linkhdr, &linkhdrsize, &lladdr_off) != 0)
	return;

	if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
	lladdr_off) == 0) {
	/* Entry was deleted */
	return;
	}

	nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);

	EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);

	if (ln->la_hold != NULL)
	nd6_grab_holdchain(ln, &chain, &sin6);
	}

	/* Calculates new router status */
	router = nd6_is_router(type, code, is_newentry, olladdr,
	lladdr != NULL ? 1 : 0, ln->ln_router);

	ln->ln_router = router;
	/* Mark non-router redirects with special flag */
	if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER)
	ln->la_flags \|= LLE_REDIRECT;

	if (flags & LLE_EXCLUSIVE)
	LLE_WUNLOCK(ln);
	else
	LLE_RUNLOCK(ln);

	if (chain != NULL)
	- nd6_flush_holdchain(ifp, ifp, chain, &sin6);
	+ nd6_flush_holdchain(ifp, chain, &sin6);

	/*
	* When the link-layer address of a router changes, select the
	* best router again. In particular, when the neighbor entry is newly
	* created, it might affect the selection policy.
	* Question: can we restrict the first condition to the "is_newentry"
	* case?
	* XXX: when we hear an RA from a new router with the link-layer
	* address option, defrouter_select_fib() is called twice, since
	* defrtrlist_update called the function as well. However, I believe
	* we can compromise the overhead, since it only happens the first
	* time.
	* XXX: although defrouter_select_fib() should not have a bad effect
	* for those are not autoconfigured hosts, we explicitly avoid such
	* cases for safety.
	*/
	if ((do_update \|\| is_newentry) && router &&
	ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
	/*
	* guaranteed recursion
	*/
	defrouter_select_fib(ifp->if_fib);
	}
	}

	static void
	nd6_slowtimo(void *arg)
	{
	CURVNET_SET((struct vnet *) arg);
	struct nd_ifinfo *nd6if;
	struct ifnet *ifp;

	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
	nd6_slowtimo, curvnet);
	IFNET_RLOCK_NOSLEEP();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (ifp->if_afdata[AF_INET6] == NULL)
	continue;
	nd6if = ND_IFINFO(ifp);
	if (nd6if->basereachable && /* already initialized */
	(nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
	/*
	* Since reachable time rarely changes by router
	* advertisements, we SHOULD insure that a new random
	* value gets recomputed at least once every few hours.
	* (RFC 2461, 6.3.4)
	*/
	nd6if->recalctm = V_nd6_recalc_reachtm_interval;
	nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
	}
	}
	IFNET_RUNLOCK_NOSLEEP();
	CURVNET_RESTORE();
	}

	void
	nd6_grab_holdchain(struct llentry ln, struct mbuf *chain,
	struct sockaddr_in6 *sin6)
	{

	LLE_WLOCK_ASSERT(ln);

	*chain = ln->la_hold;
	ln->la_hold = NULL;
	lltable_fill_sa_entry(ln, (struct sockaddr *)sin6);

	if (ln->ln_state == ND6_LLINFO_STALE) {

	/*
	* The first time we send a packet to a
	* neighbor whose entry is STALE, we have
	* to change the state to DELAY and a sets
	* a timer to expire in DELAY_FIRST_PROBE_TIME
	* seconds to ensure do neighbor unreachability
	* detection on expiration.
	* (RFC 2461 7.3.3)
	*/
	nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY);
	}
	}

	int
	nd6_output_ifp(struct ifnet ifp, struct ifnet origifp, struct mbuf *m,
	struct sockaddr_in6 dst, struct route ro)
	{
	int error;
	int ip6len;
	struct ip6_hdr *ip6;
	struct m_tag *mtag;

	#ifdef MAC
	mac_netinet6_nd6_send(ifp, m);
	#endif

	/*
	* If called from nd6_ns_output() (NS), nd6_na_output() (NA),
	* icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
	* as handled by rtsol and rtadvd), mbufs will be tagged for SeND
	* to be diverted to user space. When re-injected into the kernel,
	* send_output() will directly dispatch them to the outgoing interface.
	*/
	if (send_sendso_input_hook != NULL) {
	mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
	if (mtag != NULL) {
	ip6 = mtod(m, struct ip6_hdr *);
	ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
	/* Use the SEND socket */
	error = send_sendso_input_hook(m, ifp, SND_OUT,
	ip6len);
	/* -1 == no app on SEND socket */
	if (error == 0 \|\| error != -1)
	return (error);
	}
	}

	m_clrprotoflags(m); /* Avoid confusing lower layers. */
	IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
	mtod(m, struct ip6_hdr *));

	if ((ifp->if_flags & IFF_LOOPBACK) == 0)
	origifp = ifp;

	error = (ifp->if_output)(origifp, m, (struct sockaddr )dst, ro);
	return (error);
	}

	/*
	* Lookup link headerfor @sa_dst address. Stores found
	* data in @desten buffer. Copy of lle ln_flags can be also
	* saved in @pflags if @pflags is non-NULL.
	*
	* If destination LLE does not exists or lle state modification
	* is required, call "slow" version.
	*
	* Return values:
	* - 0 on success (address copied to buffer).
	* - EWOULDBLOCK (no local error, but address is still unresolved)
	* - other errors (alloc failure, etc)
	*/
	int
	nd6_resolve(struct ifnet ifp, int is_gw, struct mbuf m,
	const struct sockaddr sa_dst, u_char desten, uint32_t *pflags,
	struct llentry **plle)
	{
	struct llentry *ln = NULL;
	const struct sockaddr_in6 *dst6;

	if (pflags != NULL)
	*pflags = 0;

	dst6 = (const struct sockaddr_in6 *)sa_dst;

	/* discard the packet if IPv6 operation is disabled on the interface */
	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
	m_freem(m);
	return (ENETDOWN); /* better error? */
	}

	if (m != NULL && m->m_flags & M_MCAST) {
	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_L2VLAN:
	case IFT_BRIDGE:
	case IFT_ISO88025:
	ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr,
	desten);
	return (0);
	default:
	m_freem(m);
	return (EAFNOSUPPORT);
	}
	}

	IF_AFDATA_RLOCK(ifp);
	ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED,
	ifp);
	if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) {
	/* Entry found, let's copy lle info */
	bcopy(ln->r_linkdata, desten, ln->r_hdrlen);
	if (pflags != NULL)
	*pflags = LLE_VALID \| (ln->r_flags & RLLE_IFADDR);
	/* Check if we have feedback request from nd6 timer */
	if (ln->r_skip_req != 0) {
	LLE_REQ_LOCK(ln);
	ln->r_skip_req = 0; /* Notify that entry was used */
	ln->lle_hittime = time_uptime;
	LLE_REQ_UNLOCK(ln);
	}
	if (plle) {
	LLE_ADDREF(ln);
	*plle = ln;
	LLE_WUNLOCK(ln);
	}
	IF_AFDATA_RUNLOCK(ifp);
	return (0);
	} else if (plle && ln)
	LLE_WUNLOCK(ln);
	IF_AFDATA_RUNLOCK(ifp);

	return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle));
	}


	/*
	* Do L2 address resolution for @sa_dst address. Stores found
	* address in @desten buffer. Copy of lle ln_flags can be also
	* saved in @pflags if @pflags is non-NULL.
	*
	* Heavy version.
	* Function assume that destination LLE does not exist,
	* is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired.
	*
	* Set noinline to be dtrace-friendly
	*/
	static __noinline int
	nd6_resolve_slow(struct ifnet ifp, int flags, struct mbuf m,
	const struct sockaddr_in6 dst, u_char desten, uint32_t *pflags,
	struct llentry **plle)
	{
	struct llentry lle = NULL, lle_tmp;
	struct in6_addr *psrc, src;
	int send_ns, ll_len;
	char *lladdr;

	/*
	* Address resolution or Neighbor Unreachability Detection
	* for the next hop.
	* At this point, the destination of the packet must be a unicast
	* or an anycast address(i.e. not a multicast).
	*/
	if (lle == NULL) {
	IF_AFDATA_RLOCK(ifp);
	lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
	IF_AFDATA_RUNLOCK(ifp);
	if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) {
	/*
	* Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
	* the condition below is not very efficient. But we believe
	* it is tolerable, because this should be a rare case.
	*/
	lle = nd6_alloc(&dst->sin6_addr, 0, ifp);
	if (lle == NULL) {
	char ip6buf[INET6_ADDRSTRLEN];
	log(LOG_DEBUG,
	"nd6_output: can't allocate llinfo for %s "
	"(ln=%p)\n",
	ip6_sprintf(ip6buf, &dst->sin6_addr), lle);
	m_freem(m);
	return (ENOBUFS);
	}

	IF_AFDATA_WLOCK(ifp);
	LLE_WLOCK(lle);
	/* Prefer any existing entry over newly-created one */
	lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp);
	if (lle_tmp == NULL)
	lltable_link_entry(LLTABLE6(ifp), lle);
	IF_AFDATA_WUNLOCK(ifp);
	if (lle_tmp != NULL) {
	lltable_free_entry(LLTABLE6(ifp), lle);
	lle = lle_tmp;
	lle_tmp = NULL;
	}
	}
	}
	if (lle == NULL) {
	if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) {
	m_freem(m);
	return (ENOBUFS);
	}

	if (m != NULL)
	m_freem(m);
	return (ENOBUFS);
	}

	LLE_WLOCK_ASSERT(lle);

	/*
	* The first time we send a packet to a neighbor whose entry is
	* STALE, we have to change the state to DELAY and a sets a timer to
	* expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
	* neighbor unreachability detection on expiration.
	* (RFC 2461 7.3.3)
	*/
	if (lle->ln_state == ND6_LLINFO_STALE)
	nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY);

	/*
	* If the neighbor cache entry has a state other than INCOMPLETE
	* (i.e. its link-layer address is already resolved), just
	* send the packet.
	*/
	if (lle->ln_state > ND6_LLINFO_INCOMPLETE) {
	if (flags & LLE_ADDRONLY) {
	lladdr = lle->ll_addr;
	ll_len = ifp->if_addrlen;
	} else {
	lladdr = lle->r_linkdata;
	ll_len = lle->r_hdrlen;
	}
	bcopy(lladdr, desten, ll_len);
	if (pflags != NULL)
	*pflags = lle->la_flags;
	if (plle) {
	LLE_ADDREF(lle);
	*plle = lle;
	}
	LLE_WUNLOCK(lle);
	return (0);
	}

	/*
	* There is a neighbor cache entry, but no ethernet address
	* response yet. Append this latest packet to the end of the
	* packet queue in the mbuf. When it exceeds nd6_maxqueuelen,
	* the oldest packet in the queue will be removed.
	*/

	if (lle->la_hold != NULL) {
	struct mbuf *m_hold;
	int i;

	i = 0;
	for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){
	i++;
	if (m_hold->m_nextpkt == NULL) {
	m_hold->m_nextpkt = m;
	break;
	}
	}
	while (i >= V_nd6_maxqueuelen) {
	m_hold = lle->la_hold;
	lle->la_hold = lle->la_hold->m_nextpkt;
	m_freem(m_hold);
	i--;
	}
	} else {
	lle->la_hold = m;
	}

	/*
	* If there has been no NS for the neighbor after entering the
	* INCOMPLETE state, send the first solicitation.
	* Note that for newly-created lle la_asked will be 0,
	* so we will transition from ND6_LLINFO_NOSTATE to
	* ND6_LLINFO_INCOMPLETE state here.
	*/
	psrc = NULL;
	send_ns = 0;
	if (lle->la_asked == 0) {
	lle->la_asked++;
	send_ns = 1;
	psrc = nd6_llinfo_get_holdsrc(lle, &src);

	nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE);
	}
	LLE_WUNLOCK(lle);
	if (send_ns != 0)
	nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL);

	return (EWOULDBLOCK);
	}

	/*
	* Do L2 address resolution for @sa_dst address. Stores found
	* address in @desten buffer. Copy of lle ln_flags can be also
	* saved in @pflags if @pflags is non-NULL.
	*
	* Return values:
	* - 0 on success (address copied to buffer).
	* - EWOULDBLOCK (no local error, but address is still unresolved)
	* - other errors (alloc failure, etc)
	*/
	int
	nd6_resolve_addr(struct ifnet ifp, int flags, const struct sockaddr dst,
	char desten, uint32_t pflags)
	{
	int error;

	flags \|= LLE_ADDRONLY;
	error = nd6_resolve_slow(ifp, flags, NULL,
	(const struct sockaddr_in6 *)dst, desten, pflags, NULL);
	return (error);
	}

	int
	-nd6_flush_holdchain(struct ifnet ifp, struct ifnet origifp, struct mbuf *chain,
	+nd6_flush_holdchain(struct ifnet ifp, struct mbuf chain,
	struct sockaddr_in6 *dst)
	{
	struct mbuf m, m_head;
	- struct ifnet *outifp;
	int error = 0;

	m_head = chain;
	- if ((ifp->if_flags & IFF_LOOPBACK) != 0)
	- outifp = origifp;
	- else
	- outifp = ifp;
	-
	+
	while (m_head) {
	m = m_head;
	m_head = m_head->m_nextpkt;
	- error = nd6_output_ifp(ifp, origifp, m, dst, NULL);
	+ error = nd6_output_ifp(ifp, ifp, m, dst, NULL);
	}

	/*
	* XXX
	* note that intermediate errors are blindly ignored
	*/
	return (error);
	-}
	+}

	static int
	nd6_need_cache(struct ifnet *ifp)
	{
	/*
	* XXX: we currently do not make neighbor cache on any interface
	* other than ARCnet, Ethernet, FDDI and GIF.
	*
	* RFC2893 says:
	* - unidirectional tunnels needs no ND
	*/
	switch (ifp->if_type) {
	case IFT_ARCNET:
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_IEEE1394:
	case IFT_L2VLAN:
	case IFT_INFINIBAND:
	case IFT_BRIDGE:
	case IFT_PROPVIRTUAL:
	return (1);
	default:
	return (0);
	}
	}

	/*
	* Add pernament ND6 link-layer record for given
	* interface address.
	*
	* Very similar to IPv4 arp_ifinit(), but:
	* 1) IPv6 DAD is performed in different place
	* 2) It is called by IPv6 protocol stack in contrast to
	* arp_ifinit() which is typically called in SIOCSIFADDR
	* driver ioctl handler.
	*
	*/
	int
	nd6_add_ifa_lle(struct in6_ifaddr *ia)
	{
	struct ifnet *ifp;
	struct llentry ln, ln_tmp;
	struct sockaddr *dst;

	ifp = ia->ia_ifa.ifa_ifp;
	if (nd6_need_cache(ifp) == 0)
	return (0);

	ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
	dst = (struct sockaddr *)&ia->ia_addr;
	ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst);
	if (ln == NULL)
	return (ENOBUFS);

	IF_AFDATA_WLOCK(ifp);
	LLE_WLOCK(ln);
	/* Unlink any entry if exists */
	ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst);
	if (ln_tmp != NULL)
	lltable_unlink_entry(LLTABLE6(ifp), ln_tmp);
	lltable_link_entry(LLTABLE6(ifp), ln);
	IF_AFDATA_WUNLOCK(ifp);

	if (ln_tmp != NULL)
	EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED);
	EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);

	LLE_WUNLOCK(ln);
	if (ln_tmp != NULL)
	llentry_free(ln_tmp);

	return (0);
	}

	/*
	* Removes either all lle entries for given @ia, or lle
	* corresponding to @ia address.
	*/
	void
	nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all)
	{
	struct sockaddr_in6 mask, addr;
	struct sockaddr saddr, smask;
	struct ifnet *ifp;

	ifp = ia->ia_ifa.ifa_ifp;
	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
	saddr = (struct sockaddr *)&addr;
	smask = (struct sockaddr *)&mask;

	if (all != 0)
	lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC);
	else
	lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr);
	}

	static void
	clear_llinfo_pqueue(struct llentry *ln)
	{
	struct mbuf m_hold, m_hold_next;

	for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) {
	m_hold_next = m_hold->m_nextpkt;
	m_freem(m_hold);
	}

	ln->la_hold = NULL;
	}

	static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS);
	static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS);

	SYSCTL_DECL(_net_inet6_icmp6);
	SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
	CTLTYPE_OPAQUE \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter",
	"NDP default router list");
	SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
	CTLTYPE_OPAQUE \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	NULL, 0, nd6_sysctl_prlist, "S,in6_prefix",
	"NDP prefix list");
	SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
	SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");

	static int
	nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
	{
	struct in6_defrouter d;
	struct nd_defrouter *dr;
	int error;

	if (req->newptr != NULL)
	return (EPERM);

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);

	bzero(&d, sizeof(d));
	d.rtaddr.sin6_family = AF_INET6;
	d.rtaddr.sin6_len = sizeof(d.rtaddr);

	ND6_RLOCK();
	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
	d.rtaddr.sin6_addr = dr->rtaddr;
	error = sa6_recoverscope(&d.rtaddr);
	if (error != 0)
	break;
	d.flags = dr->raflags;
	d.rtlifetime = dr->rtlifetime;
	d.expire = dr->expire + (time_second - time_uptime);
	d.if_index = dr->ifp->if_index;
	error = SYSCTL_OUT(req, &d, sizeof(d));
	if (error != 0)
	break;
	}
	ND6_RUNLOCK();
	return (error);
	}

	static int
	nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
	{
	struct in6_prefix p;
	struct sockaddr_in6 s6;
	struct nd_prefix *pr;
	struct nd_pfxrouter *pfr;
	time_t maxexpire;
	int error;
	char ip6buf[INET6_ADDRSTRLEN];

	if (req->newptr)
	return (EPERM);

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);

	bzero(&p, sizeof(p));
	p.origin = PR_ORIG_RA;
	bzero(&s6, sizeof(s6));
	s6.sin6_family = AF_INET6;
	s6.sin6_len = sizeof(s6);

	ND6_RLOCK();
	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
	p.prefix = pr->ndpr_prefix;
	if (sa6_recoverscope(&p.prefix)) {
	log(LOG_ERR, "scope error in prefix list (%s)\n",
	ip6_sprintf(ip6buf, &p.prefix.sin6_addr));
	/* XXX: press on... */
	}
	p.raflags = pr->ndpr_raf;
	p.prefixlen = pr->ndpr_plen;
	p.vltime = pr->ndpr_vltime;
	p.pltime = pr->ndpr_pltime;
	p.if_index = pr->ndpr_ifp->if_index;
	if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
	p.expire = 0;
	else {
	/* XXX: we assume time_t is signed. */
	maxexpire = (-1) &
	~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
	if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate)
	p.expire = pr->ndpr_lastupdate +
	pr->ndpr_vltime +
	(time_second - time_uptime);
	else
	p.expire = maxexpire;
	}
	p.refcnt = pr->ndpr_addrcnt;
	p.flags = pr->ndpr_stateflags;
	p.advrtrs = 0;
	LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry)
	p.advrtrs++;
	error = SYSCTL_OUT(req, &p, sizeof(p));
	if (error != 0)
	break;
	LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
	s6.sin6_addr = pfr->router->rtaddr;
	if (sa6_recoverscope(&s6))
	log(LOG_ERR,
	"scope error in prefix list (%s)\n",
	ip6_sprintf(ip6buf, &pfr->router->rtaddr));
	error = SYSCTL_OUT(req, &s6, sizeof(s6));
	if (error != 0)
	goto out;
	}
	}
	out:
	ND6_RUNLOCK();
	return (error);
	}
	Index: head/sys/netinet6/nd6.h
	===================================================================
	--- head/sys/netinet6/nd6.h (revision 327172)
	+++ head/sys/netinet6/nd6.h (revision 327173)
	@@ -1,499 +1,499 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: nd6.h,v 1.76 2001/12/18 02:10:31 itojun Exp $
	* $FreeBSD$
	*/

	#ifndef _NETINET6_ND6_H_
	#define _NETINET6_ND6_H_

	/* see net/route.h, or net/if_inarp.h */
	#ifndef RTF_ANNOUNCE
	#define RTF_ANNOUNCE RTF_PROTO2
	#endif

	#include <sys/queue.h>
	#include <sys/callout.h>

	struct llentry;

	#define ND6_LLINFO_NOSTATE -2
	/*
	* We don't need the WAITDELETE state any more, but we keep the definition
	* in a comment line instead of removing it. This is necessary to avoid
	* unintentionally reusing the value for another purpose, which might
	* affect backward compatibility with old applications.
	* (20000711 jinmei@kame.net)
	*/
	/* #define ND6_LLINFO_WAITDELETE -1 */
	#define ND6_LLINFO_INCOMPLETE 0
	#define ND6_LLINFO_REACHABLE 1
	#define ND6_LLINFO_STALE 2
	#define ND6_LLINFO_DELAY 3
	#define ND6_LLINFO_PROBE 4

	#define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE)
	#define ND6_LLINFO_PERMANENT(n) (((n)->la_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE))

	struct nd_ifinfo {
	u_int32_t linkmtu; /* LinkMTU */
	u_int32_t maxmtu; /* Upper bound of LinkMTU */
	u_int32_t basereachable; /* BaseReachableTime */
	u_int32_t reachable; /* Reachable Time */
	u_int32_t retrans; /* Retrans Timer */
	u_int32_t flags; /* Flags */
	int recalctm; /* BaseReacable re-calculation timer */
	u_int8_t chlim; /* CurHopLimit */
	u_int8_t initialized; /* Flag to see the entry is initialized */
	/* the following 3 members are for privacy extension for addrconf */
	u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */
	u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */
	u_int8_t randomid[8]; /* current random ID */
	};

	#define ND6_IFF_PERFORMNUD 0x1
	#define ND6_IFF_ACCEPT_RTADV 0x2
	#define ND6_IFF_PREFER_SOURCE 0x4 /* Not used in FreeBSD. */
	#define ND6_IFF_IFDISABLED 0x8 /* IPv6 operation is disabled due to
	* DAD failure. (XXX: not ND-specific)
	*/
	#define ND6_IFF_DONT_SET_IFROUTE 0x10
	#define ND6_IFF_AUTO_LINKLOCAL 0x20
	#define ND6_IFF_NO_RADR 0x40
	#define ND6_IFF_NO_PREFER_IFACE 0x80 /* XXX: not related to ND. */
	#define ND6_IFF_NO_DAD 0x100

	#ifdef _KERNEL
	#define ND_IFINFO(ifp) \
	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->nd_ifinfo)
	#define IN6_LINKMTU(ifp) \
	((ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < (ifp)->if_mtu) \
	? ND_IFINFO(ifp)->linkmtu \
	: ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < (ifp)->if_mtu) \
	? ND_IFINFO(ifp)->maxmtu : (ifp)->if_mtu))
	#endif

	struct in6_nbrinfo {
	char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */
	struct in6_addr addr; /* IPv6 address of the neighbor */
	long asked; /* number of queries already sent for this addr */
	int isrouter; /* if it acts as a router */
	int state; /* reachability state */
	int expire; /* lifetime for NDP state transition */
	};

	#define DRLSTSIZ 10
	#define PRLSTSIZ 10
	struct in6_drlist {
	char ifname[IFNAMSIZ];
	struct {
	struct in6_addr rtaddr;
	u_char flags;
	u_short rtlifetime;
	u_long expire;
	u_short if_index;
	} defrouter[DRLSTSIZ];
	};

	struct in6_defrouter {
	struct sockaddr_in6 rtaddr;
	u_char flags;
	u_short rtlifetime;
	u_long expire;
	u_short if_index;
	};

	#ifdef _KERNEL
	struct in6_oprlist {
	char ifname[IFNAMSIZ];
	struct {
	struct in6_addr prefix;
	struct prf_ra raflags;
	u_char prefixlen;
	u_char origin;
	u_long vltime;
	u_long pltime;
	u_long expire;
	u_short if_index;
	u_short advrtrs; /* number of advertisement routers */
	struct in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
	} prefix[PRLSTSIZ];
	};
	#endif

	struct in6_prlist {
	char ifname[IFNAMSIZ];
	struct {
	struct in6_addr prefix;
	struct prf_ra raflags;
	u_char prefixlen;
	u_char origin;
	u_int32_t vltime;
	u_int32_t pltime;
	time_t expire;
	u_short if_index;
	u_short advrtrs; /* number of advertisement routers */
	struct in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */
	} prefix[PRLSTSIZ];
	};

	struct in6_prefix {
	struct sockaddr_in6 prefix;
	struct prf_ra raflags;
	u_char prefixlen;
	u_char origin;
	u_int32_t vltime;
	u_int32_t pltime;
	time_t expire;
	u_int32_t flags;
	int refcnt;
	u_short if_index;
	u_short advrtrs; /* number of advertisement routers */
	/* struct sockaddr_in6 advrtr[] */
	};

	#ifdef _KERNEL
	struct in6_ondireq {
	char ifname[IFNAMSIZ];
	struct {
	u_int32_t linkmtu; /* LinkMTU */
	u_int32_t maxmtu; /* Upper bound of LinkMTU */
	u_int32_t basereachable; /* BaseReachableTime */
	u_int32_t reachable; /* Reachable Time */
	u_int32_t retrans; /* Retrans Timer */
	u_int32_t flags; /* Flags */
	int recalctm; /* BaseReacable re-calculation timer */
	u_int8_t chlim; /* CurHopLimit */
	u_int8_t receivedra;
	} ndi;
	};
	#endif

	struct in6_ndireq {
	char ifname[IFNAMSIZ];
	struct nd_ifinfo ndi;
	};

	struct in6_ndifreq {
	char ifname[IFNAMSIZ];
	u_long ifindex;
	};

	/* Prefix status */
	#define NDPRF_ONLINK 0x1
	#define NDPRF_DETACHED 0x2

	/* protocol constants */
	#define MAX_RTR_SOLICITATION_DELAY 1 /* 1sec */
	#define RTR_SOLICITATION_INTERVAL 4 /* 4sec */
	#define MAX_RTR_SOLICITATIONS 3

	#define ND6_INFINITE_LIFETIME 0xffffffff

	#ifdef _KERNEL
	/* node constants */
	#define MAX_REACHABLE_TIME 3600000 /* msec */
	#define REACHABLE_TIME 30000 /* msec */
	#define RETRANS_TIMER 1000 /* msec */
	#define MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */
	#define MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */
	#define DEF_TEMP_VALID_LIFETIME 604800 /* 1 week */
	#define DEF_TEMP_PREFERRED_LIFETIME 86400 /* 1 day */
	#define TEMPADDR_REGEN_ADVANCE 5 /* sec */
	#define MAX_TEMP_DESYNC_FACTOR 600 /* 10 min */
	#define ND_COMPUTE_RTIME(x) \
	(((MIN_RANDOM_FACTOR * (x >> 10)) + (arc4random() & \
	((MAX_RANDOM_FACTOR - MIN_RANDOM_FACTOR) * (x >> 10)))) /1000)

	TAILQ_HEAD(nd_drhead, nd_defrouter);
	struct nd_defrouter {
	TAILQ_ENTRY(nd_defrouter) dr_entry;
	struct in6_addr rtaddr;
	u_char raflags; /* flags on RA message */
	u_short rtlifetime;
	u_long expire;
	struct ifnet *ifp;
	int installed; /* is installed into kernel routing table */
	u_int refcnt;
	};

	struct nd_prefixctl {
	struct ifnet *ndpr_ifp;

	/* prefix */
	struct sockaddr_in6 ndpr_prefix;
	u_char ndpr_plen;

	u_int32_t ndpr_vltime; /* advertised valid lifetime */
	u_int32_t ndpr_pltime; /* advertised preferred lifetime */

	struct prf_ra ndpr_flags;
	};

	LIST_HEAD(nd_prhead, nd_prefix);
	struct nd_prefix {
	struct ifnet *ndpr_ifp;
	LIST_ENTRY(nd_prefix) ndpr_entry;
	struct sockaddr_in6 ndpr_prefix; /* prefix */
	struct in6_addr ndpr_mask; /* netmask derived from the prefix */

	u_int32_t ndpr_vltime; /* advertised valid lifetime */
	u_int32_t ndpr_pltime; /* advertised preferred lifetime */

	time_t ndpr_expire; /* expiration time of the prefix */
	time_t ndpr_preferred; /* preferred time of the prefix */
	time_t ndpr_lastupdate; /* reception time of last advertisement */

	struct prf_ra ndpr_flags;
	u_int32_t ndpr_stateflags; /* actual state flags */
	/* list of routers that advertise the prefix: */
	LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs;
	u_char ndpr_plen;
	int ndpr_addrcnt; /* count of derived addresses */
	volatile u_int ndpr_refcnt;
	};

	#define ndpr_raf ndpr_flags
	#define ndpr_raf_onlink ndpr_flags.onlink
	#define ndpr_raf_auto ndpr_flags.autonomous
	#define ndpr_raf_router ndpr_flags.router

	/*
	* Message format for use in obtaining information about prefixes
	* from inet6 sysctl function
	*/
	struct inet6_ndpr_msghdr {
	u_short inpm_msglen; /* to skip over non-understood messages */
	u_char inpm_version; /* future binary compatibility */
	u_char inpm_type; /* message type */
	struct in6_addr inpm_prefix;
	u_long prm_vltim;
	u_long prm_pltime;
	u_long prm_expire;
	u_long prm_preferred;
	struct in6_prflags prm_flags;
	u_short prm_index; /* index for associated ifp */
	u_char prm_plen; /* length of prefix in bits */
	};

	#define prm_raf_onlink prm_flags.prf_ra.onlink
	#define prm_raf_auto prm_flags.prf_ra.autonomous

	#define prm_statef_onlink prm_flags.prf_state.onlink

	#define prm_rrf_decrvalid prm_flags.prf_rr.decrvalid
	#define prm_rrf_decrprefd prm_flags.prf_rr.decrprefd

	struct nd_pfxrouter {
	LIST_ENTRY(nd_pfxrouter) pfr_entry;
	struct nd_defrouter *router;
	};

	#ifdef MALLOC_DECLARE
	MALLOC_DECLARE(M_IP6NDP);
	#endif

	/* nd6.c */
	VNET_DECLARE(int, nd6_prune);
	VNET_DECLARE(int, nd6_delay);
	VNET_DECLARE(int, nd6_umaxtries);
	VNET_DECLARE(int, nd6_mmaxtries);
	VNET_DECLARE(int, nd6_useloopback);
	VNET_DECLARE(int, nd6_maxnudhint);
	VNET_DECLARE(int, nd6_gctimer);
	VNET_DECLARE(struct nd_drhead, nd_defrouter);
	VNET_DECLARE(struct nd_prhead, nd_prefix);
	VNET_DECLARE(int, nd6_debug);
	VNET_DECLARE(int, nd6_onlink_ns_rfc4861);
	#define V_nd6_prune VNET(nd6_prune)
	#define V_nd6_delay VNET(nd6_delay)
	#define V_nd6_umaxtries VNET(nd6_umaxtries)
	#define V_nd6_mmaxtries VNET(nd6_mmaxtries)
	#define V_nd6_useloopback VNET(nd6_useloopback)
	#define V_nd6_maxnudhint VNET(nd6_maxnudhint)
	#define V_nd6_gctimer VNET(nd6_gctimer)
	#define V_nd_defrouter VNET(nd_defrouter)
	#define V_nd_prefix VNET(nd_prefix)
	#define V_nd6_debug VNET(nd6_debug)
	#define V_nd6_onlink_ns_rfc4861 VNET(nd6_onlink_ns_rfc4861)

	/* Lock for the prefix and default router lists. */
	VNET_DECLARE(struct rwlock, nd6_lock);
	VNET_DECLARE(uint64_t, nd6_list_genid);
	#define V_nd6_lock VNET(nd6_lock)
	#define V_nd6_list_genid VNET(nd6_list_genid)

	#define ND6_RLOCK() rw_rlock(&V_nd6_lock)
	#define ND6_RUNLOCK() rw_runlock(&V_nd6_lock)
	#define ND6_WLOCK() rw_wlock(&V_nd6_lock)
	#define ND6_WUNLOCK() rw_wunlock(&V_nd6_lock)
	#define ND6_TRY_UPGRADE() rw_try_upgrade(&V_nd6_lock)
	#define ND6_WLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_WLOCKED)
	#define ND6_RLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_RLOCKED)
	#define ND6_LOCK_ASSERT() rw_assert(&V_nd6_lock, RA_LOCKED)
	#define ND6_UNLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_UNLOCKED)

	/* Mutex for prefix onlink/offlink transitions. */
	VNET_DECLARE(struct mtx, nd6_onlink_mtx);
	#define V_nd6_onlink_mtx VNET(nd6_onlink_mtx)

	#define ND6_ONLINK_LOCK() mtx_lock(&V_nd6_onlink_mtx)
	#define ND6_ONLINK_TRYLOCK() mtx_trylock(&V_nd6_onlink_mtx)
	#define ND6_ONLINK_UNLOCK() mtx_unlock(&V_nd6_onlink_mtx)
	#define ND6_ONLINK_LOCK_ASSERT() mtx_assert(&V_nd6_onlink_mtx, MA_OWNED)
	#define ND6_ONLINK_UNLOCK_ASSERT() mtx_assert(&V_nd6_onlink_mtx, MA_NOTOWNED)

	#define nd6log(x) do { if (V_nd6_debug) log x; } while (/CONSTCOND/ 0)

	/* nd6_rtr.c */
	VNET_DECLARE(int, nd6_defifindex);
	VNET_DECLARE(int, ip6_desync_factor); /* seconds */
	VNET_DECLARE(u_int32_t, ip6_temp_preferred_lifetime); /* seconds */
	VNET_DECLARE(u_int32_t, ip6_temp_valid_lifetime); /* seconds */
	VNET_DECLARE(int, ip6_temp_regen_advance); /* seconds */
	#define V_nd6_defifindex VNET(nd6_defifindex)
	#define V_ip6_desync_factor VNET(ip6_desync_factor)
	#define V_ip6_temp_preferred_lifetime VNET(ip6_temp_preferred_lifetime)
	#define V_ip6_temp_valid_lifetime VNET(ip6_temp_valid_lifetime)
	#define V_ip6_temp_regen_advance VNET(ip6_temp_regen_advance)

	union nd_opts {
	struct nd_opt_hdr nd_opt_array[16]; / max = ND_OPT_NONCE */
	struct {
	struct nd_opt_hdr *zero;
	struct nd_opt_hdr *src_lladdr;
	struct nd_opt_hdr *tgt_lladdr;
	struct nd_opt_prefix_info pi_beg; / multiple opts, start */
	struct nd_opt_rd_hdr *rh;
	struct nd_opt_mtu *mtu;
	struct nd_opt_hdr *__res6;
	struct nd_opt_hdr *__res7;
	struct nd_opt_hdr *__res8;
	struct nd_opt_hdr *__res9;
	struct nd_opt_hdr *__res10;
	struct nd_opt_hdr *__res11;
	struct nd_opt_hdr *__res12;
	struct nd_opt_hdr *__res13;
	struct nd_opt_nonce *nonce;
	struct nd_opt_hdr *__res15;
	struct nd_opt_hdr search; / multiple opts */
	struct nd_opt_hdr last; / multiple opts */
	int done;
	struct nd_opt_prefix_info pi_end;/ multiple opts, end */
	} nd_opt_each;
	};
	#define nd_opts_src_lladdr nd_opt_each.src_lladdr
	#define nd_opts_tgt_lladdr nd_opt_each.tgt_lladdr
	#define nd_opts_pi nd_opt_each.pi_beg
	#define nd_opts_pi_end nd_opt_each.pi_end
	#define nd_opts_rh nd_opt_each.rh
	#define nd_opts_mtu nd_opt_each.mtu
	#define nd_opts_nonce nd_opt_each.nonce
	#define nd_opts_search nd_opt_each.search
	#define nd_opts_last nd_opt_each.last
	#define nd_opts_done nd_opt_each.done

	/* XXX: need nd6_var.h?? */
	/* nd6.c */
	void nd6_init(void);
	#ifdef VIMAGE
	void nd6_destroy(void);
	#endif
	struct nd_ifinfo nd6_ifattach(struct ifnet );
	void nd6_ifdetach(struct ifnet , struct nd_ifinfo );
	int nd6_is_addr_neighbor(const struct sockaddr_in6 , struct ifnet );
	void nd6_option_init(void , int, union nd_opts );
	struct nd_opt_hdr nd6_option(union nd_opts );
	int nd6_options(union nd_opts *);
	struct llentry nd6_lookup(const struct in6_addr , int, struct ifnet *);
	struct llentry nd6_alloc(const struct in6_addr , int, struct ifnet *);
	void nd6_setmtu(struct ifnet *);
	void nd6_llinfo_setstate(struct llentry *lle, int newstate);
	void nd6_timer(void *);
	void nd6_purge(struct ifnet *);
	int nd6_resolve_addr(struct ifnet ifp, int flags, const struct sockaddr dst,
	char desten, uint32_t pflags);
	int nd6_resolve(struct ifnet , int, struct mbuf ,
	const struct sockaddr , u_char , uint32_t , struct llentry *);
	int nd6_ioctl(u_long, caddr_t, struct ifnet *);
	void nd6_cache_lladdr(struct ifnet , struct in6_addr ,
	char *, int, int, int);
	void nd6_grab_holdchain(struct llentry , struct mbuf *,
	struct sockaddr_in6 *);
	-int nd6_flush_holdchain(struct ifnet , struct ifnet , struct mbuf *,
	+int nd6_flush_holdchain(struct ifnet , struct mbuf ,
	struct sockaddr_in6 *);
	int nd6_add_ifa_lle(struct in6_ifaddr *);
	void nd6_rem_ifa_lle(struct in6_ifaddr *, int);
	int nd6_output_ifp(struct ifnet , struct ifnet , struct mbuf *,
	struct sockaddr_in6 , struct route );

	/* nd6_nbr.c */
	void nd6_na_input(struct mbuf *, int, int);
	void nd6_na_output(struct ifnet , const struct in6_addr ,
	const struct in6_addr , u_long, int, struct sockaddr );
	void nd6_ns_input(struct mbuf *, int, int);
	void nd6_ns_output(struct ifnet , const struct in6_addr ,
	const struct in6_addr , const struct in6_addr , uint8_t *);
	caddr_t nd6_ifptomac(struct ifnet *);
	void nd6_dad_init(void);
	void nd6_dad_start(struct ifaddr *, int);
	void nd6_dad_stop(struct ifaddr *);

	/* nd6_rtr.c */
	void nd6_rs_input(struct mbuf *, int, int);
	void nd6_ra_input(struct mbuf *, int, int);
	void defrouter_reset(void);
	void defrouter_select_fib(int fibnum);
	void defrouter_select(void);
	void defrouter_ref(struct nd_defrouter *);
	void defrouter_rele(struct nd_defrouter *);
	bool defrouter_remove(struct in6_addr , struct ifnet );
	void defrouter_unlink(struct nd_defrouter , struct nd_drhead );
	void defrouter_del(struct nd_defrouter *);
	int nd6_prelist_add(struct nd_prefixctl , struct nd_defrouter ,
	struct nd_prefix **);
	void nd6_prefix_unlink(struct nd_prefix , struct nd_prhead );
	void nd6_prefix_del(struct nd_prefix *);
	void nd6_prefix_ref(struct nd_prefix *);
	void nd6_prefix_rele(struct nd_prefix *);
	int nd6_prefix_onlink(struct nd_prefix *);
	int nd6_prefix_offlink(struct nd_prefix *);
	void pfxlist_onlink_check(void);
	struct nd_defrouter defrouter_lookup(struct in6_addr , struct ifnet *);
	struct nd_defrouter defrouter_lookup_locked(struct in6_addr , struct ifnet *);
	struct nd_prefix nd6_prefix_lookup(struct nd_prefixctl );
	void rt6_flush(struct in6_addr , struct ifnet );
	int nd6_setdefaultiface(int);
	int in6_tmpifadd(const struct in6_ifaddr *, int, int);

	#endif /* _KERNEL */

	#endif /* _NETINET6_ND6_H_ */
	Index: head/sys/netinet6/nd6_nbr.c
	===================================================================
	--- head/sys/netinet6/nd6_nbr.c (revision 327172)
	+++ head/sys/netinet6/nd6_nbr.c (revision 327173)
	@@ -1,1553 +1,1547 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/libkern.h>
	#include <sys/lock.h>
	#include <sys/rwlock.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/errno.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/queue.h>
	#include <sys/callout.h>
	#include <sys/refcount.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_dl.h>
	#include <net/if_var.h>
	#include <net/route.h>
	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <net/if_llatbl.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet/icmp6.h>
	#include <netinet/ip_carp.h>
	#include <netinet6/send.h>

	#define SDL(s) ((struct sockaddr_dl *)s)

	struct dadq;
	static struct dadq nd6_dad_find(struct ifaddr , struct nd_opt_nonce *);
	static void nd6_dad_add(struct dadq *dp);
	static void nd6_dad_del(struct dadq *dp);
	static void nd6_dad_rele(struct dadq *);
	static void nd6_dad_starttimer(struct dadq *, int, int);
	static void nd6_dad_stoptimer(struct dadq *);
	static void nd6_dad_timer(struct dadq *);
	static void nd6_dad_duplicated(struct ifaddr , struct dadq );
	static void nd6_dad_ns_output(struct dadq *);
	static void nd6_dad_ns_input(struct ifaddr , struct nd_opt_nonce );
	static void nd6_dad_na_input(struct ifaddr *);
	static void nd6_na_output_fib(struct ifnet , const struct in6_addr ,
	const struct in6_addr , u_long, int, struct sockaddr , u_int);
	static void nd6_ns_output_fib(struct ifnet , const struct in6_addr ,
	const struct in6_addr , const struct in6_addr , uint8_t *, u_int);

	static VNET_DEFINE(int, dad_enhanced) = 1;
	#define V_dad_enhanced VNET(dad_enhanced)

	SYSCTL_DECL(_net_inet6_ip6);
	SYSCTL_INT(_net_inet6_ip6, OID_AUTO, dad_enhanced, CTLFLAG_VNET \| CTLFLAG_RW,
	&VNET_NAME(dad_enhanced), 0,
	"Enable Enhanced DAD, which adds a random nonce to NS messages for DAD.");

	static VNET_DEFINE(int, dad_maxtry) = 15; /* max # of tries to
	transmit DAD packet */
	#define V_dad_maxtry VNET(dad_maxtry)

	/*
	* Input a Neighbor Solicitation Message.
	*
	* Based on RFC 2461
	* Based on RFC 2462 (duplicate address detection)
	*/
	void
	nd6_ns_input(struct mbuf *m, int off, int icmp6len)
	{
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct nd_neighbor_solicit *nd_ns;
	struct in6_addr saddr6 = ip6->ip6_src;
	struct in6_addr daddr6 = ip6->ip6_dst;
	struct in6_addr taddr6;
	struct in6_addr myaddr6;
	char *lladdr = NULL;
	struct ifaddr *ifa = NULL;
	int lladdrlen = 0;
	int anycast = 0, proxy = 0, tentative = 0;
	int tlladdr;
	int rflag;
	union nd_opts ndopts;
	struct sockaddr_dl proxydl;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	/* RFC 6980: Nodes MUST silently ignore fragments */
	if(m->m_flags & M_FRAGMENTED)
	goto freeit;

	rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0;
	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif)
	rflag = 0;
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, icmp6len,);
	nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len);
	if (nd_ns == NULL) {
	ICMP6STAT_INC(icp6s_tooshort);
	return;
	}
	#endif
	ip6 = mtod(m, struct ip6_hdr ); / adjust pointer for safety */
	taddr6 = nd_ns->nd_ns_target;
	if (in6_setscope(&taddr6, ifp, NULL) != 0)
	goto bad;

	if (ip6->ip6_hlim != 255) {
	nd6log((LOG_ERR,
	"nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
	ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
	goto bad;
	}

	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
	/* dst has to be a solicited node multicast address. */
	if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL &&
	/* don't check ifindex portion */
	daddr6.s6_addr32[1] == 0 &&
	daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE &&
	daddr6.s6_addr8[12] == 0xff) {
	; /* good */
	} else {
	nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
	"(wrong ip6 dst)\n"));
	goto bad;
	}
	} else if (!V_nd6_onlink_ns_rfc4861) {
	struct sockaddr_in6 src_sa6;

	/*
	* According to recent IETF discussions, it is not a good idea
	* to accept a NS from an address which would not be deemed
	* to be a neighbor otherwise. This point is expected to be
	* clarified in future revisions of the specification.
	*/
	bzero(&src_sa6, sizeof(src_sa6));
	src_sa6.sin6_family = AF_INET6;
	src_sa6.sin6_len = sizeof(src_sa6);
	src_sa6.sin6_addr = saddr6;
	if (nd6_is_addr_neighbor(&src_sa6, ifp) == 0) {
	nd6log((LOG_INFO, "nd6_ns_input: "
	"NS packet from non-neighbor\n"));
	goto bad;
	}
	}

	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
	nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
	goto bad;
	}

	icmp6len -= sizeof(*nd_ns);
	nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
	if (nd6_options(&ndopts) < 0) {
	nd6log((LOG_INFO,
	"nd6_ns_input: invalid ND option, ignored\n"));
	/* nd6_options have incremented stats */
	goto freeit;
	}

	if (ndopts.nd_opts_src_lladdr) {
	lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
	lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
	}

	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) {
	nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
	"(link-layer address option)\n"));
	goto bad;
	}

	/*
	* Attaching target link-layer address to the NA?
	* (RFC 2461 7.2.4)
	*
	* NS IP dst is unicast/anycast MUST NOT add
	* NS IP dst is solicited-node multicast MUST add
	*
	* In implementation, we add target link-layer address by default.
	* We do not add one in MUST NOT cases.
	*/
	if (!IN6_IS_ADDR_MULTICAST(&daddr6))
	tlladdr = 0;
	else
	tlladdr = 1;

	/*
	* Target address (taddr6) must be either:
	* (1) Valid unicast/anycast address for my receiving interface,
	* (2) Unicast address for which I'm offering proxy service, or
	* (3) "tentative" address on which DAD is being performed.
	*/
	/* (1) and (3) check. */
	if (ifp->if_carp)
	ifa = (*carp_iamatch6_p)(ifp, &taddr6);
	else
	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);

	/* (2) check. */
	if (ifa == NULL) {
	struct sockaddr_dl rt_gateway;
	struct rt_addrinfo info;
	struct sockaddr_in6 dst6;

	bzero(&dst6, sizeof(dst6));
	dst6.sin6_len = sizeof(struct sockaddr_in6);
	dst6.sin6_family = AF_INET6;
	dst6.sin6_addr = taddr6;

	bzero(&rt_gateway, sizeof(rt_gateway));
	rt_gateway.sdl_len = sizeof(rt_gateway);
	bzero(&info, sizeof(info));
	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway;

	if (rib_lookup_info(ifp->if_fib, (struct sockaddr *)&dst6,
	0, 0, &info) == 0) {
	if ((info.rti_flags & RTF_ANNOUNCE) != 0 &&
	rt_gateway.sdl_family == AF_LINK) {

	/*
	* proxy NDP for single entry
	*/
	proxydl = *SDL(&rt_gateway);
	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(
	ifp, IN6_IFF_NOTREADY\|IN6_IFF_ANYCAST);
	if (ifa)
	proxy = 1;
	}
	}
	}
	if (ifa == NULL) {
	/*
	* We've got an NS packet, and we don't have that adddress
	* assigned for us. We MUST silently ignore it.
	* See RFC2461 7.2.3.
	*/
	goto freeit;
	}
	myaddr6 = *IFA_IN6(ifa);
	anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST;
	tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE;
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED)
	goto freeit;

	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
	nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s "
	"(if %d, NS packet %d)\n",
	ip6_sprintf(ip6bufs, &taddr6),
	ifp->if_addrlen, lladdrlen - 2));
	goto bad;
	}

	if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
	nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n",
	ip6_sprintf(ip6bufs, &saddr6)));
	goto freeit;
	}

	/*
	* We have neighbor solicitation packet, with target address equals to
	* one of my tentative address.
	*
	* src addr how to process?
	* --- ---
	* multicast of course, invalid (rejected in ip6_input)
	* unicast somebody is doing address resolution -> ignore
	* unspec dup address detection
	*
	* The processing is defined in RFC 2462.
	*/
	if (tentative) {
	/*
	* If source address is unspecified address, it is for
	* duplicate address detection.
	*
	* If not, the packet is for addess resolution;
	* silently ignore it.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
	nd6_dad_ns_input(ifa, ndopts.nd_opts_nonce);

	goto freeit;
	}

	/*
	* If the source address is unspecified address, entries must not
	* be created or updated.
	* It looks that sender is performing DAD. Output NA toward
	* all-node multicast address, to tell the sender that I'm using
	* the address.
	* S bit ("solicited") must be zero.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
	struct in6_addr in6_all;

	in6_all = in6addr_linklocal_allnodes;
	if (in6_setscope(&in6_all, ifp, NULL) != 0)
	goto bad;
	nd6_na_output_fib(ifp, &in6_all, &taddr6,
	((anycast \|\| proxy \|\| !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) \|
	rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL,
	M_GETFIB(m));
	goto freeit;
	}

	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
	ND_NEIGHBOR_SOLICIT, 0);

	nd6_na_output_fib(ifp, &saddr6, &taddr6,
	((anycast \|\| proxy \|\| !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) \|
	rflag \| ND_NA_FLAG_SOLICITED, tlladdr,
	proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m));
	freeit:
	if (ifa != NULL)
	ifa_free(ifa);
	m_freem(m);
	return;

	bad:
	nd6log((LOG_ERR, "nd6_ns_input: src=%s\n",
	ip6_sprintf(ip6bufs, &saddr6)));
	nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n",
	ip6_sprintf(ip6bufs, &daddr6)));
	nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n",
	ip6_sprintf(ip6bufs, &taddr6)));
	ICMP6STAT_INC(icp6s_badns);
	if (ifa != NULL)
	ifa_free(ifa);
	m_freem(m);
	}

	/*
	* Output a Neighbor Solicitation Message. Caller specifies:
	* - ICMP6 header source IP6 address
	* - ND6 header target IP6 address
	* - ND6 header source datalink address
	*
	* Based on RFC 2461
	* Based on RFC 2462 (duplicate address detection)
	*
	* ln - for source address determination
	* nonce - If non-NULL, NS is used for duplicate address detection and
	* the value (length is ND_OPT_NONCE_LEN) is used as a random nonce.
	*/
	static void
	nd6_ns_output_fib(struct ifnet ifp, const struct in6_addr saddr6,
	const struct in6_addr daddr6, const struct in6_addr taddr6,
	uint8_t *nonce, u_int fibnum)
	{
	struct mbuf *m;
	struct m_tag *mtag;
	struct ip6_hdr *ip6;
	struct nd_neighbor_solicit *nd_ns;
	struct ip6_moptions im6o;
	int icmp6len;
	int maxlen;
	caddr_t mac;

	if (IN6_IS_ADDR_MULTICAST(taddr6))
	return;

	/* estimate the size of message */
	maxlen = sizeof(ip6) + sizeof(nd_ns);
	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
	KASSERT(max_linkhdr + maxlen <= MCLBYTES, (
	"%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)",
	__func__, max_linkhdr, maxlen, MCLBYTES));

	if (max_linkhdr + maxlen > MHLEN)
	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
	else
	m = m_gethdr(M_NOWAIT, MT_DATA);
	if (m == NULL)
	return;
	M_SETFIB(m, fibnum);

	if (daddr6 == NULL \|\| IN6_IS_ADDR_MULTICAST(daddr6)) {
	m->m_flags \|= M_MCAST;
	im6o.im6o_multicast_ifp = ifp;
	im6o.im6o_multicast_hlim = 255;
	im6o.im6o_multicast_loop = 0;
	}

	icmp6len = sizeof(*nd_ns);
	m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
	m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */

	/* fill neighbor solicitation packet */
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	/* ip6->ip6_plen will be set later */
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	ip6->ip6_hlim = 255;
	if (daddr6)
	ip6->ip6_dst = *daddr6;
	else {
	ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
	ip6->ip6_dst.s6_addr16[1] = 0;
	ip6->ip6_dst.s6_addr32[1] = 0;
	ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE;
	ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3];
	ip6->ip6_dst.s6_addr8[12] = 0xff;
	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
	goto bad;
	}
	if (nonce == NULL) {
	struct ifaddr *ifa = NULL;

	/*
	* RFC2461 7.2.2:
	* "If the source address of the packet prompting the
	* solicitation is the same as one of the addresses assigned
	* to the outgoing interface, that address SHOULD be placed
	* in the IP Source Address of the outgoing solicitation.
	* Otherwise, any one of the addresses assigned to the
	* interface should be used."
	*
	* We use the source address for the prompting packet
	* (saddr6), if saddr6 belongs to the outgoing interface.
	* Otherwise, we perform the source address selection as usual.
	*/

	if (saddr6 != NULL)
	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, saddr6);
	if (ifa != NULL) {
	/* ip6_src set already. */
	ip6->ip6_src = *saddr6;
	ifa_free(ifa);
	} else {
	int error;
	struct in6_addr dst6, src6;
	uint32_t scopeid;

	in6_splitscope(&ip6->ip6_dst, &dst6, &scopeid);
	error = in6_selectsrc_addr(fibnum, &dst6,
	scopeid, ifp, &src6, NULL);
	if (error) {
	char ip6buf[INET6_ADDRSTRLEN];
	nd6log((LOG_DEBUG, "%s: source can't be "
	"determined: dst=%s, error=%d\n", __func__,
	ip6_sprintf(ip6buf, &dst6),
	error));
	goto bad;
	}
	ip6->ip6_src = src6;
	}
	} else {
	/*
	* Source address for DAD packet must always be IPv6
	* unspecified address. (0::0)
	* We actually don't have to 0-clear the address (we did it
	* above), but we do so here explicitly to make the intention
	* clearer.
	*/
	bzero(&ip6->ip6_src, sizeof(ip6->ip6_src));
	}
	nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1);
	nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
	nd_ns->nd_ns_code = 0;
	nd_ns->nd_ns_reserved = 0;
	nd_ns->nd_ns_target = *taddr6;
	in6_clearscope(&nd_ns->nd_ns_target); /* XXX */

	/*
	* Add source link-layer address option.
	*
	* spec implementation
	* --- ---
	* DAD packet MUST NOT do not add the option
	* there's no link layer address:
	* impossible do not add the option
	* there's link layer address:
	* Multicast NS MUST add one add the option
	* Unicast NS SHOULD add one add the option
	*/
	if (nonce == NULL && (mac = nd6_ifptomac(ifp))) {
	int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
	struct nd_opt_hdr nd_opt = (struct nd_opt_hdr )(nd_ns + 1);
	/* 8 byte alignments... */
	optlen = (optlen + 7) & ~7;

	m->m_pkthdr.len += optlen;
	m->m_len += optlen;
	icmp6len += optlen;
	bzero((caddr_t)nd_opt, optlen);
	nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
	nd_opt->nd_opt_len = optlen >> 3;
	bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
	}
	/*
	* Add a Nonce option (RFC 3971) to detect looped back NS messages.
	* This behavior is documented as Enhanced Duplicate Address
	* Detection in RFC 7527.
	* net.inet6.ip6.dad_enhanced=0 disables this.
	*/
	if (V_dad_enhanced != 0 && nonce != NULL) {
	int optlen = sizeof(struct nd_opt_hdr) + ND_OPT_NONCE_LEN;
	struct nd_opt_hdr nd_opt = (struct nd_opt_hdr )(nd_ns + 1);
	/* 8-byte alignment is required. */
	optlen = (optlen + 7) & ~7;

	m->m_pkthdr.len += optlen;
	m->m_len += optlen;
	icmp6len += optlen;
	bzero((caddr_t)nd_opt, optlen);
	nd_opt->nd_opt_type = ND_OPT_NONCE;
	nd_opt->nd_opt_len = optlen >> 3;
	bcopy(nonce, (caddr_t)(nd_opt + 1), ND_OPT_NONCE_LEN);
	}
	ip6->ip6_plen = htons((u_short)icmp6len);
	nd_ns->nd_ns_cksum = 0;
	nd_ns->nd_ns_cksum =
	in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len);

	if (send_sendso_input_hook != NULL) {
	mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
	sizeof(unsigned short), M_NOWAIT);
	if (mtag == NULL)
	goto bad;
	(unsigned short )(mtag + 1) = nd_ns->nd_ns_type;
	m_tag_prepend(m, mtag);
	}

	ip6_output(m, NULL, NULL, (nonce != NULL) ? IPV6_UNSPECSRC : 0,
	&im6o, NULL, NULL);
	icmp6_ifstat_inc(ifp, ifs6_out_msg);
	icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit);
	ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_SOLICIT]);

	return;

	bad:
	m_freem(m);
	}

	#ifndef BURN_BRIDGES
	void
	nd6_ns_output(struct ifnet ifp, const struct in6_addr saddr6,
	const struct in6_addr daddr6, const struct in6_addr taddr6,uint8_t *nonce)
	{

	nd6_ns_output_fib(ifp, saddr6, daddr6, taddr6, nonce, RT_DEFAULT_FIB);
	}
	#endif
	/*
	* Neighbor advertisement input handling.
	*
	* Based on RFC 2461
	* Based on RFC 2462 (duplicate address detection)
	*
	* the following items are not implemented yet:
	* - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
	* - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
	*/
	void
	nd6_na_input(struct mbuf *m, int off, int icmp6len)
	{
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct nd_neighbor_advert *nd_na;
	struct in6_addr daddr6 = ip6->ip6_dst;
	struct in6_addr taddr6;
	int flags;
	int is_router;
	int is_solicited;
	int is_override;
	char *lladdr = NULL;
	int lladdrlen = 0;
	int checklink = 0;
	struct ifaddr *ifa;
	struct llentry *ln = NULL;
	union nd_opts ndopts;
	struct mbuf *chain = NULL;
	struct sockaddr_in6 sin6;
	u_char linkhdr[LLE_MAX_LINKHDR];
	size_t linkhdrsize;
	int lladdr_off;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	/* RFC 6980: Nodes MUST silently ignore fragments */
	if(m->m_flags & M_FRAGMENTED)
	goto freeit;

	if (ip6->ip6_hlim != 255) {
	nd6log((LOG_ERR,
	"nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
	ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
	goto bad;
	}

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, icmp6len,);
	nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len);
	if (nd_na == NULL) {
	ICMP6STAT_INC(icp6s_tooshort);
	return;
	}
	#endif

	flags = nd_na->nd_na_flags_reserved;
	is_router = ((flags & ND_NA_FLAG_ROUTER) != 0);
	is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0);
	is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0);
	memset(&sin6, 0, sizeof(sin6));

	taddr6 = nd_na->nd_na_target;
	if (in6_setscope(&taddr6, ifp, NULL))
	goto bad; /* XXX: impossible */

	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
	nd6log((LOG_ERR,
	"nd6_na_input: invalid target address %s\n",
	ip6_sprintf(ip6bufs, &taddr6)));
	goto bad;
	}
	if (IN6_IS_ADDR_MULTICAST(&daddr6))
	if (is_solicited) {
	nd6log((LOG_ERR,
	"nd6_na_input: a solicited adv is multicasted\n"));
	goto bad;
	}

	icmp6len -= sizeof(*nd_na);
	nd6_option_init(nd_na + 1, icmp6len, &ndopts);
	if (nd6_options(&ndopts) < 0) {
	nd6log((LOG_INFO,
	"nd6_na_input: invalid ND option, ignored\n"));
	/* nd6_options have incremented stats */
	goto freeit;
	}

	if (ndopts.nd_opts_tgt_lladdr) {
	lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
	lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
	}

	/*
	* This effectively disables the DAD check on a non-master CARP
	* address.
	*/
	if (ifp->if_carp)
	ifa = (*carp_iamatch6_p)(ifp, &taddr6);
	else
	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);

	/*
	* Target address matches one of my interface address.
	*
	* If my address is tentative, this means that there's somebody
	* already using the same address as mine. This indicates DAD failure.
	* This is defined in RFC 2462.
	*
	* Otherwise, process as defined in RFC 2461.
	*/
	if (ifa
	&& (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) {
	nd6_dad_na_input(ifa);
	ifa_free(ifa);
	goto freeit;
	}

	/* Just for safety, maybe unnecessary. */
	if (ifa) {
	ifa_free(ifa);
	log(LOG_ERR,
	"nd6_na_input: duplicate IP6 address %s\n",
	ip6_sprintf(ip6bufs, &taddr6));
	goto freeit;
	}

	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
	nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s "
	"(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6),
	ifp->if_addrlen, lladdrlen - 2));
	goto bad;
	}

	/*
	* If no neighbor cache entry is found, NA SHOULD silently be
	* discarded.
	*/
	IF_AFDATA_RLOCK(ifp);
	ln = nd6_lookup(&taddr6, LLE_EXCLUSIVE, ifp);
	IF_AFDATA_RUNLOCK(ifp);
	if (ln == NULL) {
	goto freeit;
	}

	if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
	/*
	* If the link-layer has address, and no lladdr option came,
	* discard the packet.
	*/
	if (ifp->if_addrlen && lladdr == NULL) {
	goto freeit;
	}

	/*
	* Record link-layer address, and update the state.
	*/
	linkhdrsize = sizeof(linkhdr);
	if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
	linkhdr, &linkhdrsize, &lladdr_off) != 0)
	return;

	if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
	lladdr_off) == 0) {
	ln = NULL;
	goto freeit;
	}
	EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
	if (is_solicited)
	nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE);
	else
	nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
	if ((ln->ln_router = is_router) != 0) {
	/*
	* This means a router's state has changed from
	* non-reachable to probably reachable, and might
	* affect the status of associated prefixes..
	*/
	checklink = 1;
	}
	} else {
	int llchange;

	/*
	* Check if the link-layer address has changed or not.
	*/
	if (lladdr == NULL)
	llchange = 0;
	else {
	if (ln->la_flags & LLE_VALID) {
	if (bcmp(lladdr, ln->ll_addr, ifp->if_addrlen))
	llchange = 1;
	else
	llchange = 0;
	} else
	llchange = 1;
	}

	/*
	* This is VERY complex. Look at it with care.
	*
	* override solicit lladdr llchange action
	* (L: record lladdr)
	*
	* 0 0 n -- (2c)
	* 0 0 y n (2b) L
	* 0 0 y y (1) REACHABLE->STALE
	* 0 1 n -- (2c) *->REACHABLE
	* 0 1 y n (2b) L *->REACHABLE
	* 0 1 y y (1) REACHABLE->STALE
	* 1 0 n -- (2a)
	* 1 0 y n (2a) L
	* 1 0 y y (2a) L *->STALE
	* 1 1 n -- (2a) *->REACHABLE
	* 1 1 y n (2a) L *->REACHABLE
	* 1 1 y y (2a) L *->REACHABLE
	*/
	if (!is_override && (lladdr != NULL && llchange)) { /* (1) */
	/*
	* If state is REACHABLE, make it STALE.
	* no other updates should be done.
	*/
	if (ln->ln_state == ND6_LLINFO_REACHABLE)
	nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
	goto freeit;
	} else if (is_override /* (2a) */
	\|\| (!is_override && (lladdr != NULL && !llchange)) /* (2b) */
	\|\| lladdr == NULL) { /* (2c) */
	/*
	* Update link-local address, if any.
	*/
	if (lladdr != NULL) {
	linkhdrsize = sizeof(linkhdr);
	if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
	linkhdr, &linkhdrsize, &lladdr_off) != 0)
	goto freeit;
	if (lltable_try_set_entry_addr(ifp, ln, linkhdr,
	linkhdrsize, lladdr_off) == 0) {
	ln = NULL;
	goto freeit;
	}
	EVENTHANDLER_INVOKE(lle_event, ln,
	LLENTRY_RESOLVED);
	}

	/*
	* If solicited, make the state REACHABLE.
	* If not solicited and the link-layer address was
	* changed, make it STALE.
	*/
	if (is_solicited)
	nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE);
	else {
	if (lladdr != NULL && llchange)
	nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
	}
	}

	if (ln->ln_router && !is_router) {
	/*
	* The peer dropped the router flag.
	* Remove the sender from the Default Router List and
	* update the Destination Cache entries.
	*/
	struct ifnet *nd6_ifp;

	nd6_ifp = lltable_get_ifp(ln->lle_tbl);
	if (!defrouter_remove(&ln->r_l3addr.addr6, nd6_ifp) &&
	(ND_IFINFO(nd6_ifp)->flags &
	ND6_IFF_ACCEPT_RTADV) != 0)
	/*
	* Even if the neighbor is not in the default
	* router list, the neighbor may be used as a
	* next hop for some destinations (e.g. redirect
	* case). So we must call rt6_flush explicitly.
	*/
	rt6_flush(&ip6->ip6_src, ifp);
	}
	ln->ln_router = is_router;
	}
	/* XXX - QL
	* Does this matter?
	* rt->rt_flags &= ~RTF_REJECT;
	*/
	ln->la_asked = 0;
	if (ln->la_hold != NULL)
	nd6_grab_holdchain(ln, &chain, &sin6);
	freeit:
	if (ln != NULL)
	LLE_WUNLOCK(ln);

	if (chain != NULL)
	- nd6_flush_holdchain(ifp, ifp, chain, &sin6);
	+ nd6_flush_holdchain(ifp, chain, &sin6);

	if (checklink)
	pfxlist_onlink_check();

	m_freem(m);
	return;

	bad:
	if (ln != NULL)
	LLE_WUNLOCK(ln);

	ICMP6STAT_INC(icp6s_badna);
	m_freem(m);
	}

	/*
	* Neighbor advertisement output handling.
	*
	* Based on RFC 2461
	*
	* the following items are not implemented yet:
	* - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
	* - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
	*
	* tlladdr - 1 if include target link-layer address
	* sdl0 - sockaddr_dl (= proxy NA) or NULL
	*/
	static void
	nd6_na_output_fib(struct ifnet ifp, const struct in6_addr daddr6_0,
	const struct in6_addr *taddr6, u_long flags, int tlladdr,
	struct sockaddr *sdl0, u_int fibnum)
	{
	struct mbuf *m;
	struct m_tag *mtag;
	struct ip6_hdr *ip6;
	struct nd_neighbor_advert *nd_na;
	struct ip6_moptions im6o;
	struct in6_addr daddr6, dst6, src6;
	uint32_t scopeid;

	int icmp6len, maxlen, error;
	caddr_t mac = NULL;

	daddr6 = daddr6_0; / make a local copy for modification */

	/* estimate the size of message */
	maxlen = sizeof(ip6) + sizeof(nd_na);
	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
	KASSERT(max_linkhdr + maxlen <= MCLBYTES, (
	"%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)",
	__func__, max_linkhdr, maxlen, MCLBYTES));

	if (max_linkhdr + maxlen > MHLEN)
	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
	else
	m = m_gethdr(M_NOWAIT, MT_DATA);
	if (m == NULL)
	return;
	M_SETFIB(m, fibnum);

	if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
	m->m_flags \|= M_MCAST;
	im6o.im6o_multicast_ifp = ifp;
	im6o.im6o_multicast_hlim = 255;
	im6o.im6o_multicast_loop = 0;
	}

	icmp6len = sizeof(*nd_na);
	m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
	m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */

	/* fill neighbor advertisement packet */
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	ip6->ip6_hlim = 255;
	if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) {
	/* reply to DAD */
	daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
	daddr6.s6_addr16[1] = 0;
	daddr6.s6_addr32[1] = 0;
	daddr6.s6_addr32[2] = 0;
	daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
	if (in6_setscope(&daddr6, ifp, NULL))
	goto bad;

	flags &= ~ND_NA_FLAG_SOLICITED;
	}
	ip6->ip6_dst = daddr6;

	/*
	* Select a source whose scope is the same as that of the dest.
	*/
	in6_splitscope(&daddr6, &dst6, &scopeid);
	error = in6_selectsrc_addr(fibnum, &dst6,
	scopeid, ifp, &src6, NULL);
	if (error) {
	char ip6buf[INET6_ADDRSTRLEN];
	nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
	"determined: dst=%s, error=%d\n",
	ip6_sprintf(ip6buf, &daddr6), error));
	goto bad;
	}
	ip6->ip6_src = src6;
	nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
	nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
	nd_na->nd_na_code = 0;
	nd_na->nd_na_target = *taddr6;
	in6_clearscope(&nd_na->nd_na_target); /* XXX */

	/*
	* "tlladdr" indicates NS's condition for adding tlladdr or not.
	* see nd6_ns_input() for details.
	* Basically, if NS packet is sent to unicast/anycast addr,
	* target lladdr option SHOULD NOT be included.
	*/
	if (tlladdr) {
	/*
	* sdl0 != NULL indicates proxy NA. If we do proxy, use
	* lladdr in sdl0. If we are not proxying (sending NA for
	* my address) use lladdr configured for the interface.
	*/
	if (sdl0 == NULL) {
	if (ifp->if_carp)
	mac = (*carp_macmatch6_p)(ifp, m, taddr6);
	if (mac == NULL)
	mac = nd6_ifptomac(ifp);
	} else if (sdl0->sa_family == AF_LINK) {
	struct sockaddr_dl *sdl;
	sdl = (struct sockaddr_dl *)sdl0;
	if (sdl->sdl_alen == ifp->if_addrlen)
	mac = LLADDR(sdl);
	}
	}
	if (tlladdr && mac) {
	int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
	struct nd_opt_hdr nd_opt = (struct nd_opt_hdr )(nd_na + 1);

	/* roundup to 8 bytes alignment! */
	optlen = (optlen + 7) & ~7;

	m->m_pkthdr.len += optlen;
	m->m_len += optlen;
	icmp6len += optlen;
	bzero((caddr_t)nd_opt, optlen);
	nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
	nd_opt->nd_opt_len = optlen >> 3;
	bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
	} else
	flags &= ~ND_NA_FLAG_OVERRIDE;

	ip6->ip6_plen = htons((u_short)icmp6len);
	nd_na->nd_na_flags_reserved = flags;
	nd_na->nd_na_cksum = 0;
	nd_na->nd_na_cksum =
	in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len);

	if (send_sendso_input_hook != NULL) {
	mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
	sizeof(unsigned short), M_NOWAIT);
	if (mtag == NULL)
	goto bad;
	(unsigned short )(mtag + 1) = nd_na->nd_na_type;
	m_tag_prepend(m, mtag);
	}

	ip6_output(m, NULL, NULL, 0, &im6o, NULL, NULL);
	icmp6_ifstat_inc(ifp, ifs6_out_msg);
	icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert);
	ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_ADVERT]);

	return;

	bad:
	m_freem(m);
	}

	#ifndef BURN_BRIDGES
	void
	nd6_na_output(struct ifnet ifp, const struct in6_addr daddr6_0,
	const struct in6_addr *taddr6, u_long flags, int tlladdr,
	struct sockaddr *sdl0)
	{

	nd6_na_output_fib(ifp, daddr6_0, taddr6, flags, tlladdr, sdl0,
	RT_DEFAULT_FIB);
	}
	#endif

	caddr_t
	nd6_ifptomac(struct ifnet *ifp)
	{
	switch (ifp->if_type) {
	case IFT_ARCNET:
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_IEEE1394:
	case IFT_L2VLAN:
	case IFT_INFINIBAND:
	case IFT_BRIDGE:
	case IFT_ISO88025:
	return IF_LLADDR(ifp);
	default:
	return NULL;
	}
	}

	struct dadq {
	TAILQ_ENTRY(dadq) dad_list;
	struct ifaddr *dad_ifa;
	int dad_count; /* max NS to send */
	int dad_ns_tcount; /* # of trials to send NS */
	int dad_ns_ocount; /* NS sent so far */
	int dad_ns_icount;
	int dad_na_icount;
	int dad_ns_lcount; /* looped back NS */
	int dad_loopbackprobe; /* probing state for loopback detection */
	struct callout dad_timer_ch;
	struct vnet *dad_vnet;
	u_int dad_refcnt;
	#define ND_OPT_NONCE_LEN32 \
	((ND_OPT_NONCE_LEN + sizeof(uint32_t) - 1)/sizeof(uint32_t))
	uint32_t dad_nonce[ND_OPT_NONCE_LEN32];
	};

	static VNET_DEFINE(TAILQ_HEAD(, dadq), dadq);
	static VNET_DEFINE(struct rwlock, dad_rwlock);
	#define V_dadq VNET(dadq)
	#define V_dad_rwlock VNET(dad_rwlock)

	#define DADQ_RLOCK() rw_rlock(&V_dad_rwlock)
	#define DADQ_RUNLOCK() rw_runlock(&V_dad_rwlock)
	#define DADQ_WLOCK() rw_wlock(&V_dad_rwlock)
	#define DADQ_WUNLOCK() rw_wunlock(&V_dad_rwlock)

	static void
	nd6_dad_add(struct dadq *dp)
	{

	DADQ_WLOCK();
	TAILQ_INSERT_TAIL(&V_dadq, dp, dad_list);
	DADQ_WUNLOCK();
	}

	static void
	nd6_dad_del(struct dadq *dp)
	{

	DADQ_WLOCK();
	TAILQ_REMOVE(&V_dadq, dp, dad_list);
	DADQ_WUNLOCK();
	nd6_dad_rele(dp);
	}

	static struct dadq *
	nd6_dad_find(struct ifaddr ifa, struct nd_opt_nonce n)
	{
	struct dadq *dp;

	DADQ_RLOCK();
	TAILQ_FOREACH(dp, &V_dadq, dad_list) {
	if (dp->dad_ifa != ifa)
	continue;
	/*
	* Skip if the nonce matches the received one.
	* +2 in the length is required because of type and
	* length fields are included in a header.
	*/
	if (n != NULL &&
	n->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 &&
	memcmp(&n->nd_opt_nonce[0], &dp->dad_nonce[0],
	ND_OPT_NONCE_LEN) == 0) {
	dp->dad_ns_lcount++;
	continue;
	}
	refcount_acquire(&dp->dad_refcnt);
	break;
	}
	DADQ_RUNLOCK();

	return (dp);
	}

	static void
	nd6_dad_starttimer(struct dadq *dp, int ticks, int send_ns)
	{

	if (send_ns != 0)
	nd6_dad_ns_output(dp);
	callout_reset(&dp->dad_timer_ch, ticks,
	(void ()(void ))nd6_dad_timer, (void *)dp);
	}

	static void
	nd6_dad_stoptimer(struct dadq *dp)
	{

	callout_drain(&dp->dad_timer_ch);
	}

	static void
	nd6_dad_rele(struct dadq *dp)
	{

	if (refcount_release(&dp->dad_refcnt)) {
	ifa_free(dp->dad_ifa);
	free(dp, M_IP6NDP);
	}
	}

	void
	nd6_dad_init(void)
	{

	rw_init(&V_dad_rwlock, "nd6 DAD queue");
	TAILQ_INIT(&V_dadq);
	}

	/*
	* Start Duplicate Address Detection (DAD) for specified interface address.
	*/
	void
	nd6_dad_start(struct ifaddr *ifa, int delay)
	{
	struct in6_ifaddr ia = (struct in6_ifaddr )ifa;
	struct dadq *dp;
	char ip6buf[INET6_ADDRSTRLEN];

	KASSERT((ia->ia6_flags & IN6_IFF_TENTATIVE) != 0,
	("starting DAD on non-tentative address %p", ifa));

	/*
	* If we don't need DAD, don't do it.
	* There are several cases:
	* - DAD is disabled globally or on the interface
	* - the interface address is anycast
	*/
	if ((ia->ia6_flags & IN6_IFF_ANYCAST) != 0 \|\|
	V_ip6_dad_count == 0 \|\|
	(ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_NO_DAD) != 0) {
	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
	return;
	}
	if ((ifa->ifa_ifp->if_flags & IFF_UP) == 0 \|\|
	(ifa->ifa_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 \|\|
	(ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED) != 0)
	return;

	if ((dp = nd6_dad_find(ifa, NULL)) != NULL) {
	/*
	* DAD is already in progress. Let the existing entry
	* finish it.
	*/
	nd6_dad_rele(dp);
	return;
	}

	dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT \| M_ZERO);
	if (dp == NULL) {
	log(LOG_ERR, "nd6_dad_start: memory allocation failed for "
	"%s(%s)\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
	return;
	}
	callout_init(&dp->dad_timer_ch, 0);
	#ifdef VIMAGE
	dp->dad_vnet = curvnet;
	#endif
	nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp),
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));

	/*
	* Send NS packet for DAD, ip6_dad_count times.
	* Note that we must delay the first transmission, if this is the
	* first packet to be sent from the interface after interface
	* (re)initialization.
	*/
	dp->dad_ifa = ifa;
	ifa_ref(dp->dad_ifa);
	dp->dad_count = V_ip6_dad_count;
	dp->dad_ns_icount = dp->dad_na_icount = 0;
	dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
	dp->dad_ns_lcount = dp->dad_loopbackprobe = 0;
	refcount_init(&dp->dad_refcnt, 1);
	nd6_dad_add(dp);
	nd6_dad_starttimer(dp, delay, 0);
	}

	/*
	* terminate DAD unconditionally. used for address removals.
	*/
	void
	nd6_dad_stop(struct ifaddr *ifa)
	{
	struct dadq *dp;

	dp = nd6_dad_find(ifa, NULL);
	if (!dp) {
	/* DAD wasn't started yet */
	return;
	}

	nd6_dad_stoptimer(dp);

	/*
	* The DAD queue entry may have been removed by nd6_dad_timer() while
	* we were waiting for it to stop, so re-do the lookup.
	*/
	nd6_dad_rele(dp);
	dp = nd6_dad_find(ifa, NULL);
	if (dp == NULL)
	return;

	nd6_dad_del(dp);
	nd6_dad_rele(dp);
	}

	static void
	nd6_dad_timer(struct dadq *dp)
	{
	CURVNET_SET(dp->dad_vnet);
	struct ifaddr *ifa = dp->dad_ifa;
	struct ifnet *ifp = dp->dad_ifa->ifa_ifp;
	struct in6_ifaddr ia = (struct in6_ifaddr )ifa;
	char ip6buf[INET6_ADDRSTRLEN];

	KASSERT(ia != NULL, ("DAD entry %p with no address", dp));

	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
	/* Do not need DAD for ifdisabled interface. */
	log(LOG_ERR, "nd6_dad_timer: cancel DAD on %s because of "
	"ND6_IFF_IFDISABLED.\n", ifp->if_xname);
	goto err;
	}
	if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
	log(LOG_ERR, "nd6_dad_timer: called with duplicated address "
	"%s(%s)\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
	goto err;
	}
	if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) {
	log(LOG_ERR, "nd6_dad_timer: called with non-tentative address "
	"%s(%s)\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
	goto err;
	}

	/* Stop DAD if the interface is down even after dad_maxtry attempts. */
	if ((dp->dad_ns_tcount > V_dad_maxtry) &&
	(((ifp->if_flags & IFF_UP) == 0) \|\|
	((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0))) {
	nd6log((LOG_INFO, "%s: could not run DAD "
	"because the interface was down or not running.\n",
	if_name(ifa->ifa_ifp)));
	goto err;
	}

	/* Need more checks? */
	if (dp->dad_ns_ocount < dp->dad_count) {
	/*
	* We have more NS to go. Send NS packet for DAD.
	*/
	nd6_dad_starttimer(dp,
	(long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000, 1);
	goto done;
	} else {
	/*
	* We have transmitted sufficient number of DAD packets.
	* See what we've got.
	*/
	if (dp->dad_ns_icount > 0 \|\| dp->dad_na_icount > 0)
	/* We've seen NS or NA, means DAD has failed. */
	nd6_dad_duplicated(ifa, dp);
	else if (V_dad_enhanced != 0 &&
	dp->dad_ns_lcount > 0 &&
	dp->dad_ns_lcount > dp->dad_loopbackprobe) {
	/*
	* Sec. 4.1 in RFC 7527 requires transmission of
	* additional probes until the loopback condition
	* becomes clear when a looped back probe is detected.
	*/
	log(LOG_ERR, "%s: a looped back NS message is "
	"detected during DAD for %s. "
	"Another DAD probes are being sent.\n",
	if_name(ifa->ifa_ifp),
	ip6_sprintf(ip6buf, IFA_IN6(ifa)));
	dp->dad_loopbackprobe = dp->dad_ns_lcount;
	/*
	* Send an NS immediately and increase dad_count by
	* V_nd6_mmaxtries - 1.
	*/
	dp->dad_count =
	dp->dad_ns_ocount + V_nd6_mmaxtries - 1;
	nd6_dad_starttimer(dp,
	(long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000,
	1);
	goto done;
	} else {
	/*
	* We are done with DAD. No NA came, no NS came.
	* No duplicate address found. Check IFDISABLED flag
	* again in case that it is changed between the
	* beginning of this function and here.
	*/
	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) == 0)
	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;

	nd6log((LOG_DEBUG,
	"%s: DAD complete for %s - no duplicates found\n",
	if_name(ifa->ifa_ifp),
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
	if (dp->dad_ns_lcount > 0)
	log(LOG_ERR, "%s: DAD completed while "
	"a looped back NS message is detected "
	"during DAD for %s.\n",
	if_name(ifa->ifa_ifp),
	ip6_sprintf(ip6buf, IFA_IN6(ifa)));
	}
	}
	err:
	nd6_dad_del(dp);
	done:
	CURVNET_RESTORE();
	}

	static void
	nd6_dad_duplicated(struct ifaddr ifa, struct dadq dp)
	{
	struct in6_ifaddr ia = (struct in6_ifaddr )ifa;
	struct ifnet *ifp;
	char ip6buf[INET6_ADDRSTRLEN];

	log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: "
	"NS in/out/loopback=%d/%d/%d, NA in=%d\n",
	if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount,
	dp->dad_na_icount);

	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
	ia->ia6_flags \|= IN6_IFF_DUPLICATED;

	ifp = ifa->ifa_ifp;
	log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n",
	if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr));
	log(LOG_ERR, "%s: manual intervention required\n",
	if_name(ifp));

	/*
	* If the address is a link-local address formed from an interface
	* identifier based on the hardware address which is supposed to be
	* uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP
	* operation on the interface SHOULD be disabled.
	* [RFC 4862, Section 5.4.5]
	*/
	if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) {
	struct in6_addr in6;

	/*
	* To avoid over-reaction, we only apply this logic when we are
	* very sure that hardware addresses are supposed to be unique.
	*/
	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_ATM:
	case IFT_IEEE1394:
	case IFT_INFINIBAND:
	in6 = ia->ia_addr.sin6_addr;
	if (in6_get_hw_ifid(ifp, &in6) == 0 &&
	IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) {
	ND_IFINFO(ifp)->flags \|= ND6_IFF_IFDISABLED;
	log(LOG_ERR, "%s: possible hardware address "
	"duplication detected, disable IPv6\n",
	if_name(ifp));
	}
	break;
	}
	}
	}

	static void
	nd6_dad_ns_output(struct dadq *dp)
	{
	struct in6_ifaddr ia = (struct in6_ifaddr )dp->dad_ifa;
	struct ifnet *ifp = dp->dad_ifa->ifa_ifp;
	int i;

	dp->dad_ns_tcount++;
	if ((ifp->if_flags & IFF_UP) == 0) {
	return;
	}
	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
	return;
	}

	dp->dad_ns_ocount++;
	if (V_dad_enhanced != 0) {
	for (i = 0; i < ND_OPT_NONCE_LEN32; i++)
	dp->dad_nonce[i] = arc4random();
	/*
	* XXXHRS: Note that in the case that
	* DupAddrDetectTransmits > 1, multiple NS messages with
	* different nonces can be looped back in an unexpected
	* order. The current implementation recognizes only
	* the latest nonce on the sender side. Practically it
	* should work well in almost all cases.
	*/
	}
	nd6_ns_output(ifp, NULL, NULL, &ia->ia_addr.sin6_addr,
	(uint8_t *)&dp->dad_nonce[0]);
	}

	static void
	nd6_dad_ns_input(struct ifaddr ifa, struct nd_opt_nonce ndopt_nonce)
	{
	- struct in6_ifaddr *ia;
	- struct ifnet *ifp;
	- const struct in6_addr *taddr6;
	struct dadq *dp;

	if (ifa == NULL)
	panic("ifa == NULL in nd6_dad_ns_input");

	- ia = (struct in6_ifaddr *)ifa;
	- ifp = ifa->ifa_ifp;
	- taddr6 = &ia->ia_addr.sin6_addr;
	/* Ignore Nonce option when Enhanced DAD is disabled. */
	if (V_dad_enhanced == 0)
	ndopt_nonce = NULL;
	dp = nd6_dad_find(ifa, ndopt_nonce);
	if (dp == NULL)
	return;

	dp->dad_ns_icount++;
	nd6_dad_rele(dp);
	}

	static void
	nd6_dad_na_input(struct ifaddr *ifa)
	{
	struct dadq *dp;

	if (ifa == NULL)
	panic("ifa == NULL in nd6_dad_na_input");

	dp = nd6_dad_find(ifa, NULL);
	if (dp != NULL) {
	dp->dad_na_icount++;
	nd6_dad_rele(dp);
	}
	}
	Index: head/sys/netinet6/raw_ip6.c
	===================================================================
	--- head/sys/netinet6/raw_ip6.c (revision 327172)
	+++ head/sys/netinet6/raw_ip6.c (revision 327173)
	@@ -1,909 +1,899 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)raw_ip.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipsec.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/errno.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/syslog.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_types.h>
	#include <net/route.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_pcb.h>

	#include <netinet/icmp6.h>
	#include <netinet/ip6.h>
	#include <netinet/ip_var.h>
	#include <netinet6/ip6protosw.h>
	#include <netinet6/ip6_mroute.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet6/raw_ip6.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/send.h>

	#include <netipsec/ipsec_support.h>

	#include <machine/stdarg.h>

	#define satosin6(sa) ((struct sockaddr_in6 *)(sa))
	#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))

	/*
	* Raw interface to IP6 protocol.
	*/

	VNET_DECLARE(struct inpcbhead, ripcb);
	VNET_DECLARE(struct inpcbinfo, ripcbinfo);
	#define V_ripcb VNET(ripcb)
	#define V_ripcbinfo VNET(ripcbinfo)

	extern u_long rip_sendspace;
	extern u_long rip_recvspace;

	VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat);
	VNET_PCPUSTAT_SYSINIT(rip6stat);

	#ifdef VIMAGE
	VNET_PCPUSTAT_SYSUNINIT(rip6stat);
	#endif /* VIMAGE */

	/*
	* Hooks for multicast routing. They all default to NULL, so leave them not
	* initialized and rely on BSS being set to 0.
	*/

	/*
	* The socket used to communicate with the multicast routing daemon.
	*/
	VNET_DEFINE(struct socket *, ip6_mrouter);

	/*
	* The various mrouter functions.
	*/
	int (ip6_mrouter_set)(struct socket , struct sockopt *);
	int (ip6_mrouter_get)(struct socket , struct sockopt *);
	int (*ip6_mrouter_done)(void);
	int (ip6_mforward)(struct ip6_hdr , struct ifnet , struct mbuf );
	int (*mrt6_ioctl)(u_long, caddr_t);

	/*
	* Setup generic address and protocol structures for raw_input routine, then
	* pass them along with mbuf chain.
	*/
	int
	rip6_input(struct mbuf *mp, int offp, int proto)
	{
	struct ifnet *ifp;
	struct mbuf m = mp;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct inpcb *in6p;
	struct inpcb *last = NULL;
	struct mbuf *opts = NULL;
	struct sockaddr_in6 fromsa;

	RIP6STAT_INC(rip6s_ipackets);

	init_sin6(&fromsa, m, 0); /* general init */

	ifp = m->m_pkthdr.rcvif;

	INP_INFO_RLOCK(&V_ripcbinfo);
	LIST_FOREACH(in6p, &V_ripcb, inp_list) {
	/* XXX inp locking */
	if ((in6p->inp_vflag & INP_IPV6) == 0)
	continue;
	if (in6p->inp_ip_p &&
	in6p->inp_ip_p != proto)
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
	!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
	!IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
	continue;
	if (jailed_without_vnet(in6p->inp_cred)) {
	/*
	* Allow raw socket in jail to receive multicast;
	* assume process had PRIV_NETINET_RAW at attach,
	* and fall through into normal filter path if so.
	*/
	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
	prison_check_ip6(in6p->inp_cred,
	&ip6->ip6_dst) != 0)
	continue;
	}
	INP_RLOCK(in6p);
	if (in6p->in6p_cksum != -1) {
	RIP6STAT_INC(rip6s_isum);
	if (in6_cksum(m, proto, *offp,
	m->m_pkthdr.len - *offp)) {
	INP_RUNLOCK(in6p);
	RIP6STAT_INC(rip6s_badsum);
	continue;
	}
	}
	/*
	* If this raw socket has multicast state, and we
	* have received a multicast, check if this socket
	* should receive it, as multicast filtering is now
	* the responsibility of the transport layer.
	*/
	if (in6p->in6p_moptions &&
	IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	/*
	* If the incoming datagram is for MLD, allow it
	* through unconditionally to the raw socket.
	*
	* Use the M_RTALERT_MLD flag to check for MLD
	* traffic without having to inspect the mbuf chain
	* more deeply, as all MLDv1/v2 host messages MUST
	* contain the Router Alert option.
	*
	* In the case of MLDv1, we may not have explicitly
	* joined the group, and may have set IFF_ALLMULTI
	* on the interface. im6o_mc_filter() may discard
	* control traffic we actually need to see.
	*
	* Userland multicast routing daemons should continue
	* filter the control traffic appropriately.
	*/
	int blocked;

	blocked = MCAST_PASS;
	if ((m->m_flags & M_RTALERT_MLD) == 0) {
	struct sockaddr_in6 mcaddr;

	bzero(&mcaddr, sizeof(struct sockaddr_in6));
	mcaddr.sin6_len = sizeof(struct sockaddr_in6);
	mcaddr.sin6_family = AF_INET6;
	mcaddr.sin6_addr = ip6->ip6_dst;

	blocked = im6o_mc_filter(in6p->in6p_moptions,
	ifp,
	(struct sockaddr *)&mcaddr,
	(struct sockaddr *)&fromsa);
	}
	if (blocked != MCAST_PASS) {
	IP6STAT_INC(ip6s_notmember);
	INP_RUNLOCK(in6p);
	continue;
	}
	}
	if (last != NULL) {
	struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT);

	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	/*
	* Check AH/ESP integrity.
	*/
	if (IPSEC_ENABLED(ipv6)) {
	if (n != NULL &&
	IPSEC_CHECK_POLICY(ipv6, n, last) != 0) {
	m_freem(n);
	/* Do not inject data into pcb. */
	n = NULL;
	}
	}
	#endif /* IPSEC */
	if (n) {
	if (last->inp_flags & INP_CONTROLOPTS \|\|
	last->inp_socket->so_options & SO_TIMESTAMP)
	ip6_savecontrol(last, n, &opts);
	/* strip intermediate headers */
	m_adj(n, *offp);
	if (sbappendaddr(&last->inp_socket->so_rcv,
	(struct sockaddr *)&fromsa,
	n, opts) == 0) {
	m_freem(n);
	if (opts)
	m_freem(opts);
	RIP6STAT_INC(rip6s_fullsock);
	} else
	sorwakeup(last->inp_socket);
	opts = NULL;
	}
	INP_RUNLOCK(last);
	}
	last = in6p;
	}
	INP_INFO_RUNLOCK(&V_ripcbinfo);
	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	/*
	* Check AH/ESP integrity.
	*/
	if (IPSEC_ENABLED(ipv6) && last != NULL &&
	IPSEC_CHECK_POLICY(ipv6, m, last) != 0) {
	m_freem(m);
	IP6STAT_DEC(ip6s_delivered);
	/* Do not inject data into pcb. */
	INP_RUNLOCK(last);
	} else
	#endif /* IPSEC */
	if (last != NULL) {
	if (last->inp_flags & INP_CONTROLOPTS \|\|
	last->inp_socket->so_options & SO_TIMESTAMP)
	ip6_savecontrol(last, m, &opts);
	/* Strip intermediate headers. */
	m_adj(m, *offp);
	if (sbappendaddr(&last->inp_socket->so_rcv,
	(struct sockaddr *)&fromsa, m, opts) == 0) {
	m_freem(m);
	if (opts)
	m_freem(opts);
	RIP6STAT_INC(rip6s_fullsock);
	} else
	sorwakeup(last->inp_socket);
	INP_RUNLOCK(last);
	} else {
	RIP6STAT_INC(rip6s_nosock);
	if (m->m_flags & M_MCAST)
	RIP6STAT_INC(rip6s_nosockmcast);
	if (proto == IPPROTO_NONE)
	m_freem(m);
	else {
	char prvnxtp = ip6_get_prevhdr(m, offp); /* XXX */
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_NEXTHEADER,
	prvnxtp - mtod(m, char *));
	}
	IP6STAT_DEC(ip6s_delivered);
	}
	return (IPPROTO_DONE);
	}

	void
	rip6_ctlinput(int cmd, struct sockaddr sa, void d)
	{
	- struct ip6_hdr *ip6;
	- struct mbuf *m;
	- int off = 0;
	struct ip6ctlparam *ip6cp = NULL;
	const struct sockaddr_in6 *sa6_src = NULL;
	void *cmdarg;
	struct inpcb (notify)(struct inpcb *, int) = in6_rtchange;

	if (sa->sa_family != AF_INET6 \|\|
	sa->sa_len != sizeof(struct sockaddr_in6))
	return;

	if ((unsigned)cmd >= PRC_NCMDS)
	return;
	if (PRC_IS_REDIRECT(cmd))
	notify = in6_rtchange, d = NULL;
	else if (cmd == PRC_HOSTDEAD)
	d = NULL;
	else if (inet6ctlerrmap[cmd] == 0)
	return;

	/*
	* If the parameter is from icmp6, decode it.
	*/
	if (d != NULL) {
	ip6cp = (struct ip6ctlparam *)d;
	- m = ip6cp->ip6c_m;
	- ip6 = ip6cp->ip6c_ip6;
	- off = ip6cp->ip6c_off;
	cmdarg = ip6cp->ip6c_cmdarg;
	sa6_src = ip6cp->ip6c_src;
	} else {
	- m = NULL;
	- ip6 = NULL;
	cmdarg = NULL;
	sa6_src = &sa6_any;
	}

	(void) in6_pcbnotify(&V_ripcbinfo, sa, 0,
	(const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
	}

	/*
	* Generate IPv6 header and pass packet to ip6_output. Tack on options user
	* may have setup with control call.
	*/
	int
	rip6_output(struct mbuf m, struct socket so, ...)
	{
	struct mbuf *control;
	struct m_tag *mtag;
	struct sockaddr_in6 *dstsock;
	- struct in6_addr *dst;
	struct ip6_hdr *ip6;
	struct inpcb *in6p;
	u_int plen = m->m_pkthdr.len;
	int error = 0;
	struct ip6_pktopts opt, *optp;
	struct ifnet *oifp = NULL;
	int type = 0, code = 0; /* for ICMPv6 output statistics only */
	int scope_ambiguous = 0;
	int use_defzone = 0;
	int hlim = 0;
	struct in6_addr in6a;
	va_list ap;

	va_start(ap, so);
	dstsock = va_arg(ap, struct sockaddr_in6 *);
	control = va_arg(ap, struct mbuf *);
	va_end(ap);

	in6p = sotoinpcb(so);
	INP_WLOCK(in6p);

	- dst = &dstsock->sin6_addr;
	if (control != NULL) {
	if ((error = ip6_setpktopts(control, &opt,
	in6p->in6p_outputopts, so->so_cred,
	so->so_proto->pr_protocol)) != 0) {
	goto bad;
	}
	optp = &opt;
	} else
	optp = in6p->in6p_outputopts;

	/*
	* Check and convert scope zone ID into internal form.
	*
	* XXX: we may still need to determine the zone later.
	*/
	if (!(so->so_state & SS_ISCONNECTED)) {
	if (!optp \|\| !optp->ip6po_pktinfo \|\|
	!optp->ip6po_pktinfo->ipi6_ifindex)
	use_defzone = V_ip6_use_defzone;
	if (dstsock->sin6_scope_id == 0 && !use_defzone)
	scope_ambiguous = 1;
	if ((error = sa6_embedscope(dstsock, use_defzone)) != 0)
	goto bad;
	}

	/*
	* For an ICMPv6 packet, we should know its type and code to update
	* statistics.
	*/
	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
	struct icmp6_hdr *icmp6;
	if (m->m_len < sizeof(struct icmp6_hdr) &&
	(m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
	error = ENOBUFS;
	goto bad;
	}
	icmp6 = mtod(m, struct icmp6_hdr *);
	type = icmp6->icmp6_type;
	code = icmp6->icmp6_code;
	}

	M_PREPEND(m, sizeof(*ip6), M_NOWAIT);
	if (m == NULL) {
	error = ENOBUFS;
	goto bad;
	}
	ip6 = mtod(m, struct ip6_hdr *);

	/*
	* Source address selection.
	*/
	error = in6_selectsrc_socket(dstsock, optp, in6p, so->so_cred,
	scope_ambiguous, &in6a, &hlim);

	if (error)
	goto bad;
	error = prison_check_ip6(in6p->inp_cred, &in6a);
	if (error != 0)
	goto bad;
	ip6->ip6_src = in6a;

	ip6->ip6_dst = dstsock->sin6_addr;

	/*
	* Fill in the rest of the IPv6 header fields.
	*/
	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) \|
	(in6p->inp_flow & IPV6_FLOWINFO_MASK);
	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) \|
	(IPV6_VERSION & IPV6_VERSION_MASK);

	/*
	* ip6_plen will be filled in ip6_output, so not fill it here.
	*/
	ip6->ip6_nxt = in6p->inp_ip_p;
	ip6->ip6_hlim = hlim;

	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 \|\|
	in6p->in6p_cksum != -1) {
	struct mbuf *n;
	int off;
	u_int16_t *p;

	/* Compute checksum. */
	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
	off = offsetof(struct icmp6_hdr, icmp6_cksum);
	else
	off = in6p->in6p_cksum;
	if (plen < off + 1) {
	error = EINVAL;
	goto bad;
	}
	off += sizeof(struct ip6_hdr);

	n = m;
	while (n && n->m_len <= off) {
	off -= n->m_len;
	n = n->m_next;
	}
	if (!n)
	goto bad;
	p = (u_int16_t *)(mtod(n, caddr_t) + off);
	*p = 0;
	p = in6_cksum(m, ip6->ip6_nxt, sizeof(ip6), plen);
	}

	/*
	* Send RA/RS messages to user land for protection, before sending
	* them to rtadvd/rtsol.
	*/
	if ((send_sendso_input_hook != NULL) &&
	so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
	switch (type) {
	case ND_ROUTER_ADVERT:
	case ND_ROUTER_SOLICIT:
	mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
	sizeof(unsigned short), M_NOWAIT);
	if (mtag == NULL)
	goto bad;
	m_tag_prepend(m, mtag);
	}
	}

	error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p);
	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
	if (oifp)
	icmp6_ifoutstat_inc(oifp, type, code);
	ICMP6STAT_INC(icp6s_outhist[type]);
	} else
	RIP6STAT_INC(rip6s_opackets);

	goto freectl;

	bad:
	if (m)
	m_freem(m);

	freectl:
	if (control != NULL) {
	ip6_clearpktopts(&opt, -1);
	m_freem(control);
	}
	INP_WUNLOCK(in6p);
	return (error);
	}

	/*
	* Raw IPv6 socket option processing.
	*/
	int
	rip6_ctloutput(struct socket so, struct sockopt sopt)
	{
	struct inpcb *inp;
	int error;

	if (sopt->sopt_level == IPPROTO_ICMPV6)
	/*
	* XXX: is it better to call icmp6_ctloutput() directly
	* from protosw?
	*/
	return (icmp6_ctloutput(so, sopt));
	else if (sopt->sopt_level != IPPROTO_IPV6) {
	if (sopt->sopt_level == SOL_SOCKET &&
	sopt->sopt_name == SO_SETFIB) {
	inp = sotoinpcb(so);
	INP_WLOCK(inp);
	inp->inp_inc.inc_fibnum = so->so_fibnum;
	INP_WUNLOCK(inp);
	return (0);
	}
	return (EINVAL);
	}

	error = 0;

	switch (sopt->sopt_dir) {
	case SOPT_GET:
	switch (sopt->sopt_name) {
	case MRT6_INIT:
	case MRT6_DONE:
	case MRT6_ADD_MIF:
	case MRT6_DEL_MIF:
	case MRT6_ADD_MFC:
	case MRT6_DEL_MFC:
	case MRT6_PIM:
	error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) :
	EOPNOTSUPP;
	break;
	case IPV6_CHECKSUM:
	error = ip6_raw_ctloutput(so, sopt);
	break;
	default:
	error = ip6_ctloutput(so, sopt);
	break;
	}
	break;

	case SOPT_SET:
	switch (sopt->sopt_name) {
	case MRT6_INIT:
	case MRT6_DONE:
	case MRT6_ADD_MIF:
	case MRT6_DEL_MIF:
	case MRT6_ADD_MFC:
	case MRT6_DEL_MFC:
	case MRT6_PIM:
	error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) :
	EOPNOTSUPP;
	break;
	case IPV6_CHECKSUM:
	error = ip6_raw_ctloutput(so, sopt);
	break;
	default:
	error = ip6_ctloutput(so, sopt);
	break;
	}
	break;
	}

	return (error);
	}

	static int
	rip6_attach(struct socket so, int proto, struct thread td)
	{
	struct inpcb *inp;
	struct icmp6_filter *filter;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("rip6_attach: inp != NULL"));

	error = priv_check(td, PRIV_NETINET_RAW);
	if (error)
	return (error);
	error = soreserve(so, rip_sendspace, rip_recvspace);
	if (error)
	return (error);
	filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
	if (filter == NULL)
	return (ENOMEM);
	INP_INFO_WLOCK(&V_ripcbinfo);
	error = in_pcballoc(so, &V_ripcbinfo);
	if (error) {
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	free(filter, M_PCB);
	return (error);
	}
	inp = (struct inpcb *)so->so_pcb;
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	inp->inp_vflag \|= INP_IPV6;
	inp->inp_ip_p = (long)proto;
	inp->in6p_hops = -1; /* use kernel default */
	inp->in6p_cksum = -1;
	inp->in6p_icmp6filt = filter;
	ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt);
	INP_WUNLOCK(inp);
	return (0);
	}

	static void
	rip6_detach(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_detach: inp == NULL"));

	if (so == V_ip6_mrouter && ip6_mrouter_done)
	ip6_mrouter_done();
	/* xxx: RSVP */
	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	free(inp->in6p_icmp6filt, M_PCB);
	in_pcbdetach(inp);
	in_pcbfree(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	}

	/* XXXRW: This can't ever be called. */
	static void
	rip6_abort(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_abort: inp == NULL"));

	soisdisconnected(so);
	}

	static void
	rip6_close(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_close: inp == NULL"));

	soisdisconnected(so);
	}

	static int
	rip6_disconnect(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL"));

	if ((so->so_state & SS_ISCONNECTED) == 0)
	return (ENOTCONN);
	inp->in6p_faddr = in6addr_any;
	rip6_abort(so);
	return (0);
	}

	static int
	rip6_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	struct inpcb *inp;
	struct sockaddr_in6 addr = (struct sockaddr_in6 )nam;
	struct ifaddr *ifa = NULL;
	int error = 0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_bind: inp == NULL"));

	if (nam->sa_len != sizeof(*addr))
	return (EINVAL);
	if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0)
	return (error);
	if (TAILQ_EMPTY(&V_ifnet) \|\| addr->sin6_family != AF_INET6)
	return (EADDRNOTAVAIL);
	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
	return (error);

	if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
	(ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL)
	return (EADDRNOTAVAIL);
	if (ifa != NULL &&
	((struct in6_ifaddr *)ifa)->ia6_flags &
	(IN6_IFF_ANYCAST\|IN6_IFF_NOTREADY\|
	IN6_IFF_DETACHED\|IN6_IFF_DEPRECATED)) {
	ifa_free(ifa);
	return (EADDRNOTAVAIL);
	}
	if (ifa != NULL)
	ifa_free(ifa);
	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	inp->in6p_laddr = addr->sin6_addr;
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (0);
	}

	static int
	rip6_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	struct inpcb *inp;
	struct sockaddr_in6 addr = (struct sockaddr_in6 )nam;
	struct in6_addr in6a;
	int error = 0, scope_ambiguous = 0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_connect: inp == NULL"));

	if (nam->sa_len != sizeof(*addr))
	return (EINVAL);
	if (TAILQ_EMPTY(&V_ifnet))
	return (EADDRNOTAVAIL);
	if (addr->sin6_family != AF_INET6)
	return (EAFNOSUPPORT);

	/*
	* Application should provide a proper zone ID or the use of default
	* zone IDs should be enabled. Unfortunately, some applications do
	* not behave as it should, so we need a workaround. Even if an
	* appropriate ID is not determined, we'll see if we can determine
	* the outgoing interface. If we can, determine the zone ID based on
	* the interface below.
	*/
	if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone)
	scope_ambiguous = 1;
	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
	return (error);

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	/* Source address selection. XXX: need pcblookup? */
	error = in6_selectsrc_socket(addr, inp->in6p_outputopts,
	inp, so->so_cred, scope_ambiguous, &in6a, NULL);
	if (error) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (error);
	}

	inp->in6p_faddr = addr->sin6_addr;
	inp->in6p_laddr = in6a;
	soisconnected(so);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (0);
	}

	static int
	rip6_shutdown(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL"));

	INP_WLOCK(inp);
	socantsendmore(so);
	INP_WUNLOCK(inp);
	return (0);
	}

	static int
	rip6_send(struct socket so, int flags, struct mbuf m, struct sockaddr *nam,
	struct mbuf control, struct thread td)
	{
	struct inpcb *inp;
	struct sockaddr_in6 tmp;
	struct sockaddr_in6 *dst;
	int ret;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_send: inp == NULL"));

	/* Always copy sockaddr to avoid overwrites. */
	/* Unlocked read. */
	if (so->so_state & SS_ISCONNECTED) {
	if (nam) {
	m_freem(m);
	return (EISCONN);
	}
	/* XXX */
	bzero(&tmp, sizeof(tmp));
	tmp.sin6_family = AF_INET6;
	tmp.sin6_len = sizeof(struct sockaddr_in6);
	INP_RLOCK(inp);
	bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
	sizeof(struct in6_addr));
	INP_RUNLOCK(inp);
	dst = &tmp;
	} else {
	if (nam == NULL) {
	m_freem(m);
	return (ENOTCONN);
	}
	if (nam->sa_len != sizeof(struct sockaddr_in6)) {
	m_freem(m);
	return (EINVAL);
	}
	tmp = (struct sockaddr_in6 )nam;
	dst = &tmp;

	if (dst->sin6_family == AF_UNSPEC) {
	/*
	* XXX: we allow this case for backward
	* compatibility to buggy applications that
	* rely on old (and wrong) kernel behavior.
	*/
	log(LOG_INFO, "rip6 SEND: address family is "
	"unspec. Assume AF_INET6\n");
	dst->sin6_family = AF_INET6;
	} else if (dst->sin6_family != AF_INET6) {
	m_freem(m);
	return(EAFNOSUPPORT);
	}
	}
	ret = rip6_output(m, so, dst, control);
	return (ret);
	}

	struct pr_usrreqs rip6_usrreqs = {
	.pru_abort = rip6_abort,
	.pru_attach = rip6_attach,
	.pru_bind = rip6_bind,
	.pru_connect = rip6_connect,
	.pru_control = in6_control,
	.pru_detach = rip6_detach,
	.pru_disconnect = rip6_disconnect,
	.pru_peeraddr = in6_getpeeraddr,
	.pru_send = rip6_send,
	.pru_shutdown = rip6_shutdown,
	.pru_sockaddr = in6_getsockaddr,
	.pru_close = rip6_close,
	};
	Index: head/sys/netinet6/udp6_usrreq.c
	===================================================================
	--- head/sys/netinet6/udp6_usrreq.c (revision 327172)
	+++ head/sys/netinet6/udp6_usrreq.c (revision 327173)
	@@ -1,1322 +1,1320 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* Copyright (c) 2010-2011 Juniper Networks, Inc.
	* Copyright (c) 2014 Kevin Lo
	* All rights reserved.
	*
	* Portions of this software were developed by Robert N. M. Watson under
	* contract to Juniper Networks, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $
	* $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_rss.h"

	#include <sys/param.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/sdt.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/if_types.h>
	#include <net/route.h>
	#include <net/rss_config.h>

	#include <netinet/in.h>
	#include <netinet/in_kdtrace.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#include <netinet/ip_var.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>
	#include <netinet/udplite.h>

	#include <netinet6/ip6protosw.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/in6_rss.h>
	#include <netinet6/udp6_var.h>
	#include <netinet6/scope6_var.h>

	#include <netipsec/ipsec_support.h>

	#include <security/mac/mac_framework.h>

	/*
	* UDP protocol implementation.
	* Per RFC 768, August, 1980.
	*/

	extern struct protosw inetsw[];
	static void udp6_detach(struct socket *so);

	static int
	udp6_append(struct inpcb inp, struct mbuf n, int off,
	struct sockaddr_in6 *fromsa)
	{
	struct socket *so;
	struct mbuf opts = NULL, tmp_opts;
	struct udpcb *up;

	INP_LOCK_ASSERT(inp);

	/*
	* Engage the tunneling protocol.
	*/
	up = intoudpcb(inp);
	if (up->u_tun_func != NULL) {
	in_pcbref(inp);
	INP_RUNLOCK(inp);
	(up->u_tun_func)(n, off, inp, (struct sockaddr )&fromsa[0],
	up->u_tun_ctx);
	INP_RLOCK(inp);
	return (in_pcbrele_rlocked(inp));
	}
	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	/* Check AH/ESP integrity. */
	if (IPSEC_ENABLED(ipv6)) {
	if (IPSEC_CHECK_POLICY(ipv6, n, inp) != 0) {
	m_freem(n);
	return (0);
	}
	}
	#endif /* IPSEC */
	#ifdef MAC
	if (mac_inpcb_check_deliver(inp, n) != 0) {
	m_freem(n);
	return (0);
	}
	#endif
	opts = NULL;
	if (inp->inp_flags & INP_CONTROLOPTS \|\|
	inp->inp_socket->so_options & SO_TIMESTAMP)
	ip6_savecontrol(inp, n, &opts);
	if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
	tmp_opts = sbcreatecontrol((caddr_t)&fromsa[1],
	sizeof(struct sockaddr_in6), IPV6_ORIGDSTADDR, IPPROTO_IPV6);
	if (tmp_opts) {
	if (opts) {
	tmp_opts->m_next = opts;
	opts = tmp_opts;
	} else
	opts = tmp_opts;
	}

	}
	m_adj(n, off + sizeof(struct udphdr));

	so = inp->inp_socket;
	SOCKBUF_LOCK(&so->so_rcv);
	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&fromsa[0], n,
	opts) == 0) {
	SOCKBUF_UNLOCK(&so->so_rcv);
	m_freem(n);
	if (opts)
	m_freem(opts);
	UDPSTAT_INC(udps_fullsock);
	} else
	sorwakeup_locked(so);
	return (0);
	}

	int
	udp6_input(struct mbuf *mp, int offp, int proto)
	{
	struct mbuf m = mp;
	struct ifnet *ifp;
	struct ip6_hdr *ip6;
	struct udphdr *uh;
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;
	struct udpcb *up;
	int off = *offp;
	int cscov_partial;
	int plen, ulen;
	struct sockaddr_in6 fromsa[2];
	struct m_tag *fwd_tag;
	uint16_t uh_sum;
	uint8_t nxt;

	ifp = m->m_pkthdr.rcvif;
	ip6 = mtod(m, struct ip6_hdr *);

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE);
	ip6 = mtod(m, struct ip6_hdr *);
	uh = (struct udphdr *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(uh, struct udphdr , m, off, sizeof(uh));
	if (!uh)
	return (IPPROTO_DONE);
	#endif

	UDPSTAT_INC(udps_ipackets);

	/*
	* Destination port of 0 is illegal, based on RFC768.
	*/
	if (uh->uh_dport == 0)
	goto badunlocked;

	plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6);
	ulen = ntohs((u_short)uh->uh_ulen);

	nxt = proto;
	cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0;
	if (nxt == IPPROTO_UDPLITE) {
	/* Zero means checksum over the complete packet. */
	if (ulen == 0)
	ulen = plen;
	if (ulen == plen)
	cscov_partial = 0;
	if ((ulen < sizeof(struct udphdr)) \|\| (ulen > plen)) {
	/* XXX: What is the right UDPLite MIB counter? */
	goto badunlocked;
	}
	if (uh->uh_sum == 0) {
	/* XXX: What is the right UDPLite MIB counter? */
	goto badunlocked;
	}
	} else {
	if ((ulen < sizeof(struct udphdr)) \|\| (plen != ulen)) {
	UDPSTAT_INC(udps_badlen);
	goto badunlocked;
	}
	if (uh->uh_sum == 0) {
	UDPSTAT_INC(udps_nosum);
	goto badunlocked;
	}
	}

	if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) &&
	!cscov_partial) {
	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
	uh_sum = m->m_pkthdr.csum_data;
	else
	uh_sum = in6_cksum_pseudo(ip6, ulen, nxt,
	m->m_pkthdr.csum_data);
	uh_sum ^= 0xffff;
	} else
	uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen);

	if (uh_sum != 0) {
	UDPSTAT_INC(udps_badsum);
	goto badunlocked;
	}

	/*
	* Construct sockaddr format source address.
	*/
	init_sin6(&fromsa[0], m, 0);
	fromsa[0].sin6_port = uh->uh_sport;
	init_sin6(&fromsa[1], m, 1);
	fromsa[1].sin6_port = uh->uh_dport;

	pcbinfo = udp_get_inpcbinfo(nxt);
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	struct inpcb *last;
	struct inpcbhead *pcblist;
	struct ip6_moptions *imo;

	INP_INFO_RLOCK(pcbinfo);
	/*
	* In the event that laddr should be set to the link-local
	* address (this happens in RIPng), the multicast address
	* specified in the received packet will not match laddr. To
	* handle this situation, matching is relaxed if the
	* receiving interface is the same as one specified in the
	* socket and if the destination multicast address matches
	* one of the multicast groups specified in the socket.
	*/

	/*
	* KAME note: traditionally we dropped udpiphdr from mbuf
	* here. We need udphdr for IPsec processing so we do that
	* later.
	*/
	pcblist = udp_get_pcblist(nxt);
	last = NULL;
	LIST_FOREACH(inp, pcblist, inp_list) {
	if ((inp->inp_vflag & INP_IPV6) == 0)
	continue;
	if (inp->inp_lport != uh->uh_dport)
	continue;
	if (inp->inp_fport != 0 &&
	inp->inp_fport != uh->uh_sport)
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
	if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
	&ip6->ip6_dst))
	continue;
	}
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
	&ip6->ip6_src) \|\|
	inp->inp_fport != uh->uh_sport)
	continue;
	}

	/*
	* XXXRW: Because we weren't holding either the inpcb
	* or the hash lock when we checked for a match
	* before, we should probably recheck now that the
	* inpcb lock is (supposed to be) held.
	*/

	/*
	* Handle socket delivery policy for any-source
	* and source-specific multicast. [RFC3678]
	*/
	imo = inp->in6p_moptions;
	if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	struct sockaddr_in6 mcaddr;
	int blocked;

	INP_RLOCK(inp);

	bzero(&mcaddr, sizeof(struct sockaddr_in6));
	mcaddr.sin6_len = sizeof(struct sockaddr_in6);
	mcaddr.sin6_family = AF_INET6;
	mcaddr.sin6_addr = ip6->ip6_dst;

	blocked = im6o_mc_filter(imo, ifp,
	(struct sockaddr *)&mcaddr,
	(struct sockaddr *)&fromsa[0]);
	if (blocked != MCAST_PASS) {
	if (blocked == MCAST_NOTGMEMBER)
	IP6STAT_INC(ip6s_notmember);
	if (blocked == MCAST_NOTSMEMBER \|\|
	blocked == MCAST_MUTED)
	UDPSTAT_INC(udps_filtermcast);
	INP_RUNLOCK(inp); /* XXX */
	continue;
	}

	INP_RUNLOCK(inp);
	}
	if (last != NULL) {
	struct mbuf *n;

	if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
	NULL) {
	INP_RLOCK(last);
	UDP_PROBE(receive, NULL, last, ip6,
	last, uh);
	if (udp6_append(last, n, off, fromsa))
	goto inp_lost;
	INP_RUNLOCK(last);
	}
	}
	last = inp;
	/*
	* Don't look for additional matches if this one does
	* not have either the SO_REUSEPORT or SO_REUSEADDR
	* socket options set. This heuristic avoids
	* searching through all pcbs in the common case of a
	* non-shared port. It assumes that an application
	* will never clear these options after setting them.
	*/
	if ((last->inp_socket->so_options &
	(SO_REUSEPORT\|SO_REUSEADDR)) == 0)
	break;
	}

	if (last == NULL) {
	/*
	* No matching pcb found; discard datagram. (No need
	* to send an ICMP Port Unreachable for a broadcast
	* or multicast datgram.)
	*/
	UDPSTAT_INC(udps_noport);
	UDPSTAT_INC(udps_noportmcast);
	goto badheadlocked;
	}
	INP_RLOCK(last);
	INP_INFO_RUNLOCK(pcbinfo);
	UDP_PROBE(receive, NULL, last, ip6, last, uh);
	if (udp6_append(last, m, off, fromsa) == 0)
	INP_RUNLOCK(last);
	inp_lost:
	return (IPPROTO_DONE);
	}
	/*
	* Locate pcb for datagram.
	*/

	/*
	* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
	*/
	if ((m->m_flags & M_IP6_NEXTHOP) &&
	(fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
	struct sockaddr_in6 *next_hop6;

	next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);

	/*
	* Transparently forwarded. Pretend to be the destination.
	* Already got one like this?
	*/
	inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
	uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
	INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m);
	if (!inp) {
	/*
	* It's new. Try to find the ambushing socket.
	* Because we've rewritten the destination address,
	* any hardware-generated hash is ignored.
	*/
	inp = in6_pcblookup(pcbinfo, &ip6->ip6_src,
	uh->uh_sport, &next_hop6->sin6_addr,
	next_hop6->sin6_port ? htons(next_hop6->sin6_port) :
	uh->uh_dport, INPLOOKUP_WILDCARD \|
	INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif);
	}
	/* Remove the tag from the packet. We don't need it anymore. */
	m_tag_delete(m, fwd_tag);
	m->m_flags &= ~M_IP6_NEXTHOP;
	} else
	inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
	uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
	INPLOOKUP_WILDCARD \| INPLOOKUP_RLOCKPCB,
	m->m_pkthdr.rcvif, m);
	if (inp == NULL) {
	if (udp_log_in_vain) {
	char ip6bufs[INET6_ADDRSTRLEN];
	char ip6bufd[INET6_ADDRSTRLEN];

	log(LOG_INFO,
	"Connection attempt to UDP [%s]:%d from [%s]:%d\n",
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	ntohs(uh->uh_dport),
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ntohs(uh->uh_sport));
	}
	UDPSTAT_INC(udps_noport);
	if (m->m_flags & M_MCAST) {
	printf("UDP6: M_MCAST is set in a unicast packet.\n");
	UDPSTAT_INC(udps_noportmcast);
	goto badunlocked;
	}
	if (V_udp_blackhole)
	goto badunlocked;
	icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
	return (IPPROTO_DONE);
	}
	INP_RLOCK_ASSERT(inp);
	up = intoudpcb(inp);
	if (cscov_partial) {
	if (up->u_rxcslen == 0 \|\| up->u_rxcslen > ulen) {
	INP_RUNLOCK(inp);
	m_freem(m);
	return (IPPROTO_DONE);
	}
	}
	UDP_PROBE(receive, NULL, inp, ip6, inp, uh);
	if (udp6_append(inp, m, off, fromsa) == 0)
	INP_RUNLOCK(inp);
	return (IPPROTO_DONE);

	badheadlocked:
	INP_INFO_RUNLOCK(pcbinfo);
	badunlocked:
	if (m)
	m_freem(m);
	return (IPPROTO_DONE);
	}

	static void
	udp6_common_ctlinput(int cmd, struct sockaddr sa, void d,
	struct inpcbinfo *pcbinfo)
	{
	struct udphdr uh;
	struct ip6_hdr *ip6;
	struct mbuf *m;
	int off = 0;
	struct ip6ctlparam *ip6cp = NULL;
	const struct sockaddr_in6 *sa6_src = NULL;
	void *cmdarg;
	struct inpcb (notify)(struct inpcb *, int) = udp_notify;
	struct udp_portonly {
	u_int16_t uh_sport;
	u_int16_t uh_dport;
	} *uhp;

	if (sa->sa_family != AF_INET6 \|\|
	sa->sa_len != sizeof(struct sockaddr_in6))
	return;

	if ((unsigned)cmd >= PRC_NCMDS)
	return;
	if (PRC_IS_REDIRECT(cmd))
	notify = in6_rtchange, d = NULL;
	else if (cmd == PRC_HOSTDEAD)
	d = NULL;
	else if (inet6ctlerrmap[cmd] == 0)
	return;

	/* if the parameter is from icmp6, decode it. */
	if (d != NULL) {
	ip6cp = (struct ip6ctlparam *)d;
	m = ip6cp->ip6c_m;
	ip6 = ip6cp->ip6c_ip6;
	off = ip6cp->ip6c_off;
	cmdarg = ip6cp->ip6c_cmdarg;
	sa6_src = ip6cp->ip6c_src;
	} else {
	m = NULL;
	ip6 = NULL;
	cmdarg = NULL;
	sa6_src = &sa6_any;
	}

	if (ip6) {
	/*
	* XXX: We assume that when IPV6 is non NULL,
	* M and OFF are valid.
	*/

	/* Check if we can safely examine src and dst ports. */
	if (m->m_pkthdr.len < off + sizeof(*uhp))
	return;

	bzero(&uh, sizeof(uh));
	m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh);

	if (!PRC_IS_REDIRECT(cmd)) {
	/* Check to see if its tunneled */
	struct inpcb *inp;
	inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_dst,
	uh.uh_dport, &ip6->ip6_src, uh.uh_sport,
	INPLOOKUP_WILDCARD \| INPLOOKUP_RLOCKPCB,
	m->m_pkthdr.rcvif, m);
	if (inp != NULL) {
	struct udpcb *up;

	up = intoudpcb(inp);
	if (up->u_icmp_func) {
	/* Yes it is. */
	INP_RUNLOCK(inp);
	(up->u_icmp_func)(cmd, (struct sockaddr )ip6cp->ip6c_src,
	d, up->u_tun_ctx);
	return;
	} else {
	/* Can't find it. */
	INP_RUNLOCK(inp);
	}
	}
	}
	(void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport,
	(struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd,
	cmdarg, notify);
	} else
	(void)in6_pcbnotify(pcbinfo, sa, 0,
	(const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
	}

	void
	udp6_ctlinput(int cmd, struct sockaddr sa, void d)
	{

	return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo));
	}

	void
	udplite6_ctlinput(int cmd, struct sockaddr sa, void d)
	{

	return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo));
	}

	static int
	udp6_getcred(SYSCTL_HANDLER_ARGS)
	{
	struct xucred xuc;
	struct sockaddr_in6 addrs[2];
	struct inpcb *inp;
	int error;

	error = priv_check(req->td, PRIV_NETINET_GETCRED);
	if (error)
	return (error);

	if (req->newlen != sizeof(addrs))
	return (EINVAL);
	if (req->oldlen != sizeof(struct xucred))
	return (EINVAL);
	error = SYSCTL_IN(req, addrs, sizeof(addrs));
	if (error)
	return (error);
	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 \|\|
	(error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
	return (error);
	}
	inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr,
	addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port,
	INPLOOKUP_WILDCARD \| INPLOOKUP_RLOCKPCB, NULL);
	if (inp != NULL) {
	INP_RLOCK_ASSERT(inp);
	if (inp->inp_socket == NULL)
	error = ENOENT;
	if (error == 0)
	error = cr_canseesocket(req->td->td_ucred,
	inp->inp_socket);
	if (error == 0)
	cru2x(inp->inp_cred, &xuc);
	INP_RUNLOCK(inp);
	} else
	error = ENOENT;
	if (error == 0)
	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
	return (error);
	}

	SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE\|CTLFLAG_RW, 0,
	0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");

	static int
	udp6_output(struct inpcb inp, struct mbuf m, struct sockaddr *addr6,
	struct mbuf control, struct thread td)
	{
	u_int32_t ulen = m->m_pkthdr.len;
	u_int32_t plen = sizeof(struct udphdr) + ulen;
	struct ip6_hdr *ip6;
	struct udphdr *udp6;
	struct in6_addr laddr, faddr, in6a;
	struct sockaddr_in6 *sin6 = NULL;
	int cscov_partial = 0;
	int scope_ambiguous = 0;
	u_short fport;
	int error = 0;
	uint8_t nxt;
	uint16_t cscov = 0;
	struct ip6_pktopts *optp, opt;
	int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
	int flags;
	struct sockaddr_in6 tmp;

	INP_WLOCK_ASSERT(inp);
	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);

	if (addr6) {
	/* addr6 has been validated in udp6_send(). */
	sin6 = (struct sockaddr_in6 *)addr6;

	/* protect sin6 from overwrites /
	tmp = *sin6;
	sin6 = &tmp;

	/*
	* Application should provide a proper zone ID or the use of
	* default zone IDs should be enabled. Unfortunately, some
	* applications do not behave as it should, so we need a
	* workaround. Even if an appropriate ID is not determined,
	* we'll see if we can determine the outgoing interface. If we
	* can, determine the zone ID based on the interface below.
	*/
	if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
	scope_ambiguous = 1;
	if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
	return (error);
	}

	nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
	IPPROTO_UDP : IPPROTO_UDPLITE;
	if (control) {
	if ((error = ip6_setpktopts(control, &opt,
	inp->in6p_outputopts, td->td_ucred, nxt)) != 0)
	goto release;
	optp = &opt;
	} else
	optp = inp->in6p_outputopts;

	if (sin6) {
	faddr = &sin6->sin6_addr;

	/*
	* Since we saw no essential reason for calling in_pcbconnect,
	* we get rid of such kind of logic, and call in6_selectsrc
	* and in6_pcbsetport in order to fill in the local address
	* and the local port.
	*/
	if (sin6->sin6_port == 0) {
	error = EADDRNOTAVAIL;
	goto release;
	}

	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	/* how about ::ffff:0.0.0.0 case? */
	error = EISCONN;
	goto release;
	}

	fport = sin6->sin6_port; /* allow 0 port */

	if (IN6_IS_ADDR_V4MAPPED(faddr)) {
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
	/*
	* I believe we should explicitly discard the
	* packet when mapped addresses are disabled,
	* rather than send the packet as an IPv6 one.
	* If we chose the latter approach, the packet
	* might be sent out on the wire based on the
	* default route, the situation which we'd
	* probably want to avoid.
	* (20010421 jinmei@kame.net)
	*/
	error = EINVAL;
	goto release;
	}
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
	!IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
	/*
	* when remote addr is an IPv4-mapped address,
	* local addr should not be an IPv6 address,
	* since you cannot determine how to map IPv6
	* source address to IPv4.
	*/
	error = EINVAL;
	goto release;
	}

	af = AF_INET;
	}

	if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
	error = in6_selectsrc_socket(sin6, optp, inp,
	td->td_ucred, scope_ambiguous, &in6a, NULL);
	if (error)
	goto release;
	laddr = &in6a;
	} else
	laddr = &inp->in6p_laddr; /* XXX */
	if (laddr == NULL) {
	if (error == 0)
	error = EADDRNOTAVAIL;
	goto release;
	}
	if (inp->inp_lport == 0 &&
	(error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) {
	/* Undo an address bind that may have occurred. */
	inp->in6p_laddr = in6addr_any;
	goto release;
	}
	} else {
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	error = ENOTCONN;
	goto release;
	}
	if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
	/*
	* XXX: this case would happen when the
	* application sets the V6ONLY flag after
	* connecting the foreign address.
	* Such applications should be fixed,
	* so we bark here.
	*/
	log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
	"option was set for a connected socket\n");
	error = EINVAL;
	goto release;
	} else
	af = AF_INET;
	}
	laddr = &inp->in6p_laddr;
	faddr = &inp->in6p_faddr;
	fport = inp->inp_fport;
	}

	if (af == AF_INET)
	hlen = sizeof(struct ip);

	/*
	* Calculate data length and get a mbuf
	* for UDP and IP6 headers.
	*/
	M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT);
	if (m == NULL) {
	error = ENOBUFS;
	goto release;
	}

	/*
	* Stuff checksum and output datagram.
	*/
	udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
	udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
	udp6->uh_dport = fport;
	if (nxt == IPPROTO_UDPLITE) {
	struct udpcb *up;

	up = intoudpcb(inp);
	cscov = up->u_txcslen;
	if (cscov >= plen)
	cscov = 0;
	udp6->uh_ulen = htons(cscov);
	/*
	* For UDP-Lite, checksum coverage length of zero means
	* the entire UDPLite packet is covered by the checksum.
	*/
	cscov_partial = (cscov == 0) ? 0 : 1;
	} else if (plen <= 0xffff)
	udp6->uh_ulen = htons((u_short)plen);
	else
	udp6->uh_ulen = 0;
	udp6->uh_sum = 0;

	switch (af) {
	case AF_INET6:
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_plen = htons((u_short)plen);
	ip6->ip6_nxt = nxt;
	ip6->ip6_hlim = in6_selecthlim(inp, NULL);
	ip6->ip6_src = *laddr;
	ip6->ip6_dst = *faddr;

	if (cscov_partial) {
	if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
	sizeof(struct ip6_hdr), plen, cscov)) == 0)
	udp6->uh_sum = 0xffff;
	} else {
	udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
	m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
	m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
	}

	#ifdef RSS
	{
	uint32_t hash_val, hash_type;
	uint8_t pr;

	pr = inp->inp_socket->so_proto->pr_protocol;
	/*
	* Calculate an appropriate RSS hash for UDP and
	* UDP Lite.
	*
	* The called function will take care of figuring out
	* whether a 2-tuple or 4-tuple hash is required based
	* on the currently configured scheme.
	*
	* Later later on connected socket values should be
	* cached in the inpcb and reused, rather than constantly
	* re-calculating it.
	*
	* UDP Lite is a different protocol number and will
	* likely end up being hashed as a 2-tuple until
	* RSS / NICs grow UDP Lite protocol awareness.
	*/
	if (rss_proto_software_hash_v6(faddr, laddr, fport,
	inp->inp_lport, pr, &hash_val, &hash_type) == 0) {
	m->m_pkthdr.flowid = hash_val;
	M_HASHTYPE_SET(m, hash_type);
	}
	}
	#endif
	flags = 0;
	#ifdef RSS
	/*
	* Don't override with the inp cached flowid.
	*
	* Until the whole UDP path is vetted, it may actually
	* be incorrect.
	*/
	flags \|= IP_NODEFAULTFLOWID;
	#endif

	UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
	UDPSTAT_INC(udps_opackets);
	error = ip6_output(m, optp, &inp->inp_route6, flags,
	inp->in6p_moptions, NULL, inp);
	break;
	case AF_INET:
	error = EAFNOSUPPORT;
	goto release;
	}
	goto releaseopt;

	release:
	m_freem(m);

	releaseopt:
	if (control) {
	ip6_clearpktopts(&opt, -1);
	m_freem(control);
	}
	return (error);
	}

	static void
	udp6_abort(struct socket *so)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_abort: inp == NULL"));

	INP_WLOCK(inp);
	#ifdef INET
	if (inp->inp_vflag & INP_IPV4) {
	struct pr_usrreqs *pru;
	uint8_t nxt;

	nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
	IPPROTO_UDP : IPPROTO_UDPLITE;
	INP_WUNLOCK(inp);
	pru = inetsw[ip_protox[nxt]].pr_usrreqs;
	(*pru->pru_abort)(so);
	return;
	}
	#endif

	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	INP_HASH_WLOCK(pcbinfo);
	in6_pcbdisconnect(inp);
	inp->in6p_laddr = in6addr_any;
	INP_HASH_WUNLOCK(pcbinfo);
	soisdisconnected(so);
	}
	INP_WUNLOCK(inp);
	}

	static int
	udp6_attach(struct socket so, int proto, struct thread td)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;
	int error;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("udp6_attach: inp != NULL"));

	if (so->so_snd.sb_hiwat == 0 \|\| so->so_rcv.sb_hiwat == 0) {
	error = soreserve(so, udp_sendspace, udp_recvspace);
	if (error)
	return (error);
	}
	INP_INFO_WLOCK(pcbinfo);
	error = in_pcballoc(so, pcbinfo);
	if (error) {
	INP_INFO_WUNLOCK(pcbinfo);
	return (error);
	}
	inp = (struct inpcb *)so->so_pcb;
	inp->inp_vflag \|= INP_IPV6;
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
	inp->inp_vflag \|= INP_IPV4;
	inp->in6p_hops = -1; /* use kernel default */
	inp->in6p_cksum = -1; /* just to be sure */
	/*
	* XXX: ugly!!
	* IPv4 TTL initialization is necessary for an IPv6 socket as well,
	* because the socket may be bound to an IPv6 wildcard address,
	* which may match an IPv4-mapped IPv6 address.
	*/
	inp->inp_ip_ttl = V_ip_defttl;

	error = udp_newudpcb(inp);
	if (error) {
	in_pcbdetach(inp);
	in_pcbfree(inp);
	INP_INFO_WUNLOCK(pcbinfo);
	return (error);
	}
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(pcbinfo);
	return (0);
	}

	static int
	udp6_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;
	int error;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_bind: inp == NULL"));

	INP_WLOCK(inp);
	INP_HASH_WLOCK(pcbinfo);
	inp->inp_vflag &= ~INP_IPV4;
	inp->inp_vflag \|= INP_IPV6;
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	struct sockaddr_in6 *sin6_p;

	sin6_p = (struct sockaddr_in6 *)nam;

	if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr))
	inp->inp_vflag \|= INP_IPV4;
	#ifdef INET
	else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
	struct sockaddr_in sin;

	in6_sin6_2_sin(&sin, sin6_p);
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_vflag &= ~INP_IPV6;
	error = in_pcbbind(inp, (struct sockaddr *)&sin,
	td->td_ucred);
	goto out;
	}
	#endif
	}

	error = in6_pcbbind(inp, nam, td->td_ucred);
	#ifdef INET
	out:
	#endif
	INP_HASH_WUNLOCK(pcbinfo);
	INP_WUNLOCK(inp);
	return (error);
	}

	static void
	udp6_close(struct socket *so)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_close: inp == NULL"));

	INP_WLOCK(inp);
	#ifdef INET
	if (inp->inp_vflag & INP_IPV4) {
	struct pr_usrreqs *pru;
	uint8_t nxt;

	nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
	IPPROTO_UDP : IPPROTO_UDPLITE;
	INP_WUNLOCK(inp);
	pru = inetsw[ip_protox[nxt]].pr_usrreqs;
	(*pru->pru_disconnect)(so);
	return;
	}
	#endif
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	INP_HASH_WLOCK(pcbinfo);
	in6_pcbdisconnect(inp);
	inp->in6p_laddr = in6addr_any;
	INP_HASH_WUNLOCK(pcbinfo);
	soisdisconnected(so);
	}
	INP_WUNLOCK(inp);
	}

	static int
	udp6_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;
	struct sockaddr_in6 *sin6;
	int error;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	sin6 = (struct sockaddr_in6 *)nam;
	KASSERT(inp != NULL, ("udp6_connect: inp == NULL"));

	/*
	* XXXRW: Need to clarify locking of v4/v6 flags.
	*/
	INP_WLOCK(inp);
	#ifdef INET
	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
	struct sockaddr_in sin;

	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
	error = EINVAL;
	goto out;
	}
	if ((inp->inp_vflag & INP_IPV4) == 0) {
	error = EAFNOSUPPORT;
	goto out;
	}
	if (inp->inp_faddr.s_addr != INADDR_ANY) {
	error = EISCONN;
	goto out;
	}
	in6_sin6_2_sin(&sin, sin6);
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_vflag &= ~INP_IPV6;
	error = prison_remote_ip4(td->td_ucred, &sin.sin_addr);
	if (error != 0)
	goto out;
	INP_HASH_WLOCK(pcbinfo);
	error = in_pcbconnect(inp, (struct sockaddr *)&sin,
	td->td_ucred);
	INP_HASH_WUNLOCK(pcbinfo);
	if (error == 0)
	soisconnected(so);
	goto out;
	} else {
	if ((inp->inp_vflag & INP_IPV6) == 0) {
	error = EAFNOSUPPORT;
	goto out;
	}
	}
	#endif
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	error = EISCONN;
	goto out;
	}
	inp->inp_vflag &= ~INP_IPV4;
	inp->inp_vflag \|= INP_IPV6;
	error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr);
	if (error != 0)
	goto out;
	INP_HASH_WLOCK(pcbinfo);
	error = in6_pcbconnect(inp, nam, td->td_ucred);
	INP_HASH_WUNLOCK(pcbinfo);
	if (error == 0)
	soisconnected(so);
	out:
	INP_WUNLOCK(inp);
	return (error);
	}

	static void
	udp6_detach(struct socket *so)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;
	struct udpcb *up;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));

	INP_INFO_WLOCK(pcbinfo);
	INP_WLOCK(inp);
	up = intoudpcb(inp);
	KASSERT(up != NULL, ("%s: up == NULL", __func__));
	in_pcbdetach(inp);
	in_pcbfree(inp);
	INP_INFO_WUNLOCK(pcbinfo);
	udp_discardcb(up);
	}

	static int
	udp6_disconnect(struct socket *so)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;
	- int error;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL"));

	INP_WLOCK(inp);
	#ifdef INET
	if (inp->inp_vflag & INP_IPV4) {
	struct pr_usrreqs *pru;
	uint8_t nxt;

	nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
	IPPROTO_UDP : IPPROTO_UDPLITE;
	INP_WUNLOCK(inp);
	pru = inetsw[ip_protox[nxt]].pr_usrreqs;
	(void)(*pru->pru_disconnect)(so);
	return (0);
	}
	#endif

	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	- error = ENOTCONN;
	- goto out;
	+ INP_WUNLOCK(inp);
	+ return (ENOTCONN);
	}

	INP_HASH_WLOCK(pcbinfo);
	in6_pcbdisconnect(inp);
	inp->in6p_laddr = in6addr_any;
	INP_HASH_WUNLOCK(pcbinfo);
	SOCK_LOCK(so);
	so->so_state &= ~SS_ISCONNECTED; /* XXX */
	SOCK_UNLOCK(so);
	-out:
	INP_WUNLOCK(inp);
	return (0);
	}

	static int
	udp6_send(struct socket so, int flags, struct mbuf m,
	struct sockaddr addr, struct mbuf control, struct thread *td)
	{
	struct inpcb *inp;
	struct inpcbinfo *pcbinfo;
	int error = 0;

	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_send: inp == NULL"));

	INP_WLOCK(inp);
	if (addr) {
	if (addr->sa_len != sizeof(struct sockaddr_in6)) {
	error = EINVAL;
	goto bad;
	}
	if (addr->sa_family != AF_INET6) {
	error = EAFNOSUPPORT;
	goto bad;
	}
	}

	#ifdef INET
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	int hasv4addr;
	struct sockaddr_in6 *sin6 = NULL;

	if (addr == NULL)
	hasv4addr = (inp->inp_vflag & INP_IPV4);
	else {
	sin6 = (struct sockaddr_in6 *)addr;
	hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
	? 1 : 0;
	}
	if (hasv4addr) {
	struct pr_usrreqs *pru;
	uint8_t nxt;

	nxt = (inp->inp_socket->so_proto->pr_protocol ==
	IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE;
	/*
	* XXXRW: We release UDP-layer locks before calling
	* udp_send() in order to avoid recursion. However,
	* this does mean there is a short window where inp's
	* fields are unstable. Could this lead to a
	* potential race in which the factors causing us to
	* select the UDPv4 output routine are invalidated?
	*/
	INP_WUNLOCK(inp);
	if (sin6)
	in6_sin6_2_sin_in_sock(addr);
	pru = inetsw[ip_protox[nxt]].pr_usrreqs;
	/* addr will just be freed in sendit(). */
	return ((*pru->pru_send)(so, flags, m, addr, control,
	td));
	}
	}
	#endif
	#ifdef MAC
	mac_inpcb_create_mbuf(inp, m);
	#endif
	INP_HASH_WLOCK(pcbinfo);
	error = udp6_output(inp, m, addr, control, td);
	INP_HASH_WUNLOCK(pcbinfo);
	INP_WUNLOCK(inp);
	return (error);

	bad:
	INP_WUNLOCK(inp);
	m_freem(m);
	return (error);
	}

	struct pr_usrreqs udp6_usrreqs = {
	.pru_abort = udp6_abort,
	.pru_attach = udp6_attach,
	.pru_bind = udp6_bind,
	.pru_connect = udp6_connect,
	.pru_control = in6_control,
	.pru_detach = udp6_detach,
	.pru_disconnect = udp6_disconnect,
	.pru_peeraddr = in6_mapped_peeraddr,
	.pru_send = udp6_send,
	.pru_shutdown = udp_shutdown,
	.pru_sockaddr = in6_mapped_sockaddr,
	.pru_soreceive = soreceive_dgram,
	.pru_sosend = sosend_dgram,
	.pru_sosetlabel = in_pcbsosetlabel,
	.pru_close = udp6_close
	};
	Index: head/sys/netipsec/key.c
	===================================================================
	--- head/sys/netipsec/key.c (revision 327172)
	+++ head/sys/netipsec/key.c (revision 327173)
	@@ -1,8454 +1,8449 @@
	/* $FreeBSD$ */
	/* $KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $ */

	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* This code is referd to RFC 2367
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/fnv_hash.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/malloc.h>
	#include <sys/rmlock.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/errno.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/refcount.h>
	#include <sys/syslog.h>

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/if_var.h>
	#include <net/vnet.h>
	#include <net/raw_cb.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_var.h>
	#include <netinet/udp.h>

	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/ip6_var.h>
	#endif /* INET6 */

	#include <net/pfkeyv2.h>
	#include <netipsec/keydb.h>
	#include <netipsec/key.h>
	#include <netipsec/keysock.h>
	#include <netipsec/key_debug.h>

	#include <netipsec/ipsec.h>
	#ifdef INET6
	#include <netipsec/ipsec6.h>
	#endif

	#include <netipsec/xform.h>
	#include <machine/in_cksum.h>
	#include <machine/stdarg.h>

	/* randomness */
	#include <sys/random.h>

	#define FULLMASK 0xff
	#define _BITS(bytes) ((bytes) << 3)

	/*
	* Note on SA reference counting:
	* - SAs that are not in DEAD state will have (total external reference + 1)
	* following value in reference count field. they cannot be freed and are
	* referenced from SA header.
	* - SAs that are in DEAD state will have (total external reference)
	* in reference count field. they are ready to be freed. reference from
	* SA header will be removed in key_delsav(), when the reference count
	* field hits 0 (= no external reference other than from SA header.
	*/

	VNET_DEFINE(u_int32_t, key_debug_level) = 0;
	static VNET_DEFINE(u_int, key_spi_trycnt) = 1000;
	static VNET_DEFINE(u_int32_t, key_spi_minval) = 0x100;
	static VNET_DEFINE(u_int32_t, key_spi_maxval) = 0x0fffffff; /* XXX */
	static VNET_DEFINE(u_int32_t, policy_id) = 0;
	/interval to initialize randseed,1(m)/
	static VNET_DEFINE(u_int, key_int_random) = 60;
	/* interval to expire acquiring, 30(s)*/
	static VNET_DEFINE(u_int, key_larval_lifetime) = 30;
	/* counter for blocking SADB_ACQUIRE.*/
	static VNET_DEFINE(int, key_blockacq_count) = 10;
	/* lifetime for blocking SADB_ACQUIRE.*/
	static VNET_DEFINE(int, key_blockacq_lifetime) = 20;
	/* preferred old sa rather than new sa.*/
	static VNET_DEFINE(int, key_preferred_oldsa) = 1;
	#define V_key_spi_trycnt VNET(key_spi_trycnt)
	#define V_key_spi_minval VNET(key_spi_minval)
	#define V_key_spi_maxval VNET(key_spi_maxval)
	#define V_policy_id VNET(policy_id)
	#define V_key_int_random VNET(key_int_random)
	#define V_key_larval_lifetime VNET(key_larval_lifetime)
	#define V_key_blockacq_count VNET(key_blockacq_count)
	#define V_key_blockacq_lifetime VNET(key_blockacq_lifetime)
	#define V_key_preferred_oldsa VNET(key_preferred_oldsa)

	static VNET_DEFINE(u_int32_t, acq_seq) = 0;
	#define V_acq_seq VNET(acq_seq)

	static VNET_DEFINE(uint32_t, sp_genid) = 0;
	#define V_sp_genid VNET(sp_genid)

	/* SPD */
	TAILQ_HEAD(secpolicy_queue, secpolicy);
	LIST_HEAD(secpolicy_list, secpolicy);
	static VNET_DEFINE(struct secpolicy_queue, sptree[IPSEC_DIR_MAX]);
	static VNET_DEFINE(struct secpolicy_queue, sptree_ifnet[IPSEC_DIR_MAX]);
	static struct rmlock sptree_lock;
	#define V_sptree VNET(sptree)
	#define V_sptree_ifnet VNET(sptree_ifnet)
	#define SPTREE_LOCK_INIT() rm_init(&sptree_lock, "sptree")
	#define SPTREE_LOCK_DESTROY() rm_destroy(&sptree_lock)
	#define SPTREE_RLOCK_TRACKER struct rm_priotracker sptree_tracker
	#define SPTREE_RLOCK() rm_rlock(&sptree_lock, &sptree_tracker)
	#define SPTREE_RUNLOCK() rm_runlock(&sptree_lock, &sptree_tracker)
	#define SPTREE_RLOCK_ASSERT() rm_assert(&sptree_lock, RA_RLOCKED)
	#define SPTREE_WLOCK() rm_wlock(&sptree_lock)
	#define SPTREE_WUNLOCK() rm_wunlock(&sptree_lock)
	#define SPTREE_WLOCK_ASSERT() rm_assert(&sptree_lock, RA_WLOCKED)
	#define SPTREE_UNLOCK_ASSERT() rm_assert(&sptree_lock, RA_UNLOCKED)

	/* Hash table for lookup SP using unique id */
	static VNET_DEFINE(struct secpolicy_list *, sphashtbl);
	static VNET_DEFINE(u_long, sphash_mask);
	#define V_sphashtbl VNET(sphashtbl)
	#define V_sphash_mask VNET(sphash_mask)

	#define SPHASH_NHASH_LOG2 7
	#define SPHASH_NHASH (1 << SPHASH_NHASH_LOG2)
	#define SPHASH_HASHVAL(id) (key_u32hash(id) & V_sphash_mask)
	#define SPHASH_HASH(id) &V_sphashtbl[SPHASH_HASHVAL(id)]

	/* SAD */
	TAILQ_HEAD(secashead_queue, secashead);
	LIST_HEAD(secashead_list, secashead);
	static VNET_DEFINE(struct secashead_queue, sahtree);
	static struct rmlock sahtree_lock;
	#define V_sahtree VNET(sahtree)
	#define SAHTREE_LOCK_INIT() rm_init(&sahtree_lock, "sahtree")
	#define SAHTREE_LOCK_DESTROY() rm_destroy(&sahtree_lock)
	#define SAHTREE_RLOCK_TRACKER struct rm_priotracker sahtree_tracker
	#define SAHTREE_RLOCK() rm_rlock(&sahtree_lock, &sahtree_tracker)
	#define SAHTREE_RUNLOCK() rm_runlock(&sahtree_lock, &sahtree_tracker)
	#define SAHTREE_RLOCK_ASSERT() rm_assert(&sahtree_lock, RA_RLOCKED)
	#define SAHTREE_WLOCK() rm_wlock(&sahtree_lock)
	#define SAHTREE_WUNLOCK() rm_wunlock(&sahtree_lock)
	#define SAHTREE_WLOCK_ASSERT() rm_assert(&sahtree_lock, RA_WLOCKED)
	#define SAHTREE_UNLOCK_ASSERT() rm_assert(&sahtree_lock, RA_UNLOCKED)

	/* Hash table for lookup in SAD using SA addresses */
	static VNET_DEFINE(struct secashead_list *, sahaddrhashtbl);
	static VNET_DEFINE(u_long, sahaddrhash_mask);
	#define V_sahaddrhashtbl VNET(sahaddrhashtbl)
	#define V_sahaddrhash_mask VNET(sahaddrhash_mask)

	#define SAHHASH_NHASH_LOG2 7
	#define SAHHASH_NHASH (1 << SAHHASH_NHASH_LOG2)
	#define SAHADDRHASH_HASHVAL(saidx) \
	(key_saidxhash(saidx) & V_sahaddrhash_mask)
	#define SAHADDRHASH_HASH(saidx) \
	&V_sahaddrhashtbl[SAHADDRHASH_HASHVAL(saidx)]

	/* Hash table for lookup in SAD using SPI */
	LIST_HEAD(secasvar_list, secasvar);
	static VNET_DEFINE(struct secasvar_list *, savhashtbl);
	static VNET_DEFINE(u_long, savhash_mask);
	#define V_savhashtbl VNET(savhashtbl)
	#define V_savhash_mask VNET(savhash_mask)
	#define SAVHASH_NHASH_LOG2 7
	#define SAVHASH_NHASH (1 << SAVHASH_NHASH_LOG2)
	#define SAVHASH_HASHVAL(spi) (key_u32hash(spi) & V_savhash_mask)
	#define SAVHASH_HASH(spi) &V_savhashtbl[SAVHASH_HASHVAL(spi)]

	static uint32_t
	key_saidxhash(const struct secasindex *saidx)
	{
	uint32_t hval;

	hval = fnv_32_buf(&saidx->proto, sizeof(saidx->proto),
	FNV1_32_INIT);
	switch (saidx->dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	hval = fnv_32_buf(&saidx->src.sin.sin_addr,
	sizeof(in_addr_t), hval);
	hval = fnv_32_buf(&saidx->dst.sin.sin_addr,
	sizeof(in_addr_t), hval);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	hval = fnv_32_buf(&saidx->src.sin6.sin6_addr,
	sizeof(struct in6_addr), hval);
	hval = fnv_32_buf(&saidx->dst.sin6.sin6_addr,
	sizeof(struct in6_addr), hval);
	break;
	#endif
	default:
	hval = 0;
	ipseclog((LOG_DEBUG, "%s: unknown address family %d",
	__func__, saidx->dst.sa.sa_family));
	}
	return (hval);
	}

	static uint32_t
	key_u32hash(uint32_t val)
	{

	return (fnv_32_buf(&val, sizeof(val), FNV1_32_INIT));
	}

	/* registed list */
	static VNET_DEFINE(LIST_HEAD(_regtree, secreg), regtree[SADB_SATYPE_MAX + 1]);
	#define V_regtree VNET(regtree)
	static struct mtx regtree_lock;
	#define REGTREE_LOCK_INIT() \
	mtx_init(&regtree_lock, "regtree", "fast ipsec regtree", MTX_DEF)
	#define REGTREE_LOCK_DESTROY() mtx_destroy(&regtree_lock)
	#define REGTREE_LOCK() mtx_lock(&regtree_lock)
	#define REGTREE_UNLOCK() mtx_unlock(&regtree_lock)
	#define REGTREE_LOCK_ASSERT() mtx_assert(&regtree_lock, MA_OWNED)

	/* Acquiring list */
	LIST_HEAD(secacq_list, secacq);
	static VNET_DEFINE(struct secacq_list, acqtree);
	#define V_acqtree VNET(acqtree)
	static struct mtx acq_lock;
	#define ACQ_LOCK_INIT() \
	mtx_init(&acq_lock, "acqtree", "ipsec SA acquiring list", MTX_DEF)
	#define ACQ_LOCK_DESTROY() mtx_destroy(&acq_lock)
	#define ACQ_LOCK() mtx_lock(&acq_lock)
	#define ACQ_UNLOCK() mtx_unlock(&acq_lock)
	#define ACQ_LOCK_ASSERT() mtx_assert(&acq_lock, MA_OWNED)

	/* Hash table for lookup in ACQ list using SA addresses */
	static VNET_DEFINE(struct secacq_list *, acqaddrhashtbl);
	static VNET_DEFINE(u_long, acqaddrhash_mask);
	#define V_acqaddrhashtbl VNET(acqaddrhashtbl)
	#define V_acqaddrhash_mask VNET(acqaddrhash_mask)

	/* Hash table for lookup in ACQ list using SEQ number */
	static VNET_DEFINE(struct secacq_list *, acqseqhashtbl);
	static VNET_DEFINE(u_long, acqseqhash_mask);
	#define V_acqseqhashtbl VNET(acqseqhashtbl)
	#define V_acqseqhash_mask VNET(acqseqhash_mask)

	#define ACQHASH_NHASH_LOG2 7
	#define ACQHASH_NHASH (1 << ACQHASH_NHASH_LOG2)
	#define ACQADDRHASH_HASHVAL(saidx) \
	(key_saidxhash(saidx) & V_acqaddrhash_mask)
	#define ACQSEQHASH_HASHVAL(seq) \
	(key_u32hash(seq) & V_acqseqhash_mask)
	#define ACQADDRHASH_HASH(saidx) \
	&V_acqaddrhashtbl[ACQADDRHASH_HASHVAL(saidx)]
	#define ACQSEQHASH_HASH(seq) \
	&V_acqseqhashtbl[ACQSEQHASH_HASHVAL(seq)]
	/* SP acquiring list */
	static VNET_DEFINE(LIST_HEAD(_spacqtree, secspacq), spacqtree);
	#define V_spacqtree VNET(spacqtree)
	static struct mtx spacq_lock;
	#define SPACQ_LOCK_INIT() \
	mtx_init(&spacq_lock, "spacqtree", \
	"fast ipsec security policy acquire list", MTX_DEF)
	#define SPACQ_LOCK_DESTROY() mtx_destroy(&spacq_lock)
	#define SPACQ_LOCK() mtx_lock(&spacq_lock)
	#define SPACQ_UNLOCK() mtx_unlock(&spacq_lock)
	#define SPACQ_LOCK_ASSERT() mtx_assert(&spacq_lock, MA_OWNED)

	static const int minsize[] = {
	sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */
	sizeof(struct sadb_sa), /* SADB_EXT_SA */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */
	sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_SRC */
	sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_DST */
	sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_PROXY */
	sizeof(struct sadb_key), /* SADB_EXT_KEY_AUTH */
	sizeof(struct sadb_key), /* SADB_EXT_KEY_ENCRYPT */
	sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_SRC */
	sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_DST */
	sizeof(struct sadb_sens), /* SADB_EXT_SENSITIVITY */
	sizeof(struct sadb_prop), /* SADB_EXT_PROPOSAL */
	sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_AUTH */
	sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_ENCRYPT */
	sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */
	0, /* SADB_X_EXT_KMPRIVATE */
	sizeof(struct sadb_x_policy), /* SADB_X_EXT_POLICY */
	sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */
	sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */
	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */
	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */
	sizeof(struct sadb_address), /* SADB_X_EXT_NAT_T_OAI */
	sizeof(struct sadb_address), /* SADB_X_EXT_NAT_T_OAR */
	sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */
	sizeof(struct sadb_x_sa_replay), /* SADB_X_EXT_SA_REPLAY */
	sizeof(struct sadb_address), /* SADB_X_EXT_NEW_ADDRESS_SRC */
	sizeof(struct sadb_address), /* SADB_X_EXT_NEW_ADDRESS_DST */
	};
	_Static_assert(sizeof(minsize)/sizeof(int) == SADB_EXT_MAX + 1, "minsize size mismatch");

	static const int maxsize[] = {
	sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */
	sizeof(struct sadb_sa), /* SADB_EXT_SA */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */
	0, /* SADB_EXT_ADDRESS_SRC */
	0, /* SADB_EXT_ADDRESS_DST */
	0, /* SADB_EXT_ADDRESS_PROXY */
	0, /* SADB_EXT_KEY_AUTH */
	0, /* SADB_EXT_KEY_ENCRYPT */
	0, /* SADB_EXT_IDENTITY_SRC */
	0, /* SADB_EXT_IDENTITY_DST */
	0, /* SADB_EXT_SENSITIVITY */
	0, /* SADB_EXT_PROPOSAL */
	0, /* SADB_EXT_SUPPORTED_AUTH */
	0, /* SADB_EXT_SUPPORTED_ENCRYPT */
	sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */
	0, /* SADB_X_EXT_KMPRIVATE */
	0, /* SADB_X_EXT_POLICY */
	sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */
	sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */
	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */
	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */
	0, /* SADB_X_EXT_NAT_T_OAI */
	0, /* SADB_X_EXT_NAT_T_OAR */
	sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */
	sizeof(struct sadb_x_sa_replay), /* SADB_X_EXT_SA_REPLAY */
	0, /* SADB_X_EXT_NEW_ADDRESS_SRC */
	0, /* SADB_X_EXT_NEW_ADDRESS_DST */
	};
	_Static_assert(sizeof(maxsize)/sizeof(int) == SADB_EXT_MAX + 1, "minsize size mismatch");

	/*
	* Internal values for SA flags:
	* SADB_X_EXT_F_CLONED means that SA was cloned by key_updateaddresses,
	* thus we will not free the most of SA content in key_delsav().
	*/
	#define SADB_X_EXT_F_CLONED 0x80000000

	#define SADB_CHECKLEN(_mhp, _ext) \
	((_mhp)->extlen[(_ext)] < minsize[(_ext)] \|\| (maxsize[(_ext)] != 0 && \
	((_mhp)->extlen[(_ext)] > maxsize[(_ext)])))
	#define SADB_CHECKHDR(_mhp, _ext) ((_mhp)->ext[(_ext)] == NULL)

	static VNET_DEFINE(int, ipsec_esp_keymin) = 256;
	static VNET_DEFINE(int, ipsec_esp_auth) = 0;
	static VNET_DEFINE(int, ipsec_ah_keymin) = 128;

	#define V_ipsec_esp_keymin VNET(ipsec_esp_keymin)
	#define V_ipsec_esp_auth VNET(ipsec_esp_auth)
	#define V_ipsec_ah_keymin VNET(ipsec_ah_keymin)

	#ifdef IPSEC_DEBUG
	VNET_DEFINE(int, ipsec_debug) = 1;
	#else
	VNET_DEFINE(int, ipsec_debug) = 0;
	#endif

	#ifdef INET
	SYSCTL_DECL(_net_inet_ipsec);
	SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
	"Enable IPsec debugging output when set.");
	#endif
	#ifdef INET6
	SYSCTL_DECL(_net_inet6_ipsec6);
	SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
	"Enable IPsec debugging output when set.");
	#endif

	SYSCTL_DECL(_net_key);
	SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_debug_level), 0, "");

	/* max count of trial for the decision of spi value */
	SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_spi_trycnt), 0, "");

	/* minimum spi value to allocate automatically. */
	SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_spi_minval), 0, "");

	/* maximun spi value to allocate automatically. */
	SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_spi_maxval), 0, "");

	/* interval to initialize randseed */
	SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_int_random), 0, "");

	/* lifetime for larval SA */
	SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_larval_lifetime), 0, "");

	/* counter for blocking to send SADB_ACQUIRE to IKEd */
	SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_blockacq_count), 0, "");

	/* lifetime for blocking to send SADB_ACQUIRE to IKEd */
	SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_blockacq_lifetime), 0, "");

	/* ESP auth */
	SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ipsec_esp_auth), 0, "");

	/* minimum ESP key length */
	SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ipsec_esp_keymin), 0, "");

	/* minimum AH key length */
	SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ipsec_ah_keymin), 0, "");

	/* perfered old SA rather than new SA */
	SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, preferred_oldsa,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(key_preferred_oldsa), 0, "");

	#define __LIST_CHAINED(elm) \
	(!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL))

	MALLOC_DEFINE(M_IPSEC_SA, "secasvar", "ipsec security association");
	MALLOC_DEFINE(M_IPSEC_SAH, "sahead", "ipsec sa head");
	MALLOC_DEFINE(M_IPSEC_SP, "ipsecpolicy", "ipsec security policy");
	MALLOC_DEFINE(M_IPSEC_SR, "ipsecrequest", "ipsec security request");
	MALLOC_DEFINE(M_IPSEC_MISC, "ipsec-misc", "ipsec miscellaneous");
	MALLOC_DEFINE(M_IPSEC_SAQ, "ipsec-saq", "ipsec sa acquire");
	MALLOC_DEFINE(M_IPSEC_SAR, "ipsec-reg", "ipsec sa acquire");

	static VNET_DEFINE(uma_zone_t, key_lft_zone);
	#define V_key_lft_zone VNET(key_lft_zone)

	static LIST_HEAD(xforms_list, xformsw) xforms = LIST_HEAD_INITIALIZER();
	static struct mtx xforms_lock;
	#define XFORMS_LOCK_INIT() \
	mtx_init(&xforms_lock, "xforms_list", "IPsec transforms list", MTX_DEF)
	#define XFORMS_LOCK_DESTROY() mtx_destroy(&xforms_lock)
	#define XFORMS_LOCK() mtx_lock(&xforms_lock)
	#define XFORMS_UNLOCK() mtx_unlock(&xforms_lock)

	/*
	* set parameters into secpolicyindex buffer.
	* Must allocate secpolicyindex buffer passed to this function.
	*/
	#define KEY_SETSECSPIDX(_dir, s, d, ps, pd, ulp, idx) \
	do { \
	bzero((idx), sizeof(struct secpolicyindex)); \
	(idx)->dir = (_dir); \
	(idx)->prefs = (ps); \
	(idx)->prefd = (pd); \
	(idx)->ul_proto = (ulp); \
	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \
	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \
	} while (0)

	/*
	* set parameters into secasindex buffer.
	* Must allocate secasindex buffer before calling this function.
	*/
	#define KEY_SETSECASIDX(p, m, r, s, d, idx) \
	do { \
	bzero((idx), sizeof(struct secasindex)); \
	(idx)->proto = (p); \
	(idx)->mode = (m); \
	(idx)->reqid = (r); \
	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \
	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \
	key_porttosaddr(&(idx)->src.sa, 0); \
	key_porttosaddr(&(idx)->dst.sa, 0); \
	} while (0)

	/* key statistics */
	struct _keystat {
	u_long getspi_count; /* the avarage of count to try to get new SPI */
	} keystat;

	struct sadb_msghdr {
	struct sadb_msg *msg;
	struct sadb_ext *ext[SADB_EXT_MAX + 1];
	int extoff[SADB_EXT_MAX + 1];
	int extlen[SADB_EXT_MAX + 1];
	};

	static struct supported_ealgs {
	int sadb_alg;
	const struct enc_xform *xform;
	} supported_ealgs[] = {
	{ SADB_EALG_DESCBC, &enc_xform_des },
	{ SADB_EALG_3DESCBC, &enc_xform_3des },
	{ SADB_X_EALG_AES, &enc_xform_rijndael128 },
	{ SADB_X_EALG_BLOWFISHCBC, &enc_xform_blf },
	{ SADB_X_EALG_CAST128CBC, &enc_xform_cast5 },
	{ SADB_EALG_NULL, &enc_xform_null },
	{ SADB_X_EALG_CAMELLIACBC, &enc_xform_camellia },
	{ SADB_X_EALG_AESCTR, &enc_xform_aes_icm },
	{ SADB_X_EALG_AESGCM16, &enc_xform_aes_nist_gcm },
	{ SADB_X_EALG_AESGMAC, &enc_xform_aes_nist_gmac },
	};

	static struct supported_aalgs {
	int sadb_alg;
	const struct auth_hash *xform;
	} supported_aalgs[] = {
	{ SADB_X_AALG_NULL, &auth_hash_null },
	{ SADB_AALG_MD5HMAC, &auth_hash_hmac_md5 },
	{ SADB_AALG_SHA1HMAC, &auth_hash_hmac_sha1 },
	{ SADB_X_AALG_RIPEMD160HMAC, &auth_hash_hmac_ripemd_160 },
	{ SADB_X_AALG_MD5, &auth_hash_key_md5 },
	{ SADB_X_AALG_SHA, &auth_hash_key_sha1 },
	{ SADB_X_AALG_SHA2_256, &auth_hash_hmac_sha2_256 },
	{ SADB_X_AALG_SHA2_384, &auth_hash_hmac_sha2_384 },
	{ SADB_X_AALG_SHA2_512, &auth_hash_hmac_sha2_512 },
	{ SADB_X_AALG_AES128GMAC, &auth_hash_nist_gmac_aes_128 },
	{ SADB_X_AALG_AES192GMAC, &auth_hash_nist_gmac_aes_192 },
	{ SADB_X_AALG_AES256GMAC, &auth_hash_nist_gmac_aes_256 },
	};

	static struct supported_calgs {
	int sadb_alg;
	const struct comp_algo *xform;
	} supported_calgs[] = {
	{ SADB_X_CALG_DEFLATE, &comp_algo_deflate },
	};

	#ifndef IPSEC_DEBUG2
	static struct callout key_timer;
	#endif

	static void key_unlink(struct secpolicy *);
	static struct secpolicy key_getsp(struct secpolicyindex );
	static struct secpolicy *key_getspbyid(u_int32_t);
	static struct mbuf key_gather_mbuf(struct mbuf ,
	const struct sadb_msghdr *, int, int, ...);
	static int key_spdadd(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static uint32_t key_getnewspid(void);
	static int key_spddelete(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_spddelete2(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_spdget(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_spdflush(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_spddump(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static struct mbuf key_setdumpsp(struct secpolicy ,
	u_int8_t, u_int32_t, u_int32_t);
	static struct mbuf key_sp2mbuf(struct secpolicy );
	static size_t key_getspreqmsglen(struct secpolicy *);
	static int key_spdexpire(struct secpolicy *);
	static struct secashead key_newsah(struct secasindex );
	static void key_freesah(struct secashead **);
	static void key_delsah(struct secashead *);
	static struct secasvar key_newsav(const struct sadb_msghdr ,
	struct secasindex , uint32_t, int );
	static void key_delsav(struct secasvar *);
	static void key_unlinksav(struct secasvar *);
	static struct secashead key_getsah(struct secasindex );
	static int key_checkspidup(uint32_t);
	static struct secasvar *key_getsavbyspi(uint32_t);
	static int key_setnatt(struct secasvar , const struct sadb_msghdr );
	static int key_setsaval(struct secasvar , const struct sadb_msghdr );
	static int key_updatelifetimes(struct secasvar , const struct sadb_msghdr );
	static int key_updateaddresses(struct socket , struct mbuf ,
	const struct sadb_msghdr , struct secasvar , struct secasindex *);

	static struct mbuf key_setdumpsa(struct secasvar , u_int8_t,
	u_int8_t, u_int32_t, u_int32_t);
	static struct mbuf *key_setsadbmsg(u_int8_t, u_int16_t, u_int8_t,
	u_int32_t, pid_t, u_int16_t);
	static struct mbuf key_setsadbsa(struct secasvar );
	static struct mbuf *key_setsadbaddr(u_int16_t,
	const struct sockaddr *, u_int8_t, u_int16_t);
	static struct mbuf *key_setsadbxport(u_int16_t, u_int16_t);
	static struct mbuf *key_setsadbxtype(u_int16_t);
	static struct mbuf *key_setsadbxsa2(u_int8_t, u_int32_t, u_int32_t);
	static struct mbuf *key_setsadbxsareplay(u_int32_t);
	static struct mbuf *key_setsadbxpolicy(u_int16_t, u_int8_t,
	u_int32_t, u_int32_t);
	static struct seckey key_dup_keymsg(const struct sadb_key , size_t,
	struct malloc_type *);
	static struct seclifetime key_dup_lifemsg(const struct sadb_lifetime src,
	struct malloc_type *);

	/* flags for key_cmpsaidx() */
	#define CMP_HEAD 1 /* protocol, addresses. */
	#define CMP_MODE_REQID 2 /* additionally HEAD, reqid, mode. */
	#define CMP_REQID 3 /* additionally HEAD, reaid. */
	#define CMP_EXACTLY 4 /* all elements. */
	static int key_cmpsaidx(const struct secasindex *,
	const struct secasindex *, int);
	static int key_cmpspidx_exactly(struct secpolicyindex *,
	struct secpolicyindex *);
	static int key_cmpspidx_withmask(struct secpolicyindex *,
	struct secpolicyindex *);
	static int key_bbcmp(const void , const void , u_int);
	static uint8_t key_satype2proto(uint8_t);
	static uint8_t key_proto2satype(uint8_t);

	static int key_getspi(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static uint32_t key_do_getnewspi(struct sadb_spirange , struct secasindex );
	static int key_update(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_add(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_setident(struct secashead , const struct sadb_msghdr );
	static struct mbuf key_getmsgbuf_x1(struct mbuf ,
	const struct sadb_msghdr *);
	static int key_delete(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_delete_all(struct socket , struct mbuf ,
	const struct sadb_msghdr , struct secasindex );
	static void key_delete_xform(const struct xformsw *);
	static int key_get(struct socket , struct mbuf ,
	const struct sadb_msghdr *);

	static void key_getcomb_setlifetime(struct sadb_comb *);
	static struct mbuf *key_getcomb_ealg(void);
	static struct mbuf *key_getcomb_ah(void);
	static struct mbuf *key_getcomb_ipcomp(void);
	static struct mbuf key_getprop(const struct secasindex );

	static int key_acquire(const struct secasindex , struct secpolicy );
	static uint32_t key_newacq(const struct secasindex , int );
	static uint32_t key_getacq(const struct secasindex , int );
	static int key_acqdone(const struct secasindex *, uint32_t);
	static int key_acqreset(uint32_t);
	static struct secspacq key_newspacq(struct secpolicyindex );
	static struct secspacq key_getspacq(struct secpolicyindex );
	static int key_acquire2(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_register(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_expire(struct secasvar *, int);
	static int key_flush(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_dump(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_promisc(struct socket , struct mbuf ,
	const struct sadb_msghdr *);
	static int key_senderror(struct socket , struct mbuf , int);
	static int key_validate_ext(const struct sadb_ext *, int);
	static int key_align(struct mbuf , struct sadb_msghdr );
	static struct mbuf key_setlifetime(struct seclifetime , uint16_t);
	static struct mbuf key_setkey(struct seckey , uint16_t);
	static int xform_init(struct secasvar *, u_short);

	#define DBG_IPSEC_INITREF(t, p) do { \
	refcount_init(&(p)->refcnt, 1); \
	KEYDBG(KEY_STAMP, \
	printf("%s: Initialize refcnt %s(%p) = %u\n", \
	__func__, #t, (p), (p)->refcnt)); \
	} while (0)
	#define DBG_IPSEC_ADDREF(t, p) do { \
	refcount_acquire(&(p)->refcnt); \
	KEYDBG(KEY_STAMP, \
	printf("%s: Acquire refcnt %s(%p) -> %u\n", \
	__func__, #t, (p), (p)->refcnt)); \
	} while (0)
	#define DBG_IPSEC_DELREF(t, p) do { \
	KEYDBG(KEY_STAMP, \
	printf("%s: Release refcnt %s(%p) -> %u\n", \
	__func__, #t, (p), (p)->refcnt - 1)); \
	refcount_release(&(p)->refcnt); \
	} while (0)

	#define IPSEC_INITREF(t, p) refcount_init(&(p)->refcnt, 1)
	#define IPSEC_ADDREF(t, p) refcount_acquire(&(p)->refcnt)
	#define IPSEC_DELREF(t, p) refcount_release(&(p)->refcnt)

	#define SP_INITREF(p) IPSEC_INITREF(SP, p)
	#define SP_ADDREF(p) IPSEC_ADDREF(SP, p)
	#define SP_DELREF(p) IPSEC_DELREF(SP, p)

	#define SAH_INITREF(p) IPSEC_INITREF(SAH, p)
	#define SAH_ADDREF(p) IPSEC_ADDREF(SAH, p)
	#define SAH_DELREF(p) IPSEC_DELREF(SAH, p)

	#define SAV_INITREF(p) IPSEC_INITREF(SAV, p)
	#define SAV_ADDREF(p) IPSEC_ADDREF(SAV, p)
	#define SAV_DELREF(p) IPSEC_DELREF(SAV, p)

	/*
	* Update the refcnt while holding the SPTREE lock.
	*/
	void
	key_addref(struct secpolicy *sp)
	{

	SP_ADDREF(sp);
	}

	/*
	* Return 0 when there are known to be no SP's for the specified
	* direction. Otherwise return 1. This is used by IPsec code
	* to optimize performance.
	*/
	int
	key_havesp(u_int dir)
	{

	return (dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND ?
	TAILQ_FIRST(&V_sptree[dir]) != NULL : 1);
	}

	/* %%% IPsec policy management */
	/*
	* Return current SPDB generation.
	*/
	uint32_t
	key_getspgen(void)
	{

	return (V_sp_genid);
	}

	void
	key_bumpspgen(void)
	{

	V_sp_genid++;
	}

	static int
	key_checksockaddrs(struct sockaddr src, struct sockaddr dst)
	{

	/* family match */
	if (src->sa_family != dst->sa_family)
	return (EINVAL);
	/* sa_len match */
	if (src->sa_len != dst->sa_len)
	return (EINVAL);
	switch (src->sa_family) {
	#ifdef INET
	case AF_INET:
	if (src->sa_len != sizeof(struct sockaddr_in))
	return (EINVAL);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	if (src->sa_len != sizeof(struct sockaddr_in6))
	return (EINVAL);
	break;
	#endif
	default:
	return (EAFNOSUPPORT);
	}
	return (0);
	}

	/*
	* allocating a SP for OUTBOUND or INBOUND packet.
	* Must call key_freesp() later.
	* OUT: NULL: not found
	* others: found and return the pointer.
	*/
	struct secpolicy *
	key_allocsp(struct secpolicyindex *spidx, u_int dir)
	{
	SPTREE_RLOCK_TRACKER;
	struct secpolicy *sp;

	IPSEC_ASSERT(spidx != NULL, ("null spidx"));
	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", dir));

	SPTREE_RLOCK();
	TAILQ_FOREACH(sp, &V_sptree[dir], chain) {
	if (key_cmpspidx_withmask(&sp->spidx, spidx)) {
	SP_ADDREF(sp);
	break;
	}
	}
	SPTREE_RUNLOCK();

	if (sp != NULL) { /* found a SPD entry */
	sp->lastused = time_second;
	KEYDBG(IPSEC_STAMP,
	printf("%s: return SP(%p)\n", __func__, sp));
	KEYDBG(IPSEC_DATA, kdebug_secpolicy(sp));
	} else {
	KEYDBG(IPSEC_DATA,
	printf("%s: lookup failed for ", __func__);
	kdebug_secpolicyindex(spidx, NULL));
	}
	return (sp);
	}

	/*
	* Allocating an SA entry for an INBOUND or OUTBOUND TCP packet, signed
	* or should be signed by MD5 signature.
	* We don't use key_allocsa() for such lookups, because we don't know SPI.
	* Unlike ESP and AH protocols, SPI isn't transmitted in the TCP header with
	* signed packet. We use SADB only as storage for password.
	* OUT: positive: corresponding SA for given saidx found.
	* NULL: SA not found
	*/
	struct secasvar *
	key_allocsa_tcpmd5(struct secasindex *saidx)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secashead *sah;
	struct secasvar *sav;

	IPSEC_ASSERT(saidx->proto == IPPROTO_TCP,
	("unexpected security protocol %u", saidx->proto));
	IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TCPMD5,
	("unexpected mode %u", saidx->mode));

	SAHTREE_RLOCK();
	LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
	KEYDBG(IPSEC_DUMP,
	printf("%s: checking SAH\n", __func__);
	kdebug_secash(sah, " "));
	if (sah->saidx.proto != IPPROTO_TCP)
	continue;
	if (!key_sockaddrcmp(&saidx->dst.sa, &sah->saidx.dst.sa, 0) &&
	!key_sockaddrcmp(&saidx->src.sa, &sah->saidx.src.sa, 0))
	break;
	}
	if (sah != NULL) {
	if (V_key_preferred_oldsa)
	sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
	else
	sav = TAILQ_FIRST(&sah->savtree_alive);
	if (sav != NULL)
	SAV_ADDREF(sav);
	} else
	sav = NULL;
	SAHTREE_RUNLOCK();

	if (sav != NULL) {
	KEYDBG(IPSEC_STAMP,
	printf("%s: return SA(%p)\n", __func__, sav));
	KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
	} else {
	KEYDBG(IPSEC_STAMP,
	printf("%s: SA not found\n", __func__));
	KEYDBG(IPSEC_DATA, kdebug_secasindex(saidx, NULL));
	}
	return (sav);
	}

	/*
	* Allocating an SA entry for an OUTBOUND packet.
	* OUT: positive: corresponding SA for given saidx found.
	* NULL: SA not found, but will be acquired, check *error
	* for acquiring status.
	*/
	struct secasvar *
	key_allocsa_policy(struct secpolicy sp, const struct secasindex saidx,
	int *error)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secashead *sah;
	struct secasvar *sav;

	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
	IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TRANSPORT \|\|
	saidx->mode == IPSEC_MODE_TUNNEL,
	("unexpected policy %u", saidx->mode));

	/*
	* We check new SA in the IPsec request because a different
	* SA may be involved each time this request is checked, either
	* because new SAs are being configured, or this request is
	* associated with an unconnected datagram socket, or this request
	* is associated with a system default policy.
	*/
	SAHTREE_RLOCK();
	LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
	KEYDBG(IPSEC_DUMP,
	printf("%s: checking SAH\n", __func__);
	kdebug_secash(sah, " "));
	if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID))
	break;

	}
	if (sah != NULL) {
	/*
	* Allocate the oldest SA available according to
	* draft-jenkins-ipsec-rekeying-03.
	*/
	if (V_key_preferred_oldsa)
	sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
	else
	sav = TAILQ_FIRST(&sah->savtree_alive);
	if (sav != NULL)
	SAV_ADDREF(sav);
	} else
	sav = NULL;
	SAHTREE_RUNLOCK();

	if (sav != NULL) {
	*error = 0;
	KEYDBG(IPSEC_STAMP,
	printf("%s: chosen SA(%p) for SP(%p)\n", __func__,
	sav, sp));
	KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
	return (sav); /* return referenced SA */
	}

	/* there is no SA */
	*error = key_acquire(saidx, sp);
	if ((*error) != 0)
	ipseclog((LOG_DEBUG,
	"%s: error %d returned from key_acquire()\n",
	__func__, *error));
	KEYDBG(IPSEC_STAMP,
	printf("%s: acquire SA for SP(%p), error %d\n",
	__func__, sp, *error));
	KEYDBG(IPSEC_DATA, kdebug_secasindex(saidx, NULL));
	return (NULL);
	}

	/*
	* allocating a usable SA entry for a INBOUND packet.
	* Must call key_freesav() later.
	* OUT: positive: pointer to a usable sav (i.e. MATURE or DYING state).
	* NULL: not found, or error occurred.
	*
	* According to RFC 2401 SA is uniquely identified by a triple SPI,
	* destination address, and security protocol. But according to RFC 4301,
	* SPI by itself suffices to specify an SA.
	*
	* Note that, however, we do need to keep source address in IPsec SA.
	* IKE specification and PF_KEY specification do assume that we
	* keep source address in IPsec SA. We see a tricky situation here.
	*/
	struct secasvar *
	key_allocsa(union sockaddr_union *dst, uint8_t proto, uint32_t spi)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secasvar *sav;

	IPSEC_ASSERT(proto == IPPROTO_ESP \|\| proto == IPPROTO_AH \|\|
	proto == IPPROTO_IPCOMP, ("unexpected security protocol %u",
	proto));

	SAHTREE_RLOCK();
	LIST_FOREACH(sav, SAVHASH_HASH(spi), spihash) {
	if (sav->spi == spi)
	break;
	}
	/*
	* We use single SPI namespace for all protocols, so it is
	* impossible to have SPI duplicates in the SAVHASH.
	*/
	if (sav != NULL) {
	if (sav->state != SADB_SASTATE_LARVAL &&
	sav->sah->saidx.proto == proto &&
	key_sockaddrcmp(&dst->sa,
	&sav->sah->saidx.dst.sa, 0) == 0)
	SAV_ADDREF(sav);
	else
	sav = NULL;
	}
	SAHTREE_RUNLOCK();

	if (sav == NULL) {
	KEYDBG(IPSEC_STAMP,
	char buf[IPSEC_ADDRSTRLEN];
	printf("%s: SA not found for spi %u proto %u dst %s\n",
	__func__, ntohl(spi), proto, ipsec_address(dst, buf,
	sizeof(buf))));
	} else {
	KEYDBG(IPSEC_STAMP,
	printf("%s: return SA(%p)\n", __func__, sav));
	KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
	}
	return (sav);
	}

	struct secasvar *
	key_allocsa_tunnel(union sockaddr_union src, union sockaddr_union dst,
	uint8_t proto)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secasindex saidx;
	struct secashead *sah;
	struct secasvar *sav;

	IPSEC_ASSERT(src != NULL, ("null src address"));
	IPSEC_ASSERT(dst != NULL, ("null dst address"));

	KEY_SETSECASIDX(proto, IPSEC_MODE_TUNNEL, 0, &src->sa,
	&dst->sa, &saidx);

	sav = NULL;
	SAHTREE_RLOCK();
	LIST_FOREACH(sah, SAHADDRHASH_HASH(&saidx), addrhash) {
	if (IPSEC_MODE_TUNNEL != sah->saidx.mode)
	continue;
	if (proto != sah->saidx.proto)
	continue;
	if (key_sockaddrcmp(&src->sa, &sah->saidx.src.sa, 0) != 0)
	continue;
	if (key_sockaddrcmp(&dst->sa, &sah->saidx.dst.sa, 0) != 0)
	continue;
	/* XXXAE: is key_preferred_oldsa reasonably?*/
	if (V_key_preferred_oldsa)
	sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
	else
	sav = TAILQ_FIRST(&sah->savtree_alive);
	if (sav != NULL) {
	SAV_ADDREF(sav);
	break;
	}
	}
	SAHTREE_RUNLOCK();
	KEYDBG(IPSEC_STAMP,
	printf("%s: return SA(%p)\n", __func__, sav));
	if (sav != NULL)
	KEYDBG(IPSEC_DATA, kdebug_secasv(sav));
	return (sav);
	}

	/*
	* Must be called after calling key_allocsp().
	*/
	void
	key_freesp(struct secpolicy **spp)
	{
	struct secpolicy sp = spp;

	IPSEC_ASSERT(sp != NULL, ("null sp"));
	if (SP_DELREF(sp) == 0)
	return;

	KEYDBG(IPSEC_STAMP,
	printf("%s: last reference to SP(%p)\n", __func__, sp));
	KEYDBG(IPSEC_DATA, kdebug_secpolicy(sp));

	*spp = NULL;
	while (sp->tcount > 0)
	ipsec_delisr(sp->req[--sp->tcount]);
	free(sp, M_IPSEC_SP);
	}

	static void
	key_unlink(struct secpolicy *sp)
	{

	IPSEC_ASSERT(sp->spidx.dir == IPSEC_DIR_INBOUND \|\|
	sp->spidx.dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", sp->spidx.dir));
	SPTREE_UNLOCK_ASSERT();

	KEYDBG(KEY_STAMP,
	printf("%s: SP(%p)\n", __func__, sp));
	SPTREE_WLOCK();
	if (sp->state != IPSEC_SPSTATE_ALIVE) {
	/* SP is already unlinked */
	SPTREE_WUNLOCK();
	return;
	}
	sp->state = IPSEC_SPSTATE_DEAD;
	TAILQ_REMOVE(&V_sptree[sp->spidx.dir], sp, chain);
	LIST_REMOVE(sp, idhash);
	V_sp_genid++;
	SPTREE_WUNLOCK();
	key_freesp(&sp);
	}

	/*
	* insert a secpolicy into the SP database. Lower priorities first
	*/
	static void
	key_insertsp(struct secpolicy *newsp)
	{
	struct secpolicy *sp;

	SPTREE_WLOCK_ASSERT();
	TAILQ_FOREACH(sp, &V_sptree[newsp->spidx.dir], chain) {
	if (newsp->priority < sp->priority) {
	TAILQ_INSERT_BEFORE(sp, newsp, chain);
	goto done;
	}
	}
	TAILQ_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, chain);
	done:
	LIST_INSERT_HEAD(SPHASH_HASH(newsp->id), newsp, idhash);
	newsp->state = IPSEC_SPSTATE_ALIVE;
	V_sp_genid++;
	}

	/*
	* Insert a bunch of VTI secpolicies into the SPDB.
	* We keep VTI policies in the separate list due to following reasons:
	* 1) they should be immutable to user's or some deamon's attempts to
	* delete. The only way delete such policies - destroy or unconfigure
	* corresponding virtual inteface.
	* 2) such policies have traffic selector that matches all traffic per
	* address family.
	* Since all VTI policies have the same priority, we don't care about
	* policies order.
	*/
	int
	key_register_ifnet(struct secpolicy **spp, u_int count)
	{
	struct mbuf *m;
	u_int i;

	SPTREE_WLOCK();
	/*
	* First of try to acquire id for each SP.
	*/
	for (i = 0; i < count; i++) {
	IPSEC_ASSERT(spp[i]->spidx.dir == IPSEC_DIR_INBOUND \|\|
	spp[i]->spidx.dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", spp[i]->spidx.dir));

	if ((spp[i]->id = key_getnewspid()) == 0) {
	SPTREE_WUNLOCK();
	return (EAGAIN);
	}
	}
	for (i = 0; i < count; i++) {
	TAILQ_INSERT_TAIL(&V_sptree_ifnet[spp[i]->spidx.dir],
	spp[i], chain);
	/*
	* NOTE: despite the fact that we keep VTI SP in the
	* separate list, SPHASH contains policies from both
	* sources. Thus SADB_X_SPDGET will correctly return
	* SP by id, because it uses SPHASH for lookups.
	*/
	LIST_INSERT_HEAD(SPHASH_HASH(spp[i]->id), spp[i], idhash);
	spp[i]->state = IPSEC_SPSTATE_IFNET;
	}
	SPTREE_WUNLOCK();
	/*
	* Notify user processes about new SP.
	*/
	for (i = 0; i < count; i++) {
	m = key_setdumpsp(spp[i], SADB_X_SPDADD, 0, 0);
	if (m != NULL)
	key_sendup_mbuf(NULL, m, KEY_SENDUP_ALL);
	}
	return (0);
	}

	void
	key_unregister_ifnet(struct secpolicy **spp, u_int count)
	{
	struct mbuf *m;
	u_int i;

	SPTREE_WLOCK();
	for (i = 0; i < count; i++) {
	IPSEC_ASSERT(spp[i]->spidx.dir == IPSEC_DIR_INBOUND \|\|
	spp[i]->spidx.dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", spp[i]->spidx.dir));

	if (spp[i]->state != IPSEC_SPSTATE_IFNET)
	continue;
	spp[i]->state = IPSEC_SPSTATE_DEAD;
	TAILQ_REMOVE(&V_sptree_ifnet[spp[i]->spidx.dir],
	spp[i], chain);
	LIST_REMOVE(spp[i], idhash);
	}
	SPTREE_WUNLOCK();

	for (i = 0; i < count; i++) {
	m = key_setdumpsp(spp[i], SADB_X_SPDDELETE, 0, 0);
	if (m != NULL)
	key_sendup_mbuf(NULL, m, KEY_SENDUP_ALL);
	}
	}

	/*
	* Must be called after calling key_allocsa().
	* This function is called by key_freesp() to free some SA allocated
	* for a policy.
	*/
	void
	key_freesav(struct secasvar **psav)
	{
	struct secasvar sav = psav;

	IPSEC_ASSERT(sav != NULL, ("null sav"));
	if (SAV_DELREF(sav) == 0)
	return;

	KEYDBG(IPSEC_STAMP,
	printf("%s: last reference to SA(%p)\n", __func__, sav));

	*psav = NULL;
	key_delsav(sav);
	}

	/*
	* Unlink SA from SAH and SPI hash under SAHTREE_WLOCK.
	* Expect that SA has extra reference due to lookup.
	* Release this references, also release SAH reference after unlink.
	*/
	static void
	key_unlinksav(struct secasvar *sav)
	{
	struct secashead *sah;

	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p)\n", __func__, sav));

	SAHTREE_UNLOCK_ASSERT();
	SAHTREE_WLOCK();
	if (sav->state == SADB_SASTATE_DEAD) {
	/* SA is already unlinked */
	SAHTREE_WUNLOCK();
	return;
	}
	/* Unlink from SAH */
	if (sav->state == SADB_SASTATE_LARVAL)
	TAILQ_REMOVE(&sav->sah->savtree_larval, sav, chain);
	else
	TAILQ_REMOVE(&sav->sah->savtree_alive, sav, chain);
	/* Unlink from SPI hash */
	LIST_REMOVE(sav, spihash);
	sav->state = SADB_SASTATE_DEAD;
	sah = sav->sah;
	SAHTREE_WUNLOCK();
	key_freesav(&sav);
	/* Since we are unlinked, release reference to SAH */
	key_freesah(&sah);
	}

	/* %%% SPD management */
	/*
	* search SPD
	* OUT: NULL : not found
	* others : found, pointer to a SP.
	*/
	static struct secpolicy *
	key_getsp(struct secpolicyindex *spidx)
	{
	SPTREE_RLOCK_TRACKER;
	struct secpolicy *sp;

	IPSEC_ASSERT(spidx != NULL, ("null spidx"));

	SPTREE_RLOCK();
	TAILQ_FOREACH(sp, &V_sptree[spidx->dir], chain) {
	if (key_cmpspidx_exactly(spidx, &sp->spidx)) {
	SP_ADDREF(sp);
	break;
	}
	}
	SPTREE_RUNLOCK();

	return sp;
	}

	/*
	* get SP by index.
	* OUT: NULL : not found
	* others : found, pointer to referenced SP.
	*/
	static struct secpolicy *
	key_getspbyid(uint32_t id)
	{
	SPTREE_RLOCK_TRACKER;
	struct secpolicy *sp;

	SPTREE_RLOCK();
	LIST_FOREACH(sp, SPHASH_HASH(id), idhash) {
	if (sp->id == id) {
	SP_ADDREF(sp);
	break;
	}
	}
	SPTREE_RUNLOCK();
	return (sp);
	}

	struct secpolicy *
	key_newsp(void)
	{
	struct secpolicy *sp;

	sp = malloc(sizeof(*sp), M_IPSEC_SP, M_NOWAIT \| M_ZERO);
	if (sp != NULL)
	SP_INITREF(sp);
	return (sp);
	}

	struct ipsecrequest *
	ipsec_newisr(void)
	{

	return (malloc(sizeof(struct ipsecrequest), M_IPSEC_SR,
	M_NOWAIT \| M_ZERO));
	}

	void
	ipsec_delisr(struct ipsecrequest *p)
	{

	free(p, M_IPSEC_SR);
	}

	/*
	* create secpolicy structure from sadb_x_policy structure.
	* NOTE: `state', `secpolicyindex' and 'id' in secpolicy structure
	* are not set, so must be set properly later.
	*/
	struct secpolicy *
	key_msg2sp(struct sadb_x_policy xpl0, size_t len, int error)
	{
	struct secpolicy *newsp;

	IPSEC_ASSERT(xpl0 != NULL, ("null xpl0"));
	IPSEC_ASSERT(len >= sizeof(*xpl0), ("policy too short: %zu", len));

	if (len != PFKEY_EXTLEN(xpl0)) {
	ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__));
	*error = EINVAL;
	return NULL;
	}

	if ((newsp = key_newsp()) == NULL) {
	*error = ENOBUFS;
	return NULL;
	}

	newsp->spidx.dir = xpl0->sadb_x_policy_dir;
	newsp->policy = xpl0->sadb_x_policy_type;
	newsp->priority = xpl0->sadb_x_policy_priority;
	newsp->tcount = 0;

	/* check policy */
	switch (xpl0->sadb_x_policy_type) {
	case IPSEC_POLICY_DISCARD:
	case IPSEC_POLICY_NONE:
	case IPSEC_POLICY_ENTRUST:
	case IPSEC_POLICY_BYPASS:
	break;

	case IPSEC_POLICY_IPSEC:
	{
	struct sadb_x_ipsecrequest *xisr;
	struct ipsecrequest *isr;
	int tlen;

	/* validity check */
	if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) {
	ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n",
	__func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}

	tlen = PFKEY_EXTLEN(xpl0) - sizeof(*xpl0);
	xisr = (struct sadb_x_ipsecrequest *)(xpl0 + 1);

	while (tlen > 0) {
	/* length check */
	if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr) \|\|
	xisr->sadb_x_ipsecrequest_len > tlen) {
	ipseclog((LOG_DEBUG, "%s: invalid ipsecrequest "
	"length.\n", __func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}

	if (newsp->tcount >= IPSEC_MAXREQ) {
	ipseclog((LOG_DEBUG,
	"%s: too many ipsecrequests.\n",
	__func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return (NULL);
	}

	/* allocate request buffer */
	/* NB: data structure is zero'd */
	isr = ipsec_newisr();
	if (isr == NULL) {
	ipseclog((LOG_DEBUG,
	"%s: No more memory.\n", __func__));
	key_freesp(&newsp);
	*error = ENOBUFS;
	return NULL;
	}

	newsp->req[newsp->tcount++] = isr;

	/* set values */
	switch (xisr->sadb_x_ipsecrequest_proto) {
	case IPPROTO_ESP:
	case IPPROTO_AH:
	case IPPROTO_IPCOMP:
	break;
	default:
	ipseclog((LOG_DEBUG,
	"%s: invalid proto type=%u\n", __func__,
	xisr->sadb_x_ipsecrequest_proto));
	key_freesp(&newsp);
	*error = EPROTONOSUPPORT;
	return NULL;
	}
	isr->saidx.proto =
	(uint8_t)xisr->sadb_x_ipsecrequest_proto;

	switch (xisr->sadb_x_ipsecrequest_mode) {
	case IPSEC_MODE_TRANSPORT:
	case IPSEC_MODE_TUNNEL:
	break;
	case IPSEC_MODE_ANY:
	default:
	ipseclog((LOG_DEBUG,
	"%s: invalid mode=%u\n", __func__,
	xisr->sadb_x_ipsecrequest_mode));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}
	isr->saidx.mode = xisr->sadb_x_ipsecrequest_mode;

	switch (xisr->sadb_x_ipsecrequest_level) {
	case IPSEC_LEVEL_DEFAULT:
	case IPSEC_LEVEL_USE:
	case IPSEC_LEVEL_REQUIRE:
	break;
	case IPSEC_LEVEL_UNIQUE:
	/* validity check */
	/*
	* If range violation of reqid, kernel will
	* update it, don't refuse it.
	*/
	if (xisr->sadb_x_ipsecrequest_reqid
	> IPSEC_MANUAL_REQID_MAX) {
	ipseclog((LOG_DEBUG,
	"%s: reqid=%d range "
	"violation, updated by kernel.\n",
	__func__,
	xisr->sadb_x_ipsecrequest_reqid));
	xisr->sadb_x_ipsecrequest_reqid = 0;
	}

	/* allocate new reqid id if reqid is zero. */
	if (xisr->sadb_x_ipsecrequest_reqid == 0) {
	u_int32_t reqid;
	if ((reqid = key_newreqid()) == 0) {
	key_freesp(&newsp);
	*error = ENOBUFS;
	return NULL;
	}
	isr->saidx.reqid = reqid;
	xisr->sadb_x_ipsecrequest_reqid = reqid;
	} else {
	/* set it for manual keying. */
	isr->saidx.reqid =
	xisr->sadb_x_ipsecrequest_reqid;
	}
	break;

	default:
	ipseclog((LOG_DEBUG, "%s: invalid level=%u\n",
	__func__,
	xisr->sadb_x_ipsecrequest_level));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}
	isr->level = xisr->sadb_x_ipsecrequest_level;

	/* set IP addresses if there */
	if (xisr->sadb_x_ipsecrequest_len > sizeof(*xisr)) {
	struct sockaddr *paddr;

	len = tlen - sizeof(*xisr);
	paddr = (struct sockaddr *)(xisr + 1);
	/* validity check */
	if (len < sizeof(struct sockaddr) \|\|
	len < 2 * paddr->sa_len \|\|
	paddr->sa_len > sizeof(isr->saidx.src)) {
	ipseclog((LOG_DEBUG, "%s: invalid "
	"request address length.\n",
	__func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}
	/*
	* Request length should be enough to keep
	* source and destination addresses.
	*/
	if (xisr->sadb_x_ipsecrequest_len <
	sizeof(xisr) + 2 paddr->sa_len) {
	ipseclog((LOG_DEBUG, "%s: invalid "
	"ipsecrequest length.\n",
	__func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return (NULL);
	}
	bcopy(paddr, &isr->saidx.src, paddr->sa_len);
	paddr = (struct sockaddr *)((caddr_t)paddr +
	paddr->sa_len);

	/* validity check */
	if (paddr->sa_len !=
	isr->saidx.src.sa.sa_len) {
	ipseclog((LOG_DEBUG, "%s: invalid "
	"request address length.\n",
	__func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}
	/* AF family should match */
	if (paddr->sa_family !=
	isr->saidx.src.sa.sa_family) {
	ipseclog((LOG_DEBUG, "%s: address "
	"family doesn't match.\n",
	__func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return (NULL);
	}
	bcopy(paddr, &isr->saidx.dst, paddr->sa_len);
	} else {
	/*
	* Addresses for TUNNEL mode requests are
	* mandatory.
	*/
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
	ipseclog((LOG_DEBUG, "%s: missing "
	"request addresses.\n", __func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return (NULL);
	}
	}
	tlen -= xisr->sadb_x_ipsecrequest_len;

	/* validity check */
	if (tlen < 0) {
	ipseclog((LOG_DEBUG, "%s: becoming tlen < 0.\n",
	__func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}

	xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr
	+ xisr->sadb_x_ipsecrequest_len);
	}
	/* XXXAE: LARVAL SP */
	if (newsp->tcount < 1) {
	ipseclog((LOG_DEBUG, "%s: valid IPSEC transforms "
	"not found.\n", __func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return (NULL);
	}
	}
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
	key_freesp(&newsp);
	*error = EINVAL;
	return NULL;
	}

	*error = 0;
	return (newsp);
	}

	uint32_t
	key_newreqid(void)
	{
	static uint32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1;

	if (auto_reqid == ~0)
	auto_reqid = IPSEC_MANUAL_REQID_MAX + 1;
	else
	auto_reqid++;

	/* XXX should be unique check */
	return (auto_reqid);
	}

	/*
	* copy secpolicy struct to sadb_x_policy structure indicated.
	*/
	static struct mbuf *
	key_sp2mbuf(struct secpolicy *sp)
	{
	struct mbuf *m;
	size_t tlen;

	tlen = key_getspreqmsglen(sp);
	m = m_get2(tlen, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, tlen);
	m->m_len = tlen;
	if (key_sp2msg(sp, m->m_data, &tlen) != 0) {
	m_freem(m);
	return (NULL);
	}
	return (m);
	}

	int
	key_sp2msg(struct secpolicy sp, void request, size_t *len)
	{
	struct sadb_x_ipsecrequest *xisr;
	struct sadb_x_policy *xpl;
	struct ipsecrequest *isr;
	size_t xlen, ilen;
	caddr_t p;
	int error, i;

	IPSEC_ASSERT(sp != NULL, ("null policy"));

	xlen = sizeof(*xpl);
	if (*len < xlen)
	return (EINVAL);

	error = 0;
	bzero(request, *len);
	xpl = (struct sadb_x_policy *)request;
	xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
	xpl->sadb_x_policy_type = sp->policy;
	xpl->sadb_x_policy_dir = sp->spidx.dir;
	xpl->sadb_x_policy_id = sp->id;
	xpl->sadb_x_policy_priority = sp->priority;
	switch (sp->state) {
	case IPSEC_SPSTATE_IFNET:
	xpl->sadb_x_policy_scope = IPSEC_POLICYSCOPE_IFNET;
	break;
	case IPSEC_SPSTATE_PCB:
	xpl->sadb_x_policy_scope = IPSEC_POLICYSCOPE_PCB;
	break;
	default:
	xpl->sadb_x_policy_scope = IPSEC_POLICYSCOPE_GLOBAL;
	}

	/* if is the policy for ipsec ? */
	if (sp->policy == IPSEC_POLICY_IPSEC) {
	p = (caddr_t)xpl + sizeof(*xpl);
	for (i = 0; i < sp->tcount; i++) {
	isr = sp->req[i];
	ilen = PFKEY_ALIGN8(sizeof(*xisr) +
	isr->saidx.src.sa.sa_len +
	isr->saidx.dst.sa.sa_len);
	xlen += ilen;
	if (xlen > *len) {
	error = ENOBUFS;
	/* Calculate needed size */
	continue;
	}
	xisr = (struct sadb_x_ipsecrequest *)p;
	xisr->sadb_x_ipsecrequest_len = ilen;
	xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto;
	xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode;
	xisr->sadb_x_ipsecrequest_level = isr->level;
	xisr->sadb_x_ipsecrequest_reqid = isr->saidx.reqid;

	p += sizeof(*xisr);
	bcopy(&isr->saidx.src, p, isr->saidx.src.sa.sa_len);
	p += isr->saidx.src.sa.sa_len;
	bcopy(&isr->saidx.dst, p, isr->saidx.dst.sa.sa_len);
	p += isr->saidx.dst.sa.sa_len;
	}
	}
	xpl->sadb_x_policy_len = PFKEY_UNIT64(xlen);
	if (error == 0)
	*len = xlen;
	else
	len = sizeof(xpl);
	return (error);
	}

	/* m will not be freed nor modified */
	static struct mbuf *
	key_gather_mbuf(struct mbuf m, const struct sadb_msghdr mhp,
	int ndeep, int nitem, ...)
	{
	va_list ap;
	int idx;
	int i;
	struct mbuf result = NULL, n;
	int len;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));

	va_start(ap, nitem);
	for (i = 0; i < nitem; i++) {
	idx = va_arg(ap, int);
	if (idx < 0 \|\| idx > SADB_EXT_MAX)
	goto fail;
	/* don't attempt to pull empty extension */
	if (idx == SADB_EXT_RESERVED && mhp->msg == NULL)
	continue;
	if (idx != SADB_EXT_RESERVED &&
	(mhp->ext[idx] == NULL \|\| mhp->extlen[idx] == 0))
	continue;

	if (idx == SADB_EXT_RESERVED) {
	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));

	IPSEC_ASSERT(len <= MHLEN, ("header too big %u", len));

	MGETHDR(n, M_NOWAIT, MT_DATA);
	if (!n)
	goto fail;
	n->m_len = len;
	n->m_next = NULL;
	m_copydata(m, 0, sizeof(struct sadb_msg),
	mtod(n, caddr_t));
	} else if (i < ndeep) {
	len = mhp->extlen[idx];
	n = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (n == NULL)
	goto fail;
	m_align(n, len);
	n->m_len = len;
	m_copydata(m, mhp->extoff[idx], mhp->extlen[idx],
	mtod(n, caddr_t));
	} else {
	n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx],
	M_NOWAIT);
	}
	if (n == NULL)
	goto fail;

	if (result)
	m_cat(result, n);
	else
	result = n;
	}
	va_end(ap);

	if ((result->m_flags & M_PKTHDR) != 0) {
	result->m_pkthdr.len = 0;
	for (n = result; n; n = n->m_next)
	result->m_pkthdr.len += n->m_len;
	}

	return result;

	fail:
	m_freem(result);
	va_end(ap);
	return NULL;
	}

	/*
	* SADB_X_SPDADD, SADB_X_SPDSETIDX or SADB_X_SPDUPDATE processing
	* add an entry to SP database, when received
	* <base, address(SD), (lifetime(H),) policy>
	* from the user(?).
	* Adding to SP database,
	* and send
	* <base, address(SD), (lifetime(H),) policy>
	* to the socket which was send.
	*
	* SPDADD set a unique policy entry.
	* SPDSETIDX like SPDADD without a part of policy requests.
	* SPDUPDATE replace a unique policy entry.
	*
	* XXXAE: serialize this in PF_KEY to avoid races.
	* m will always be freed.
	*/
	static int
	key_spdadd(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secpolicyindex spidx;
	struct sadb_address src0, dst0;
	struct sadb_x_policy xpl0, xpl;
	struct sadb_lifetime *lft = NULL;
	struct secpolicy *newsp;
	int error;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) \|\|
	SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST) \|\|
	SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	if (!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD)) {
	if (SADB_CHECKLEN(mhp, SADB_EXT_LIFETIME_HARD)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
	}

	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];

	/* check the direciton */
	switch (xpl0->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	case IPSEC_DIR_OUTBOUND:
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: invalid SP direction.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	/* key_spdadd() accepts DISCARD, NONE and IPSEC. */
	if (xpl0->sadb_x_policy_type != IPSEC_POLICY_DISCARD &&
	xpl0->sadb_x_policy_type != IPSEC_POLICY_NONE &&
	xpl0->sadb_x_policy_type != IPSEC_POLICY_IPSEC) {
	ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	/* policy requests are mandatory when action is ipsec. */
	if (xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
	mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) {
	ipseclog((LOG_DEBUG,
	"%s: policy requests required.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1));
	if (error != 0 \|\|
	src0->sadb_address_proto != dst0->sadb_address_proto) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	return key_senderror(so, m, error);
	}
	/* make secindex */
	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
	src0 + 1,
	dst0 + 1,
	src0->sadb_address_prefixlen,
	dst0->sadb_address_prefixlen,
	src0->sadb_address_proto,
	&spidx);
	/* Checking there is SP already or not. */
	newsp = key_getsp(&spidx);
	if (newsp != NULL) {
	if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
	KEYDBG(KEY_STAMP,
	printf("%s: unlink SP(%p) for SPDUPDATE\n",
	__func__, newsp));
	KEYDBG(KEY_DATA, kdebug_secpolicy(newsp));
	key_unlink(newsp);
	key_freesp(&newsp);
	} else {
	key_freesp(&newsp);
	ipseclog((LOG_DEBUG, "%s: a SP entry exists already.",
	__func__));
	return (key_senderror(so, m, EEXIST));
	}
	}

	/* allocate new SP entry */
	if ((newsp = key_msg2sp(xpl0, PFKEY_EXTLEN(xpl0), &error)) == NULL) {
	return key_senderror(so, m, error);
	}

	newsp->lastused = newsp->created = time_second;
	newsp->lifetime = lft ? lft->sadb_lifetime_addtime : 0;
	newsp->validtime = lft ? lft->sadb_lifetime_usetime : 0;
	bcopy(&spidx, &newsp->spidx, sizeof(spidx));

	/* XXXAE: there is race between key_getsp() and key_insertsp() */
	SPTREE_WLOCK();
	if ((newsp->id = key_getnewspid()) == 0) {
	SPTREE_WUNLOCK();
	key_freesp(&newsp);
	return key_senderror(so, m, ENOBUFS);
	}
	key_insertsp(newsp);
	SPTREE_WUNLOCK();

	KEYDBG(KEY_STAMP,
	printf("%s: SP(%p)\n", __func__, newsp));
	KEYDBG(KEY_DATA, kdebug_secpolicy(newsp));

	{
	struct mbuf n, mpolicy;
	struct sadb_msg *newmsg;
	int off;

	/* create new sadb_msg to reply. */
	if (lft) {
	n = key_gather_mbuf(m, mhp, 2, 5, SADB_EXT_RESERVED,
	SADB_X_EXT_POLICY, SADB_EXT_LIFETIME_HARD,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	} else {
	n = key_gather_mbuf(m, mhp, 2, 4, SADB_EXT_RESERVED,
	SADB_X_EXT_POLICY,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	}
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	if (n->m_len < sizeof(*newmsg)) {
	n = m_pullup(n, sizeof(*newmsg));
	if (!n)
	return key_senderror(so, m, ENOBUFS);
	}
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	off = 0;
	mpolicy = m_pulldown(n, PFKEY_ALIGN8(sizeof(struct sadb_msg)),
	sizeof(*xpl), &off);
	if (mpolicy == NULL) {
	/* n is already freed */
	return key_senderror(so, m, ENOBUFS);
	}
	xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off);
	if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) {
	m_freem(n);
	return key_senderror(so, m, EINVAL);
	}
	xpl->sadb_x_policy_id = newsp->id;

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* get new policy id.
	* OUT:
	* 0: failure.
	* others: success.
	*/
	static uint32_t
	key_getnewspid(void)
	{
	struct secpolicy *sp;
	uint32_t newid = 0;
	int count = V_key_spi_trycnt; /* XXX */

	SPTREE_WLOCK_ASSERT();
	while (count--) {
	if (V_policy_id == ~0) /* overflowed */
	newid = V_policy_id = 1;
	else
	newid = ++V_policy_id;
	LIST_FOREACH(sp, SPHASH_HASH(newid), idhash) {
	if (sp->id == newid)
	break;
	}
	if (sp == NULL)
	break;
	}
	if (count == 0 \|\| newid == 0) {
	ipseclog((LOG_DEBUG, "%s: failed to allocate policy id.\n",
	__func__));
	return (0);
	}
	return (newid);
	}

	/*
	* SADB_SPDDELETE processing
	* receive
	* <base, address(SD), policy(*)>
	* from the user(?), and set SADB_SASTATE_DEAD,
	* and send,
	* <base, address(SD), policy(*)>
	* to the ikmpd.
	* policy(*) including direction of policy.
	*
	* m will always be freed.
	*/
	static int
	key_spddelete(struct socket so, struct mbuf m,
	const struct sadb_msghdr *mhp)
	{
	struct secpolicyindex spidx;
	struct sadb_address src0, dst0;
	struct sadb_x_policy *xpl0;
	struct secpolicy *sp;

	IPSEC_ASSERT(so != NULL, ("null so"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) \|\|
	SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST) \|\|
	SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];

	/* check the direciton */
	switch (xpl0->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	case IPSEC_DIR_OUTBOUND:
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: invalid SP direction.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	/* Only DISCARD, NONE and IPSEC are allowed */
	if (xpl0->sadb_x_policy_type != IPSEC_POLICY_DISCARD &&
	xpl0->sadb_x_policy_type != IPSEC_POLICY_NONE &&
	xpl0->sadb_x_policy_type != IPSEC_POLICY_IPSEC) {
	ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	if (key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1)) != 0 \|\|
	src0->sadb_address_proto != dst0->sadb_address_proto) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	/* make secindex */
	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
	src0 + 1,
	dst0 + 1,
	src0->sadb_address_prefixlen,
	dst0->sadb_address_prefixlen,
	src0->sadb_address_proto,
	&spidx);

	/* Is there SP in SPD ? */
	if ((sp = key_getsp(&spidx)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SP found.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	/* save policy id to buffer to be returned. */
	xpl0->sadb_x_policy_id = sp->id;

	KEYDBG(KEY_STAMP,
	printf("%s: SP(%p)\n", __func__, sp));
	KEYDBG(KEY_DATA, kdebug_secpolicy(sp));
	key_unlink(sp);
	key_freesp(&sp);

	{
	struct mbuf *n;
	struct sadb_msg *newmsg;

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
	SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* SADB_SPDDELETE2 processing
	* receive
	* <base, policy(*)>
	* from the user(?), and set SADB_SASTATE_DEAD,
	* and send,
	* <base, policy(*)>
	* to the ikmpd.
	* policy(*) including direction of policy.
	*
	* m will always be freed.
	*/
	static int
	key_spddelete2(struct socket so, struct mbuf m,
	const struct sadb_msghdr *mhp)
	{
	struct secpolicy *sp;
	uint32_t id;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY) \|\|
	SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	id = ((struct sadb_x_policy *)
	mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;

	/* Is there SP in SPD ? */
	if ((sp = key_getspbyid(id)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SP found for id %u.\n",
	__func__, id));
	return key_senderror(so, m, EINVAL);
	}

	KEYDBG(KEY_STAMP,
	printf("%s: SP(%p)\n", __func__, sp));
	KEYDBG(KEY_DATA, kdebug_secpolicy(sp));
	key_unlink(sp);
	if (sp->state != IPSEC_SPSTATE_DEAD) {
	ipseclog((LOG_DEBUG, "%s: failed to delete SP with id %u.\n",
	__func__, id));
	key_freesp(&sp);
	return (key_senderror(so, m, EACCES));
	}
	key_freesp(&sp);

	{
	struct mbuf n, nn;
	struct sadb_msg *newmsg;
	int off, len;

	/* create new sadb_msg to reply. */
	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));

	MGETHDR(n, M_NOWAIT, MT_DATA);
	if (n && len > MHLEN) {
	if (!(MCLGET(n, M_NOWAIT))) {
	m_freem(n);
	n = NULL;
	}
	}
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	n->m_len = len;
	n->m_next = NULL;
	off = 0;

	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));

	IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)",
	off, len));

	n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY],
	mhp->extlen[SADB_X_EXT_POLICY], M_NOWAIT);
	if (!n->m_next) {
	m_freem(n);
	return key_senderror(so, m, ENOBUFS);
	}

	n->m_pkthdr.len = 0;
	for (nn = n; nn; nn = nn->m_next)
	n->m_pkthdr.len += nn->m_len;

	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* SADB_X_SPDGET processing
	* receive
	* <base, policy(*)>
	* from the user(?),
	* and send,
	* <base, address(SD), policy>
	* to the ikmpd.
	* policy(*) including direction of policy.
	*
	* m will always be freed.
	*/
	static int
	key_spdget(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secpolicy *sp;
	struct mbuf *n;
	uint32_t id;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (SADB_CHECKHDR(mhp, SADB_X_EXT_POLICY) \|\|
	SADB_CHECKLEN(mhp, SADB_X_EXT_POLICY)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	id = ((struct sadb_x_policy *)
	mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;

	/* Is there SP in SPD ? */
	if ((sp = key_getspbyid(id)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SP found for id %u.\n",
	__func__, id));
	return key_senderror(so, m, ENOENT);
	}

	n = key_setdumpsp(sp, SADB_X_SPDGET, mhp->msg->sadb_msg_seq,
	mhp->msg->sadb_msg_pid);
	key_freesp(&sp);
	if (n != NULL) {
	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	} else
	return key_senderror(so, m, ENOBUFS);
	}

	/*
	* SADB_X_SPDACQUIRE processing.
	* Acquire policy and SA(s) for a OUTBOUND packet.
	* send
	* <base, policy(*)>
	* to KMD, and expect to receive
	* <base> with SADB_X_SPDACQUIRE if error occurred,
	* or
	* <base, policy>
	* with SADB_X_SPDUPDATE from KMD by PF_KEY.
	* policy(*) is without policy requests.
	*
	* 0 : succeed
	* others: error number
	*/
	int
	key_spdacquire(struct secpolicy *sp)
	{
	struct mbuf result = NULL, m;
	struct secspacq *newspacq;

	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
	IPSEC_ASSERT(sp->req == NULL, ("policy exists"));
	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
	("policy not IPSEC %u", sp->policy));

	/* Get an entry to check whether sent message or not. */
	newspacq = key_getspacq(&sp->spidx);
	if (newspacq != NULL) {
	if (V_key_blockacq_count < newspacq->count) {
	/* reset counter and do send message. */
	newspacq->count = 0;
	} else {
	/* increment counter and do nothing. */
	newspacq->count++;
	SPACQ_UNLOCK();
	return (0);
	}
	SPACQ_UNLOCK();
	} else {
	/* make new entry for blocking to send SADB_ACQUIRE. */
	newspacq = key_newspacq(&sp->spidx);
	if (newspacq == NULL)
	return ENOBUFS;
	}

	/* create new sadb_msg to reply. */
	m = key_setsadbmsg(SADB_X_SPDACQUIRE, 0, 0, 0, 0, 0);
	if (!m)
	return ENOBUFS;

	result = m;

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED);
	}

	/*
	* SADB_SPDFLUSH processing
	* receive
	* <base>
	* from the user, and free all entries in secpctree.
	* and send,
	* <base>
	* to the user.
	* NOTE: what to do is only marking SADB_SASTATE_DEAD.
	*
	* m will always be freed.
	*/
	static int
	key_spdflush(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secpolicy_queue drainq;
	struct sadb_msg *newmsg;
	struct secpolicy sp, nextsp;
	u_int dir;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (m->m_len != PFKEY_ALIGN8(sizeof(struct sadb_msg)))
	return key_senderror(so, m, EINVAL);

	TAILQ_INIT(&drainq);
	SPTREE_WLOCK();
	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	TAILQ_CONCAT(&drainq, &V_sptree[dir], chain);
	}
	/*
	* We need to set state to DEAD for each policy to be sure,
	* that another thread won't try to unlink it.
	* Also remove SP from sphash.
	*/
	TAILQ_FOREACH(sp, &drainq, chain) {
	sp->state = IPSEC_SPSTATE_DEAD;
	LIST_REMOVE(sp, idhash);
	}
	V_sp_genid++;
	SPTREE_WUNLOCK();
	sp = TAILQ_FIRST(&drainq);
	while (sp != NULL) {
	nextsp = TAILQ_NEXT(sp, chain);
	key_freesp(&sp);
	sp = nextsp;
	}

	if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	if (m->m_next)
	m_freem(m->m_next);
	m->m_next = NULL;
	m->m_pkthdr.len = m->m_len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
	newmsg = mtod(m, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);

	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	}

	static uint8_t
	key_satype2scopemask(uint8_t satype)
	{

	if (satype == IPSEC_POLICYSCOPE_ANY)
	return (0xff);
	return (satype);
	}
	/*
	* SADB_SPDDUMP processing
	* receive
	* <base>
	* from the user, and dump all SP leaves and send,
	* <base> .....
	* to the ikmpd.
	*
	* NOTE:
	* sadb_msg_satype is considered as mask of policy scopes.
	* m will always be freed.
	*/
	static int
	key_spddump(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	SPTREE_RLOCK_TRACKER;
	struct secpolicy *sp;
	struct mbuf *n;
	int cnt;
	u_int dir, scope;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* search SPD entry and get buffer size. */
	cnt = 0;
	scope = key_satype2scopemask(mhp->msg->sadb_msg_satype);
	SPTREE_RLOCK();
	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	if (scope & IPSEC_POLICYSCOPE_GLOBAL) {
	TAILQ_FOREACH(sp, &V_sptree[dir], chain)
	cnt++;
	}
	if (scope & IPSEC_POLICYSCOPE_IFNET) {
	TAILQ_FOREACH(sp, &V_sptree_ifnet[dir], chain)
	cnt++;
	}
	}

	if (cnt == 0) {
	SPTREE_RUNLOCK();
	return key_senderror(so, m, ENOENT);
	}

	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	if (scope & IPSEC_POLICYSCOPE_GLOBAL) {
	TAILQ_FOREACH(sp, &V_sptree[dir], chain) {
	--cnt;
	n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
	mhp->msg->sadb_msg_pid);

	if (n != NULL)
	key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}
	if (scope & IPSEC_POLICYSCOPE_IFNET) {
	TAILQ_FOREACH(sp, &V_sptree_ifnet[dir], chain) {
	--cnt;
	n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
	mhp->msg->sadb_msg_pid);

	if (n != NULL)
	key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}
	}

	SPTREE_RUNLOCK();
	m_freem(m);
	return (0);
	}

	static struct mbuf *
	key_setdumpsp(struct secpolicy *sp, u_int8_t type, u_int32_t seq,
	u_int32_t pid)
	{
	struct mbuf result = NULL, m;
	struct seclifetime lt;

	m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt);
	if (!m)
	goto fail;
	result = m;

	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sp->spidx.src.sa, sp->spidx.prefs,
	sp->spidx.ul_proto);
	if (!m)
	goto fail;
	m_cat(result, m);

	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sp->spidx.dst.sa, sp->spidx.prefd,
	sp->spidx.ul_proto);
	if (!m)
	goto fail;
	m_cat(result, m);

	m = key_sp2mbuf(sp);
	if (!m)
	goto fail;
	m_cat(result, m);

	if(sp->lifetime){
	lt.addtime=sp->created;
	lt.usetime= sp->lastused;
	m = key_setlifetime(&lt, SADB_EXT_LIFETIME_CURRENT);
	if (!m)
	goto fail;
	m_cat(result, m);

	lt.addtime=sp->lifetime;
	lt.usetime= sp->validtime;
	m = key_setlifetime(&lt, SADB_EXT_LIFETIME_HARD);
	if (!m)
	goto fail;
	m_cat(result, m);
	}

	if ((result->m_flags & M_PKTHDR) == 0)
	goto fail;

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL)
	goto fail;
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return result;

	fail:
	m_freem(result);
	return NULL;
	}
	/*
	* get PFKEY message length for security policy and request.
	*/
	static size_t
	key_getspreqmsglen(struct secpolicy *sp)
	{
	size_t tlen, len;
	int i;

	tlen = sizeof(struct sadb_x_policy);
	/* if is the policy for ipsec ? */
	if (sp->policy != IPSEC_POLICY_IPSEC)
	return (tlen);

	/* get length of ipsec requests */
	for (i = 0; i < sp->tcount; i++) {
	len = sizeof(struct sadb_x_ipsecrequest)
	+ sp->req[i]->saidx.src.sa.sa_len
	+ sp->req[i]->saidx.dst.sa.sa_len;

	tlen += PFKEY_ALIGN8(len);
	}
	return (tlen);
	}

	/*
	* SADB_SPDEXPIRE processing
	* send
	* <base, address(SD), lifetime(CH), policy>
	* to KMD by PF_KEY.
	*
	* OUT: 0 : succeed
	* others : error number
	*/
	static int
	key_spdexpire(struct secpolicy *sp)
	{
	struct sadb_lifetime *lt;
	struct mbuf result = NULL, m;
	int len, error = -1;

	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));

	KEYDBG(KEY_STAMP,
	printf("%s: SP(%p)\n", __func__, sp));
	KEYDBG(KEY_DATA, kdebug_secpolicy(sp));

	/* set msg header */
	m = key_setsadbmsg(SADB_X_SPDEXPIRE, 0, 0, 0, 0, 0);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	result = m;

	/* create lifetime extension (current and hard) */
	len = PFKEY_ALIGN8(sizeof(lt)) 2;
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	m_align(m, len);
	m->m_len = len;
	bzero(mtod(m, caddr_t), len);
	lt = mtod(m, struct sadb_lifetime *);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
	lt->sadb_lifetime_allocations = 0;
	lt->sadb_lifetime_bytes = 0;
	lt->sadb_lifetime_addtime = sp->created;
	lt->sadb_lifetime_usetime = sp->lastused;
	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
	lt->sadb_lifetime_allocations = 0;
	lt->sadb_lifetime_bytes = 0;
	lt->sadb_lifetime_addtime = sp->lifetime;
	lt->sadb_lifetime_usetime = sp->validtime;
	m_cat(result, m);

	/* set sadb_address for source */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sp->spidx.src.sa,
	sp->spidx.prefs, sp->spidx.ul_proto);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* set sadb_address for destination */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sp->spidx.dst.sa,
	sp->spidx.prefd, sp->spidx.ul_proto);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* set secpolicy */
	m = key_sp2mbuf(sp);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	if ((result->m_flags & M_PKTHDR) == 0) {
	error = EINVAL;
	goto fail;
	}

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);

	fail:
	if (result)
	m_freem(result);
	return error;
	}

	/* %%% SAD management */
	/*
	* allocating and initialize new SA head.
	* OUT: NULL : failure due to the lack of memory.
	* others : pointer to new SA head.
	*/
	static struct secashead *
	key_newsah(struct secasindex *saidx)
	{
	struct secashead *sah;

	sah = malloc(sizeof(struct secashead), M_IPSEC_SAH,
	M_NOWAIT \| M_ZERO);
	if (sah == NULL) {
	PFKEYSTAT_INC(in_nomem);
	return (NULL);
	}
	TAILQ_INIT(&sah->savtree_larval);
	TAILQ_INIT(&sah->savtree_alive);
	sah->saidx = *saidx;
	sah->state = SADB_SASTATE_DEAD;
	SAH_INITREF(sah);

	KEYDBG(KEY_STAMP,
	printf("%s: SAH(%p)\n", __func__, sah));
	KEYDBG(KEY_DATA, kdebug_secash(sah, NULL));
	return (sah);
	}

	static void
	key_freesah(struct secashead **psah)
	{
	struct secashead sah = psah;

	if (SAH_DELREF(sah) == 0)
	return;

	KEYDBG(KEY_STAMP,
	printf("%s: last reference to SAH(%p)\n", __func__, sah));
	KEYDBG(KEY_DATA, kdebug_secash(sah, NULL));

	*psah = NULL;
	key_delsah(sah);
	}

	static void
	key_delsah(struct secashead *sah)
	{
	IPSEC_ASSERT(sah != NULL, ("NULL sah"));
	IPSEC_ASSERT(sah->state == SADB_SASTATE_DEAD,
	("Attempt to free non DEAD SAH %p", sah));
	IPSEC_ASSERT(TAILQ_EMPTY(&sah->savtree_larval),
	("Attempt to free SAH %p with LARVAL SA", sah));
	IPSEC_ASSERT(TAILQ_EMPTY(&sah->savtree_alive),
	("Attempt to free SAH %p with ALIVE SA", sah));

	free(sah, M_IPSEC_SAH);
	}

	/*
	* allocating a new SA for key_add() and key_getspi() call,
	* and copy the values of mhp into new buffer.
	* When SAD message type is SADB_GETSPI set SA state to LARVAL.
	* For SADB_ADD create and initialize SA with MATURE state.
	* OUT: NULL : fail
	* others : pointer to new secasvar.
	*/
	static struct secasvar *
	key_newsav(const struct sadb_msghdr mhp, struct secasindex saidx,
	uint32_t spi, int *errp)
	{
	struct secashead *sah;
	struct secasvar *sav;
	int isnew;

	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
	IPSEC_ASSERT(mhp->msg->sadb_msg_type == SADB_GETSPI \|\|
	mhp->msg->sadb_msg_type == SADB_ADD, ("wrong message type"));

	sav = NULL;
	sah = NULL;
	/* check SPI value */
	switch (saidx->proto) {
	case IPPROTO_ESP:
	case IPPROTO_AH:
	/*
	* RFC 4302, 2.4. Security Parameters Index (SPI), SPI values
	* 1-255 reserved by IANA for future use,
	* 0 for implementation specific, local use.
	*/
	if (ntohl(spi) <= 255) {
	ipseclog((LOG_DEBUG, "%s: illegal range of SPI %u.\n",
	__func__, ntohl(spi)));
	*errp = EINVAL;
	goto done;
	}
	break;
	}

	sav = malloc(sizeof(struct secasvar), M_IPSEC_SA, M_NOWAIT \| M_ZERO);
	if (sav == NULL) {
	*errp = ENOBUFS;
	goto done;
	}
	sav->lock = malloc(sizeof(struct mtx), M_IPSEC_MISC,
	M_NOWAIT \| M_ZERO);
	if (sav->lock == NULL) {
	*errp = ENOBUFS;
	goto done;
	}
	mtx_init(sav->lock, "ipsec association", NULL, MTX_DEF);
	sav->lft_c = uma_zalloc(V_key_lft_zone, M_NOWAIT);
	if (sav->lft_c == NULL) {
	*errp = ENOBUFS;
	goto done;
	}
	counter_u64_zero(sav->lft_c_allocations);
	counter_u64_zero(sav->lft_c_bytes);

	sav->spi = spi;
	sav->seq = mhp->msg->sadb_msg_seq;
	sav->state = SADB_SASTATE_LARVAL;
	sav->pid = (pid_t)mhp->msg->sadb_msg_pid;
	SAV_INITREF(sav);
	again:
	sah = key_getsah(saidx);
	if (sah == NULL) {
	/* create a new SA index */
	sah = key_newsah(saidx);
	if (sah == NULL) {
	ipseclog((LOG_DEBUG,
	"%s: No more memory.\n", __func__));
	*errp = ENOBUFS;
	goto done;
	}
	isnew = 1;
	} else
	isnew = 0;

	sav->sah = sah;
	if (mhp->msg->sadb_msg_type == SADB_GETSPI) {
	sav->created = time_second;
	} else if (sav->state == SADB_SASTATE_LARVAL) {
	/*
	* Do not call key_setsaval() second time in case
	* of `goto again`. We will have MATURE state.
	*/
	*errp = key_setsaval(sav, mhp);
	if (*errp != 0)
	goto done;
	sav->state = SADB_SASTATE_MATURE;
	}

	SAHTREE_WLOCK();
	/*
	* Check that existing SAH wasn't unlinked.
	* Since we didn't hold the SAHTREE lock, it is possible,
	* that callout handler or key_flush() or key_delete() could
	* unlink this SAH.
	*/
	if (isnew == 0 && sah->state == SADB_SASTATE_DEAD) {
	SAHTREE_WUNLOCK();
	key_freesah(&sah); /* reference from key_getsah() */
	goto again;
	}
	if (isnew != 0) {
	/*
	* Add new SAH into SADB.
	*
	* XXXAE: we can serialize key_add and key_getspi calls, so
	* several threads will not fight in the race.
	* Otherwise we should check under SAHTREE lock, that this
	* SAH would not added twice.
	*/
	TAILQ_INSERT_HEAD(&V_sahtree, sah, chain);
	/* Add new SAH into hash by addresses */
	LIST_INSERT_HEAD(SAHADDRHASH_HASH(saidx), sah, addrhash);
	/* Now we are linked in the chain */
	sah->state = SADB_SASTATE_MATURE;
	/*
	* SAV references this new SAH.
	* In case of existing SAH we reuse reference
	* from key_getsah().
	*/
	SAH_ADDREF(sah);
	}
	/* Link SAV with SAH */
	if (sav->state == SADB_SASTATE_MATURE)
	TAILQ_INSERT_HEAD(&sah->savtree_alive, sav, chain);
	else
	TAILQ_INSERT_HEAD(&sah->savtree_larval, sav, chain);
	/* Add SAV into SPI hash */
	LIST_INSERT_HEAD(SAVHASH_HASH(sav->spi), sav, spihash);
	SAHTREE_WUNLOCK();
	errp = 0; / success */
	done:
	if (*errp != 0) {
	if (sav != NULL) {
	if (sav->lock != NULL) {
	mtx_destroy(sav->lock);
	free(sav->lock, M_IPSEC_MISC);
	}
	if (sav->lft_c != NULL)
	uma_zfree(V_key_lft_zone, sav->lft_c);
	free(sav, M_IPSEC_SA), sav = NULL;
	}
	if (sah != NULL)
	key_freesah(&sah);
	if (*errp == ENOBUFS) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	PFKEYSTAT_INC(in_nomem);
	}
	}
	return (sav);
	}

	/*
	* free() SA variable entry.
	*/
	static void
	key_cleansav(struct secasvar *sav)
	{

	if (sav->natt != NULL) {
	free(sav->natt, M_IPSEC_MISC);
	sav->natt = NULL;
	}
	if (sav->flags & SADB_X_EXT_F_CLONED)
	return;
	/*
	* Cleanup xform state. Note that zeroize'ing causes the
	* keys to be cleared; otherwise we must do it ourself.
	*/
	if (sav->tdb_xform != NULL) {
	sav->tdb_xform->xf_zeroize(sav);
	sav->tdb_xform = NULL;
	} else {
	if (sav->key_auth != NULL)
	bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
	if (sav->key_enc != NULL)
	bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
	}
	if (sav->key_auth != NULL) {
	if (sav->key_auth->key_data != NULL)
	free(sav->key_auth->key_data, M_IPSEC_MISC);
	free(sav->key_auth, M_IPSEC_MISC);
	sav->key_auth = NULL;
	}
	if (sav->key_enc != NULL) {
	if (sav->key_enc->key_data != NULL)
	free(sav->key_enc->key_data, M_IPSEC_MISC);
	free(sav->key_enc, M_IPSEC_MISC);
	sav->key_enc = NULL;
	}
	if (sav->replay != NULL) {
	if (sav->replay->bitmap != NULL)
	free(sav->replay->bitmap, M_IPSEC_MISC);
	free(sav->replay, M_IPSEC_MISC);
	sav->replay = NULL;
	}
	if (sav->lft_h != NULL) {
	free(sav->lft_h, M_IPSEC_MISC);
	sav->lft_h = NULL;
	}
	if (sav->lft_s != NULL) {
	free(sav->lft_s, M_IPSEC_MISC);
	sav->lft_s = NULL;
	}
	}

	/*
	* free() SA variable entry.
	*/
	static void
	key_delsav(struct secasvar *sav)
	{
	IPSEC_ASSERT(sav != NULL, ("null sav"));
	IPSEC_ASSERT(sav->state == SADB_SASTATE_DEAD,
	("attempt to free non DEAD SA %p", sav));
	IPSEC_ASSERT(sav->refcnt == 0, ("reference count %u > 0",
	sav->refcnt));

	/*
	* SA must be unlinked from the chain and hashtbl.
	* If SA was cloned, we leave all fields untouched,
	* except NAT-T config.
	*/
	key_cleansav(sav);
	if ((sav->flags & SADB_X_EXT_F_CLONED) == 0) {
	mtx_destroy(sav->lock);
	free(sav->lock, M_IPSEC_MISC);
	uma_zfree(V_key_lft_zone, sav->lft_c);
	}
	free(sav, M_IPSEC_SA);
	}

	/*
	* search SAH.
	* OUT:
	* NULL : not found
	* others : found, referenced pointer to a SAH.
	*/
	static struct secashead *
	key_getsah(struct secasindex *saidx)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secashead *sah;

	SAHTREE_RLOCK();
	LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
	if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID) != 0) {
	SAH_ADDREF(sah);
	break;
	}
	}
	SAHTREE_RUNLOCK();
	return (sah);
	}

	/*
	* Check not to be duplicated SPI.
	* OUT:
	* 0 : not found
	* 1 : found SA with given SPI.
	*/
	static int
	key_checkspidup(uint32_t spi)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secasvar *sav;

	/* Assume SPI is in network byte order */
	SAHTREE_RLOCK();
	LIST_FOREACH(sav, SAVHASH_HASH(spi), spihash) {
	if (sav->spi == spi)
	break;
	}
	SAHTREE_RUNLOCK();
	return (sav != NULL);
	}

	/*
	* Search SA by SPI.
	* OUT:
	* NULL : not found
	* others : found, referenced pointer to a SA.
	*/
	static struct secasvar *
	key_getsavbyspi(uint32_t spi)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secasvar *sav;

	/* Assume SPI is in network byte order */
	SAHTREE_RLOCK();
	LIST_FOREACH(sav, SAVHASH_HASH(spi), spihash) {
	if (sav->spi != spi)
	continue;
	SAV_ADDREF(sav);
	break;
	}
	SAHTREE_RUNLOCK();
	return (sav);
	}

	static int
	key_updatelifetimes(struct secasvar sav, const struct sadb_msghdr mhp)
	{
	struct seclifetime lft_h, lft_s, *tmp;

	/* Lifetime extension is optional, check that it is present. */
	if (SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
	SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) {
	/*
	* In case of SADB_UPDATE we may need to change
	* existing lifetimes.
	*/
	if (sav->state == SADB_SASTATE_MATURE) {
	lft_h = lft_s = NULL;
	goto reset;
	}
	return (0);
	}
	/* Both HARD and SOFT extensions must present */
	if ((SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
	!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) \|\|
	(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT) &&
	!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD))) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return (EINVAL);
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_LIFETIME_HARD) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_LIFETIME_SOFT)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return (EINVAL);
	}
	lft_h = key_dup_lifemsg((const struct sadb_lifetime *)
	mhp->ext[SADB_EXT_LIFETIME_HARD], M_IPSEC_MISC);
	if (lft_h == NULL) {
	PFKEYSTAT_INC(in_nomem);
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return (ENOBUFS);
	}
	lft_s = key_dup_lifemsg((const struct sadb_lifetime *)
	mhp->ext[SADB_EXT_LIFETIME_SOFT], M_IPSEC_MISC);
	if (lft_s == NULL) {
	PFKEYSTAT_INC(in_nomem);
	free(lft_h, M_IPSEC_MISC);
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return (ENOBUFS);
	}
	reset:
	if (sav->state != SADB_SASTATE_LARVAL) {
	/*
	* key_update() holds reference to this SA,
	* so it won't be deleted in meanwhile.
	*/
	SECASVAR_LOCK(sav);
	tmp = sav->lft_h;
	sav->lft_h = lft_h;
	lft_h = tmp;

	tmp = sav->lft_s;
	sav->lft_s = lft_s;
	lft_s = tmp;
	SECASVAR_UNLOCK(sav);
	if (lft_h != NULL)
	free(lft_h, M_IPSEC_MISC);
	if (lft_s != NULL)
	free(lft_s, M_IPSEC_MISC);
	return (0);
	}
	/* We can update lifetime without holding a lock */
	IPSEC_ASSERT(sav->lft_h == NULL, ("lft_h is already initialized\n"));
	IPSEC_ASSERT(sav->lft_s == NULL, ("lft_s is already initialized\n"));
	sav->lft_h = lft_h;
	sav->lft_s = lft_s;
	return (0);
	}

	/*
	* copy SA values from PF_KEY message except SPI, SEQ, PID and TYPE.
	* You must update these if need. Expects only LARVAL SAs.
	* OUT: 0: success.
	* !0: failure.
	*/
	static int
	key_setsaval(struct secasvar sav, const struct sadb_msghdr mhp)
	{
	const struct sadb_sa *sa0;
	const struct sadb_key *key0;
	uint32_t replay;
	size_t len;
	int error;

	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
	IPSEC_ASSERT(sav->state == SADB_SASTATE_LARVAL,
	("Attempt to update non LARVAL SA"));

	/* XXX rewrite */
	error = key_setident(sav->sah, mhp);
	if (error != 0)
	goto fail;

	/* SA */
	if (!SADB_CHECKHDR(mhp, SADB_EXT_SA)) {
	if (SADB_CHECKLEN(mhp, SADB_EXT_SA)) {
	error = EINVAL;
	goto fail;
	}
	sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	sav->alg_auth = sa0->sadb_sa_auth;
	sav->alg_enc = sa0->sadb_sa_encrypt;
	sav->flags = sa0->sadb_sa_flags;
	if ((sav->flags & SADB_KEY_FLAGS_MAX) != sav->flags) {
	ipseclog((LOG_DEBUG,
	"%s: invalid sa_flags 0x%08x.\n", __func__,
	sav->flags));
	error = EINVAL;
	goto fail;
	}

	/* Optional replay window */
	replay = 0;
	if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0)
	replay = sa0->sadb_sa_replay;
	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_SA_REPLAY)) {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA_REPLAY)) {
	error = EINVAL;
	goto fail;
	}
	replay = ((const struct sadb_x_sa_replay *)
	mhp->ext[SADB_X_EXT_SA_REPLAY])->sadb_x_sa_replay_replay;

	if (replay > UINT32_MAX - 32) {
	ipseclog((LOG_DEBUG,
	"%s: replay window too big.\n", __func__));
	error = EINVAL;
	goto fail;
	}

	replay = (replay + 7) >> 3;
	}

	sav->replay = malloc(sizeof(struct secreplay), M_IPSEC_MISC,
	M_NOWAIT \| M_ZERO);
	if (sav->replay == NULL) {
	PFKEYSTAT_INC(in_nomem);
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	error = ENOBUFS;
	goto fail;
	}

	if (replay != 0) {
	/* number of 32b blocks to be allocated */
	uint32_t bitmap_size;

	/* RFC 6479:
	* - the allocated replay window size must be
	* a power of two.
	* - use an extra 32b block as a redundant window.
	*/
	bitmap_size = 1;
	while (replay + 4 > bitmap_size)
	bitmap_size <<= 1;
	bitmap_size = bitmap_size / 4;

	sav->replay->bitmap = malloc(
	bitmap_size * sizeof(uint32_t), M_IPSEC_MISC,
	M_NOWAIT \| M_ZERO);
	if (sav->replay->bitmap == NULL) {
	PFKEYSTAT_INC(in_nomem);
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	error = ENOBUFS;
	goto fail;
	}
	sav->replay->bitmap_size = bitmap_size;
	sav->replay->wsize = replay;
	}
	}

	/* Authentication keys */
	if (!SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH)) {
	if (SADB_CHECKLEN(mhp, SADB_EXT_KEY_AUTH)) {
	error = EINVAL;
	goto fail;
	}
	error = 0;
	key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_AUTH];
	len = mhp->extlen[SADB_EXT_KEY_AUTH];
	switch (mhp->msg->sadb_msg_satype) {
	case SADB_SATYPE_AH:
	case SADB_SATYPE_ESP:
	case SADB_X_SATYPE_TCPSIGNATURE:
	if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
	sav->alg_auth != SADB_X_AALG_NULL)
	error = EINVAL;
	break;
	case SADB_X_SATYPE_IPCOMP:
	default:
	error = EINVAL;
	break;
	}
	if (error) {
	ipseclog((LOG_DEBUG, "%s: invalid key_auth values.\n",
	__func__));
	goto fail;
	}

	sav->key_auth = key_dup_keymsg(key0, len, M_IPSEC_MISC);
	if (sav->key_auth == NULL ) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	PFKEYSTAT_INC(in_nomem);
	error = ENOBUFS;
	goto fail;
	}
	}

	/* Encryption key */
	if (!SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT)) {
	if (SADB_CHECKLEN(mhp, SADB_EXT_KEY_ENCRYPT)) {
	error = EINVAL;
	goto fail;
	}
	error = 0;
	key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_ENCRYPT];
	len = mhp->extlen[SADB_EXT_KEY_ENCRYPT];
	switch (mhp->msg->sadb_msg_satype) {
	case SADB_SATYPE_ESP:
	if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
	sav->alg_enc != SADB_EALG_NULL) {
	error = EINVAL;
	break;
	}
	sav->key_enc = key_dup_keymsg(key0, len, M_IPSEC_MISC);
	if (sav->key_enc == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	PFKEYSTAT_INC(in_nomem);
	error = ENOBUFS;
	goto fail;
	}
	break;
	case SADB_X_SATYPE_IPCOMP:
	if (len != PFKEY_ALIGN8(sizeof(struct sadb_key)))
	error = EINVAL;
	sav->key_enc = NULL; /just in case/
	break;
	case SADB_SATYPE_AH:
	case SADB_X_SATYPE_TCPSIGNATURE:
	default:
	error = EINVAL;
	break;
	}
	if (error) {
	ipseclog((LOG_DEBUG, "%s: invalid key_enc value.\n",
	__func__));
	goto fail;
	}
	}

	/* set iv */
	sav->ivlen = 0;
	switch (mhp->msg->sadb_msg_satype) {
	case SADB_SATYPE_AH:
	if (sav->flags & SADB_X_EXT_DERIV) {
	ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
	"given to AH SA.\n", __func__));
	error = EINVAL;
	goto fail;
	}
	if (sav->alg_enc != SADB_EALG_NONE) {
	ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
	"mismated.\n", __func__));
	error = EINVAL;
	goto fail;
	}
	error = xform_init(sav, XF_AH);
	break;
	case SADB_SATYPE_ESP:
	if ((sav->flags & (SADB_X_EXT_OLD \| SADB_X_EXT_DERIV)) ==
	(SADB_X_EXT_OLD \| SADB_X_EXT_DERIV)) {
	ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
	"given to old-esp.\n", __func__));
	error = EINVAL;
	goto fail;
	}
	error = xform_init(sav, XF_ESP);
	break;
	case SADB_X_SATYPE_IPCOMP:
	if (sav->alg_auth != SADB_AALG_NONE) {
	ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
	"mismated.\n", __func__));
	error = EINVAL;
	goto fail;
	}
	if ((sav->flags & SADB_X_EXT_RAWCPI) == 0 &&
	ntohl(sav->spi) >= 0x10000) {
	ipseclog((LOG_DEBUG, "%s: invalid cpi for IPComp.\n",
	__func__));
	error = EINVAL;
	goto fail;
	}
	error = xform_init(sav, XF_IPCOMP);
	break;
	case SADB_X_SATYPE_TCPSIGNATURE:
	if (sav->alg_enc != SADB_EALG_NONE) {
	ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
	"mismated.\n", __func__));
	error = EINVAL;
	goto fail;
	}
	error = xform_init(sav, XF_TCPSIGNATURE);
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: Invalid satype.\n", __func__));
	error = EPROTONOSUPPORT;
	goto fail;
	}
	if (error) {
	ipseclog((LOG_DEBUG, "%s: unable to initialize SA type %u.\n",
	__func__, mhp->msg->sadb_msg_satype));
	goto fail;
	}

	/* Handle NAT-T headers */
	error = key_setnatt(sav, mhp);
	if (error != 0)
	goto fail;

	/* Initialize lifetime for CURRENT */
	sav->firstused = 0;
	sav->created = time_second;

	/* lifetimes for HARD and SOFT */
	error = key_updatelifetimes(sav, mhp);
	if (error == 0)
	return (0);
	fail:
	key_cleansav(sav);
	return (error);
	}

	/*
	* subroutine for SADB_GET and SADB_DUMP.
	*/
	static struct mbuf *
	key_setdumpsa(struct secasvar *sav, uint8_t type, uint8_t satype,
	uint32_t seq, uint32_t pid)
	{
	struct seclifetime lft_c;
	struct mbuf result = NULL, tres = NULL, *m;
	int i, dumporder[] = {
	SADB_EXT_SA, SADB_X_EXT_SA2, SADB_X_EXT_SA_REPLAY,
	SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
	SADB_EXT_LIFETIME_CURRENT, SADB_EXT_ADDRESS_SRC,
	SADB_EXT_ADDRESS_DST, SADB_EXT_ADDRESS_PROXY,
	SADB_EXT_KEY_AUTH, SADB_EXT_KEY_ENCRYPT,
	SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST,
	SADB_EXT_SENSITIVITY,
	SADB_X_EXT_NAT_T_TYPE,
	SADB_X_EXT_NAT_T_SPORT, SADB_X_EXT_NAT_T_DPORT,
	SADB_X_EXT_NAT_T_OAI, SADB_X_EXT_NAT_T_OAR,
	SADB_X_EXT_NAT_T_FRAG,
	};
	uint32_t replay_count;

	m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt);
	if (m == NULL)
	goto fail;
	result = m;

	for (i = nitems(dumporder) - 1; i >= 0; i--) {
	m = NULL;
	switch (dumporder[i]) {
	case SADB_EXT_SA:
	m = key_setsadbsa(sav);
	if (!m)
	goto fail;
	break;

	case SADB_X_EXT_SA2:
	SECASVAR_LOCK(sav);
	replay_count = sav->replay ? sav->replay->count : 0;
	SECASVAR_UNLOCK(sav);
	m = key_setsadbxsa2(sav->sah->saidx.mode, replay_count,
	sav->sah->saidx.reqid);
	if (!m)
	goto fail;
	break;

	case SADB_X_EXT_SA_REPLAY:
	if (sav->replay == NULL \|\|
	sav->replay->wsize <= UINT8_MAX)
	continue;

	m = key_setsadbxsareplay(sav->replay->wsize);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_ADDRESS_SRC:
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sav->sah->saidx.src.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_ADDRESS_DST:
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sav->sah->saidx.dst.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_KEY_AUTH:
	if (!sav->key_auth)
	continue;
	m = key_setkey(sav->key_auth, SADB_EXT_KEY_AUTH);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_KEY_ENCRYPT:
	if (!sav->key_enc)
	continue;
	m = key_setkey(sav->key_enc, SADB_EXT_KEY_ENCRYPT);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_LIFETIME_CURRENT:
	lft_c.addtime = sav->created;
	lft_c.allocations = (uint32_t)counter_u64_fetch(
	sav->lft_c_allocations);
	lft_c.bytes = counter_u64_fetch(sav->lft_c_bytes);
	lft_c.usetime = sav->firstused;
	m = key_setlifetime(&lft_c, SADB_EXT_LIFETIME_CURRENT);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_LIFETIME_HARD:
	if (!sav->lft_h)
	continue;
	m = key_setlifetime(sav->lft_h,
	SADB_EXT_LIFETIME_HARD);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_LIFETIME_SOFT:
	if (!sav->lft_s)
	continue;
	m = key_setlifetime(sav->lft_s,
	SADB_EXT_LIFETIME_SOFT);

	if (!m)
	goto fail;
	break;

	case SADB_X_EXT_NAT_T_TYPE:
	if (sav->natt == NULL)
	continue;
	m = key_setsadbxtype(UDP_ENCAP_ESPINUDP);
	if (!m)
	goto fail;
	break;

	case SADB_X_EXT_NAT_T_DPORT:
	if (sav->natt == NULL)
	continue;
	m = key_setsadbxport(sav->natt->dport,
	SADB_X_EXT_NAT_T_DPORT);
	if (!m)
	goto fail;
	break;

	case SADB_X_EXT_NAT_T_SPORT:
	if (sav->natt == NULL)
	continue;
	m = key_setsadbxport(sav->natt->sport,
	SADB_X_EXT_NAT_T_SPORT);
	if (!m)
	goto fail;
	break;

	case SADB_X_EXT_NAT_T_OAI:
	if (sav->natt == NULL \|\|
	(sav->natt->flags & IPSEC_NATT_F_OAI) == 0)
	continue;
	m = key_setsadbaddr(SADB_X_EXT_NAT_T_OAI,
	&sav->natt->oai.sa, FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m)
	goto fail;
	break;
	case SADB_X_EXT_NAT_T_OAR:
	if (sav->natt == NULL \|\|
	(sav->natt->flags & IPSEC_NATT_F_OAR) == 0)
	continue;
	m = key_setsadbaddr(SADB_X_EXT_NAT_T_OAR,
	&sav->natt->oar.sa, FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m)
	goto fail;
	break;
	case SADB_X_EXT_NAT_T_FRAG:
	/* We do not (yet) support those. */
	continue;

	case SADB_EXT_ADDRESS_PROXY:
	case SADB_EXT_IDENTITY_SRC:
	case SADB_EXT_IDENTITY_DST:
	/* XXX: should we brought from SPD ? */
	case SADB_EXT_SENSITIVITY:
	default:
	continue;
	}

	if (!m)
	goto fail;
	if (tres)
	m_cat(m, tres);
	tres = m;
	}

	m_cat(result, tres);
	tres = NULL;
	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL)
	goto fail;
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return result;

	fail:
	m_freem(result);
	m_freem(tres);
	return NULL;
	}

	/*
	* set data into sadb_msg.
	*/
	static struct mbuf *
	key_setsadbmsg(u_int8_t type, u_int16_t tlen, u_int8_t satype, u_int32_t seq,
	pid_t pid, u_int16_t reserved)
	{
	struct mbuf *m;
	struct sadb_msg *p;
	int len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
	if (len > MCLBYTES)
	return NULL;
	MGETHDR(m, M_NOWAIT, MT_DATA);
	if (m && len > MHLEN) {
	if (!(MCLGET(m, M_NOWAIT))) {
	m_freem(m);
	m = NULL;
	}
	}
	if (!m)
	return NULL;
	m->m_pkthdr.len = m->m_len = len;
	m->m_next = NULL;

	p = mtod(m, struct sadb_msg *);

	bzero(p, len);
	p->sadb_msg_version = PF_KEY_V2;
	p->sadb_msg_type = type;
	p->sadb_msg_errno = 0;
	p->sadb_msg_satype = satype;
	p->sadb_msg_len = PFKEY_UNIT64(tlen);
	p->sadb_msg_reserved = reserved;
	p->sadb_msg_seq = seq;
	p->sadb_msg_pid = (u_int32_t)pid;

	return m;
	}

	/*
	* copy secasvar data into sadb_address.
	*/
	static struct mbuf *
	key_setsadbsa(struct secasvar *sav)
	{
	struct mbuf *m;
	struct sadb_sa *p;
	int len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_sa));
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_sa *);
	bzero(p, len);
	p->sadb_sa_len = PFKEY_UNIT64(len);
	p->sadb_sa_exttype = SADB_EXT_SA;
	p->sadb_sa_spi = sav->spi;
	p->sadb_sa_replay = sav->replay ?
	(sav->replay->wsize > UINT8_MAX ? UINT8_MAX :
	sav->replay->wsize): 0;
	p->sadb_sa_state = sav->state;
	p->sadb_sa_auth = sav->alg_auth;
	p->sadb_sa_encrypt = sav->alg_enc;
	p->sadb_sa_flags = sav->flags & SADB_KEY_FLAGS_MAX;
	return (m);
	}

	/*
	* set data into sadb_address.
	*/
	static struct mbuf *
	key_setsadbaddr(u_int16_t exttype, const struct sockaddr *saddr,
	u_int8_t prefixlen, u_int16_t ul_proto)
	{
	struct mbuf *m;
	struct sadb_address *p;
	size_t len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_address)) +
	PFKEY_ALIGN8(saddr->sa_len);
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_address *);

	bzero(p, len);
	p->sadb_address_len = PFKEY_UNIT64(len);
	p->sadb_address_exttype = exttype;
	p->sadb_address_proto = ul_proto;
	if (prefixlen == FULLMASK) {
	switch (saddr->sa_family) {
	case AF_INET:
	prefixlen = sizeof(struct in_addr) << 3;
	break;
	case AF_INET6:
	prefixlen = sizeof(struct in6_addr) << 3;
	break;
	default:
	; /XXX/
	}
	}
	p->sadb_address_prefixlen = prefixlen;
	p->sadb_address_reserved = 0;

	bcopy(saddr,
	mtod(m, caddr_t) + PFKEY_ALIGN8(sizeof(struct sadb_address)),
	saddr->sa_len);

	return m;
	}

	/*
	* set data into sadb_x_sa2.
	*/
	static struct mbuf *
	key_setsadbxsa2(u_int8_t mode, u_int32_t seq, u_int32_t reqid)
	{
	struct mbuf *m;
	struct sadb_x_sa2 *p;
	size_t len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa2));
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_x_sa2 *);

	bzero(p, len);
	p->sadb_x_sa2_len = PFKEY_UNIT64(len);
	p->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
	p->sadb_x_sa2_mode = mode;
	p->sadb_x_sa2_reserved1 = 0;
	p->sadb_x_sa2_reserved2 = 0;
	p->sadb_x_sa2_sequence = seq;
	p->sadb_x_sa2_reqid = reqid;

	return m;
	}

	/*
	* Set data into sadb_x_sa_replay.
	*/
	static struct mbuf *
	key_setsadbxsareplay(u_int32_t replay)
	{
	struct mbuf *m;
	struct sadb_x_sa_replay *p;
	size_t len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa_replay));
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_x_sa_replay *);

	bzero(p, len);
	p->sadb_x_sa_replay_len = PFKEY_UNIT64(len);
	p->sadb_x_sa_replay_exttype = SADB_X_EXT_SA_REPLAY;
	p->sadb_x_sa_replay_replay = (replay << 3);

	return m;
	}

	/*
	* Set a type in sadb_x_nat_t_type.
	*/
	static struct mbuf *
	key_setsadbxtype(u_int16_t type)
	{
	struct mbuf *m;
	size_t len;
	struct sadb_x_nat_t_type *p;

	len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_type));

	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_x_nat_t_type *);

	bzero(p, len);
	p->sadb_x_nat_t_type_len = PFKEY_UNIT64(len);
	p->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE;
	p->sadb_x_nat_t_type_type = type;

	return (m);
	}
	/*
	* Set a port in sadb_x_nat_t_port.
	* In contrast to default RFC 2367 behaviour, port is in network byte order.
	*/
	static struct mbuf *
	key_setsadbxport(u_int16_t port, u_int16_t type)
	{
	struct mbuf *m;
	size_t len;
	struct sadb_x_nat_t_port *p;

	len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_port));

	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_x_nat_t_port *);

	bzero(p, len);
	p->sadb_x_nat_t_port_len = PFKEY_UNIT64(len);
	p->sadb_x_nat_t_port_exttype = type;
	p->sadb_x_nat_t_port_port = port;

	return (m);
	}

	/*
	* Get port from sockaddr. Port is in network byte order.
	*/
	uint16_t
	key_portfromsaddr(struct sockaddr *sa)
	{

	switch (sa->sa_family) {
	#ifdef INET
	case AF_INET:
	return ((struct sockaddr_in *)sa)->sin_port;
	#endif
	#ifdef INET6
	case AF_INET6:
	return ((struct sockaddr_in6 *)sa)->sin6_port;
	#endif
	}
	return (0);
	}

	/*
	* Set port in struct sockaddr. Port is in network byte order.
	*/
	void
	key_porttosaddr(struct sockaddr *sa, uint16_t port)
	{

	switch (sa->sa_family) {
	#ifdef INET
	case AF_INET:
	((struct sockaddr_in *)sa)->sin_port = port;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	((struct sockaddr_in6 *)sa)->sin6_port = port;
	break;
	#endif
	default:
	ipseclog((LOG_DEBUG, "%s: unexpected address family %d.\n",
	__func__, sa->sa_family));
	break;
	}
	}

	/*
	* set data into sadb_x_policy
	*/
	static struct mbuf *
	key_setsadbxpolicy(u_int16_t type, u_int8_t dir, u_int32_t id, u_int32_t priority)
	{
	struct mbuf *m;
	struct sadb_x_policy *p;
	size_t len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_x_policy));
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return (NULL);
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_x_policy *);

	bzero(p, len);
	p->sadb_x_policy_len = PFKEY_UNIT64(len);
	p->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
	p->sadb_x_policy_type = type;
	p->sadb_x_policy_dir = dir;
	p->sadb_x_policy_id = id;
	p->sadb_x_policy_priority = priority;

	return m;
	}

	/* %%% utilities */
	/* Take a key message (sadb_key) from the socket and turn it into one
	* of the kernel's key structures (seckey).
	*
	* IN: pointer to the src
	* OUT: NULL no more memory
	*/
	struct seckey *
	key_dup_keymsg(const struct sadb_key *src, size_t len,
	struct malloc_type *type)
	{
	struct seckey *dst;

	dst = malloc(sizeof(*dst), type, M_NOWAIT);
	if (dst != NULL) {
	dst->bits = src->sadb_key_bits;
	dst->key_data = malloc(len, type, M_NOWAIT);
	if (dst->key_data != NULL) {
	bcopy((const char *)(src + 1), dst->key_data, len);
	} else {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	free(dst, type);
	dst = NULL;
	}
	} else {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));

	}
	return (dst);
	}

	/* Take a lifetime message (sadb_lifetime) passed in on a socket and
	* turn it into one of the kernel's lifetime structures (seclifetime).
	*
	* IN: pointer to the destination, source and malloc type
	* OUT: NULL, no more memory
	*/

	static struct seclifetime *
	key_dup_lifemsg(const struct sadb_lifetime src, struct malloc_type type)
	{
	struct seclifetime *dst;

	dst = malloc(sizeof(*dst), type, M_NOWAIT);
	if (dst == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return (NULL);
	}
	dst->allocations = src->sadb_lifetime_allocations;
	dst->bytes = src->sadb_lifetime_bytes;
	dst->addtime = src->sadb_lifetime_addtime;
	dst->usetime = src->sadb_lifetime_usetime;
	return (dst);
	}

	/*
	* compare two secasindex structure.
	* flag can specify to compare 2 saidxes.
	* compare two secasindex structure without both mode and reqid.
	* don't compare port.
	* IN:
	* saidx0: source, it can be in SAD.
	* saidx1: object.
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_cmpsaidx(const struct secasindex saidx0, const struct secasindex saidx1,
	int flag)
	{

	/* sanity */
	if (saidx0 == NULL && saidx1 == NULL)
	return 1;

	if (saidx0 == NULL \|\| saidx1 == NULL)
	return 0;

	if (saidx0->proto != saidx1->proto)
	return 0;

	if (flag == CMP_EXACTLY) {
	if (saidx0->mode != saidx1->mode)
	return 0;
	if (saidx0->reqid != saidx1->reqid)
	return 0;
	if (bcmp(&saidx0->src, &saidx1->src,
	saidx0->src.sa.sa_len) != 0 \|\|
	bcmp(&saidx0->dst, &saidx1->dst,
	saidx0->dst.sa.sa_len) != 0)
	return 0;
	} else {

	/* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */
	if (flag == CMP_MODE_REQID \|\| flag == CMP_REQID) {
	/*
	* If reqid of SPD is non-zero, unique SA is required.
	* The result must be of same reqid in this case.
	*/
	if (saidx1->reqid != 0 &&
	saidx0->reqid != saidx1->reqid)
	return 0;
	}

	if (flag == CMP_MODE_REQID) {
	if (saidx0->mode != IPSEC_MODE_ANY
	&& saidx0->mode != saidx1->mode)
	return 0;
	}

	if (key_sockaddrcmp(&saidx0->src.sa, &saidx1->src.sa, 0) != 0)
	return 0;
	if (key_sockaddrcmp(&saidx0->dst.sa, &saidx1->dst.sa, 0) != 0)
	return 0;
	}

	return 1;
	}

	/*
	* compare two secindex structure exactly.
	* IN:
	* spidx0: source, it is often in SPD.
	* spidx1: object, it is often from PFKEY message.
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_cmpspidx_exactly(struct secpolicyindex *spidx0,
	struct secpolicyindex *spidx1)
	{
	/* sanity */
	if (spidx0 == NULL && spidx1 == NULL)
	return 1;

	if (spidx0 == NULL \|\| spidx1 == NULL)
	return 0;

	if (spidx0->prefs != spidx1->prefs
	\|\| spidx0->prefd != spidx1->prefd
	\|\| spidx0->ul_proto != spidx1->ul_proto)
	return 0;

	return key_sockaddrcmp(&spidx0->src.sa, &spidx1->src.sa, 1) == 0 &&
	key_sockaddrcmp(&spidx0->dst.sa, &spidx1->dst.sa, 1) == 0;
	}

	/*
	* compare two secindex structure with mask.
	* IN:
	* spidx0: source, it is often in SPD.
	* spidx1: object, it is often from IP header.
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_cmpspidx_withmask(struct secpolicyindex *spidx0,
	struct secpolicyindex *spidx1)
	{
	/* sanity */
	if (spidx0 == NULL && spidx1 == NULL)
	return 1;

	if (spidx0 == NULL \|\| spidx1 == NULL)
	return 0;

	if (spidx0->src.sa.sa_family != spidx1->src.sa.sa_family \|\|
	spidx0->dst.sa.sa_family != spidx1->dst.sa.sa_family \|\|
	spidx0->src.sa.sa_len != spidx1->src.sa.sa_len \|\|
	spidx0->dst.sa.sa_len != spidx1->dst.sa.sa_len)
	return 0;

	/* if spidx.ul_proto == IPSEC_ULPROTO_ANY, ignore. */
	if (spidx0->ul_proto != (u_int16_t)IPSEC_ULPROTO_ANY
	&& spidx0->ul_proto != spidx1->ul_proto)
	return 0;

	switch (spidx0->src.sa.sa_family) {
	case AF_INET:
	if (spidx0->src.sin.sin_port != IPSEC_PORT_ANY
	&& spidx0->src.sin.sin_port != spidx1->src.sin.sin_port)
	return 0;
	if (!key_bbcmp(&spidx0->src.sin.sin_addr,
	&spidx1->src.sin.sin_addr, spidx0->prefs))
	return 0;
	break;
	case AF_INET6:
	if (spidx0->src.sin6.sin6_port != IPSEC_PORT_ANY
	&& spidx0->src.sin6.sin6_port != spidx1->src.sin6.sin6_port)
	return 0;
	/*
	* scope_id check. if sin6_scope_id is 0, we regard it
	* as a wildcard scope, which matches any scope zone ID.
	*/
	if (spidx0->src.sin6.sin6_scope_id &&
	spidx1->src.sin6.sin6_scope_id &&
	spidx0->src.sin6.sin6_scope_id != spidx1->src.sin6.sin6_scope_id)
	return 0;
	if (!key_bbcmp(&spidx0->src.sin6.sin6_addr,
	&spidx1->src.sin6.sin6_addr, spidx0->prefs))
	return 0;
	break;
	default:
	/* XXX */
	if (bcmp(&spidx0->src, &spidx1->src, spidx0->src.sa.sa_len) != 0)
	return 0;
	break;
	}

	switch (spidx0->dst.sa.sa_family) {
	case AF_INET:
	if (spidx0->dst.sin.sin_port != IPSEC_PORT_ANY
	&& spidx0->dst.sin.sin_port != spidx1->dst.sin.sin_port)
	return 0;
	if (!key_bbcmp(&spidx0->dst.sin.sin_addr,
	&spidx1->dst.sin.sin_addr, spidx0->prefd))
	return 0;
	break;
	case AF_INET6:
	if (spidx0->dst.sin6.sin6_port != IPSEC_PORT_ANY
	&& spidx0->dst.sin6.sin6_port != spidx1->dst.sin6.sin6_port)
	return 0;
	/*
	* scope_id check. if sin6_scope_id is 0, we regard it
	* as a wildcard scope, which matches any scope zone ID.
	*/
	if (spidx0->dst.sin6.sin6_scope_id &&
	spidx1->dst.sin6.sin6_scope_id &&
	spidx0->dst.sin6.sin6_scope_id != spidx1->dst.sin6.sin6_scope_id)
	return 0;
	if (!key_bbcmp(&spidx0->dst.sin6.sin6_addr,
	&spidx1->dst.sin6.sin6_addr, spidx0->prefd))
	return 0;
	break;
	default:
	/* XXX */
	if (bcmp(&spidx0->dst, &spidx1->dst, spidx0->dst.sa.sa_len) != 0)
	return 0;
	break;
	}

	/* XXX Do we check other field ? e.g. flowinfo */

	return 1;
	}

	#ifdef satosin
	#undef satosin
	#endif
	#define satosin(s) ((const struct sockaddr_in *)s)
	#ifdef satosin6
	#undef satosin6
	#endif
	#define satosin6(s) ((const struct sockaddr_in6 *)s)
	/* returns 0 on match */
	int
	key_sockaddrcmp(const struct sockaddr sa1, const struct sockaddr sa2,
	int port)
	{
	if (sa1->sa_family != sa2->sa_family \|\| sa1->sa_len != sa2->sa_len)
	return 1;

	switch (sa1->sa_family) {
	#ifdef INET
	case AF_INET:
	if (sa1->sa_len != sizeof(struct sockaddr_in))
	return 1;
	if (satosin(sa1)->sin_addr.s_addr !=
	satosin(sa2)->sin_addr.s_addr) {
	return 1;
	}
	if (port && satosin(sa1)->sin_port != satosin(sa2)->sin_port)
	return 1;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	if (sa1->sa_len != sizeof(struct sockaddr_in6))
	return 1; /EINVAL/
	if (satosin6(sa1)->sin6_scope_id !=
	satosin6(sa2)->sin6_scope_id) {
	return 1;
	}
	if (!IN6_ARE_ADDR_EQUAL(&satosin6(sa1)->sin6_addr,
	&satosin6(sa2)->sin6_addr)) {
	return 1;
	}
	if (port &&
	satosin6(sa1)->sin6_port != satosin6(sa2)->sin6_port) {
	return 1;
	}
	break;
	#endif
	default:
	if (bcmp(sa1, sa2, sa1->sa_len) != 0)
	return 1;
	break;
	}

	return 0;
	}

	/* returns 0 on match */
	int
	key_sockaddrcmp_withmask(const struct sockaddr *sa1,
	const struct sockaddr *sa2, size_t mask)
	{
	if (sa1->sa_family != sa2->sa_family \|\| sa1->sa_len != sa2->sa_len)
	return (1);

	switch (sa1->sa_family) {
	#ifdef INET
	case AF_INET:
	return (!key_bbcmp(&satosin(sa1)->sin_addr,
	&satosin(sa2)->sin_addr, mask));
	#endif
	#ifdef INET6
	case AF_INET6:
	if (satosin6(sa1)->sin6_scope_id !=
	satosin6(sa2)->sin6_scope_id)
	return (1);
	return (!key_bbcmp(&satosin6(sa1)->sin6_addr,
	&satosin6(sa2)->sin6_addr, mask));
	#endif
	}
	return (1);
	}
	#undef satosin
	#undef satosin6

	/*
	* compare two buffers with mask.
	* IN:
	* addr1: source
	* addr2: object
	* bits: Number of bits to compare
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_bbcmp(const void a1, const void a2, u_int bits)
	{
	const unsigned char *p1 = a1;
	const unsigned char *p2 = a2;

	/* XXX: This could be considerably faster if we compare a word
	* at a time, but it is complicated on LSB Endian machines */

	/* Handle null pointers */
	if (p1 == NULL \|\| p2 == NULL)
	return (p1 == p2);

	while (bits >= 8) {
	if (p1++ != p2++)
	return 0;
	bits -= 8;
	}

	if (bits > 0) {
	u_int8_t mask = ~((1<<(8-bits))-1);
	if ((p1 & mask) != (p2 & mask))
	return 0;
	}
	return 1; /* Match! */
	}

	static void
	key_flush_spd(time_t now)
	{
	SPTREE_RLOCK_TRACKER;
	struct secpolicy_list drainq;
	struct secpolicy sp, nextsp;
	u_int dir;

	LIST_INIT(&drainq);
	SPTREE_RLOCK();
	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	TAILQ_FOREACH(sp, &V_sptree[dir], chain) {
	if (sp->lifetime == 0 && sp->validtime == 0)
	continue;
	if ((sp->lifetime &&
	now - sp->created > sp->lifetime) \|\|
	(sp->validtime &&
	now - sp->lastused > sp->validtime)) {
	/* Hold extra reference to send SPDEXPIRE */
	SP_ADDREF(sp);
	LIST_INSERT_HEAD(&drainq, sp, drainq);
	}
	}
	}
	SPTREE_RUNLOCK();
	if (LIST_EMPTY(&drainq))
	return;

	SPTREE_WLOCK();
	sp = LIST_FIRST(&drainq);
	while (sp != NULL) {
	nextsp = LIST_NEXT(sp, drainq);
	/* Check that SP is still linked */
	if (sp->state != IPSEC_SPSTATE_ALIVE) {
	LIST_REMOVE(sp, drainq);
	key_freesp(&sp); /* release extra reference */
	sp = nextsp;
	continue;
	}
	TAILQ_REMOVE(&V_sptree[sp->spidx.dir], sp, chain);
	LIST_REMOVE(sp, idhash);
	sp->state = IPSEC_SPSTATE_DEAD;
	sp = nextsp;
	}
	V_sp_genid++;
	SPTREE_WUNLOCK();

	sp = LIST_FIRST(&drainq);
	while (sp != NULL) {
	nextsp = LIST_NEXT(sp, drainq);
	key_spdexpire(sp);
	key_freesp(&sp); /* release extra reference */
	key_freesp(&sp); /* release last reference */
	sp = nextsp;
	}
	}

	static void
	key_flush_sad(time_t now)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secashead_list emptyq;
	struct secasvar_list drainq, hexpireq, sexpireq, freeq;
	struct secashead sah, nextsah;
	struct secasvar sav, nextsav;

	LIST_INIT(&drainq);
	LIST_INIT(&hexpireq);
	LIST_INIT(&sexpireq);
	LIST_INIT(&emptyq);

	SAHTREE_RLOCK();
	TAILQ_FOREACH(sah, &V_sahtree, chain) {
	/* Check for empty SAH */
	if (TAILQ_EMPTY(&sah->savtree_larval) &&
	TAILQ_EMPTY(&sah->savtree_alive)) {
	SAH_ADDREF(sah);
	LIST_INSERT_HEAD(&emptyq, sah, drainq);
	continue;
	}
	/* Add all stale LARVAL SAs into drainq */
	TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
	if (now - sav->created < V_key_larval_lifetime)
	continue;
	SAV_ADDREF(sav);
	LIST_INSERT_HEAD(&drainq, sav, drainq);
	}
	TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
	/* lifetimes aren't specified */
	if (sav->lft_h == NULL)
	continue;
	SECASVAR_LOCK(sav);
	/*
	* Check again with lock held, because it may
	* be updated by SADB_UPDATE.
	*/
	if (sav->lft_h == NULL) {
	SECASVAR_UNLOCK(sav);
	continue;
	}
	/*
	* RFC 2367:
	* HARD lifetimes MUST take precedence over SOFT
	* lifetimes, meaning if the HARD and SOFT lifetimes
	* are the same, the HARD lifetime will appear on the
	* EXPIRE message.
	*/
	/* check HARD lifetime */
	if ((sav->lft_h->addtime != 0 &&
	now - sav->created > sav->lft_h->addtime) \|\|
	(sav->lft_h->usetime != 0 && sav->firstused &&
	now - sav->firstused > sav->lft_h->usetime) \|\|
	(sav->lft_h->bytes != 0 && counter_u64_fetch(
	sav->lft_c_bytes) > sav->lft_h->bytes)) {
	SECASVAR_UNLOCK(sav);
	SAV_ADDREF(sav);
	LIST_INSERT_HEAD(&hexpireq, sav, drainq);
	continue;
	}
	/* check SOFT lifetime (only for MATURE SAs) */
	if (sav->state == SADB_SASTATE_MATURE && (
	(sav->lft_s->addtime != 0 &&
	now - sav->created > sav->lft_s->addtime) \|\|
	(sav->lft_s->usetime != 0 && sav->firstused &&
	now - sav->firstused > sav->lft_s->usetime) \|\|
	(sav->lft_s->bytes != 0 && counter_u64_fetch(
	sav->lft_c_bytes) > sav->lft_s->bytes))) {
	SECASVAR_UNLOCK(sav);
	SAV_ADDREF(sav);
	LIST_INSERT_HEAD(&sexpireq, sav, drainq);
	continue;
	}
	SECASVAR_UNLOCK(sav);
	}
	}
	SAHTREE_RUNLOCK();

	if (LIST_EMPTY(&emptyq) && LIST_EMPTY(&drainq) &&
	LIST_EMPTY(&hexpireq) && LIST_EMPTY(&sexpireq))
	return;

	LIST_INIT(&freeq);
	SAHTREE_WLOCK();
	/* Unlink stale LARVAL SAs */
	sav = LIST_FIRST(&drainq);
	while (sav != NULL) {
	nextsav = LIST_NEXT(sav, drainq);
	/* Check that SA is still LARVAL */
	if (sav->state != SADB_SASTATE_LARVAL) {
	LIST_REMOVE(sav, drainq);
	LIST_INSERT_HEAD(&freeq, sav, drainq);
	sav = nextsav;
	continue;
	}
	TAILQ_REMOVE(&sav->sah->savtree_larval, sav, chain);
	LIST_REMOVE(sav, spihash);
	sav->state = SADB_SASTATE_DEAD;
	sav = nextsav;
	}
	/* Unlink all SAs with expired HARD lifetime */
	sav = LIST_FIRST(&hexpireq);
	while (sav != NULL) {
	nextsav = LIST_NEXT(sav, drainq);
	/* Check that SA is not unlinked */
	if (sav->state == SADB_SASTATE_DEAD) {
	LIST_REMOVE(sav, drainq);
	LIST_INSERT_HEAD(&freeq, sav, drainq);
	sav = nextsav;
	continue;
	}
	TAILQ_REMOVE(&sav->sah->savtree_alive, sav, chain);
	LIST_REMOVE(sav, spihash);
	sav->state = SADB_SASTATE_DEAD;
	sav = nextsav;
	}
	/* Mark all SAs with expired SOFT lifetime as DYING */
	sav = LIST_FIRST(&sexpireq);
	while (sav != NULL) {
	nextsav = LIST_NEXT(sav, drainq);
	/* Check that SA is not unlinked */
	if (sav->state == SADB_SASTATE_DEAD) {
	LIST_REMOVE(sav, drainq);
	LIST_INSERT_HEAD(&freeq, sav, drainq);
	sav = nextsav;
	continue;
	}
	/*
	* NOTE: this doesn't change SA order in the chain.
	*/
	sav->state = SADB_SASTATE_DYING;
	sav = nextsav;
	}
	/* Unlink empty SAHs */
	sah = LIST_FIRST(&emptyq);
	while (sah != NULL) {
	nextsah = LIST_NEXT(sah, drainq);
	/* Check that SAH is still empty and not unlinked */
	if (sah->state == SADB_SASTATE_DEAD \|\|
	!TAILQ_EMPTY(&sah->savtree_larval) \|\|
	!TAILQ_EMPTY(&sah->savtree_alive)) {
	LIST_REMOVE(sah, drainq);
	key_freesah(&sah); /* release extra reference */
	sah = nextsah;
	continue;
	}
	TAILQ_REMOVE(&V_sahtree, sah, chain);
	LIST_REMOVE(sah, addrhash);
	sah->state = SADB_SASTATE_DEAD;
	sah = nextsah;
	}
	SAHTREE_WUNLOCK();

	/* Send SPDEXPIRE messages */
	sav = LIST_FIRST(&hexpireq);
	while (sav != NULL) {
	nextsav = LIST_NEXT(sav, drainq);
	key_expire(sav, 1);
	key_freesah(&sav->sah); /* release reference from SAV */
	key_freesav(&sav); /* release extra reference */
	key_freesav(&sav); /* release last reference */
	sav = nextsav;
	}
	sav = LIST_FIRST(&sexpireq);
	while (sav != NULL) {
	nextsav = LIST_NEXT(sav, drainq);
	key_expire(sav, 0);
	key_freesav(&sav); /* release extra reference */
	sav = nextsav;
	}
	/* Free stale LARVAL SAs */
	sav = LIST_FIRST(&drainq);
	while (sav != NULL) {
	nextsav = LIST_NEXT(sav, drainq);
	key_freesah(&sav->sah); /* release reference from SAV */
	key_freesav(&sav); /* release extra reference */
	key_freesav(&sav); /* release last reference */
	sav = nextsav;
	}
	/* Free SAs that were unlinked/changed by someone else */
	sav = LIST_FIRST(&freeq);
	while (sav != NULL) {
	nextsav = LIST_NEXT(sav, drainq);
	key_freesav(&sav); /* release extra reference */
	sav = nextsav;
	}
	/* Free empty SAH */
	sah = LIST_FIRST(&emptyq);
	while (sah != NULL) {
	nextsah = LIST_NEXT(sah, drainq);
	key_freesah(&sah); /* release extra reference */
	key_freesah(&sah); /* release last reference */
	sah = nextsah;
	}
	}

	static void
	key_flush_acq(time_t now)
	{
	struct secacq acq, nextacq;

	/* ACQ tree */
	ACQ_LOCK();
	acq = LIST_FIRST(&V_acqtree);
	while (acq != NULL) {
	nextacq = LIST_NEXT(acq, chain);
	if (now - acq->created > V_key_blockacq_lifetime) {
	LIST_REMOVE(acq, chain);
	LIST_REMOVE(acq, addrhash);
	LIST_REMOVE(acq, seqhash);
	free(acq, M_IPSEC_SAQ);
	}
	acq = nextacq;
	}
	ACQ_UNLOCK();
	}

	static void
	key_flush_spacq(time_t now)
	{
	struct secspacq acq, nextacq;

	/* SP ACQ tree */
	SPACQ_LOCK();
	for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) {
	nextacq = LIST_NEXT(acq, chain);
	if (now - acq->created > V_key_blockacq_lifetime
	&& __LIST_CHAINED(acq)) {
	LIST_REMOVE(acq, chain);
	free(acq, M_IPSEC_SAQ);
	}
	}
	SPACQ_UNLOCK();
	}

	/*
	* time handler.
	* scanning SPD and SAD to check status for each entries,
	* and do to remove or to expire.
	* XXX: year 2038 problem may remain.
	*/
	static void
	key_timehandler(void *arg)
	{
	VNET_ITERATOR_DECL(vnet_iter);
	time_t now = time_second;

	VNET_LIST_RLOCK_NOSLEEP();
	VNET_FOREACH(vnet_iter) {
	CURVNET_SET(vnet_iter);
	key_flush_spd(now);
	key_flush_sad(now);
	key_flush_acq(now);
	key_flush_spacq(now);
	CURVNET_RESTORE();
	}
	VNET_LIST_RUNLOCK_NOSLEEP();

	#ifndef IPSEC_DEBUG2
	/* do exchange to tick time !! */
	callout_schedule(&key_timer, hz);
	#endif /* IPSEC_DEBUG2 */
	}

	u_long
	key_random()
	{
	u_long value;

	key_randomfill(&value, sizeof(value));
	return value;
	}

	void
	key_randomfill(void *p, size_t l)
	{
	size_t n;
	u_long v;
	static int warn = 1;

	n = 0;
	n = (size_t)read_random(p, (u_int)l);
	/* last resort */
	while (n < l) {
	v = random();
	bcopy(&v, (u_int8_t *)p + n,
	l - n < sizeof(v) ? l - n : sizeof(v));
	n += sizeof(v);

	if (warn) {
	printf("WARNING: pseudo-random number generator "
	"used for IPsec processing\n");
	warn = 0;
	}
	}
	}

	/*
	* map SADB_SATYPE_* to IPPROTO_*.
	* if satype == SADB_SATYPE then satype is mapped to ~0.
	* OUT:
	* 0: invalid satype.
	*/
	static uint8_t
	key_satype2proto(uint8_t satype)
	{
	switch (satype) {
	case SADB_SATYPE_UNSPEC:
	return IPSEC_PROTO_ANY;
	case SADB_SATYPE_AH:
	return IPPROTO_AH;
	case SADB_SATYPE_ESP:
	return IPPROTO_ESP;
	case SADB_X_SATYPE_IPCOMP:
	return IPPROTO_IPCOMP;
	case SADB_X_SATYPE_TCPSIGNATURE:
	return IPPROTO_TCP;
	default:
	return 0;
	}
	/* NOTREACHED */
	}

	/*
	* map IPPROTO_* to SADB_SATYPE_*
	* OUT:
	* 0: invalid protocol type.
	*/
	static uint8_t
	key_proto2satype(uint8_t proto)
	{
	switch (proto) {
	case IPPROTO_AH:
	return SADB_SATYPE_AH;
	case IPPROTO_ESP:
	return SADB_SATYPE_ESP;
	case IPPROTO_IPCOMP:
	return SADB_X_SATYPE_IPCOMP;
	case IPPROTO_TCP:
	return SADB_X_SATYPE_TCPSIGNATURE;
	default:
	return 0;
	}
	/* NOTREACHED */
	}

	/* %%% PF_KEY */
	/*
	* SADB_GETSPI processing is to receive
	* <base, (SA2), src address, dst address, (SPI range)>
	* from the IKMPd, to assign a unique spi value, to hang on the INBOUND
	* tree with the status of LARVAL, and send
	* <base, SA(*), address(SD)>
	* to the IKMPd.
	*
	* IN: mhp: pointer to the pointer to each header.
	* OUT: NULL if fail.
	* other if success, return pointer to the message to send.
	*/
	static int
	key_getspi(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secasindex saidx;
	struct sadb_address src0, dst0;
	struct secasvar *sav;
	uint32_t reqid, spi;
	int error;
	uint8_t mode, proto;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST)
	#ifdef PFKEY_STRICT_CHECKS
	\|\| SADB_CHECKHDR(mhp, SADB_EXT_SPIRANGE)
	#endif
	) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	error = EINVAL;
	goto fail;
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)
	#ifdef PFKEY_STRICT_CHECKS
	\|\| SADB_CHECKLEN(mhp, SADB_EXT_SPIRANGE)
	#endif
	) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	error = EINVAL;
	goto fail;
	}
	if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
	mode = IPSEC_MODE_ANY;
	reqid = 0;
	} else {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	error = EINVAL;
	goto fail;
	}
	mode = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
	reqid = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
	}

	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	error = EINVAL;
	goto fail;
	}
	error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1));
	if (error != 0) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	error = EINVAL;
	goto fail;
	}
	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);

	/* SPI allocation */
	spi = key_do_getnewspi(
	(struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE], &saidx);
	if (spi == 0) {
	/*
	* Requested SPI or SPI range is not available or
	* already used.
	*/
	error = EEXIST;
	goto fail;
	}
	sav = key_newsav(mhp, &saidx, spi, &error);
	if (sav == NULL)
	goto fail;

	if (sav->seq != 0) {
	/*
	* RFC2367:
	* If the SADB_GETSPI message is in response to a
	* kernel-generated SADB_ACQUIRE, the sadb_msg_seq
	* MUST be the same as the SADB_ACQUIRE message.
	*
	* XXXAE: However it doesn't definethe behaviour how to
	* check this and what to do if it doesn't match.
	* Also what we should do if it matches?
	*
	* We can compare saidx used in SADB_ACQUIRE with saidx
	* used in SADB_GETSPI, but this probably can break
	* existing software. For now just warn if it doesn't match.
	*
	* XXXAE: anyway it looks useless.
	*/
	key_acqdone(&saidx, sav->seq);
	}
	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p)\n", __func__, sav));
	KEYDBG(KEY_DATA, kdebug_secasv(sav));

	{
	struct mbuf n, nn;
	struct sadb_sa *m_sa;
	struct sadb_msg *newmsg;
	int off, len;

	/* create new sadb_msg to reply. */
	len = PFKEY_ALIGN8(sizeof(struct sadb_msg)) +
	PFKEY_ALIGN8(sizeof(struct sadb_sa));

	MGETHDR(n, M_NOWAIT, MT_DATA);
	if (len > MHLEN) {
	if (!(MCLGET(n, M_NOWAIT))) {
	m_freem(n);
	n = NULL;
	}
	}
	if (!n) {
	error = ENOBUFS;
	goto fail;
	}

	n->m_len = len;
	n->m_next = NULL;
	off = 0;

	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));

	m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off);
	m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa));
	m_sa->sadb_sa_exttype = SADB_EXT_SA;
	m_sa->sadb_sa_spi = spi; /* SPI is already in network byte order */
	off += PFKEY_ALIGN8(sizeof(struct sadb_sa));

	IPSEC_ASSERT(off == len,
	("length inconsistency (off %u len %u)", off, len));

	n->m_next = key_gather_mbuf(m, mhp, 0, 2, SADB_EXT_ADDRESS_SRC,
	SADB_EXT_ADDRESS_DST);
	if (!n->m_next) {
	m_freem(n);
	error = ENOBUFS;
	goto fail;
	}

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
	}

	n->m_pkthdr.len = 0;
	for (nn = n; nn; nn = nn->m_next)
	n->m_pkthdr.len += nn->m_len;

	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_seq = sav->seq;
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}

	fail:
	return (key_senderror(so, m, error));
	}

	/*
	* allocating new SPI
	* called by key_getspi().
	* OUT:
	* 0: failure.
	* others: success, SPI in network byte order.
	*/
	static uint32_t
	key_do_getnewspi(struct sadb_spirange spirange, struct secasindex saidx)
	{
	uint32_t min, max, newspi, t;
	int count = V_key_spi_trycnt;

	/* set spi range to allocate */
	if (spirange != NULL) {
	min = spirange->sadb_spirange_min;
	max = spirange->sadb_spirange_max;
	} else {
	min = V_key_spi_minval;
	max = V_key_spi_maxval;
	}
	/* IPCOMP needs 2-byte SPI */
	if (saidx->proto == IPPROTO_IPCOMP) {
	if (min >= 0x10000)
	min = 0xffff;
	if (max >= 0x10000)
	max = 0xffff;
	if (min > max) {
	t = min; min = max; max = t;
	}
	}

	if (min == max) {
	if (!key_checkspidup(htonl(min))) {
	ipseclog((LOG_DEBUG, "%s: SPI %u exists already.\n",
	__func__, min));
	return 0;
	}

	count--; /* taking one cost. */
	newspi = min;
	} else {

	/* init SPI */
	newspi = 0;

	/* when requesting to allocate spi ranged */
	while (count--) {
	/* generate pseudo-random SPI value ranged. */
	newspi = min + (key_random() % (max - min + 1));
	if (!key_checkspidup(htonl(newspi)))
	break;
	}

	if (count == 0 \|\| newspi == 0) {
	ipseclog((LOG_DEBUG,
	"%s: failed to allocate SPI.\n", __func__));
	return 0;
	}
	}

	/* statistics */
	keystat.getspi_count =
	(keystat.getspi_count + V_key_spi_trycnt - count) / 2;

	return (htonl(newspi));
	}

	/*
	* Find TCP-MD5 SA with corresponding secasindex.
	* If not found, return NULL and fill SPI with usable value if needed.
	*/
	static struct secasvar *
	key_getsav_tcpmd5(struct secasindex saidx, uint32_t spi)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secashead *sah;
	struct secasvar *sav;

	IPSEC_ASSERT(saidx->proto == IPPROTO_TCP, ("wrong proto"));
	SAHTREE_RLOCK();
	LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
	if (sah->saidx.proto != IPPROTO_TCP)
	continue;
	if (!key_sockaddrcmp(&saidx->dst.sa, &sah->saidx.dst.sa, 0) &&
	!key_sockaddrcmp(&saidx->src.sa, &sah->saidx.src.sa, 0))
	break;
	}
	if (sah != NULL) {
	if (V_key_preferred_oldsa)
	sav = TAILQ_LAST(&sah->savtree_alive, secasvar_queue);
	else
	sav = TAILQ_FIRST(&sah->savtree_alive);
	if (sav != NULL) {
	SAV_ADDREF(sav);
	SAHTREE_RUNLOCK();
	return (sav);
	}
	}
	if (spi == NULL) {
	/* No SPI required */
	SAHTREE_RUNLOCK();
	return (NULL);
	}
	/* Check that SPI is unique */
	LIST_FOREACH(sav, SAVHASH_HASH(*spi), spihash) {
	if (sav->spi == *spi)
	break;
	}
	if (sav == NULL) {
	SAHTREE_RUNLOCK();
	/* SPI is already unique */
	return (NULL);
	}
	SAHTREE_RUNLOCK();
	/* XXX: not optimal */
	*spi = key_do_getnewspi(NULL, saidx);
	return (NULL);
	}

	static int
	key_updateaddresses(struct socket so, struct mbuf m,
	const struct sadb_msghdr mhp, struct secasvar sav,
	struct secasindex *saidx)
	{
	struct sockaddr *newaddr;
	struct secashead *sah;
	struct secasvar newsav, tmp;
	struct mbuf *n;
	int error, isnew;

	/* Check that we need to change SAH */
	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_SRC)) {
	newaddr = (struct sockaddr *)(
	((struct sadb_address *)
	mhp->ext[SADB_X_EXT_NEW_ADDRESS_SRC]) + 1);
	bcopy(newaddr, &saidx->src, newaddr->sa_len);
	key_porttosaddr(&saidx->src.sa, 0);
	}
	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_DST)) {
	newaddr = (struct sockaddr *)(
	((struct sadb_address *)
	mhp->ext[SADB_X_EXT_NEW_ADDRESS_DST]) + 1);
	bcopy(newaddr, &saidx->dst, newaddr->sa_len);
	key_porttosaddr(&saidx->dst.sa, 0);
	}
	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_SRC) \|\|
	!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_DST)) {
	error = key_checksockaddrs(&saidx->src.sa, &saidx->dst.sa);
	if (error != 0) {
	ipseclog((LOG_DEBUG, "%s: invalid new sockaddr.\n",
	__func__));
	return (error);
	}

	sah = key_getsah(saidx);
	if (sah == NULL) {
	/* create a new SA index */
	sah = key_newsah(saidx);
	if (sah == NULL) {
	ipseclog((LOG_DEBUG,
	"%s: No more memory.\n", __func__));
	return (ENOBUFS);
	}
	isnew = 2; /* SAH is new */
	} else
	isnew = 1; /* existing SAH is referenced */
	} else {
	/*
	* src and dst addresses are still the same.
	* Do we want to change NAT-T config?
	*/
	if (sav->sah->saidx.proto != IPPROTO_ESP \|\|
	SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_TYPE) \|\|
	SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_SPORT) \|\|
	SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_DPORT)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return (EINVAL);
	}
	/* We hold reference to SA, thus SAH will be referenced too. */
	sah = sav->sah;
	isnew = 0;
	}

	newsav = malloc(sizeof(struct secasvar), M_IPSEC_SA,
	M_NOWAIT \| M_ZERO);
	if (newsav == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	error = ENOBUFS;
	goto fail;
	}

	/* Clone SA's content into newsav */
	SAV_INITREF(newsav);
	bcopy(sav, newsav, offsetof(struct secasvar, chain));
	/*
	* We create new NAT-T config if it is needed.
	* Old NAT-T config will be freed by key_cleansav() when
	* last reference to SA will be released.
	*/
	newsav->natt = NULL;
	newsav->sah = sah;
	newsav->state = SADB_SASTATE_MATURE;
	error = key_setnatt(newsav, mhp);
	if (error != 0)
	goto fail;

	SAHTREE_WLOCK();
	/* Check that SA is still alive */
	if (sav->state == SADB_SASTATE_DEAD) {
	/* SA was unlinked */
	SAHTREE_WUNLOCK();
	error = ESRCH;
	goto fail;
	}

	/* Unlink SA from SAH and SPI hash */
	IPSEC_ASSERT((sav->flags & SADB_X_EXT_F_CLONED) == 0,
	("SA is already cloned"));
	IPSEC_ASSERT(sav->state == SADB_SASTATE_MATURE \|\|
	sav->state == SADB_SASTATE_DYING,
	("Wrong SA state %u\n", sav->state));
	TAILQ_REMOVE(&sav->sah->savtree_alive, sav, chain);
	LIST_REMOVE(sav, spihash);
	sav->state = SADB_SASTATE_DEAD;

	/*
	* Link new SA with SAH. Keep SAs ordered by
	* create time (newer are first).
	*/
	TAILQ_FOREACH(tmp, &sah->savtree_alive, chain) {
	if (newsav->created > tmp->created) {
	TAILQ_INSERT_BEFORE(tmp, newsav, chain);
	break;
	}
	}
	if (tmp == NULL)
	TAILQ_INSERT_TAIL(&sah->savtree_alive, newsav, chain);

	/* Add new SA into SPI hash. */
	LIST_INSERT_HEAD(SAVHASH_HASH(newsav->spi), newsav, spihash);

	/* Add new SAH into SADB. */
	if (isnew == 2) {
	TAILQ_INSERT_HEAD(&V_sahtree, sah, chain);
	LIST_INSERT_HEAD(SAHADDRHASH_HASH(saidx), sah, addrhash);
	sah->state = SADB_SASTATE_MATURE;
	SAH_ADDREF(sah); /* newsav references new SAH */
	}
	/*
	* isnew == 1 -> @sah was referenced by key_getsah().
	* isnew == 0 -> we use the same @sah, that was used by @sav,
	* and we use its reference for @newsav.
	*/
	SECASVAR_LOCK(sav);
	/* XXX: replace cntr with pointer? */
	newsav->cntr = sav->cntr;
	sav->flags \|= SADB_X_EXT_F_CLONED;
	SECASVAR_UNLOCK(sav);

	SAHTREE_WUNLOCK();

	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p) cloned into SA(%p)\n",
	__func__, sav, newsav));
	KEYDBG(KEY_DATA, kdebug_secasv(newsav));

	key_freesav(&sav); /* release last reference */

	/* set msg buf from mhp */
	n = key_getmsgbuf_x1(m, mhp);
	if (n == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return (ENOBUFS);
	}
	m_freem(m);
	key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	return (0);
	fail:
	if (isnew != 0)
	key_freesah(&sah);
	if (newsav != NULL) {
	if (newsav->natt != NULL)
	free(newsav->natt, M_IPSEC_MISC);
	free(newsav, M_IPSEC_SA);
	}
	return (error);
	}

	/*
	* SADB_UPDATE processing
	* receive
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* key(AE), (identity(SD),) (sensitivity)>
	* from the ikmpd, and update a secasvar entry whose status is SADB_SASTATE_LARVAL.
	* and send
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* (identity(SD),) (sensitivity)>
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_update(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secasindex saidx;
	struct sadb_address src0, dst0;
	struct sadb_sa *sa0;
	struct secasvar *sav;
	uint32_t reqid;
	int error;
	uint8_t mode, proto;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (SADB_CHECKHDR(mhp, SADB_EXT_SA) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) \|\|
	(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
	!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) \|\|
	(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT) &&
	!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD))) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_SA) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
	mode = IPSEC_MODE_ANY;
	reqid = 0;
	} else {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	mode = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
	reqid = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
	}

	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);

	/*
	* Only SADB_SASTATE_MATURE SAs may be submitted in an
	* SADB_UPDATE message.
	*/
	if (sa0->sadb_sa_state != SADB_SASTATE_MATURE) {
	ipseclog((LOG_DEBUG, "%s: invalid state.\n", __func__));
	#ifdef PFKEY_STRICT_CHECKS
	return key_senderror(so, m, EINVAL);
	#endif
	}
	error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1));
	if (error != 0) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	return key_senderror(so, m, error);
	}
	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
	sav = key_getsavbyspi(sa0->sadb_sa_spi);
	if (sav == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SA found for SPI %u\n",
	__func__, ntohl(sa0->sadb_sa_spi)));
	return key_senderror(so, m, EINVAL);
	}
	/*
	* Check that SADB_UPDATE issued by the same process that did
	* SADB_GETSPI or SADB_ADD.
	*/
	if (sav->pid != mhp->msg->sadb_msg_pid) {
	ipseclog((LOG_DEBUG,
	"%s: pid mismatched (SPI %u, pid %u vs. %u)\n", __func__,
	ntohl(sav->spi), sav->pid, mhp->msg->sadb_msg_pid));
	key_freesav(&sav);
	return key_senderror(so, m, EINVAL);
	}
	/* saidx should match with SA. */
	if (key_cmpsaidx(&sav->sah->saidx, &saidx, CMP_MODE_REQID) == 0) {
	ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u",
	__func__, ntohl(sav->spi)));
	key_freesav(&sav);
	return key_senderror(so, m, ESRCH);
	}

	if (sav->state == SADB_SASTATE_LARVAL) {
	if ((mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
	SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT)) \|\|
	(mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
	SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH))) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	key_freesav(&sav);
	return key_senderror(so, m, EINVAL);
	}
	/*
	* We can set any values except src, dst and SPI.
	*/
	error = key_setsaval(sav, mhp);
	if (error != 0) {
	key_freesav(&sav);
	return (key_senderror(so, m, error));
	}
	/* Change SA state to MATURE */
	SAHTREE_WLOCK();
	if (sav->state != SADB_SASTATE_LARVAL) {
	/* SA was deleted or another thread made it MATURE. */
	SAHTREE_WUNLOCK();
	key_freesav(&sav);
	return (key_senderror(so, m, ESRCH));
	}
	/*
	* NOTE: we keep SAs in savtree_alive ordered by created
	* time. When SA's state changed from LARVAL to MATURE,
	* we update its created time in key_setsaval() and move
	* it into head of savtree_alive.
	*/
	TAILQ_REMOVE(&sav->sah->savtree_larval, sav, chain);
	TAILQ_INSERT_HEAD(&sav->sah->savtree_alive, sav, chain);
	sav->state = SADB_SASTATE_MATURE;
	SAHTREE_WUNLOCK();
	} else {
	/*
	* For DYING and MATURE SA we can change only state
	* and lifetimes. Report EINVAL if something else attempted
	* to change.
	*/
	if (!SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT) \|\|
	!SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH)) {
	key_freesav(&sav);
	return (key_senderror(so, m, EINVAL));
	}
	error = key_updatelifetimes(sav, mhp);
	if (error != 0) {
	key_freesav(&sav);
	return (key_senderror(so, m, error));
	}
	/*
	* This is FreeBSD extension to RFC2367.
	* IKEd can specify SADB_X_EXT_NEW_ADDRESS_SRC and/or
	* SADB_X_EXT_NEW_ADDRESS_DST when it wants to change
	* SA addresses (for example to implement MOBIKE protocol
	* as described in RFC4555). Also we allow to change
	* NAT-T config.
	*/
	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_SRC) \|\|
	!SADB_CHECKHDR(mhp, SADB_X_EXT_NEW_ADDRESS_DST) \|\|
	!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_TYPE) \|\|
	sav->natt != NULL) {
	error = key_updateaddresses(so, m, mhp, sav, &saidx);
	key_freesav(&sav);
	if (error != 0)
	return (key_senderror(so, m, error));
	return (0);
	}
	/* Check that SA is still alive */
	SAHTREE_WLOCK();
	if (sav->state == SADB_SASTATE_DEAD) {
	/* SA was unlinked */
	SAHTREE_WUNLOCK();
	key_freesav(&sav);
	return (key_senderror(so, m, ESRCH));
	}
	/*
	* NOTE: there is possible state moving from DYING to MATURE,
	* but this doesn't change created time, so we won't reorder
	* this SA.
	*/
	sav->state = SADB_SASTATE_MATURE;
	SAHTREE_WUNLOCK();
	}
	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p)\n", __func__, sav));
	KEYDBG(KEY_DATA, kdebug_secasv(sav));
	key_freesav(&sav);

	{
	struct mbuf *n;

	/* set msg buf from mhp */
	n = key_getmsgbuf_x1(m, mhp);
	if (n == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* SADB_ADD processing
	* add an entry to SA database, when received
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* key(AE), (identity(SD),) (sensitivity)>
	* from the ikmpd,
	* and send
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* (identity(SD),) (sensitivity)>
	* to the ikmpd.
	*
	* IGNORE identity and sensitivity messages.
	*
	* m will always be freed.
	*/
	static int
	key_add(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secasindex saidx;
	struct sadb_address src0, dst0;
	struct sadb_sa *sa0;
	struct secasvar *sav;
	uint32_t reqid, spi;
	uint8_t mode, proto;
	int error;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (SADB_CHECKHDR(mhp, SADB_EXT_SA) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) \|\|
	(mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP && (
	SADB_CHECKHDR(mhp, SADB_EXT_KEY_ENCRYPT) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_KEY_ENCRYPT))) \|\|
	(mhp->msg->sadb_msg_satype == SADB_SATYPE_AH && (
	SADB_CHECKHDR(mhp, SADB_EXT_KEY_AUTH) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_KEY_AUTH))) \|\|
	(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD) &&
	!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT)) \|\|
	(SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_SOFT) &&
	!SADB_CHECKHDR(mhp, SADB_EXT_LIFETIME_HARD))) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_SA) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
	mode = IPSEC_MODE_ANY;
	reqid = 0;
	} else {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	mode = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
	reqid = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
	}

	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];

	/*
	* Only SADB_SASTATE_MATURE SAs may be submitted in an
	* SADB_ADD message.
	*/
	if (sa0->sadb_sa_state != SADB_SASTATE_MATURE) {
	ipseclog((LOG_DEBUG, "%s: invalid state.\n", __func__));
	#ifdef PFKEY_STRICT_CHECKS
	return key_senderror(so, m, EINVAL);
	#endif
	}
	error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1));
	if (error != 0) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	return key_senderror(so, m, error);
	}
	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
	spi = sa0->sadb_sa_spi;
	/*
	* For TCP-MD5 SAs we don't use SPI. Check the uniqueness using
	* secasindex.
	* XXXAE: IPComp seems also doesn't use SPI.
	*/
	if (proto == IPPROTO_TCP) {
	sav = key_getsav_tcpmd5(&saidx, &spi);
	if (sav == NULL && spi == 0) {
	/* Failed to allocate SPI */
	ipseclog((LOG_DEBUG, "%s: SA already exists.\n",
	__func__));
	return key_senderror(so, m, EEXIST);
	}
	/* XXX: SPI that we report back can have another value */
	} else {
	/* We can create new SA only if SPI is different. */
	sav = key_getsavbyspi(spi);
	}
	if (sav != NULL) {
	key_freesav(&sav);
	ipseclog((LOG_DEBUG, "%s: SA already exists.\n", __func__));
	return key_senderror(so, m, EEXIST);
	}

	sav = key_newsav(mhp, &saidx, spi, &error);
	if (sav == NULL)
	return key_senderror(so, m, error);
	KEYDBG(KEY_STAMP,
	printf("%s: return SA(%p)\n", __func__, sav));
	KEYDBG(KEY_DATA, kdebug_secasv(sav));
	/*
	* If SADB_ADD was in response to SADB_ACQUIRE, we need to schedule
	* ACQ for deletion.
	*/
	if (sav->seq != 0)
	key_acqdone(&saidx, sav->seq);

	{
	/*
	* Don't call key_freesav() on error here, as we would like to
	* keep the SA in the database.
	*/
	struct mbuf *n;

	/* set msg buf from mhp */
	n = key_getmsgbuf_x1(m, mhp);
	if (n == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* NAT-T support.
	* IKEd may request the use ESP in UDP encapsulation when it detects the
	* presence of NAT. It uses NAT-T extension headers for such SAs to specify
	* parameters needed for encapsulation and decapsulation. These PF_KEY
	* extension headers are not standardized, so this comment addresses our
	* implementation.
	* SADB_X_EXT_NAT_T_TYPE specifies type of encapsulation, we support only
	* UDP_ENCAP_ESPINUDP as described in RFC3948.
	* SADB_X_EXT_NAT_T_SPORT/DPORT specifies source and destination ports for
	* UDP header. We use these ports in UDP encapsulation procedure, also we
	* can check them in UDP decapsulation procedure.
	* SADB_X_EXT_NAT_T_OA[IR] specifies original address of initiator or
	* responder. These addresses can be used for transport mode to adjust
	* checksum after decapsulation and decryption. Since original IP addresses
	* used by peer usually different (we detected presence of NAT), TCP/UDP
	* pseudo header checksum and IP header checksum was calculated using original
	* addresses. After decapsulation and decryption we need to adjust checksum
	* to have correct datagram.
	*
	* We expect presence of NAT-T extension headers only in SADB_ADD and
	* SADB_UPDATE messages. We report NAT-T extension headers in replies
	* to SADB_ADD, SADB_UPDATE, SADB_GET, and SADB_DUMP messages.
	*/
	static int
	key_setnatt(struct secasvar sav, const struct sadb_msghdr mhp)
	{
	struct sadb_x_nat_t_port *port;
	struct sadb_x_nat_t_type *type;
	struct sadb_address oai, oar;
	struct sockaddr *sa;
	uint32_t addr;
	uint16_t cksum;

	IPSEC_ASSERT(sav->natt == NULL, ("natt is already initialized"));
	/*
	* Ignore NAT-T headers if sproto isn't ESP.
	*/
	if (sav->sah->saidx.proto != IPPROTO_ESP)
	return (0);

	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_TYPE) &&
	!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_SPORT) &&
	!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_DPORT)) {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_TYPE) \|\|
	SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_SPORT) \|\|
	SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_DPORT)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	return (EINVAL);
	}
	} else
	return (0);

	type = (struct sadb_x_nat_t_type *)mhp->ext[SADB_X_EXT_NAT_T_TYPE];
	if (type->sadb_x_nat_t_type_type != UDP_ENCAP_ESPINUDP) {
	ipseclog((LOG_DEBUG, "%s: unsupported NAT-T type %u.\n",
	__func__, type->sadb_x_nat_t_type_type));
	return (EINVAL);
	}
	/*
	* Allocate storage for NAT-T config.
	* On error it will be released by key_cleansav().
	*/
	sav->natt = malloc(sizeof(struct secnatt), M_IPSEC_MISC,
	M_NOWAIT \| M_ZERO);
	if (sav->natt == NULL) {
	PFKEYSTAT_INC(in_nomem);
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return (ENOBUFS);
	}
	port = (struct sadb_x_nat_t_port *)mhp->ext[SADB_X_EXT_NAT_T_SPORT];
	if (port->sadb_x_nat_t_port_port == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid NAT-T sport specified.\n",
	__func__));
	return (EINVAL);
	}
	sav->natt->sport = port->sadb_x_nat_t_port_port;
	port = (struct sadb_x_nat_t_port *)mhp->ext[SADB_X_EXT_NAT_T_DPORT];
	if (port->sadb_x_nat_t_port_port == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid NAT-T dport specified.\n",
	__func__));
	return (EINVAL);
	}
	sav->natt->dport = port->sadb_x_nat_t_port_port;

	/*
	* SADB_X_EXT_NAT_T_OAI and SADB_X_EXT_NAT_T_OAR are optional
	* and needed only for transport mode IPsec.
	* Usually NAT translates only one address, but it is possible,
	* that both addresses could be translated.
	* NOTE: Value of SADB_X_EXT_NAT_T_OAI is equal to SADB_X_EXT_NAT_T_OA.
	*/
	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_OAI)) {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_OAI)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	return (EINVAL);
	}
	oai = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAI];
	} else
	oai = NULL;
	if (!SADB_CHECKHDR(mhp, SADB_X_EXT_NAT_T_OAR)) {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_NAT_T_OAR)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	return (EINVAL);
	}
	oar = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAR];
	} else
	oar = NULL;

	/* Initialize addresses only for transport mode */
	if (sav->sah->saidx.mode != IPSEC_MODE_TUNNEL) {
	cksum = 0;
	if (oai != NULL) {
	/* Currently we support only AF_INET */
	sa = (struct sockaddr *)(oai + 1);
	if (sa->sa_family != AF_INET \|\|
	sa->sa_len != sizeof(struct sockaddr_in)) {
	ipseclog((LOG_DEBUG,
	"%s: wrong NAT-OAi header.\n",
	__func__));
	return (EINVAL);
	}
	/* Ignore address if it the same */
	if (((struct sockaddr_in *)sa)->sin_addr.s_addr !=
	sav->sah->saidx.src.sin.sin_addr.s_addr) {
	bcopy(sa, &sav->natt->oai.sa, sa->sa_len);
	sav->natt->flags \|= IPSEC_NATT_F_OAI;
	/* Calculate checksum delta */
	addr = sav->sah->saidx.src.sin.sin_addr.s_addr;
	cksum = in_addword(cksum, ~addr >> 16);
	cksum = in_addword(cksum, ~addr & 0xffff);
	addr = sav->natt->oai.sin.sin_addr.s_addr;
	cksum = in_addword(cksum, addr >> 16);
	cksum = in_addword(cksum, addr & 0xffff);
	}
	}
	if (oar != NULL) {
	/* Currently we support only AF_INET */
	sa = (struct sockaddr *)(oar + 1);
	if (sa->sa_family != AF_INET \|\|
	sa->sa_len != sizeof(struct sockaddr_in)) {
	ipseclog((LOG_DEBUG,
	"%s: wrong NAT-OAr header.\n",
	__func__));
	return (EINVAL);
	}
	/* Ignore address if it the same */
	if (((struct sockaddr_in *)sa)->sin_addr.s_addr !=
	sav->sah->saidx.dst.sin.sin_addr.s_addr) {
	bcopy(sa, &sav->natt->oar.sa, sa->sa_len);
	sav->natt->flags \|= IPSEC_NATT_F_OAR;
	/* Calculate checksum delta */
	addr = sav->sah->saidx.dst.sin.sin_addr.s_addr;
	cksum = in_addword(cksum, ~addr >> 16);
	cksum = in_addword(cksum, ~addr & 0xffff);
	addr = sav->natt->oar.sin.sin_addr.s_addr;
	cksum = in_addword(cksum, addr >> 16);
	cksum = in_addword(cksum, addr & 0xffff);
	}
	}
	sav->natt->cksum = cksum;
	}
	return (0);
	}

	static int
	key_setident(struct secashead sah, const struct sadb_msghdr mhp)
	{
	const struct sadb_ident idsrc, iddst;
	- int idsrclen, iddstlen;

	IPSEC_ASSERT(sah != NULL, ("null secashead"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* don't make buffer if not there */
	if (SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_SRC) &&
	SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_DST)) {
	sah->idents = NULL;
	sah->identd = NULL;
	return (0);
	}

	if (SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_IDENTITY_DST)) {
	ipseclog((LOG_DEBUG, "%s: invalid identity.\n", __func__));
	return (EINVAL);
	}

	idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC];
	iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST];
	- idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC];
	- iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST];

	/* validity check */
	if (idsrc->sadb_ident_type != iddst->sadb_ident_type) {
	ipseclog((LOG_DEBUG, "%s: ident type mismatch.\n", __func__));
	return EINVAL;
	}

	switch (idsrc->sadb_ident_type) {
	case SADB_IDENTTYPE_PREFIX:
	case SADB_IDENTTYPE_FQDN:
	case SADB_IDENTTYPE_USERFQDN:
	default:
	/* XXX do nothing */
	sah->idents = NULL;
	sah->identd = NULL;
	return 0;
	}

	/* make structure */
	sah->idents = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
	if (sah->idents == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return ENOBUFS;
	}
	sah->identd = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
	if (sah->identd == NULL) {
	free(sah->idents, M_IPSEC_MISC);
	sah->idents = NULL;
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return ENOBUFS;
	}
	sah->idents->type = idsrc->sadb_ident_type;
	sah->idents->id = idsrc->sadb_ident_id;

	sah->identd->type = iddst->sadb_ident_type;
	sah->identd->id = iddst->sadb_ident_id;

	return 0;
	}

	/*
	* m will not be freed on return.
	* it is caller's responsibility to free the result.
	*
	* Called from SADB_ADD and SADB_UPDATE. Reply will contain headers
	* from the request in defined order.
	*/
	static struct mbuf *
	key_getmsgbuf_x1(struct mbuf m, const struct sadb_msghdr mhp)
	{
	struct mbuf *n;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 16, SADB_EXT_RESERVED,
	SADB_EXT_SA, SADB_X_EXT_SA2,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST,
	SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
	SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST,
	SADB_X_EXT_NAT_T_TYPE, SADB_X_EXT_NAT_T_SPORT,
	SADB_X_EXT_NAT_T_DPORT, SADB_X_EXT_NAT_T_OAI,
	SADB_X_EXT_NAT_T_OAR, SADB_X_EXT_NEW_ADDRESS_SRC,
	SADB_X_EXT_NEW_ADDRESS_DST);
	if (!n)
	return NULL;

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return NULL;
	}
	mtod(n, struct sadb_msg *)->sadb_msg_errno = 0;
	mtod(n, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(n->m_pkthdr.len);

	return n;
	}

	/*
	* SADB_DELETE processing
	* receive
	* <base, SA(*), address(SD)>
	* from the ikmpd, and set SADB_SASTATE_DEAD,
	* and send,
	* <base, SA(*), address(SD)>
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_delete(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secasindex saidx;
	struct sadb_address src0, dst0;
	struct secasvar *sav;
	struct sadb_sa *sa0;
	uint8_t proto;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);

	if (key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1)) != 0) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	return (key_senderror(so, m, EINVAL));
	}
	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
	if (SADB_CHECKHDR(mhp, SADB_EXT_SA)) {
	/*
	* Caller wants us to delete all non-LARVAL SAs
	* that match the src/dst. This is used during
	* IKE INITIAL-CONTACT.
	* XXXAE: this looks like some extension to RFC2367.
	*/
	ipseclog((LOG_DEBUG, "%s: doing delete all.\n", __func__));
	return (key_delete_all(so, m, mhp, &saidx));
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_SA)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return (key_senderror(so, m, EINVAL));
	}
	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	if (proto == IPPROTO_TCP)
	sav = key_getsav_tcpmd5(&saidx, NULL);
	else
	sav = key_getsavbyspi(sa0->sadb_sa_spi);
	if (sav == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SA found for SPI %u.\n",
	__func__, ntohl(sa0->sadb_sa_spi)));
	return (key_senderror(so, m, ESRCH));
	}
	if (key_cmpsaidx(&sav->sah->saidx, &saidx, CMP_HEAD) == 0) {
	ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u.\n",
	__func__, ntohl(sav->spi)));
	key_freesav(&sav);
	return (key_senderror(so, m, ESRCH));
	}
	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p)\n", __func__, sav));
	KEYDBG(KEY_DATA, kdebug_secasv(sav));
	key_unlinksav(sav);
	key_freesav(&sav);

	{
	struct mbuf *n;
	struct sadb_msg *newmsg;

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
	SADB_EXT_SA, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return key_senderror(so, m, ENOBUFS);
	}
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* delete all SAs for src/dst. Called from key_delete().
	*/
	static int
	key_delete_all(struct socket so, struct mbuf m,
	const struct sadb_msghdr mhp, struct secasindex saidx)
	{
	struct secasvar_queue drainq;
	struct secashead *sah;
	struct secasvar sav, nextsav;

	TAILQ_INIT(&drainq);
	SAHTREE_WLOCK();
	LIST_FOREACH(sah, SAHADDRHASH_HASH(saidx), addrhash) {
	if (key_cmpsaidx(&sah->saidx, saidx, CMP_HEAD) == 0)
	continue;
	/* Move all ALIVE SAs into drainq */
	TAILQ_CONCAT(&drainq, &sah->savtree_alive, chain);
	}
	/* Unlink all queued SAs from SPI hash */
	TAILQ_FOREACH(sav, &drainq, chain) {
	sav->state = SADB_SASTATE_DEAD;
	LIST_REMOVE(sav, spihash);
	}
	SAHTREE_WUNLOCK();
	/* Now we can release reference for all SAs in drainq */
	sav = TAILQ_FIRST(&drainq);
	while (sav != NULL) {
	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p)\n", __func__, sav));
	KEYDBG(KEY_DATA, kdebug_secasv(sav));
	nextsav = TAILQ_NEXT(sav, chain);
	key_freesah(&sav->sah); /* release reference from SAV */
	key_freesav(&sav); /* release last reference */
	sav = nextsav;
	}

	{
	struct mbuf *n;
	struct sadb_msg *newmsg;

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 3, SADB_EXT_RESERVED,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return key_senderror(so, m, ENOBUFS);
	}
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* Delete all alive SAs for corresponding xform.
	* Larval SAs have not initialized tdb_xform, so it is safe to leave them
	* here when xform disappears.
	*/
	static void
	key_delete_xform(const struct xformsw *xsp)
	{
	struct secasvar_queue drainq;
	struct secashead *sah;
	struct secasvar sav, nextsav;

	TAILQ_INIT(&drainq);
	SAHTREE_WLOCK();
	TAILQ_FOREACH(sah, &V_sahtree, chain) {
	sav = TAILQ_FIRST(&sah->savtree_alive);
	if (sav == NULL)
	continue;
	if (sav->tdb_xform != xsp)
	continue;
	/*
	* It is supposed that all SAs in the chain are related to
	* one xform.
	*/
	TAILQ_CONCAT(&drainq, &sah->savtree_alive, chain);
	}
	/* Unlink all queued SAs from SPI hash */
	TAILQ_FOREACH(sav, &drainq, chain) {
	sav->state = SADB_SASTATE_DEAD;
	LIST_REMOVE(sav, spihash);
	}
	SAHTREE_WUNLOCK();

	/* Now we can release reference for all SAs in drainq */
	sav = TAILQ_FIRST(&drainq);
	while (sav != NULL) {
	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p)\n", __func__, sav));
	KEYDBG(KEY_DATA, kdebug_secasv(sav));
	nextsav = TAILQ_NEXT(sav, chain);
	key_freesah(&sav->sah); /* release reference from SAV */
	key_freesav(&sav); /* release last reference */
	sav = nextsav;
	}
	}

	/*
	* SADB_GET processing
	* receive
	* <base, SA(*), address(SD)>
	* from the ikmpd, and get a SP and a SA to respond,
	* and send,
	* <base, SA, (lifetime(HSC),) address(SD), (address(P),) key(AE),
	* (identity(SD),) (sensitivity)>
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_get(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secasindex saidx;
	struct sadb_address src0, dst0;
	struct sadb_sa *sa0;
	struct secasvar *sav;
	uint8_t proto;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (SADB_CHECKHDR(mhp, SADB_EXT_SA) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_SA) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];

	if (key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1)) != 0) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);

	if (proto == IPPROTO_TCP)
	sav = key_getsav_tcpmd5(&saidx, NULL);
	else
	sav = key_getsavbyspi(sa0->sadb_sa_spi);
	if (sav == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
	return key_senderror(so, m, ESRCH);
	}
	if (key_cmpsaidx(&sav->sah->saidx, &saidx, CMP_HEAD) == 0) {
	ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u.\n",
	__func__, ntohl(sa0->sadb_sa_spi)));
	key_freesav(&sav);
	return (key_senderror(so, m, ESRCH));
	}

	{
	struct mbuf *n;
	uint8_t satype;

	/* map proto to satype */
	if ((satype = key_proto2satype(sav->sah->saidx.proto)) == 0) {
	ipseclog((LOG_DEBUG, "%s: there was invalid proto in SAD.\n",
	__func__));
	key_freesav(&sav);
	return key_senderror(so, m, EINVAL);
	}

	/* create new sadb_msg to reply. */
	n = key_setdumpsa(sav, SADB_GET, satype, mhp->msg->sadb_msg_seq,
	mhp->msg->sadb_msg_pid);

	key_freesav(&sav);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}

	/* XXX make it sysctl-configurable? */
	static void
	key_getcomb_setlifetime(struct sadb_comb *comb)
	{

	comb->sadb_comb_soft_allocations = 1;
	comb->sadb_comb_hard_allocations = 1;
	comb->sadb_comb_soft_bytes = 0;
	comb->sadb_comb_hard_bytes = 0;
	comb->sadb_comb_hard_addtime = 86400; /* 1 day */
	comb->sadb_comb_soft_addtime = comb->sadb_comb_soft_addtime * 80 / 100;
	comb->sadb_comb_soft_usetime = 28800; /* 8 hours */
	comb->sadb_comb_hard_usetime = comb->sadb_comb_hard_usetime * 80 / 100;
	}

	/*
	* XXX reorder combinations by preference
	* XXX no idea if the user wants ESP authentication or not
	*/
	static struct mbuf *
	key_getcomb_ealg(void)
	{
	struct sadb_comb *comb;
	const struct enc_xform *algo;
	struct mbuf result = NULL, m, *n;
	int encmin;
	int i, off, o;
	int totlen;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));

	m = NULL;
	for (i = 1; i <= SADB_EALG_MAX; i++) {
	algo = enc_algorithm_lookup(i);
	if (algo == NULL)
	continue;

	/* discard algorithms with key size smaller than system min */
	if (_BITS(algo->maxkey) < V_ipsec_esp_keymin)
	continue;
	if (_BITS(algo->minkey) < V_ipsec_esp_keymin)
	encmin = V_ipsec_esp_keymin;
	else
	encmin = _BITS(algo->minkey);

	if (V_ipsec_esp_auth)
	m = key_getcomb_ah();
	else {
	IPSEC_ASSERT(l <= MLEN,
	("l=%u > MLEN=%lu", l, (u_long) MLEN));
	MGET(m, M_NOWAIT, MT_DATA);
	if (m) {
	M_ALIGN(m, l);
	m->m_len = l;
	m->m_next = NULL;
	bzero(mtod(m, caddr_t), m->m_len);
	}
	}
	if (!m)
	goto fail;

	totlen = 0;
	for (n = m; n; n = n->m_next)
	totlen += n->m_len;
	IPSEC_ASSERT((totlen % l) == 0, ("totlen=%u, l=%u", totlen, l));

	for (off = 0; off < totlen; off += l) {
	n = m_pulldown(m, off, l, &o);
	if (!n) {
	/* m is already freed */
	goto fail;
	}
	comb = (struct sadb_comb *)(mtod(n, caddr_t) + o);
	bzero(comb, sizeof(*comb));
	key_getcomb_setlifetime(comb);
	comb->sadb_comb_encrypt = i;
	comb->sadb_comb_encrypt_minbits = encmin;
	comb->sadb_comb_encrypt_maxbits = _BITS(algo->maxkey);
	}

	if (!result)
	result = m;
	else
	m_cat(result, m);
	}

	return result;

	fail:
	if (result)
	m_freem(result);
	return NULL;
	}

	static void
	key_getsizes_ah(const struct auth_hash ah, int alg, u_int16_t min,
	u_int16_t* max)
	{

	min = max = ah->hashsize;
	if (ah->keysize == 0) {
	/*
	* Transform takes arbitrary key size but algorithm
	* key size is restricted. Enforce this here.
	*/
	switch (alg) {
	case SADB_X_AALG_MD5: min = max = 16; break;
	case SADB_X_AALG_SHA: min = max = 20; break;
	case SADB_X_AALG_NULL: min = 1; max = 256; break;
	case SADB_X_AALG_SHA2_256: min = max = 32; break;
	case SADB_X_AALG_SHA2_384: min = max = 48; break;
	case SADB_X_AALG_SHA2_512: min = max = 64; break;
	default:
	DPRINTF(("%s: unknown AH algorithm %u\n",
	__func__, alg));
	break;
	}
	}
	}

	/*
	* XXX reorder combinations by preference
	*/
	static struct mbuf *
	key_getcomb_ah()
	{
	const struct auth_hash *algo;
	struct sadb_comb *comb;
	struct mbuf *m;
	u_int16_t minkeysize, maxkeysize;
	int i;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));

	m = NULL;
	for (i = 1; i <= SADB_AALG_MAX; i++) {
	#if 1
	/* we prefer HMAC algorithms, not old algorithms */
	if (i != SADB_AALG_SHA1HMAC &&
	i != SADB_AALG_MD5HMAC &&
	i != SADB_X_AALG_SHA2_256 &&
	i != SADB_X_AALG_SHA2_384 &&
	i != SADB_X_AALG_SHA2_512)
	continue;
	#endif
	algo = auth_algorithm_lookup(i);
	if (!algo)
	continue;
	key_getsizes_ah(algo, i, &minkeysize, &maxkeysize);
	/* discard algorithms with key size smaller than system min */
	if (_BITS(minkeysize) < V_ipsec_ah_keymin)
	continue;

	if (!m) {
	IPSEC_ASSERT(l <= MLEN,
	("l=%u > MLEN=%lu", l, (u_long) MLEN));
	MGET(m, M_NOWAIT, MT_DATA);
	if (m) {
	M_ALIGN(m, l);
	m->m_len = l;
	m->m_next = NULL;
	}
	} else
	M_PREPEND(m, l, M_NOWAIT);
	if (!m)
	return NULL;

	comb = mtod(m, struct sadb_comb *);
	bzero(comb, sizeof(*comb));
	key_getcomb_setlifetime(comb);
	comb->sadb_comb_auth = i;
	comb->sadb_comb_auth_minbits = _BITS(minkeysize);
	comb->sadb_comb_auth_maxbits = _BITS(maxkeysize);
	}

	return m;
	}

	/*
	* not really an official behavior. discussed in pf_key@inner.net in Sep2000.
	* XXX reorder combinations by preference
	*/
	static struct mbuf *
	key_getcomb_ipcomp()
	{
	const struct comp_algo *algo;
	struct sadb_comb *comb;
	struct mbuf *m;
	int i;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));

	m = NULL;
	for (i = 1; i <= SADB_X_CALG_MAX; i++) {
	algo = comp_algorithm_lookup(i);
	if (!algo)
	continue;

	if (!m) {
	IPSEC_ASSERT(l <= MLEN,
	("l=%u > MLEN=%lu", l, (u_long) MLEN));
	MGET(m, M_NOWAIT, MT_DATA);
	if (m) {
	M_ALIGN(m, l);
	m->m_len = l;
	m->m_next = NULL;
	}
	} else
	M_PREPEND(m, l, M_NOWAIT);
	if (!m)
	return NULL;

	comb = mtod(m, struct sadb_comb *);
	bzero(comb, sizeof(*comb));
	key_getcomb_setlifetime(comb);
	comb->sadb_comb_encrypt = i;
	/* what should we set into sadb_comb__{min,max}bits? /
	}

	return m;
	}

	/*
	* XXX no way to pass mode (transport/tunnel) to userland
	* XXX replay checking?
	* XXX sysctl interface to ipsec_{ah,esp}_keymin
	*/
	static struct mbuf *
	key_getprop(const struct secasindex *saidx)
	{
	struct sadb_prop *prop;
	struct mbuf m, n;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_prop));
	int totlen;

	switch (saidx->proto) {
	case IPPROTO_ESP:
	m = key_getcomb_ealg();
	break;
	case IPPROTO_AH:
	m = key_getcomb_ah();
	break;
	case IPPROTO_IPCOMP:
	m = key_getcomb_ipcomp();
	break;
	default:
	return NULL;
	}

	if (!m)
	return NULL;
	M_PREPEND(m, l, M_NOWAIT);
	if (!m)
	return NULL;

	totlen = 0;
	for (n = m; n; n = n->m_next)
	totlen += n->m_len;

	prop = mtod(m, struct sadb_prop *);
	bzero(prop, sizeof(*prop));
	prop->sadb_prop_len = PFKEY_UNIT64(totlen);
	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
	prop->sadb_prop_replay = 32; /* XXX */

	return m;
	}

	/*
	* SADB_ACQUIRE processing called by key_checkrequest() and key_acquire2().
	* send
	* <base, SA, address(SD), (address(P)), x_policy,
	* (identity(SD),) (sensitivity,) proposal>
	* to KMD, and expect to receive
	* <base> with SADB_ACQUIRE if error occurred,
	* or
	* <base, src address, dst address, (SPI range)> with SADB_GETSPI
	* from KMD by PF_KEY.
	*
	* XXX x_policy is outside of RFC2367 (KAME extension).
	* XXX sensitivity is not supported.
	* XXX for ipcomp, RFC2367 does not define how to fill in proposal.
	* see comment for key_getcomb_ipcomp().
	*
	* OUT:
	* 0 : succeed
	* others: error number
	*/
	static int
	key_acquire(const struct secasindex saidx, struct secpolicy sp)
	{
	union sockaddr_union addr;
	struct mbuf result, m;
	uint32_t seq;
	int error;
	uint16_t ul_proto;
	uint8_t mask, satype;

	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
	satype = key_proto2satype(saidx->proto);
	IPSEC_ASSERT(satype != 0, ("null satype, protocol %u", saidx->proto));

	error = -1;
	result = NULL;
	ul_proto = IPSEC_ULPROTO_ANY;

	/* Get seq number to check whether sending message or not. */
	seq = key_getacq(saidx, &error);
	if (seq == 0)
	return (error);

	m = key_setsadbmsg(SADB_ACQUIRE, 0, satype, seq, 0, 0);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	result = m;

	/*
	* set sadb_address for saidx's.
	*
	* Note that if sp is supplied, then we're being called from
	* key_allocsa_policy() and should supply port and protocol
	* information.
	* XXXAE: why only TCP and UDP? ICMP and SCTP looks applicable too.
	* XXXAE: probably we can handle this in the ipsec[46]_allocsa().
	* XXXAE: it looks like we should save this info in the ACQ entry.
	*/
	if (sp != NULL && (sp->spidx.ul_proto == IPPROTO_TCP \|\|
	sp->spidx.ul_proto == IPPROTO_UDP))
	ul_proto = sp->spidx.ul_proto;

	addr = saidx->src;
	mask = FULLMASK;
	if (ul_proto != IPSEC_ULPROTO_ANY) {
	switch (sp->spidx.src.sa.sa_family) {
	case AF_INET:
	if (sp->spidx.src.sin.sin_port != IPSEC_PORT_ANY) {
	addr.sin.sin_port = sp->spidx.src.sin.sin_port;
	mask = sp->spidx.prefs;
	}
	break;
	case AF_INET6:
	if (sp->spidx.src.sin6.sin6_port != IPSEC_PORT_ANY) {
	addr.sin6.sin6_port =
	sp->spidx.src.sin6.sin6_port;
	mask = sp->spidx.prefs;
	}
	break;
	default:
	break;
	}
	}
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &addr.sa, mask, ul_proto);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	addr = saidx->dst;
	mask = FULLMASK;
	if (ul_proto != IPSEC_ULPROTO_ANY) {
	switch (sp->spidx.dst.sa.sa_family) {
	case AF_INET:
	if (sp->spidx.dst.sin.sin_port != IPSEC_PORT_ANY) {
	addr.sin.sin_port = sp->spidx.dst.sin.sin_port;
	mask = sp->spidx.prefd;
	}
	break;
	case AF_INET6:
	if (sp->spidx.dst.sin6.sin6_port != IPSEC_PORT_ANY) {
	addr.sin6.sin6_port =
	sp->spidx.dst.sin6.sin6_port;
	mask = sp->spidx.prefd;
	}
	break;
	default:
	break;
	}
	}
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &addr.sa, mask, ul_proto);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* XXX proxy address (optional) */

	/* set sadb_x_policy */
	if (sp != NULL) {
	m = key_setsadbxpolicy(sp->policy, sp->spidx.dir, sp->id,
	sp->priority);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);
	}

	/* XXX identity (optional) */
	#if 0
	if (idexttype && fqdn) {
	/* create identity extension (FQDN) */
	struct sadb_ident *id;
	int fqdnlen;

	fqdnlen = strlen(fqdn) + 1; /* +1 for terminating-NUL */
	id = (struct sadb_ident *)p;
	bzero(id, sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
	id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
	id->sadb_ident_exttype = idexttype;
	id->sadb_ident_type = SADB_IDENTTYPE_FQDN;
	bcopy(fqdn, id + 1, fqdnlen);
	p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(fqdnlen);
	}

	if (idexttype) {
	/* create identity extension (USERFQDN) */
	struct sadb_ident *id;
	int userfqdnlen;

	if (userfqdn) {
	/* +1 for terminating-NUL */
	userfqdnlen = strlen(userfqdn) + 1;
	} else
	userfqdnlen = 0;
	id = (struct sadb_ident *)p;
	bzero(id, sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
	id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
	id->sadb_ident_exttype = idexttype;
	id->sadb_ident_type = SADB_IDENTTYPE_USERFQDN;
	/* XXX is it correct? */
	if (curproc && curproc->p_cred)
	id->sadb_ident_id = curproc->p_cred->p_ruid;
	if (userfqdn && userfqdnlen)
	bcopy(userfqdn, id + 1, userfqdnlen);
	p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(userfqdnlen);
	}
	#endif

	/* XXX sensitivity (optional) */

	/* create proposal/combination extension */
	m = key_getprop(saidx);
	#if 0
	/*
	* spec conformant: always attach proposal/combination extension,
	* the problem is that we have no way to attach it for ipcomp,
	* due to the way sadb_comb is declared in RFC2367.
	*/
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);
	#else
	/*
	* outside of spec; make proposal/combination extension optional.
	*/
	if (m)
	m_cat(result, m);
	#endif

	if ((result->m_flags & M_PKTHDR) == 0) {
	error = EINVAL;
	goto fail;
	}

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	KEYDBG(KEY_STAMP,
	printf("%s: SP(%p)\n", __func__, sp));
	KEYDBG(KEY_DATA, kdebug_secasindex(saidx, NULL));

	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);

	fail:
	if (result)
	m_freem(result);
	return error;
	}

	static uint32_t
	key_newacq(const struct secasindex saidx, int perror)
	{
	struct secacq *acq;
	uint32_t seq;

	acq = malloc(sizeof(*acq), M_IPSEC_SAQ, M_NOWAIT \| M_ZERO);
	if (acq == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	*perror = ENOBUFS;
	return (0);
	}

	/* copy secindex */
	bcopy(saidx, &acq->saidx, sizeof(acq->saidx));
	acq->created = time_second;
	acq->count = 0;

	/* add to acqtree */
	ACQ_LOCK();
	seq = acq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq);
	LIST_INSERT_HEAD(&V_acqtree, acq, chain);
	LIST_INSERT_HEAD(ACQADDRHASH_HASH(saidx), acq, addrhash);
	LIST_INSERT_HEAD(ACQSEQHASH_HASH(seq), acq, seqhash);
	ACQ_UNLOCK();
	*perror = 0;
	return (seq);
	}

	static uint32_t
	key_getacq(const struct secasindex saidx, int perror)
	{
	struct secacq *acq;
	uint32_t seq;

	ACQ_LOCK();
	LIST_FOREACH(acq, ACQADDRHASH_HASH(saidx), addrhash) {
	if (key_cmpsaidx(&acq->saidx, saidx, CMP_EXACTLY)) {
	if (acq->count > V_key_blockacq_count) {
	/*
	* Reset counter and send message.
	* Also reset created time to keep ACQ for
	* this saidx.
	*/
	acq->created = time_second;
	acq->count = 0;
	seq = acq->seq;
	} else {
	/*
	* Increment counter and do nothing.
	* We send SADB_ACQUIRE message only
	* for each V_key_blockacq_count packet.
	*/
	acq->count++;
	seq = 0;
	}
	break;
	}
	}
	ACQ_UNLOCK();
	if (acq != NULL) {
	*perror = 0;
	return (seq);
	}
	/* allocate new entry */
	return (key_newacq(saidx, perror));
	}

	static int
	key_acqreset(uint32_t seq)
	{
	struct secacq *acq;

	ACQ_LOCK();
	LIST_FOREACH(acq, ACQSEQHASH_HASH(seq), seqhash) {
	if (acq->seq == seq) {
	acq->count = 0;
	acq->created = time_second;
	break;
	}
	}
	ACQ_UNLOCK();
	if (acq == NULL)
	return (ESRCH);
	return (0);
	}
	/*
	* Mark ACQ entry as stale to remove it in key_flush_acq().
	* Called after successful SADB_GETSPI message.
	*/
	static int
	key_acqdone(const struct secasindex *saidx, uint32_t seq)
	{
	struct secacq *acq;

	ACQ_LOCK();
	LIST_FOREACH(acq, ACQSEQHASH_HASH(seq), seqhash) {
	if (acq->seq == seq)
	break;
	}
	if (acq != NULL) {
	if (key_cmpsaidx(&acq->saidx, saidx, CMP_EXACTLY) == 0) {
	ipseclog((LOG_DEBUG,
	"%s: Mismatched saidx for ACQ %u", __func__, seq));
	acq = NULL;
	} else {
	acq->created = 0;
	}
	} else {
	ipseclog((LOG_DEBUG,
	"%s: ACQ %u is not found.", __func__, seq));
	}
	ACQ_UNLOCK();
	if (acq == NULL)
	return (ESRCH);
	return (0);
	}

	static struct secspacq *
	key_newspacq(struct secpolicyindex *spidx)
	{
	struct secspacq *acq;

	/* get new entry */
	acq = malloc(sizeof(struct secspacq), M_IPSEC_SAQ, M_NOWAIT\|M_ZERO);
	if (acq == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return NULL;
	}

	/* copy secindex */
	bcopy(spidx, &acq->spidx, sizeof(acq->spidx));
	acq->created = time_second;
	acq->count = 0;

	/* add to spacqtree */
	SPACQ_LOCK();
	LIST_INSERT_HEAD(&V_spacqtree, acq, chain);
	SPACQ_UNLOCK();

	return acq;
	}

	static struct secspacq *
	key_getspacq(struct secpolicyindex *spidx)
	{
	struct secspacq *acq;

	SPACQ_LOCK();
	LIST_FOREACH(acq, &V_spacqtree, chain) {
	if (key_cmpspidx_exactly(spidx, &acq->spidx)) {
	/* NB: return holding spacq_lock */
	return acq;
	}
	}
	SPACQ_UNLOCK();

	return NULL;
	}

	/*
	* SADB_ACQUIRE processing,
	* in first situation, is receiving
	* <base>
	* from the ikmpd, and clear sequence of its secasvar entry.
	*
	* In second situation, is receiving
	* <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
	* from a user land process, and return
	* <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
	* to the socket.
	*
	* m will always be freed.
	*/
	static int
	key_acquire2(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	SAHTREE_RLOCK_TRACKER;
	struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *sah;
	uint32_t reqid;
	int error;
	uint8_t mode, proto;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/*
	* Error message from KMd.
	* We assume that if error was occurred in IKEd, the length of PFKEY
	* message is equal to the size of sadb_msg structure.
	* We do not raise error even if error occurred in this function.
	*/
	if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) {
	/* check sequence number */
	if (mhp->msg->sadb_msg_seq == 0 \|\|
	mhp->msg->sadb_msg_errno == 0) {
	ipseclog((LOG_DEBUG, "%s: must specify sequence "
	"number and errno.\n", __func__));
	} else {
	/*
	* IKEd reported that error occurred.
	* XXXAE: what it expects from the kernel?
	* Probably we should send SADB_ACQUIRE again?
	* If so, reset ACQ's state.
	* XXXAE: it looks useless.
	*/
	key_acqreset(mhp->msg->sadb_msg_seq);
	}
	m_freem(m);
	return (0);
	}

	/*
	* This message is from user land.
	*/

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_ADDRESS_DST) \|\|
	SADB_CHECKHDR(mhp, SADB_EXT_PROPOSAL)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: missing required header.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_SRC) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_ADDRESS_DST) \|\|
	SADB_CHECKLEN(mhp, SADB_EXT_PROPOSAL)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	if (SADB_CHECKHDR(mhp, SADB_X_EXT_SA2)) {
	mode = IPSEC_MODE_ANY;
	reqid = 0;
	} else {
	if (SADB_CHECKLEN(mhp, SADB_X_EXT_SA2)) {
	ipseclog((LOG_DEBUG,
	"%s: invalid message: wrong header size.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	mode = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
	reqid = ((struct sadb_x_sa2 *)
	mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
	}

	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];

	error = key_checksockaddrs((struct sockaddr *)(src0 + 1),
	(struct sockaddr *)(dst0 + 1));
	if (error != 0) {
	ipseclog((LOG_DEBUG, "%s: invalid sockaddr.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);

	/* get a SA index */
	SAHTREE_RLOCK();
	LIST_FOREACH(sah, SAHADDRHASH_HASH(&saidx), addrhash) {
	if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID))
	break;
	}
	SAHTREE_RUNLOCK();
	if (sah != NULL) {
	ipseclog((LOG_DEBUG, "%s: a SA exists already.\n", __func__));
	return key_senderror(so, m, EEXIST);
	}

	error = key_acquire(&saidx, NULL);
	if (error != 0) {
	ipseclog((LOG_DEBUG,
	"%s: error %d returned from key_acquire()\n",
	__func__, error));
	return key_senderror(so, m, error);
	}
	m_freem(m);
	return (0);
	}

	/*
	* SADB_REGISTER processing.
	* If SATYPE_UNSPEC has been passed as satype, only return sabd_supported.
	* receive
	* <base>
	* from the ikmpd, and register a socket to send PF_KEY messages,
	* and send
	* <base, supported>
	* to KMD by PF_KEY.
	* If socket is detached, must free from regnode.
	*
	* m will always be freed.
	*/
	static int
	key_register(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secreg reg, newreg = NULL;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* check for invalid register message */
	if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0]))
	return key_senderror(so, m, EINVAL);

	/* When SATYPE_UNSPEC is specified, only return sabd_supported. */
	if (mhp->msg->sadb_msg_satype == SADB_SATYPE_UNSPEC)
	goto setmsg;

	/* check whether existing or not */
	REGTREE_LOCK();
	LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) {
	if (reg->so == so) {
	REGTREE_UNLOCK();
	ipseclog((LOG_DEBUG, "%s: socket exists already.\n",
	__func__));
	return key_senderror(so, m, EEXIST);
	}
	}

	/* create regnode */
	newreg = malloc(sizeof(struct secreg), M_IPSEC_SAR, M_NOWAIT\|M_ZERO);
	if (newreg == NULL) {
	REGTREE_UNLOCK();
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	newreg->so = so;
	((struct keycb *)sotorawcb(so))->kp_registered++;

	/* add regnode to regtree. */
	LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain);
	REGTREE_UNLOCK();

	setmsg:
	{
	struct mbuf *n;
	struct sadb_msg *newmsg;
	struct sadb_supported *sup;
	u_int len, alen, elen;
	int off;
	int i;
	struct sadb_alg *alg;

	/* create new sadb_msg to reply. */
	alen = 0;
	for (i = 1; i <= SADB_AALG_MAX; i++) {
	if (auth_algorithm_lookup(i))
	alen += sizeof(struct sadb_alg);
	}
	if (alen)
	alen += sizeof(struct sadb_supported);
	elen = 0;
	for (i = 1; i <= SADB_EALG_MAX; i++) {
	if (enc_algorithm_lookup(i))
	elen += sizeof(struct sadb_alg);
	}
	if (elen)
	elen += sizeof(struct sadb_supported);

	len = sizeof(struct sadb_msg) + alen + elen;

	if (len > MCLBYTES)
	return key_senderror(so, m, ENOBUFS);

	MGETHDR(n, M_NOWAIT, MT_DATA);
	if (len > MHLEN) {
	if (!(MCLGET(n, M_NOWAIT))) {
	m_freem(n);
	n = NULL;
	}
	}
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	n->m_pkthdr.len = n->m_len = len;
	n->m_next = NULL;
	off = 0;

	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(len);
	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));

	/* for authentication algorithm */
	if (alen) {
	sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
	sup->sadb_supported_len = PFKEY_UNIT64(alen);
	sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
	off += PFKEY_ALIGN8(sizeof(*sup));

	for (i = 1; i <= SADB_AALG_MAX; i++) {
	const struct auth_hash *aalgo;
	u_int16_t minkeysize, maxkeysize;

	aalgo = auth_algorithm_lookup(i);
	if (!aalgo)
	continue;
	alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
	alg->sadb_alg_id = i;
	alg->sadb_alg_ivlen = 0;
	key_getsizes_ah(aalgo, i, &minkeysize, &maxkeysize);
	alg->sadb_alg_minbits = _BITS(minkeysize);
	alg->sadb_alg_maxbits = _BITS(maxkeysize);
	off += PFKEY_ALIGN8(sizeof(*alg));
	}
	}

	/* for encryption algorithm */
	if (elen) {
	sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
	sup->sadb_supported_len = PFKEY_UNIT64(elen);
	sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
	off += PFKEY_ALIGN8(sizeof(*sup));

	for (i = 1; i <= SADB_EALG_MAX; i++) {
	const struct enc_xform *ealgo;

	ealgo = enc_algorithm_lookup(i);
	if (!ealgo)
	continue;
	alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
	alg->sadb_alg_id = i;
	alg->sadb_alg_ivlen = ealgo->ivsize;
	alg->sadb_alg_minbits = _BITS(ealgo->minkey);
	alg->sadb_alg_maxbits = _BITS(ealgo->maxkey);
	off += PFKEY_ALIGN8(sizeof(struct sadb_alg));
	}
	}

	IPSEC_ASSERT(off == len,
	("length assumption failed (off %u len %u)", off, len));

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_REGISTERED);
	}
	}

	/*
	* free secreg entry registered.
	* XXX: I want to do free a socket marked done SADB_RESIGER to socket.
	*/
	void
	key_freereg(struct socket *so)
	{
	struct secreg *reg;
	int i;

	IPSEC_ASSERT(so != NULL, ("NULL so"));

	/*
	* check whether existing or not.
	* check all type of SA, because there is a potential that
	* one socket is registered to multiple type of SA.
	*/
	REGTREE_LOCK();
	for (i = 0; i <= SADB_SATYPE_MAX; i++) {
	LIST_FOREACH(reg, &V_regtree[i], chain) {
	if (reg->so == so && __LIST_CHAINED(reg)) {
	LIST_REMOVE(reg, chain);
	free(reg, M_IPSEC_SAR);
	break;
	}
	}
	}
	REGTREE_UNLOCK();
	}

	/*
	* SADB_EXPIRE processing
	* send
	* <base, SA, SA2, lifetime(C and one of HS), address(SD)>
	* to KMD by PF_KEY.
	* NOTE: We send only soft lifetime extension.
	*
	* OUT: 0 : succeed
	* others : error number
	*/
	static int
	key_expire(struct secasvar *sav, int hard)
	{
	struct mbuf result = NULL, m;
	struct sadb_lifetime *lt;
	uint32_t replay_count;
	int error, len;
	uint8_t satype;

	IPSEC_ASSERT (sav != NULL, ("null sav"));
	IPSEC_ASSERT (sav->sah != NULL, ("null sa header"));

	KEYDBG(KEY_STAMP,
	printf("%s: SA(%p) expired %s lifetime\n", __func__,
	sav, hard ? "hard": "soft"));
	KEYDBG(KEY_DATA, kdebug_secasv(sav));
	/* set msg header */
	satype = key_proto2satype(sav->sah->saidx.proto);
	IPSEC_ASSERT(satype != 0, ("invalid proto, satype %u", satype));
	m = key_setsadbmsg(SADB_EXPIRE, 0, satype, sav->seq, 0, sav->refcnt);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	result = m;

	/* create SA extension */
	m = key_setsadbsa(sav);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* create SA extension */
	SECASVAR_LOCK(sav);
	replay_count = sav->replay ? sav->replay->count : 0;
	SECASVAR_UNLOCK(sav);

	m = key_setsadbxsa2(sav->sah->saidx.mode, replay_count,
	sav->sah->saidx.reqid);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	if (sav->replay && sav->replay->wsize > UINT8_MAX) {
	m = key_setsadbxsareplay(sav->replay->wsize);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);
	}

	/* create lifetime extension (current and soft) */
	len = PFKEY_ALIGN8(sizeof(lt)) 2;
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	m_align(m, len);
	m->m_len = len;
	bzero(mtod(m, caddr_t), len);
	lt = mtod(m, struct sadb_lifetime *);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
	lt->sadb_lifetime_allocations =
	(uint32_t)counter_u64_fetch(sav->lft_c_allocations);
	lt->sadb_lifetime_bytes =
	counter_u64_fetch(sav->lft_c_bytes);
	lt->sadb_lifetime_addtime = sav->created;
	lt->sadb_lifetime_usetime = sav->firstused;
	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	if (hard) {
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
	lt->sadb_lifetime_allocations = sav->lft_h->allocations;
	lt->sadb_lifetime_bytes = sav->lft_h->bytes;
	lt->sadb_lifetime_addtime = sav->lft_h->addtime;
	lt->sadb_lifetime_usetime = sav->lft_h->usetime;
	} else {
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
	lt->sadb_lifetime_allocations = sav->lft_s->allocations;
	lt->sadb_lifetime_bytes = sav->lft_s->bytes;
	lt->sadb_lifetime_addtime = sav->lft_s->addtime;
	lt->sadb_lifetime_usetime = sav->lft_s->usetime;
	}
	m_cat(result, m);

	/* set sadb_address for source */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sav->sah->saidx.src.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* set sadb_address for destination */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sav->sah->saidx.dst.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/*
	* XXX-BZ Handle NAT-T extensions here.
	* XXXAE: it doesn't seem quite useful. IKEs should not depend on
	* this information, we report only significant SA fields.
	*/

	if ((result->m_flags & M_PKTHDR) == 0) {
	error = EINVAL;
	goto fail;
	}

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);

	fail:
	if (result)
	m_freem(result);
	return error;
	}

	static void
	key_freesah_flushed(struct secashead_queue *flushq)
	{
	struct secashead sah, nextsah;
	struct secasvar sav, nextsav;

	sah = TAILQ_FIRST(flushq);
	while (sah != NULL) {
	sav = TAILQ_FIRST(&sah->savtree_larval);
	while (sav != NULL) {
	nextsav = TAILQ_NEXT(sav, chain);
	TAILQ_REMOVE(&sah->savtree_larval, sav, chain);
	key_freesav(&sav); /* release last reference */
	key_freesah(&sah); /* release reference from SAV */
	sav = nextsav;
	}
	sav = TAILQ_FIRST(&sah->savtree_alive);
	while (sav != NULL) {
	nextsav = TAILQ_NEXT(sav, chain);
	TAILQ_REMOVE(&sah->savtree_alive, sav, chain);
	key_freesav(&sav); /* release last reference */
	key_freesah(&sah); /* release reference from SAV */
	sav = nextsav;
	}
	nextsah = TAILQ_NEXT(sah, chain);
	key_freesah(&sah); /* release last reference */
	sah = nextsah;
	}
	}

	/*
	* SADB_FLUSH processing
	* receive
	* <base>
	* from the ikmpd, and free all entries in secastree.
	* and send,
	* <base>
	* to the ikmpd.
	* NOTE: to do is only marking SADB_SASTATE_DEAD.
	*
	* m will always be freed.
	*/
	static int
	key_flush(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	struct secashead_queue flushq;
	struct sadb_msg *newmsg;
	struct secashead sah, nextsah;
	struct secasvar *sav;
	uint8_t proto;
	int i;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	KEYDBG(KEY_STAMP,
	printf("%s: proto %u\n", __func__, proto));

	TAILQ_INIT(&flushq);
	if (proto == IPSEC_PROTO_ANY) {
	/* no SATYPE specified, i.e. flushing all SA. */
	SAHTREE_WLOCK();
	/* Move all SAHs into flushq */
	TAILQ_CONCAT(&flushq, &V_sahtree, chain);
	/* Flush all buckets in SPI hash */
	for (i = 0; i < V_savhash_mask + 1; i++)
	LIST_INIT(&V_savhashtbl[i]);
	/* Flush all buckets in SAHADDRHASH */
	for (i = 0; i < V_sahaddrhash_mask + 1; i++)
	LIST_INIT(&V_sahaddrhashtbl[i]);
	/* Mark all SAHs as unlinked */
	TAILQ_FOREACH(sah, &flushq, chain) {
	sah->state = SADB_SASTATE_DEAD;
	/*
	* Callout handler makes its job using
	* RLOCK and drain queues. In case, when this
	* function will be called just before it
	* acquires WLOCK, we need to mark SAs as
	* unlinked to prevent second unlink.
	*/
	TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
	sav->state = SADB_SASTATE_DEAD;
	}
	TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
	sav->state = SADB_SASTATE_DEAD;
	}
	}
	SAHTREE_WUNLOCK();
	} else {
	SAHTREE_WLOCK();
	sah = TAILQ_FIRST(&V_sahtree);
	while (sah != NULL) {
	IPSEC_ASSERT(sah->state != SADB_SASTATE_DEAD,
	("DEAD SAH %p in SADB_FLUSH", sah));
	nextsah = TAILQ_NEXT(sah, chain);
	if (sah->saidx.proto != proto) {
	sah = nextsah;
	continue;
	}
	sah->state = SADB_SASTATE_DEAD;
	TAILQ_REMOVE(&V_sahtree, sah, chain);
	LIST_REMOVE(sah, addrhash);
	/* Unlink all SAs from SPI hash */
	TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
	LIST_REMOVE(sav, spihash);
	sav->state = SADB_SASTATE_DEAD;
	}
	TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
	LIST_REMOVE(sav, spihash);
	sav->state = SADB_SASTATE_DEAD;
	}
	/* Add SAH into flushq */
	TAILQ_INSERT_HEAD(&flushq, sah, chain);
	sah = nextsah;
	}
	SAHTREE_WUNLOCK();
	}

	key_freesah_flushed(&flushq);
	/* Free all queued SAs and SAHs */
	if (m->m_len < sizeof(struct sadb_msg) \|\|
	sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	if (m->m_next)
	m_freem(m->m_next);
	m->m_next = NULL;
	m->m_pkthdr.len = m->m_len = sizeof(struct sadb_msg);
	newmsg = mtod(m, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);

	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	}

	/*
	* SADB_DUMP processing
	* dump all entries including status of DEAD in SAD.
	* receive
	* <base>
	* from the ikmpd, and dump all secasvar leaves
	* and send,
	* <base> .....
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_dump(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	SAHTREE_RLOCK_TRACKER;
	struct secashead *sah;
	struct secasvar *sav;
	- struct sadb_msg *newmsg;
	struct mbuf *n;
	uint32_t cnt;
	uint8_t proto, satype;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	/* count sav entries to be sent to the userland. */
	cnt = 0;
	SAHTREE_RLOCK();
	TAILQ_FOREACH(sah, &V_sahtree, chain) {
	if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC &&
	proto != sah->saidx.proto)
	continue;

	TAILQ_FOREACH(sav, &sah->savtree_larval, chain)
	cnt++;
	TAILQ_FOREACH(sav, &sah->savtree_alive, chain)
	cnt++;
	}

	if (cnt == 0) {
	SAHTREE_RUNLOCK();
	return key_senderror(so, m, ENOENT);
	}

	/* send this to the userland, one at a time. */
	- newmsg = NULL;
	TAILQ_FOREACH(sah, &V_sahtree, chain) {
	if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC &&
	proto != sah->saidx.proto)
	continue;

	/* map proto to satype */
	if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
	SAHTREE_RUNLOCK();
	ipseclog((LOG_DEBUG, "%s: there was invalid proto in "
	"SAD.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}
	TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
	n = key_setdumpsa(sav, SADB_DUMP, satype,
	--cnt, mhp->msg->sadb_msg_pid);
	if (n == NULL) {
	SAHTREE_RUNLOCK();
	return key_senderror(so, m, ENOBUFS);
	}
	key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
	n = key_setdumpsa(sav, SADB_DUMP, satype,
	--cnt, mhp->msg->sadb_msg_pid);
	if (n == NULL) {
	SAHTREE_RUNLOCK();
	return key_senderror(so, m, ENOBUFS);
	}
	key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}
	SAHTREE_RUNLOCK();
	m_freem(m);
	return (0);
	}
	/*
	* SADB_X_PROMISC processing
	*
	* m will always be freed.
	*/
	static int
	key_promisc(struct socket so, struct mbuf m, const struct sadb_msghdr *mhp)
	{
	int olen;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	olen = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);

	if (olen < sizeof(struct sadb_msg)) {
	#if 1
	return key_senderror(so, m, EINVAL);
	#else
	m_freem(m);
	return 0;
	#endif
	} else if (olen == sizeof(struct sadb_msg)) {
	/* enable/disable promisc mode */
	struct keycb *kp;

	if ((kp = (struct keycb *)sotorawcb(so)) == NULL)
	return key_senderror(so, m, EINVAL);
	mhp->msg->sadb_msg_errno = 0;
	switch (mhp->msg->sadb_msg_satype) {
	case 0:
	case 1:
	kp->kp_promisc = mhp->msg->sadb_msg_satype;
	break;
	default:
	return key_senderror(so, m, EINVAL);
	}

	/* send the original message back to everyone */
	mhp->msg->sadb_msg_errno = 0;
	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	} else {
	/* send packet as is */

	m_adj(m, PFKEY_ALIGN8(sizeof(struct sadb_msg)));

	/* TODO: if sadb_msg_seq is specified, send to specific pid */
	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	}
	}

	static int (key_typesw[])(struct socket , struct mbuf *,
	const struct sadb_msghdr *) = {
	NULL, /* SADB_RESERVED */
	key_getspi, /* SADB_GETSPI */
	key_update, /* SADB_UPDATE */
	key_add, /* SADB_ADD */
	key_delete, /* SADB_DELETE */
	key_get, /* SADB_GET */
	key_acquire2, /* SADB_ACQUIRE */
	key_register, /* SADB_REGISTER */
	NULL, /* SADB_EXPIRE */
	key_flush, /* SADB_FLUSH */
	key_dump, /* SADB_DUMP */
	key_promisc, /* SADB_X_PROMISC */
	NULL, /* SADB_X_PCHANGE */
	key_spdadd, /* SADB_X_SPDUPDATE */
	key_spdadd, /* SADB_X_SPDADD */
	key_spddelete, /* SADB_X_SPDDELETE */
	key_spdget, /* SADB_X_SPDGET */
	NULL, /* SADB_X_SPDACQUIRE */
	key_spddump, /* SADB_X_SPDDUMP */
	key_spdflush, /* SADB_X_SPDFLUSH */
	key_spdadd, /* SADB_X_SPDSETIDX */
	NULL, /* SADB_X_SPDEXPIRE */
	key_spddelete2, /* SADB_X_SPDDELETE2 */
	};

	/*
	* parse sadb_msg buffer to process PFKEYv2,
	* and create a data to response if needed.
	* I think to be dealed with mbuf directly.
	* IN:
	* msgp : pointer to pointer to a received buffer pulluped.
	* This is rewrited to response.
	* so : pointer to socket.
	* OUT:
	* length for buffer to send to user process.
	*/
	int
	key_parse(struct mbuf m, struct socket so)
	{
	struct sadb_msg *msg;
	struct sadb_msghdr mh;
	u_int orglen;
	int error;
	int target;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));

	if (m->m_len < sizeof(struct sadb_msg)) {
	m = m_pullup(m, sizeof(struct sadb_msg));
	if (!m)
	return ENOBUFS;
	}
	msg = mtod(m, struct sadb_msg *);
	orglen = PFKEY_UNUNIT64(msg->sadb_msg_len);
	target = KEY_SENDUP_ONE;

	if ((m->m_flags & M_PKTHDR) == 0 \|\| m->m_pkthdr.len != orglen) {
	ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__));
	PFKEYSTAT_INC(out_invlen);
	error = EINVAL;
	goto senderror;
	}

	if (msg->sadb_msg_version != PF_KEY_V2) {
	ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n",
	__func__, msg->sadb_msg_version));
	PFKEYSTAT_INC(out_invver);
	error = EINVAL;
	goto senderror;
	}

	if (msg->sadb_msg_type > SADB_MAX) {
	ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
	__func__, msg->sadb_msg_type));
	PFKEYSTAT_INC(out_invmsgtype);
	error = EINVAL;
	goto senderror;
	}

	/* for old-fashioned code - should be nuked */
	if (m->m_pkthdr.len > MCLBYTES) {
	m_freem(m);
	return ENOBUFS;
	}
	if (m->m_next) {
	struct mbuf *n;

	MGETHDR(n, M_NOWAIT, MT_DATA);
	if (n && m->m_pkthdr.len > MHLEN) {
	if (!(MCLGET(n, M_NOWAIT))) {
	m_free(n);
	n = NULL;
	}
	}
	if (!n) {
	m_freem(m);
	return ENOBUFS;
	}
	m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t));
	n->m_pkthdr.len = n->m_len = m->m_pkthdr.len;
	n->m_next = NULL;
	m_freem(m);
	m = n;
	}

	/* align the mbuf chain so that extensions are in contiguous region. */
	error = key_align(m, &mh);
	if (error)
	return error;

	msg = mh.msg;

	/* We use satype as scope mask for spddump */
	if (msg->sadb_msg_type == SADB_X_SPDDUMP) {
	switch (msg->sadb_msg_satype) {
	case IPSEC_POLICYSCOPE_ANY:
	case IPSEC_POLICYSCOPE_GLOBAL:
	case IPSEC_POLICYSCOPE_IFNET:
	case IPSEC_POLICYSCOPE_PCB:
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n",
	__func__, msg->sadb_msg_type));
	PFKEYSTAT_INC(out_invsatype);
	error = EINVAL;
	goto senderror;
	}
	} else {
	switch (msg->sadb_msg_satype) { /* check SA type */
	case SADB_SATYPE_UNSPEC:
	switch (msg->sadb_msg_type) {
	case SADB_GETSPI:
	case SADB_UPDATE:
	case SADB_ADD:
	case SADB_DELETE:
	case SADB_GET:
	case SADB_ACQUIRE:
	case SADB_EXPIRE:
	ipseclog((LOG_DEBUG, "%s: must specify satype "
	"when msg type=%u.\n", __func__,
	msg->sadb_msg_type));
	PFKEYSTAT_INC(out_invsatype);
	error = EINVAL;
	goto senderror;
	}
	break;
	case SADB_SATYPE_AH:
	case SADB_SATYPE_ESP:
	case SADB_X_SATYPE_IPCOMP:
	case SADB_X_SATYPE_TCPSIGNATURE:
	switch (msg->sadb_msg_type) {
	case SADB_X_SPDADD:
	case SADB_X_SPDDELETE:
	case SADB_X_SPDGET:
	case SADB_X_SPDFLUSH:
	case SADB_X_SPDSETIDX:
	case SADB_X_SPDUPDATE:
	case SADB_X_SPDDELETE2:
	ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n",
	__func__, msg->sadb_msg_type));
	PFKEYSTAT_INC(out_invsatype);
	error = EINVAL;
	goto senderror;
	}
	break;
	case SADB_SATYPE_RSVP:
	case SADB_SATYPE_OSPFV2:
	case SADB_SATYPE_RIPV2:
	case SADB_SATYPE_MIP:
	ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n",
	__func__, msg->sadb_msg_satype));
	PFKEYSTAT_INC(out_invsatype);
	error = EOPNOTSUPP;
	goto senderror;
	case 1: /* XXX: What does it do? */
	if (msg->sadb_msg_type == SADB_X_PROMISC)
	break;
	/FALLTHROUGH/
	default:
	ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
	__func__, msg->sadb_msg_satype));
	PFKEYSTAT_INC(out_invsatype);
	error = EINVAL;
	goto senderror;
	}
	}

	/* check field of upper layer protocol and address family */
	if (mh.ext[SADB_EXT_ADDRESS_SRC] != NULL
	&& mh.ext[SADB_EXT_ADDRESS_DST] != NULL) {
	struct sadb_address src0, dst0;
	u_int plen;

	src0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_DST]);

	/* check upper layer protocol */
	if (src0->sadb_address_proto != dst0->sadb_address_proto) {
	ipseclog((LOG_DEBUG, "%s: upper layer protocol "
	"mismatched.\n", __func__));
	PFKEYSTAT_INC(out_invaddr);
	error = EINVAL;
	goto senderror;
	}

	/* check family */
	if (PFKEY_ADDR_SADDR(src0)->sa_family !=
	PFKEY_ADDR_SADDR(dst0)->sa_family) {
	ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
	__func__));
	PFKEYSTAT_INC(out_invaddr);
	error = EINVAL;
	goto senderror;
	}
	if (PFKEY_ADDR_SADDR(src0)->sa_len !=
	PFKEY_ADDR_SADDR(dst0)->sa_len) {
	ipseclog((LOG_DEBUG, "%s: address struct size "
	"mismatched.\n", __func__));
	PFKEYSTAT_INC(out_invaddr);
	error = EINVAL;
	goto senderror;
	}

	switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
	case AF_INET:
	if (PFKEY_ADDR_SADDR(src0)->sa_len !=
	sizeof(struct sockaddr_in)) {
	PFKEYSTAT_INC(out_invaddr);
	error = EINVAL;
	goto senderror;
	}
	break;
	case AF_INET6:
	if (PFKEY_ADDR_SADDR(src0)->sa_len !=
	sizeof(struct sockaddr_in6)) {
	PFKEYSTAT_INC(out_invaddr);
	error = EINVAL;
	goto senderror;
	}
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: unsupported address family\n",
	__func__));
	PFKEYSTAT_INC(out_invaddr);
	error = EAFNOSUPPORT;
	goto senderror;
	}

	switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
	case AF_INET:
	plen = sizeof(struct in_addr) << 3;
	break;
	case AF_INET6:
	plen = sizeof(struct in6_addr) << 3;
	break;
	default:
	plen = 0; /fool gcc/
	break;
	}

	/* check max prefix length */
	if (src0->sadb_address_prefixlen > plen \|\|
	dst0->sadb_address_prefixlen > plen) {
	ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n",
	__func__));
	PFKEYSTAT_INC(out_invaddr);
	error = EINVAL;
	goto senderror;
	}

	/*
	* prefixlen == 0 is valid because there can be a case when
	* all addresses are matched.
	*/
	}

	if (msg->sadb_msg_type >= nitems(key_typesw) \|\|
	key_typesw[msg->sadb_msg_type] == NULL) {
	PFKEYSTAT_INC(out_invmsgtype);
	error = EINVAL;
	goto senderror;
	}

	return (*key_typesw[msg->sadb_msg_type])(so, m, &mh);

	senderror:
	msg->sadb_msg_errno = error;
	return key_sendup_mbuf(so, m, target);
	}

	static int
	key_senderror(struct socket so, struct mbuf m, int code)
	{
	struct sadb_msg *msg;

	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
	("mbuf too small, len %u", m->m_len));

	msg = mtod(m, struct sadb_msg *);
	msg->sadb_msg_errno = code;
	return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
	}

	/*
	* set the pointer to each header into message buffer.
	* m will be freed on error.
	* XXX larger-than-MCLBYTES extension?
	*/
	static int
	key_align(struct mbuf m, struct sadb_msghdr mhp)
	{
	struct mbuf *n;
	struct sadb_ext *ext;
	size_t off, end;
	int extlen;
	int toff;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
	("mbuf too small, len %u", m->m_len));

	/* initialize */
	bzero(mhp, sizeof(*mhp));

	mhp->msg = mtod(m, struct sadb_msg *);
	mhp->ext[0] = (struct sadb_ext )mhp->msg; /XXX backward compat */

	end = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
	extlen = end; /just in case extlen is not updated/
	for (off = sizeof(struct sadb_msg); off < end; off += extlen) {
	n = m_pulldown(m, off, sizeof(struct sadb_ext), &toff);
	if (!n) {
	/* m is already freed */
	return ENOBUFS;
	}
	ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);

	/* set pointer */
	switch (ext->sadb_ext_type) {
	case SADB_EXT_SA:
	case SADB_EXT_ADDRESS_SRC:
	case SADB_EXT_ADDRESS_DST:
	case SADB_EXT_ADDRESS_PROXY:
	case SADB_EXT_LIFETIME_CURRENT:
	case SADB_EXT_LIFETIME_HARD:
	case SADB_EXT_LIFETIME_SOFT:
	case SADB_EXT_KEY_AUTH:
	case SADB_EXT_KEY_ENCRYPT:
	case SADB_EXT_IDENTITY_SRC:
	case SADB_EXT_IDENTITY_DST:
	case SADB_EXT_SENSITIVITY:
	case SADB_EXT_PROPOSAL:
	case SADB_EXT_SUPPORTED_AUTH:
	case SADB_EXT_SUPPORTED_ENCRYPT:
	case SADB_EXT_SPIRANGE:
	case SADB_X_EXT_POLICY:
	case SADB_X_EXT_SA2:
	case SADB_X_EXT_NAT_T_TYPE:
	case SADB_X_EXT_NAT_T_SPORT:
	case SADB_X_EXT_NAT_T_DPORT:
	case SADB_X_EXT_NAT_T_OAI:
	case SADB_X_EXT_NAT_T_OAR:
	case SADB_X_EXT_NAT_T_FRAG:
	case SADB_X_EXT_SA_REPLAY:
	case SADB_X_EXT_NEW_ADDRESS_SRC:
	case SADB_X_EXT_NEW_ADDRESS_DST:
	/* duplicate check */
	/*
	* XXX Are there duplication payloads of either
	* KEY_AUTH or KEY_ENCRYPT ?
	*/
	if (mhp->ext[ext->sadb_ext_type] != NULL) {
	ipseclog((LOG_DEBUG, "%s: duplicate ext_type "
	"%u\n", __func__, ext->sadb_ext_type));
	m_freem(m);
	PFKEYSTAT_INC(out_dupext);
	return EINVAL;
	}
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n",
	__func__, ext->sadb_ext_type));
	m_freem(m);
	PFKEYSTAT_INC(out_invexttype);
	return EINVAL;
	}

	extlen = PFKEY_UNUNIT64(ext->sadb_ext_len);

	if (key_validate_ext(ext, extlen)) {
	m_freem(m);
	PFKEYSTAT_INC(out_invlen);
	return EINVAL;
	}

	n = m_pulldown(m, off, extlen, &toff);
	if (!n) {
	/* m is already freed */
	return ENOBUFS;
	}
	ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);

	mhp->ext[ext->sadb_ext_type] = ext;
	mhp->extoff[ext->sadb_ext_type] = off;
	mhp->extlen[ext->sadb_ext_type] = extlen;
	}

	if (off != end) {
	m_freem(m);
	PFKEYSTAT_INC(out_invlen);
	return EINVAL;
	}

	return 0;
	}

	static int
	key_validate_ext(const struct sadb_ext *ext, int len)
	{
	const struct sockaddr *sa;
	enum { NONE, ADDR } checktype = NONE;
	int baselen = 0;
	const int sal = offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len);

	if (len != PFKEY_UNUNIT64(ext->sadb_ext_len))
	return EINVAL;

	/* if it does not match minimum/maximum length, bail */
	if (ext->sadb_ext_type >= nitems(minsize) \|\|
	ext->sadb_ext_type >= nitems(maxsize))
	return EINVAL;
	if (!minsize[ext->sadb_ext_type] \|\| len < minsize[ext->sadb_ext_type])
	return EINVAL;
	if (maxsize[ext->sadb_ext_type] && len > maxsize[ext->sadb_ext_type])
	return EINVAL;

	/* more checks based on sadb_ext_type XXX need more */
	switch (ext->sadb_ext_type) {
	case SADB_EXT_ADDRESS_SRC:
	case SADB_EXT_ADDRESS_DST:
	case SADB_EXT_ADDRESS_PROXY:
	case SADB_X_EXT_NAT_T_OAI:
	case SADB_X_EXT_NAT_T_OAR:
	case SADB_X_EXT_NEW_ADDRESS_SRC:
	case SADB_X_EXT_NEW_ADDRESS_DST:
	baselen = PFKEY_ALIGN8(sizeof(struct sadb_address));
	checktype = ADDR;
	break;
	case SADB_EXT_IDENTITY_SRC:
	case SADB_EXT_IDENTITY_DST:
	if (((const struct sadb_ident *)ext)->sadb_ident_type ==
	SADB_X_IDENTTYPE_ADDR) {
	baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident));
	checktype = ADDR;
	} else
	checktype = NONE;
	break;
	default:
	checktype = NONE;
	break;
	}

	switch (checktype) {
	case NONE:
	break;
	case ADDR:
	sa = (const struct sockaddr )(((const u_int8_t)ext)+baselen);
	if (len < baselen + sal)
	return EINVAL;
	if (baselen + PFKEY_ALIGN8(sa->sa_len) != len)
	return EINVAL;
	break;
	}

	return 0;
	}

	void
	key_init(void)
	{
	int i;

	for (i = 0; i < IPSEC_DIR_MAX; i++) {
	TAILQ_INIT(&V_sptree[i]);
	TAILQ_INIT(&V_sptree_ifnet[i]);
	}

	V_key_lft_zone = uma_zcreate("IPsec SA lft_c",
	sizeof(uint64_t) * 2, NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, UMA_ZONE_PCPU);

	TAILQ_INIT(&V_sahtree);
	V_sphashtbl = hashinit(SPHASH_NHASH, M_IPSEC_SP, &V_sphash_mask);
	V_savhashtbl = hashinit(SAVHASH_NHASH, M_IPSEC_SA, &V_savhash_mask);
	V_sahaddrhashtbl = hashinit(SAHHASH_NHASH, M_IPSEC_SAH,
	&V_sahaddrhash_mask);
	V_acqaddrhashtbl = hashinit(ACQHASH_NHASH, M_IPSEC_SAQ,
	&V_acqaddrhash_mask);
	V_acqseqhashtbl = hashinit(ACQHASH_NHASH, M_IPSEC_SAQ,
	&V_acqseqhash_mask);

	for (i = 0; i <= SADB_SATYPE_MAX; i++)
	LIST_INIT(&V_regtree[i]);

	LIST_INIT(&V_acqtree);
	LIST_INIT(&V_spacqtree);

	if (!IS_DEFAULT_VNET(curvnet))
	return;

	XFORMS_LOCK_INIT();
	SPTREE_LOCK_INIT();
	REGTREE_LOCK_INIT();
	SAHTREE_LOCK_INIT();
	ACQ_LOCK_INIT();
	SPACQ_LOCK_INIT();

	#ifndef IPSEC_DEBUG2
	callout_init(&key_timer, 1);
	callout_reset(&key_timer, hz, key_timehandler, NULL);
	#endif /IPSEC_DEBUG2/

	/* initialize key statistics */
	keystat.getspi_count = 1;

	if (bootverbose)
	printf("IPsec: Initialized Security Association Processing.\n");
	}

	#ifdef VIMAGE
	void
	key_destroy(void)
	{
	struct secashead_queue sahdrainq;
	struct secpolicy_queue drainq;
	struct secpolicy sp, nextsp;
	struct secacq acq, nextacq;
	struct secspacq spacq, nextspacq;
	struct secashead *sah;
	struct secasvar *sav;
	struct secreg *reg;
	int i;

	/*
	* XXX: can we just call free() for each object without
	* walking through safe way with releasing references?
	*/
	TAILQ_INIT(&drainq);
	SPTREE_WLOCK();
	for (i = 0; i < IPSEC_DIR_MAX; i++) {
	TAILQ_CONCAT(&drainq, &V_sptree[i], chain);
	TAILQ_CONCAT(&drainq, &V_sptree_ifnet[i], chain);
	}
	for (i = 0; i < V_sphash_mask + 1; i++)
	LIST_INIT(&V_sphashtbl[i]);
	SPTREE_WUNLOCK();

	sp = TAILQ_FIRST(&drainq);
	while (sp != NULL) {
	nextsp = TAILQ_NEXT(sp, chain);
	key_freesp(&sp);
	sp = nextsp;
	}

	TAILQ_INIT(&sahdrainq);
	SAHTREE_WLOCK();
	TAILQ_CONCAT(&sahdrainq, &V_sahtree, chain);
	for (i = 0; i < V_savhash_mask + 1; i++)
	LIST_INIT(&V_savhashtbl[i]);
	for (i = 0; i < V_sahaddrhash_mask + 1; i++)
	LIST_INIT(&V_sahaddrhashtbl[i]);
	TAILQ_FOREACH(sah, &sahdrainq, chain) {
	sah->state = SADB_SASTATE_DEAD;
	TAILQ_FOREACH(sav, &sah->savtree_larval, chain) {
	sav->state = SADB_SASTATE_DEAD;
	}
	TAILQ_FOREACH(sav, &sah->savtree_alive, chain) {
	sav->state = SADB_SASTATE_DEAD;
	}
	}
	SAHTREE_WUNLOCK();

	key_freesah_flushed(&sahdrainq);
	hashdestroy(V_sphashtbl, M_IPSEC_SP, V_sphash_mask);
	hashdestroy(V_savhashtbl, M_IPSEC_SA, V_savhash_mask);
	hashdestroy(V_sahaddrhashtbl, M_IPSEC_SAH, V_sahaddrhash_mask);

	REGTREE_LOCK();
	for (i = 0; i <= SADB_SATYPE_MAX; i++) {
	LIST_FOREACH(reg, &V_regtree[i], chain) {
	if (__LIST_CHAINED(reg)) {
	LIST_REMOVE(reg, chain);
	free(reg, M_IPSEC_SAR);
	break;
	}
	}
	}
	REGTREE_UNLOCK();

	ACQ_LOCK();
	acq = LIST_FIRST(&V_acqtree);
	while (acq != NULL) {
	nextacq = LIST_NEXT(acq, chain);
	LIST_REMOVE(acq, chain);
	free(acq, M_IPSEC_SAQ);
	acq = nextacq;
	}
	for (i = 0; i < V_acqaddrhash_mask + 1; i++)
	LIST_INIT(&V_acqaddrhashtbl[i]);
	for (i = 0; i < V_acqseqhash_mask + 1; i++)
	LIST_INIT(&V_acqseqhashtbl[i]);
	ACQ_UNLOCK();

	SPACQ_LOCK();
	for (spacq = LIST_FIRST(&V_spacqtree); spacq != NULL;
	spacq = nextspacq) {
	nextspacq = LIST_NEXT(spacq, chain);
	if (__LIST_CHAINED(spacq)) {
	LIST_REMOVE(spacq, chain);
	free(spacq, M_IPSEC_SAQ);
	}
	}
	SPACQ_UNLOCK();
	hashdestroy(V_acqaddrhashtbl, M_IPSEC_SAQ, V_acqaddrhash_mask);
	hashdestroy(V_acqseqhashtbl, M_IPSEC_SAQ, V_acqseqhash_mask);
	uma_zdestroy(V_key_lft_zone);

	if (!IS_DEFAULT_VNET(curvnet))
	return;
	#ifndef IPSEC_DEBUG2
	callout_drain(&key_timer);
	#endif
	XFORMS_LOCK_DESTROY();
	SPTREE_LOCK_DESTROY();
	REGTREE_LOCK_DESTROY();
	SAHTREE_LOCK_DESTROY();
	ACQ_LOCK_DESTROY();
	SPACQ_LOCK_DESTROY();
	}
	#endif

	/* record data transfer on SA, and update timestamps */
	void
	key_sa_recordxfer(struct secasvar sav, struct mbuf m)
	{
	IPSEC_ASSERT(sav != NULL, ("Null secasvar"));
	IPSEC_ASSERT(m != NULL, ("Null mbuf"));

	/*
	* XXX Currently, there is a difference of bytes size
	* between inbound and outbound processing.
	*/
	counter_u64_add(sav->lft_c_bytes, m->m_pkthdr.len);

	/*
	* We use the number of packets as the unit of
	* allocations. We increment the variable
	* whenever {esp,ah}_{in,out}put is called.
	*/
	counter_u64_add(sav->lft_c_allocations, 1);

	/*
	* NOTE: We record CURRENT usetime by using wall clock,
	* in seconds. HARD and SOFT lifetime are measured by the time
	* difference (again in seconds) from usetime.
	*
	* usetime
	* v expire expire
	* -----+-----+--------+---> t
	* <--------------> HARD
	* <-----> SOFT
	*/
	if (sav->firstused == 0)
	sav->firstused = time_second;
	}

	/*
	* Take one of the kernel's security keys and convert it into a PF_KEY
	* structure within an mbuf, suitable for sending up to a waiting
	* application in user land.
	*
	* IN:
	* src: A pointer to a kernel security key.
	* exttype: Which type of key this is. Refer to the PF_KEY data structures.
	* OUT:
	* a valid mbuf or NULL indicating an error
	*
	*/

	static struct mbuf *
	key_setkey(struct seckey *src, uint16_t exttype)
	{
	struct mbuf *m;
	struct sadb_key *p;
	int len;

	if (src == NULL)
	return NULL;

	len = PFKEY_ALIGN8(sizeof(struct sadb_key) + _KEYLEN(src));
	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return NULL;
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_key *);
	bzero(p, len);
	p->sadb_key_len = PFKEY_UNIT64(len);
	p->sadb_key_exttype = exttype;
	p->sadb_key_bits = src->bits;
	bcopy(src->key_data, _KEYBUF(p), _KEYLEN(src));

	return m;
	}

	/*
	* Take one of the kernel's lifetime data structures and convert it
	* into a PF_KEY structure within an mbuf, suitable for sending up to
	* a waiting application in user land.
	*
	* IN:
	* src: A pointer to a kernel lifetime structure.
	* exttype: Which type of lifetime this is. Refer to the PF_KEY
	* data structures for more information.
	* OUT:
	* a valid mbuf or NULL indicating an error
	*
	*/

	static struct mbuf *
	key_setlifetime(struct seclifetime *src, uint16_t exttype)
	{
	struct mbuf *m = NULL;
	struct sadb_lifetime *p;
	int len = PFKEY_ALIGN8(sizeof(struct sadb_lifetime));

	if (src == NULL)
	return NULL;

	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
	if (m == NULL)
	return m;
	m_align(m, len);
	m->m_len = len;
	p = mtod(m, struct sadb_lifetime *);

	bzero(p, len);
	p->sadb_lifetime_len = PFKEY_UNIT64(len);
	p->sadb_lifetime_exttype = exttype;
	p->sadb_lifetime_allocations = src->allocations;
	p->sadb_lifetime_bytes = src->bytes;
	p->sadb_lifetime_addtime = src->addtime;
	p->sadb_lifetime_usetime = src->usetime;

	return m;

	}

	const struct enc_xform *
	enc_algorithm_lookup(int alg)
	{
	int i;

	for (i = 0; i < nitems(supported_ealgs); i++)
	if (alg == supported_ealgs[i].sadb_alg)
	return (supported_ealgs[i].xform);
	return (NULL);
	}

	const struct auth_hash *
	auth_algorithm_lookup(int alg)
	{
	int i;

	for (i = 0; i < nitems(supported_aalgs); i++)
	if (alg == supported_aalgs[i].sadb_alg)
	return (supported_aalgs[i].xform);
	return (NULL);
	}

	const struct comp_algo *
	comp_algorithm_lookup(int alg)
	{
	int i;

	for (i = 0; i < nitems(supported_calgs); i++)
	if (alg == supported_calgs[i].sadb_alg)
	return (supported_calgs[i].xform);
	return (NULL);
	}

	/*
	* Register a transform.
	*/
	static int
	xform_register(struct xformsw* xsp)
	{
	struct xformsw *entry;

	XFORMS_LOCK();
	LIST_FOREACH(entry, &xforms, chain) {
	if (entry->xf_type == xsp->xf_type) {
	XFORMS_UNLOCK();
	return (EEXIST);
	}
	}
	LIST_INSERT_HEAD(&xforms, xsp, chain);
	XFORMS_UNLOCK();
	return (0);
	}

	void
	xform_attach(void *data)
	{
	struct xformsw xsp = (struct xformsw )data;

	if (xform_register(xsp) != 0)
	printf("%s: failed to register %s xform\n", __func__,
	xsp->xf_name);
	}

	void
	xform_detach(void *data)
	{
	struct xformsw xsp = (struct xformsw )data;

	XFORMS_LOCK();
	LIST_REMOVE(xsp, chain);
	XFORMS_UNLOCK();

	/* Delete all SAs related to this xform. */
	key_delete_xform(xsp);
	}

	/*
	* Initialize transform support in an sav.
	*/
	static int
	xform_init(struct secasvar *sav, u_short xftype)
	{
	struct xformsw *entry;
	int ret;

	IPSEC_ASSERT(sav->tdb_xform == NULL,
	("tdb_xform is already initialized"));

	ret = EINVAL;
	XFORMS_LOCK();
	LIST_FOREACH(entry, &xforms, chain) {
	if (entry->xf_type == xftype) {
	ret = (*entry->xf_init)(sav, entry);
	break;
	}
	}
	XFORMS_UNLOCK();
	return (ret);
	}

	Index: head/sys/netipsec/xform_ah.c
	===================================================================
	--- head/sys/netipsec/xform_ah.c (revision 327172)
	+++ head/sys/netipsec/xform_ah.c (revision 327173)
	@@ -1,1154 +1,1149 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ip_ah.c,v 1.63 2001/06/26 06:18:58 angelos Exp $ */
	/*-
	* The authors of this code are John Ioannidis (ji@tla.org),
	* Angelos D. Keromytis (kermit@csd.uch.gr) and
	* Niels Provos (provos@physnet.uni-hamburg.de).
	*
	* The original version of this code was written by John Ioannidis
	* for BSD/OS in Athens, Greece, in November 1995.
	*
	* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
	* by Angelos D. Keromytis.
	*
	* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
	* and Niels Provos.
	*
	* Additional features in 1999 by Angelos D. Keromytis and Niklas Hallqvist.
	*
	* Copyright (c) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
	* Angelos D. Keromytis and Niels Provos.
	* Copyright (c) 1999 Niklas Hallqvist.
	* Copyright (c) 2001 Angelos D. Keromytis.
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all copies of any software which is or includes a copy or
	* modification of this software.
	* You may use this code under the GNU public license if you so wish. Please
	* contribute changes back to the authors under this freer than GPL license
	* so that we may further the use of strong encryption without limitations to
	* all.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysctl.h>

	#include <net/if.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_ecn.h>
	#include <netinet/ip6.h>

	#include <netipsec/ipsec.h>
	#include <netipsec/ah.h>
	#include <netipsec/ah_var.h>
	#include <netipsec/xform.h>

	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#include <netipsec/ipsec6.h>
	#include <netinet6/ip6_ecn.h>
	#endif

	#include <netipsec/key.h>
	#include <netipsec/key_debug.h>

	#include <opencrypto/cryptodev.h>

	/*
	* Return header size in bytes. The old protocol did not support
	* the replay counter; the new protocol always includes the counter.
	*/
	#define HDRSIZE(sav) \
	(((sav)->flags & SADB_X_EXT_OLD) ? \
	sizeof (struct ah) : sizeof (struct ah) + sizeof (u_int32_t))
	/*
	* Return authenticator size in bytes, based on a field in the
	* algorithm descriptor.
	*/
	#define AUTHSIZE(sav) ((sav->flags & SADB_X_EXT_OLD) ? 16 : \
	xform_ah_authsize((sav)->tdb_authalgxform))

	VNET_DEFINE(int, ah_enable) = 1; /* control flow of packets with AH */
	VNET_DEFINE(int, ah_cleartos) = 1; /* clear ip_tos when doing AH calc */
	VNET_PCPUSTAT_DEFINE(struct ahstat, ahstat);
	VNET_PCPUSTAT_SYSINIT(ahstat);

	#ifdef VIMAGE
	VNET_PCPUSTAT_SYSUNINIT(ahstat);
	#endif /* VIMAGE */

	#ifdef INET
	SYSCTL_DECL(_net_inet_ah);
	SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_enable,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ah_enable), 0, "");
	SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_cleartos,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0, "");
	SYSCTL_VNET_PCPUSTAT(_net_inet_ah, IPSECCTL_STATS, stats, struct ahstat,
	ahstat, "AH statistics (struct ahstat, netipsec/ah_var.h)");
	#endif

	static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */

	static int ah_input_cb(struct cryptop*);
	static int ah_output_cb(struct cryptop*);

	int
	xform_ah_authsize(const struct auth_hash *esph)
	{
	int alen;

	if (esph == NULL)
	return 0;

	switch (esph->type) {
	case CRYPTO_SHA2_256_HMAC:
	case CRYPTO_SHA2_384_HMAC:
	case CRYPTO_SHA2_512_HMAC:
	alen = esph->hashsize / 2; /* RFC4868 2.3 */
	break;

	case CRYPTO_AES_128_NIST_GMAC:
	case CRYPTO_AES_192_NIST_GMAC:
	case CRYPTO_AES_256_NIST_GMAC:
	alen = esph->hashsize;
	break;

	default:
	alen = AH_HMAC_HASHLEN;
	break;
	}

	return alen;
	}

	size_t
	ah_hdrsiz(struct secasvar *sav)
	{
	size_t size;

	if (sav != NULL) {
	int authsize;
	IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null xform"));
	/XXX not right for null algorithm--does it matter??/
	authsize = AUTHSIZE(sav);
	size = roundup(authsize, sizeof (u_int32_t)) + HDRSIZE(sav);
	} else {
	/* default guess */
	size = sizeof (struct ah) + sizeof (u_int32_t) + 16;
	}
	return size;
	}

	/*
	* NB: public for use by esp_init.
	*/
	int
	ah_init0(struct secasvar sav, struct xformsw xsp, struct cryptoini *cria)
	{
	const struct auth_hash *thash;
	int keylen;

	thash = auth_algorithm_lookup(sav->alg_auth);
	if (thash == NULL) {
	DPRINTF(("%s: unsupported authentication algorithm %u\n",
	__func__, sav->alg_auth));
	return EINVAL;
	}
	/*
	* Verify the replay state block allocation is consistent with
	* the protocol type. We check here so we can make assumptions
	* later during protocol processing.
	*/
	/* NB: replay state is setup elsewhere (sigh) */
	if (((sav->flags&SADB_X_EXT_OLD) == 0) ^ (sav->replay != NULL)) {
	DPRINTF(("%s: replay state block inconsistency, "
	"%s algorithm %s replay state\n", __func__,
	(sav->flags & SADB_X_EXT_OLD) ? "old" : "new",
	sav->replay == NULL ? "without" : "with"));
	return EINVAL;
	}
	if (sav->key_auth == NULL) {
	DPRINTF(("%s: no authentication key for %s algorithm\n",
	__func__, thash->name));
	return EINVAL;
	}
	keylen = _KEYLEN(sav->key_auth);
	if (keylen > thash->keysize && thash->keysize != 0) {
	DPRINTF(("%s: invalid keylength %d, algorithm %s requires "
	"keysize less than %d\n", __func__,
	keylen, thash->name, thash->keysize));
	return EINVAL;
	}

	sav->tdb_xform = xsp;
	sav->tdb_authalgxform = thash;

	/* Initialize crypto session. */
	bzero(cria, sizeof (*cria));
	cria->cri_alg = sav->tdb_authalgxform->type;
	cria->cri_klen = _KEYBITS(sav->key_auth);
	cria->cri_key = sav->key_auth->key_data;
	cria->cri_mlen = AUTHSIZE(sav);

	return 0;
	}

	/*
	* ah_init() is called when an SPI is being set up.
	*/
	static int
	ah_init(struct secasvar sav, struct xformsw xsp)
	{
	struct cryptoini cria;
	int error;

	error = ah_init0(sav, xsp, &cria);
	return error ? error :
	crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support);
	}

	/*
	* Paranoia.
	*
	* NB: public for use by esp_zeroize (XXX).
	*/
	int
	ah_zeroize(struct secasvar *sav)
	{
	int err;

	if (sav->key_auth)
	bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));

	err = crypto_freesession(sav->tdb_cryptoid);
	sav->tdb_cryptoid = 0;
	sav->tdb_authalgxform = NULL;
	sav->tdb_xform = NULL;
	return err;
	}

	/*
	* Massage IPv4/IPv6 headers for AH processing.
	*/
	static int
	ah_massage_headers(struct mbuf **m0, int proto, int skip, int alg, int out)
	{
	struct mbuf m = m0;
	unsigned char *ptr;
	int off, count;

	#ifdef INET
	struct ip *ip;
	#endif /* INET */

	#ifdef INET6
	struct ip6_ext *ip6e;
	struct ip6_hdr ip6;
	int alloc, len, ad;
	#endif /* INET6 */

	switch (proto) {
	#ifdef INET
	case AF_INET:
	/*
	* This is the least painful way of dealing with IPv4 header
	* and option processing -- just make sure they're in
	* contiguous memory.
	*/
	*m0 = m = m_pullup(m, skip);
	if (m == NULL) {
	DPRINTF(("%s: m_pullup failed\n", __func__));
	return ENOBUFS;
	}

	/* Fix the IP header */
	ip = mtod(m, struct ip *);
	if (V_ah_cleartos)
	ip->ip_tos = 0;
	ip->ip_ttl = 0;
	ip->ip_sum = 0;

	if (alg == CRYPTO_MD5_KPDK \|\| alg == CRYPTO_SHA1_KPDK)
	ip->ip_off &= htons(IP_DF);
	else
	ip->ip_off = htons(0);

	ptr = mtod(m, unsigned char *) + sizeof(struct ip);

	/* IPv4 option processing */
	for (off = sizeof(struct ip); off < skip;) {
	if (ptr[off] == IPOPT_EOL \|\| ptr[off] == IPOPT_NOP \|\|
	off + 1 < skip)
	;
	else {
	DPRINTF(("%s: illegal IPv4 option length for "
	"option %d\n", __func__, ptr[off]));

	m_freem(m);
	return EINVAL;
	}

	switch (ptr[off]) {
	case IPOPT_EOL:
	off = skip; /* End the loop. */
	break;

	case IPOPT_NOP:
	off++;
	break;

	case IPOPT_SECURITY: /* 0x82 */
	case 0x85: /* Extended security. */
	case 0x86: /* Commercial security. */
	case 0x94: /* Router alert */
	case 0x95: /* RFC1770 */
	/* Sanity check for option length. */
	if (ptr[off + 1] < 2) {
	DPRINTF(("%s: illegal IPv4 option "
	"length for option %d\n",
	__func__, ptr[off]));

	m_freem(m);
	return EINVAL;
	}

	off += ptr[off + 1];
	break;

	case IPOPT_LSRR:
	case IPOPT_SSRR:
	/* Sanity check for option length. */
	if (ptr[off + 1] < 2) {
	DPRINTF(("%s: illegal IPv4 option "
	"length for option %d\n",
	__func__, ptr[off]));

	m_freem(m);
	return EINVAL;
	}

	/*
	* On output, if we have either of the
	* source routing options, we should
	* swap the destination address of the
	* IP header with the last address
	* specified in the option, as that is
	* what the destination's IP header
	* will look like.
	*/
	if (out)
	bcopy(ptr + off + ptr[off + 1] -
	sizeof(struct in_addr),
	&(ip->ip_dst), sizeof(struct in_addr));

	/* Fall through */
	default:
	/* Sanity check for option length. */
	if (ptr[off + 1] < 2) {
	DPRINTF(("%s: illegal IPv4 option "
	"length for option %d\n",
	__func__, ptr[off]));
	m_freem(m);
	return EINVAL;
	}

	/* Zeroize all other options. */
	count = ptr[off + 1];
	bcopy(ipseczeroes, ptr, count);
	off += count;
	break;
	}

	/* Sanity check. */
	if (off > skip) {
	DPRINTF(("%s: malformed IPv4 options header\n",
	__func__));

	m_freem(m);
	return EINVAL;
	}
	}

	break;
	#endif /* INET */

	#ifdef INET6
	case AF_INET6: /* Ugly... */
	/* Copy and "cook" the IPv6 header. */
	m_copydata(m, 0, sizeof(ip6), (caddr_t) &ip6);

	/* We don't do IPv6 Jumbograms. */
	if (ip6.ip6_plen == 0) {
	DPRINTF(("%s: unsupported IPv6 jumbogram\n", __func__));
	m_freem(m);
	return EMSGSIZE;
	}

	ip6.ip6_flow = 0;
	ip6.ip6_hlim = 0;
	ip6.ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6.ip6_vfc \|= IPV6_VERSION;

	/* Scoped address handling. */
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_src))
	ip6.ip6_src.s6_addr16[1] = 0;
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_dst))
	ip6.ip6_dst.s6_addr16[1] = 0;

	/* Done with IPv6 header. */
	m_copyback(m, 0, sizeof(struct ip6_hdr), (caddr_t) &ip6);

	/* Let's deal with the remaining headers (if any). */
	if (skip - sizeof(struct ip6_hdr) > 0) {
	if (m->m_len <= skip) {
	ptr = (unsigned char *) malloc(
	skip - sizeof(struct ip6_hdr),
	M_XDATA, M_NOWAIT);
	if (ptr == NULL) {
	DPRINTF(("%s: failed to allocate memory"
	"for IPv6 headers\n",__func__));
	m_freem(m);
	return ENOBUFS;
	}

	/*
	* Copy all the protocol headers after
	* the IPv6 header.
	*/
	m_copydata(m, sizeof(struct ip6_hdr),
	skip - sizeof(struct ip6_hdr), ptr);
	alloc = 1;
	} else {
	/* No need to allocate memory. */
	ptr = mtod(m, unsigned char *) +
	sizeof(struct ip6_hdr);
	alloc = 0;
	}
	} else
	break;

	off = ip6.ip6_nxt & 0xff; /* Next header type. */

	for (len = 0; len < skip - sizeof(struct ip6_hdr);)
	switch (off) {
	case IPPROTO_HOPOPTS:
	case IPPROTO_DSTOPTS:
	ip6e = (struct ip6_ext *) (ptr + len);

	/*
	* Process the mutable/immutable
	* options -- borrows heavily from the
	* KAME code.
	*/
	for (count = len + sizeof(struct ip6_ext);
	count < len + ((ip6e->ip6e_len + 1) << 3);) {
	if (ptr[count] == IP6OPT_PAD1) {
	count++;
	continue; /* Skip padding. */
	}

	/* Sanity check. */
	if (count > len +
	((ip6e->ip6e_len + 1) << 3)) {
	m_freem(m);

	/* Free, if we allocated. */
	if (alloc)
	free(ptr, M_XDATA);
	return EINVAL;
	}

	ad = ptr[count + 1];

	/* If mutable option, zeroize. */
	if (ptr[count] & IP6OPT_MUTABLE)
	bcopy(ipseczeroes, ptr + count,
	ptr[count + 1]);

	count += ad;

	/* Sanity check. */
	if (count >
	skip - sizeof(struct ip6_hdr)) {
	m_freem(m);

	/* Free, if we allocated. */
	if (alloc)
	free(ptr, M_XDATA);
	return EINVAL;
	}
	}

	/* Advance. */
	len += ((ip6e->ip6e_len + 1) << 3);
	off = ip6e->ip6e_nxt;
	break;

	case IPPROTO_ROUTING:
	/*
	* Always include routing headers in
	* computation.
	*/
	ip6e = (struct ip6_ext *) (ptr + len);
	len += ((ip6e->ip6e_len + 1) << 3);
	off = ip6e->ip6e_nxt;
	break;

	default:
	DPRINTF(("%s: unexpected IPv6 header type %d",
	__func__, off));
	if (alloc)
	free(ptr, M_XDATA);
	m_freem(m);
	return EINVAL;
	}

	/* Copyback and free, if we allocated. */
	if (alloc) {
	m_copyback(m, sizeof(struct ip6_hdr),
	skip - sizeof(struct ip6_hdr), ptr);
	free(ptr, M_XDATA);
	}

	break;
	#endif /* INET6 */
	}

	return 0;
	}

	/*
	* ah_input() gets called to verify that an input packet
	* passes authentication.
	*/
	static int
	ah_input(struct mbuf m, struct secasvar sav, int skip, int protoff)
	{
	IPSEC_DEBUG_DECLARE(char buf[128]);
	const struct auth_hash *ahx;
	struct cryptodesc *crda;
	struct cryptop *crp;
	struct xform_data *xd;
	struct newah *ah;
	uint64_t cryptoid;
	int hl, rplen, authsize, error;

	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->key_auth != NULL, ("null authentication key"));
	IPSEC_ASSERT(sav->tdb_authalgxform != NULL,
	("null authentication xform"));

	/* Figure out header size. */
	rplen = HDRSIZE(sav);

	/* XXX don't pullup, just copy header */
	IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen);
	if (ah == NULL) {
	DPRINTF(("ah_input: cannot pullup header\n"));
	AHSTAT_INC(ahs_hdrops); /XXX/
	error = ENOBUFS;
	goto bad;
	}

	/* Check replay window, if applicable. */
	SECASVAR_LOCK(sav);
	if (sav->replay != NULL && sav->replay->wsize != 0 &&
	ipsec_chkreplay(ntohl(ah->ah_seq), sav) == 0) {
	SECASVAR_UNLOCK(sav);
	AHSTAT_INC(ahs_replay);
	DPRINTF(("%s: packet replay failure: %s\n", __func__,
	ipsec_sa2str(sav, buf, sizeof(buf))));
	error = EACCES;
	goto bad;
	}
	cryptoid = sav->tdb_cryptoid;
	SECASVAR_UNLOCK(sav);

	/* Verify AH header length. */
	hl = ah->ah_len * sizeof (u_int32_t);
	ahx = sav->tdb_authalgxform;
	authsize = AUTHSIZE(sav);
	if (hl != authsize + rplen - sizeof (struct ah)) {
	DPRINTF(("%s: bad authenticator length %u (expecting %lu)"
	" for packet in SA %s/%08lx\n", __func__, hl,
	(u_long) (authsize + rplen - sizeof (struct ah)),
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	AHSTAT_INC(ahs_badauthl);
	error = EACCES;
	goto bad;
	}
	AHSTAT_ADD(ahs_ibytes, m->m_pkthdr.len - skip - hl);

	/* Get crypto descriptors. */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptor\n",
	__func__));
	AHSTAT_INC(ahs_crypto);
	error = ENOBUFS;
	goto bad;
	}

	crda = crp->crp_desc;
	IPSEC_ASSERT(crda != NULL, ("null crypto descriptor"));

	crda->crd_skip = 0;
	crda->crd_len = m->m_pkthdr.len;
	crda->crd_inject = skip + rplen;

	/* Authentication operation. */
	crda->crd_alg = ahx->type;
	crda->crd_klen = _KEYBITS(sav->key_auth);
	crda->crd_key = sav->key_auth->key_data;

	/* Allocate IPsec-specific opaque crypto info. */
	xd = malloc(sizeof(*xd) + skip + rplen + authsize, M_XDATA,
	M_NOWAIT \| M_ZERO);
	if (xd == NULL) {
	DPRINTF(("%s: failed to allocate xform_data\n", __func__));
	AHSTAT_INC(ahs_crypto);
	crypto_freereq(crp);
	error = ENOBUFS;
	goto bad;
	}

	/*
	* Save the authenticator, the skipped portion of the packet,
	* and the AH header.
	*/
	m_copydata(m, 0, skip + rplen + authsize, (caddr_t)(xd + 1));

	/* Zeroize the authenticator on the packet. */
	m_copyback(m, skip + rplen, authsize, ipseczeroes);

	/* "Massage" the packet headers for crypto processing. */
	error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
	skip, ahx->type, 0);
	if (error != 0) {
	/* NB: mbuf is free'd by ah_massage_headers */
	AHSTAT_INC(ahs_hdrops);
	free(xd, M_XDATA);
	crypto_freereq(crp);
	key_freesav(&sav);
	return (error);
	}

	/* Crypto operation descriptor. */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	if (V_async_crypto)
	crp->crp_flags \|= CRYPTO_F_ASYNC \| CRYPTO_F_ASYNC_KEEPORDER;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ah_input_cb;
	crp->crp_sid = cryptoid;
	crp->crp_opaque = (caddr_t) xd;

	/* These are passed as-is to the callback. */
	xd->sav = sav;
	xd->nxt = ah->ah_nxt;
	xd->protoff = protoff;
	xd->skip = skip;
	xd->cryptoid = cryptoid;
	return (crypto_dispatch(crp));
	bad:
	m_freem(m);
	key_freesav(&sav);
	return (error);
	}

	/*
	* AH input callback from the crypto driver.
	*/
	static int
	ah_input_cb(struct cryptop *crp)
	{
	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
	unsigned char calc[AH_ALEN_MAX];
	- const struct auth_hash *ahx;
	struct mbuf *m;
	- struct cryptodesc *crd;
	struct xform_data *xd;
	struct secasvar *sav;
	struct secasindex *saidx;
	caddr_t ptr;
	uint64_t cryptoid;
	int authsize, rplen, error, skip, protoff;
	uint8_t nxt;

	- crd = crp->crp_desc;
	m = (struct mbuf *) crp->crp_buf;
	xd = (struct xform_data *) crp->crp_opaque;
	sav = xd->sav;
	skip = xd->skip;
	nxt = xd->nxt;
	protoff = xd->protoff;
	cryptoid = xd->cryptoid;
	saidx = &sav->sah->saidx;
	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET \|\|
	saidx->dst.sa.sa_family == AF_INET6,
	("unexpected protocol family %u", saidx->dst.sa.sa_family));
	-
	- ahx = sav->tdb_authalgxform;

	/* Check for crypto errors. */
	if (crp->crp_etype) {
	if (crp->crp_etype == EAGAIN) {
	/* Reset the session ID */
	if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
	crypto_freesession(cryptoid);
	xd->cryptoid = crp->crp_sid;
	return (crypto_dispatch(crp));
	}
	AHSTAT_INC(ahs_noxform);
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	} else {
	AHSTAT_INC(ahs_hist[sav->alg_auth]);
	crypto_freereq(crp); /* No longer needed. */
	crp = NULL;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	AHSTAT_INC(ahs_crypto);
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}

	/* Figure out header size. */
	rplen = HDRSIZE(sav);
	authsize = AUTHSIZE(sav);

	/* Copy authenticator off the packet. */
	m_copydata(m, skip + rplen, authsize, calc);

	/* Verify authenticator. */
	ptr = (caddr_t) (xd + 1);
	if (timingsafe_bcmp(ptr + skip + rplen, calc, authsize)) {
	DPRINTF(("%s: authentication hash mismatch for packet "
	"in SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	AHSTAT_INC(ahs_badauth);
	error = EACCES;
	goto bad;
	}
	/* Fix the Next Protocol field. */
	((uint8_t *) ptr)[protoff] = nxt;

	/* Copyback the saved (uncooked) network headers. */
	m_copyback(m, 0, skip, ptr);
	free(xd, M_XDATA), xd = NULL; /* No longer needed */

	/*
	* Header is now authenticated.
	*/
	m->m_flags \|= M_AUTHIPHDR\|M_AUTHIPDGM;

	/*
	* Update replay sequence number, if appropriate.
	*/
	if (sav->replay) {
	u_int32_t seq;

	m_copydata(m, skip + offsetof(struct newah, ah_seq),
	sizeof (seq), (caddr_t) &seq);
	SECASVAR_LOCK(sav);
	if (ipsec_updatereplay(ntohl(seq), sav)) {
	SECASVAR_UNLOCK(sav);
	AHSTAT_INC(ahs_replay);
	error = EACCES;
	goto bad;
	}
	SECASVAR_UNLOCK(sav);
	}

	/*
	* Remove the AH header and authenticator from the mbuf.
	*/
	error = m_striphdr(m, skip, rplen + authsize);
	if (error) {
	DPRINTF(("%s: mangled mbuf chain for SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	AHSTAT_INC(ahs_hdrops);
	goto bad;
	}

	switch (saidx->dst.sa.sa_family) {
	#ifdef INET6
	case AF_INET6:
	error = ipsec6_common_input_cb(m, sav, skip, protoff);
	break;
	#endif
	#ifdef INET
	case AF_INET:
	error = ipsec4_common_input_cb(m, sav, skip, protoff);
	break;
	#endif
	default:
	panic("%s: Unexpected address family: %d saidx=%p", __func__,
	saidx->dst.sa.sa_family, saidx);
	}
	return error;
	bad:
	if (sav)
	key_freesav(&sav);
	if (m != NULL)
	m_freem(m);
	if (xd != NULL)
	free(xd, M_XDATA);
	if (crp != NULL)
	crypto_freereq(crp);
	return error;
	}

	/*
	* AH output routine, called by ipsec[46]_perform_request().
	*/
	static int
	ah_output(struct mbuf m, struct secpolicy sp, struct secasvar *sav,
	u_int idx, int skip, int protoff)
	{
	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
	const struct auth_hash *ahx;
	struct cryptodesc *crda;
	struct xform_data *xd;
	struct mbuf *mi;
	struct cryptop *crp;
	struct newah *ah;
	uint64_t cryptoid;
	uint16_t iplen;
	int error, rplen, authsize, maxpacketsize, roff;
	uint8_t prot;

	IPSEC_ASSERT(sav != NULL, ("null SA"));
	ahx = sav->tdb_authalgxform;
	IPSEC_ASSERT(ahx != NULL, ("null authentication xform"));

	AHSTAT_INC(ahs_output);

	/* Figure out header size. */
	rplen = HDRSIZE(sav);

	/* Check for maximum packet size violations. */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	maxpacketsize = IP_MAXPACKET;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	maxpacketsize = IPV6_MAXPACKET;
	break;
	#endif /* INET6 */
	default:
	DPRINTF(("%s: unknown/unsupported protocol family %u, "
	"SA %s/%08lx\n", __func__,
	sav->sah->saidx.dst.sa.sa_family,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	AHSTAT_INC(ahs_nopf);
	error = EPFNOSUPPORT;
	goto bad;
	}
	authsize = AUTHSIZE(sav);
	if (rplen + authsize + m->m_pkthdr.len > maxpacketsize) {
	DPRINTF(("%s: packet in SA %s/%08lx got too big "
	"(len %u, max len %u)\n", __func__,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi),
	rplen + authsize + m->m_pkthdr.len, maxpacketsize));
	AHSTAT_INC(ahs_toobig);
	error = EMSGSIZE;
	goto bad;
	}

	/* Update the counters. */
	AHSTAT_ADD(ahs_obytes, m->m_pkthdr.len - skip);

	m = m_unshare(m, M_NOWAIT);
	if (m == NULL) {
	DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	AHSTAT_INC(ahs_hdrops);
	error = ENOBUFS;
	goto bad;
	}

	/* Inject AH header. */
	mi = m_makespace(m, skip, rplen + authsize, &roff);
	if (mi == NULL) {
	DPRINTF(("%s: failed to inject %u byte AH header for SA "
	"%s/%08lx\n", __func__,
	rplen + authsize,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	AHSTAT_INC(ahs_hdrops); /XXX differs from openbsd /
	error = ENOBUFS;
	goto bad;
	}

	/*
	* The AH header is guaranteed by m_makespace() to be in
	* contiguous memory, at roff bytes offset into the returned mbuf.
	*/
	ah = (struct newah *)(mtod(mi, caddr_t) + roff);

	/* Initialize the AH header. */
	m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nxt);
	ah->ah_len = (rplen + authsize - sizeof(struct ah)) / sizeof(u_int32_t);
	ah->ah_reserve = 0;
	ah->ah_spi = sav->spi;

	/* Zeroize authenticator. */
	m_copyback(m, skip + rplen, authsize, ipseczeroes);

	/* Insert packet replay counter, as requested. */
	SECASVAR_LOCK(sav);
	if (sav->replay) {
	if (sav->replay->count == ~0 &&
	(sav->flags & SADB_X_EXT_CYCSEQ) == 0) {
	SECASVAR_UNLOCK(sav);
	DPRINTF(("%s: replay counter wrapped for SA %s/%08lx\n",
	__func__, ipsec_address(&sav->sah->saidx.dst, buf,
	sizeof(buf)), (u_long) ntohl(sav->spi)));
	AHSTAT_INC(ahs_wrap);
	error = EACCES;
	goto bad;
	}
	#ifdef REGRESSION
	/* Emulate replay attack when ipsec_replay is TRUE. */
	if (!V_ipsec_replay)
	#endif
	sav->replay->count++;
	ah->ah_seq = htonl(sav->replay->count);
	}
	cryptoid = sav->tdb_cryptoid;
	SECASVAR_UNLOCK(sav);

	/* Get crypto descriptors. */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptors\n",
	__func__));
	AHSTAT_INC(ahs_crypto);
	error = ENOBUFS;
	goto bad;
	}

	crda = crp->crp_desc;
	crda->crd_skip = 0;
	crda->crd_inject = skip + rplen;
	crda->crd_len = m->m_pkthdr.len;

	/* Authentication operation. */
	crda->crd_alg = ahx->type;
	crda->crd_key = sav->key_auth->key_data;
	crda->crd_klen = _KEYBITS(sav->key_auth);

	/* Allocate IPsec-specific opaque crypto info. */
	xd = malloc(sizeof(struct xform_data) + skip, M_XDATA,
	M_NOWAIT \| M_ZERO);
	if (xd == NULL) {
	crypto_freereq(crp);
	DPRINTF(("%s: failed to allocate xform_data\n", __func__));
	AHSTAT_INC(ahs_crypto);
	error = ENOBUFS;
	goto bad;
	}

	/* Save the skipped portion of the packet. */
	m_copydata(m, 0, skip, (caddr_t) (xd + 1));

	/*
	* Fix IP header length on the header used for
	* authentication. We don't need to fix the original
	* header length as it will be fixed by our caller.
	*/
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	bcopy(((caddr_t)(xd + 1)) +
	offsetof(struct ip, ip_len),
	(caddr_t) &iplen, sizeof(u_int16_t));
	iplen = htons(ntohs(iplen) + rplen + authsize);
	m_copyback(m, offsetof(struct ip, ip_len),
	sizeof(u_int16_t), (caddr_t) &iplen);
	break;
	#endif /* INET */

	#ifdef INET6
	case AF_INET6:
	bcopy(((caddr_t)(xd + 1)) +
	offsetof(struct ip6_hdr, ip6_plen),
	(caddr_t) &iplen, sizeof(uint16_t));
	iplen = htons(ntohs(iplen) + rplen + authsize);
	m_copyback(m, offsetof(struct ip6_hdr, ip6_plen),
	sizeof(uint16_t), (caddr_t) &iplen);
	break;
	#endif /* INET6 */
	}

	/* Fix the Next Header field in saved header. */
	((uint8_t *) (xd + 1))[protoff] = IPPROTO_AH;

	/* Update the Next Protocol field in the IP header. */
	prot = IPPROTO_AH;
	m_copyback(m, protoff, sizeof(uint8_t), (caddr_t) &prot);

	/* "Massage" the packet headers for crypto processing. */
	error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
	skip, ahx->type, 1);
	if (error != 0) {
	m = NULL; /* mbuf was free'd by ah_massage_headers. */
	free(xd, M_XDATA);
	crypto_freereq(crp);
	goto bad;
	}

	/* Crypto operation descriptor. */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	if (V_async_crypto)
	crp->crp_flags \|= CRYPTO_F_ASYNC \| CRYPTO_F_ASYNC_KEEPORDER;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ah_output_cb;
	crp->crp_sid = cryptoid;
	crp->crp_opaque = (caddr_t) xd;

	/* These are passed as-is to the callback. */
	xd->sp = sp;
	xd->sav = sav;
	xd->skip = skip;
	xd->idx = idx;
	xd->cryptoid = cryptoid;

	return crypto_dispatch(crp);
	bad:
	if (m)
	m_freem(m);
	key_freesav(&sav);
	key_freesp(&sp);
	return (error);
	}

	/*
	* AH output callback from the crypto driver.
	*/
	static int
	ah_output_cb(struct cryptop *crp)
	{
	struct xform_data *xd;
	struct secpolicy *sp;
	struct secasvar *sav;
	struct mbuf *m;
	uint64_t cryptoid;
	caddr_t ptr;
	u_int idx;
	int skip, error;

	m = (struct mbuf *) crp->crp_buf;
	xd = (struct xform_data *) crp->crp_opaque;
	sp = xd->sp;
	sav = xd->sav;
	skip = xd->skip;
	idx = xd->idx;
	cryptoid = xd->cryptoid;
	ptr = (caddr_t) (xd + 1);

	/* Check for crypto errors. */
	if (crp->crp_etype) {
	if (crp->crp_etype == EAGAIN) {
	/* Reset the session ID */
	if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
	crypto_freesession(cryptoid);
	xd->cryptoid = crp->crp_sid;
	return (crypto_dispatch(crp));
	}
	AHSTAT_INC(ahs_noxform);
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	m_freem(m);
	goto bad;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	AHSTAT_INC(ahs_crypto);
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	/*
	* Copy original headers (with the new protocol number) back
	* in place.
	*/
	m_copyback(m, 0, skip, ptr);

	free(xd, M_XDATA);
	crypto_freereq(crp);
	AHSTAT_INC(ahs_hist[sav->alg_auth]);
	#ifdef REGRESSION
	/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
	if (V_ipsec_integrity) {
	int alen;

	/*
	* Corrupt HMAC if we want to test integrity verification of
	* the other side.
	*/
	alen = AUTHSIZE(sav);
	m_copyback(m, m->m_pkthdr.len - alen, alen, ipseczeroes);
	}
	#endif

	/* NB: m is reclaimed by ipsec_process_done. */
	error = ipsec_process_done(m, sp, sav, idx);
	return (error);
	bad:
	free(xd, M_XDATA);
	crypto_freereq(crp);
	key_freesav(&sav);
	key_freesp(&sp);
	return (error);
	}

	static struct xformsw ah_xformsw = {
	.xf_type = XF_AH,
	.xf_name = "IPsec AH",
	.xf_init = ah_init,
	.xf_zeroize = ah_zeroize,
	.xf_input = ah_input,
	.xf_output = ah_output,
	};

	SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
	xform_attach, &ah_xformsw);
	SYSUNINIT(ah_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
	xform_detach, &ah_xformsw);
	Index: head/sys/netipsec/xform_esp.c
	===================================================================
	--- head/sys/netipsec/xform_esp.c (revision 327172)
	+++ head/sys/netipsec/xform_esp.c (revision 327173)
	@@ -1,966 +1,964 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ip_esp.c,v 1.69 2001/06/26 06:18:59 angelos Exp $ */
	/*-
	* The authors of this code are John Ioannidis (ji@tla.org),
	* Angelos D. Keromytis (kermit@csd.uch.gr) and
	* Niels Provos (provos@physnet.uni-hamburg.de).
	*
	* The original version of this code was written by John Ioannidis
	* for BSD/OS in Athens, Greece, in November 1995.
	*
	* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
	* by Angelos D. Keromytis.
	*
	* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
	* and Niels Provos.
	*
	* Additional features in 1999 by Angelos D. Keromytis.
	*
	* Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
	* Angelos D. Keromytis and Niels Provos.
	* Copyright (c) 2001 Angelos D. Keromytis.
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all copies of any software which is or includes a copy or
	* modification of this software.
	* You may use this code under the GNU public license if you so wish. Please
	* contribute changes back to the authors under this freer than GPL license
	* so that we may further the use of strong encryption without limitations to
	* all.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/random.h>
	#include <sys/mutex.h>
	#include <sys/sysctl.h>
	#include <sys/mutex.h>
	#include <machine/atomic.h>

	#include <net/if.h>
	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_ecn.h>
	#include <netinet/ip6.h>

	#include <netipsec/ipsec.h>
	#include <netipsec/ah.h>
	#include <netipsec/ah_var.h>
	#include <netipsec/esp.h>
	#include <netipsec/esp_var.h>
	#include <netipsec/xform.h>

	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#include <netipsec/ipsec6.h>
	#include <netinet6/ip6_ecn.h>
	#endif

	#include <netipsec/key.h>
	#include <netipsec/key_debug.h>

	#include <opencrypto/cryptodev.h>
	#include <opencrypto/xform.h>

	VNET_DEFINE(int, esp_enable) = 1;
	VNET_PCPUSTAT_DEFINE(struct espstat, espstat);
	VNET_PCPUSTAT_SYSINIT(espstat);

	#ifdef VIMAGE
	VNET_PCPUSTAT_SYSUNINIT(espstat);
	#endif /* VIMAGE */

	SYSCTL_DECL(_net_inet_esp);
	SYSCTL_INT(_net_inet_esp, OID_AUTO, esp_enable,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(esp_enable), 0, "");
	SYSCTL_VNET_PCPUSTAT(_net_inet_esp, IPSECCTL_STATS, stats,
	struct espstat, espstat,
	"ESP statistics (struct espstat, netipsec/esp_var.h");

	static int esp_input_cb(struct cryptop *op);
	static int esp_output_cb(struct cryptop *crp);

	size_t
	esp_hdrsiz(struct secasvar *sav)
	{
	size_t size;

	if (sav != NULL) {
	/XXX not right for null algorithm--does it matter??/
	IPSEC_ASSERT(sav->tdb_encalgxform != NULL,
	("SA with null xform"));
	if (sav->flags & SADB_X_EXT_OLD)
	size = sizeof (struct esp);
	else
	size = sizeof (struct newesp);
	size += sav->tdb_encalgxform->blocksize + 9;
	/XXX need alg check???/
	if (sav->tdb_authalgxform != NULL && sav->replay)
	size += ah_hdrsiz(sav);
	} else {
	/*
	* base header size
	* + max iv length for CBC mode
	* + max pad length
	* + sizeof (pad length field)
	* + sizeof (next header field)
	* + max icv supported.
	*/
	size = sizeof (struct newesp) + EALG_MAX_BLOCK_LEN + 9 + 16;
	}
	return size;
	}

	/*
	* esp_init() is called when an SPI is being set up.
	*/
	static int
	esp_init(struct secasvar sav, struct xformsw xsp)
	{
	const struct enc_xform *txform;
	struct cryptoini cria, crie;
	int keylen;
	int error;

	txform = enc_algorithm_lookup(sav->alg_enc);
	if (txform == NULL) {
	DPRINTF(("%s: unsupported encryption algorithm %d\n",
	__func__, sav->alg_enc));
	return EINVAL;
	}
	if (sav->key_enc == NULL) {
	DPRINTF(("%s: no encoding key for %s algorithm\n",
	__func__, txform->name));
	return EINVAL;
	}
	if ((sav->flags & (SADB_X_EXT_OLD \| SADB_X_EXT_IV4B)) ==
	SADB_X_EXT_IV4B) {
	DPRINTF(("%s: 4-byte IV not supported with protocol\n",
	__func__));
	return EINVAL;
	}
	/* subtract off the salt, RFC4106, 8.1 and RFC3686, 5.1 */
	keylen = _KEYLEN(sav->key_enc) - SAV_ISCTRORGCM(sav) * 4;
	if (txform->minkey > keylen \|\| keylen > txform->maxkey) {
	DPRINTF(("%s: invalid key length %u, must be in the range "
	"[%u..%u] for algorithm %s\n", __func__,
	keylen, txform->minkey, txform->maxkey,
	txform->name));
	return EINVAL;
	}

	if (SAV_ISCTRORGCM(sav))
	sav->ivlen = 8; /* RFC4106 3.1 and RFC3686 3.1 */
	else
	sav->ivlen = txform->ivsize;

	/*
	* Setup AH-related state.
	*/
	if (sav->alg_auth != 0) {
	error = ah_init0(sav, xsp, &cria);
	if (error)
	return error;
	}

	/* NB: override anything set in ah_init0 */
	sav->tdb_xform = xsp;
	sav->tdb_encalgxform = txform;

	/*
	* Whenever AES-GCM is used for encryption, one
	* of the AES authentication algorithms is chosen
	* as well, based on the key size.
	*/
	if (sav->alg_enc == SADB_X_EALG_AESGCM16) {
	switch (keylen) {
	case AES_128_GMAC_KEY_LEN:
	sav->alg_auth = SADB_X_AALG_AES128GMAC;
	sav->tdb_authalgxform = &auth_hash_nist_gmac_aes_128;
	break;
	case AES_192_GMAC_KEY_LEN:
	sav->alg_auth = SADB_X_AALG_AES192GMAC;
	sav->tdb_authalgxform = &auth_hash_nist_gmac_aes_192;
	break;
	case AES_256_GMAC_KEY_LEN:
	sav->alg_auth = SADB_X_AALG_AES256GMAC;
	sav->tdb_authalgxform = &auth_hash_nist_gmac_aes_256;
	break;
	default:
	DPRINTF(("%s: invalid key length %u"
	"for algorithm %s\n", __func__,
	keylen, txform->name));
	return EINVAL;
	}
	bzero(&cria, sizeof(cria));
	cria.cri_alg = sav->tdb_authalgxform->type;
	cria.cri_key = sav->key_enc->key_data;
	cria.cri_klen = _KEYBITS(sav->key_enc) - SAV_ISGCM(sav) * 32;
	}

	/* Initialize crypto session. */
	bzero(&crie, sizeof(crie));
	crie.cri_alg = sav->tdb_encalgxform->type;
	crie.cri_key = sav->key_enc->key_data;
	crie.cri_klen = _KEYBITS(sav->key_enc) - SAV_ISCTRORGCM(sav) * 32;

	if (sav->tdb_authalgxform && sav->tdb_encalgxform) {
	/* init both auth & enc */
	crie.cri_next = &cria;
	error = crypto_newsession(&sav->tdb_cryptoid,
	&crie, V_crypto_support);
	} else if (sav->tdb_encalgxform) {
	error = crypto_newsession(&sav->tdb_cryptoid,
	&crie, V_crypto_support);
	} else if (sav->tdb_authalgxform) {
	error = crypto_newsession(&sav->tdb_cryptoid,
	&cria, V_crypto_support);
	} else {
	/* XXX cannot happen? */
	DPRINTF(("%s: no encoding OR authentication xform!\n",
	__func__));
	error = EINVAL;
	}
	return error;
	}

	/*
	* Paranoia.
	*/
	static int
	esp_zeroize(struct secasvar *sav)
	{
	/* NB: ah_zerorize free's the crypto session state */
	int error = ah_zeroize(sav);

	if (sav->key_enc)
	bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
	sav->tdb_encalgxform = NULL;
	sav->tdb_xform = NULL;
	return error;
	}

	/*
	* ESP input processing, called (eventually) through the protocol switch.
	*/
	static int
	esp_input(struct mbuf m, struct secasvar sav, int skip, int protoff)
	{
	IPSEC_DEBUG_DECLARE(char buf[128]);
	const struct auth_hash *esph;
	const struct enc_xform *espx;
	struct xform_data *xd;
	struct cryptodesc *crde;
	struct cryptop *crp;
	struct newesp *esp;
	uint8_t *ivp;
	uint64_t cryptoid;
	int alen, error, hlen, plen;

	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("null encoding xform"));

	error = EINVAL;
	/* Valid IP Packet length ? */
	if ( (skip&3) \|\| (m->m_pkthdr.len&3) ){
	DPRINTF(("%s: misaligned packet, skip %u pkt len %u",
	__func__, skip, m->m_pkthdr.len));
	ESPSTAT_INC(esps_badilen);
	goto bad;
	}
	/* XXX don't pullup, just copy header */
	IP6_EXTHDR_GET(esp, struct newesp *, m, skip, sizeof (struct newesp));

	esph = sav->tdb_authalgxform;
	espx = sav->tdb_encalgxform;

	/* Determine the ESP header and auth length */
	if (sav->flags & SADB_X_EXT_OLD)
	hlen = sizeof (struct esp) + sav->ivlen;
	else
	hlen = sizeof (struct newesp) + sav->ivlen;

	alen = xform_ah_authsize(esph);

	/*
	* Verify payload length is multiple of encryption algorithm
	* block size.
	*
	* NB: This works for the null algorithm because the blocksize
	* is 4 and all packets must be 4-byte aligned regardless
	* of the algorithm.
	*/
	plen = m->m_pkthdr.len - (skip + hlen + alen);
	if ((plen & (espx->blocksize - 1)) \|\| (plen <= 0)) {
	DPRINTF(("%s: payload of %d octets not a multiple of %d octets,"
	" SA %s/%08lx\n", __func__, plen, espx->blocksize,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long)ntohl(sav->spi)));
	ESPSTAT_INC(esps_badilen);
	goto bad;
	}

	/*
	* Check sequence number.
	*/
	SECASVAR_LOCK(sav);
	if (esph != NULL && sav->replay != NULL && sav->replay->wsize != 0) {
	if (ipsec_chkreplay(ntohl(esp->esp_seq), sav) == 0) {
	SECASVAR_UNLOCK(sav);
	DPRINTF(("%s: packet replay check for %s\n", __func__,
	ipsec_sa2str(sav, buf, sizeof(buf))));
	ESPSTAT_INC(esps_replay);
	error = EACCES;
	goto bad;
	}
	}
	cryptoid = sav->tdb_cryptoid;
	SECASVAR_UNLOCK(sav);

	/* Update the counters */
	ESPSTAT_ADD(esps_ibytes, m->m_pkthdr.len - (skip + hlen + alen));

	/* Get crypto descriptors */
	crp = crypto_getreq(esph && espx ? 2 : 1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptors\n",
	__func__));
	ESPSTAT_INC(esps_crypto);
	error = ENOBUFS;
	goto bad;
	}

	/* Get IPsec-specific opaque pointer */
	xd = malloc(sizeof(*xd) + alen, M_XDATA, M_NOWAIT \| M_ZERO);
	if (xd == NULL) {
	DPRINTF(("%s: failed to allocate xform_data\n", __func__));
	ESPSTAT_INC(esps_crypto);
	crypto_freereq(crp);
	error = ENOBUFS;
	goto bad;
	}

	if (esph != NULL) {
	struct cryptodesc *crda = crp->crp_desc;

	IPSEC_ASSERT(crda != NULL, ("null ah crypto descriptor"));

	/* Authentication descriptor */
	crda->crd_skip = skip;
	if (SAV_ISGCM(sav))
	crda->crd_len = 8; /* RFC4106 5, SPI + SN */
	else
	crda->crd_len = m->m_pkthdr.len - (skip + alen);
	crda->crd_inject = m->m_pkthdr.len - alen;

	crda->crd_alg = esph->type;

	/* Copy the authenticator */
	m_copydata(m, m->m_pkthdr.len - alen, alen,
	(caddr_t) (xd + 1));

	/* Chain authentication request */
	crde = crda->crd_next;
	} else {
	crde = crp->crp_desc;
	}

	/* Crypto operation descriptor */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	if (V_async_crypto)
	crp->crp_flags \|= CRYPTO_F_ASYNC \| CRYPTO_F_ASYNC_KEEPORDER;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = esp_input_cb;
	crp->crp_sid = cryptoid;
	crp->crp_opaque = (caddr_t) xd;

	/* These are passed as-is to the callback */
	xd->sav = sav;
	xd->protoff = protoff;
	xd->skip = skip;
	xd->cryptoid = cryptoid;

	/* Decryption descriptor */
	IPSEC_ASSERT(crde != NULL, ("null esp crypto descriptor"));
	crde->crd_skip = skip + hlen;
	crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
	crde->crd_inject = skip + hlen - sav->ivlen;

	if (SAV_ISCTRORGCM(sav)) {
	ivp = &crde->crd_iv[0];

	/* GCM IV Format: RFC4106 4 */
	/* CTR IV Format: RFC3686 4 */
	/* Salt is last four bytes of key, RFC4106 8.1 */
	/* Nonce is last four bytes of key, RFC3686 5.1 */
	memcpy(ivp, sav->key_enc->key_data +
	_KEYLEN(sav->key_enc) - 4, 4);

	if (SAV_ISCTR(sav)) {
	/* Initial block counter is 1, RFC3686 4 */
	be32enc(&ivp[sav->ivlen + 4], 1);
	}

	m_copydata(m, skip + hlen - sav->ivlen, sav->ivlen, &ivp[4]);
	crde->crd_flags \|= CRD_F_IV_EXPLICIT;
	}

	crde->crd_alg = espx->type;

	return (crypto_dispatch(crp));
	bad:
	m_freem(m);
	key_freesav(&sav);
	return (error);
	}

	/*
	* ESP input callback from the crypto driver.
	*/
	static int
	esp_input_cb(struct cryptop *crp)
	{
	IPSEC_DEBUG_DECLARE(char buf[128]);
	u_int8_t lastthree[3], aalg[AH_HMAC_MAXHASHLEN];
	const struct auth_hash *esph;
	- const struct enc_xform *espx;
	struct mbuf *m;
	struct cryptodesc *crd;
	struct xform_data *xd;
	struct secasvar *sav;
	struct secasindex *saidx;
	caddr_t ptr;
	uint64_t cryptoid;
	int hlen, skip, protoff, error, alen;

	crd = crp->crp_desc;
	IPSEC_ASSERT(crd != NULL, ("null crypto descriptor!"));

	m = (struct mbuf *) crp->crp_buf;
	xd = (struct xform_data *) crp->crp_opaque;
	sav = xd->sav;
	skip = xd->skip;
	protoff = xd->protoff;
	cryptoid = xd->cryptoid;
	saidx = &sav->sah->saidx;
	esph = sav->tdb_authalgxform;
	- espx = sav->tdb_encalgxform;

	/* Check for crypto errors */
	if (crp->crp_etype) {
	if (crp->crp_etype == EAGAIN) {
	/* Reset the session ID */
	if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
	crypto_freesession(cryptoid);
	xd->cryptoid = crp->crp_sid;
	return (crypto_dispatch(crp));
	}
	ESPSTAT_INC(esps_noxform);
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	ESPSTAT_INC(esps_crypto);
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	ESPSTAT_INC(esps_hist[sav->alg_enc]);

	/* If authentication was performed, check now. */
	if (esph != NULL) {
	alen = xform_ah_authsize(esph);
	AHSTAT_INC(ahs_hist[sav->alg_auth]);
	/* Copy the authenticator from the packet */
	m_copydata(m, m->m_pkthdr.len - alen, alen, aalg);
	ptr = (caddr_t) (xd + 1);

	/* Verify authenticator */
	if (timingsafe_bcmp(ptr, aalg, alen) != 0) {
	DPRINTF(("%s: authentication hash mismatch for "
	"packet in SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	ESPSTAT_INC(esps_badauth);
	error = EACCES;
	goto bad;
	}
	m->m_flags \|= M_AUTHIPDGM;
	/* Remove trailing authenticator */
	m_adj(m, -alen);
	}

	/* Release the crypto descriptors */
	free(xd, M_XDATA), xd = NULL;
	crypto_freereq(crp), crp = NULL;

	/*
	* Packet is now decrypted.
	*/
	m->m_flags \|= M_DECRYPTED;

	/*
	* Update replay sequence number, if appropriate.
	*/
	if (sav->replay) {
	u_int32_t seq;

	m_copydata(m, skip + offsetof(struct newesp, esp_seq),
	sizeof (seq), (caddr_t) &seq);
	SECASVAR_LOCK(sav);
	if (ipsec_updatereplay(ntohl(seq), sav)) {
	SECASVAR_UNLOCK(sav);
	DPRINTF(("%s: packet replay check for %s\n", __func__,
	ipsec_sa2str(sav, buf, sizeof(buf))));
	ESPSTAT_INC(esps_replay);
	error = EACCES;
	goto bad;
	}
	SECASVAR_UNLOCK(sav);
	}

	/* Determine the ESP header length */
	if (sav->flags & SADB_X_EXT_OLD)
	hlen = sizeof (struct esp) + sav->ivlen;
	else
	hlen = sizeof (struct newesp) + sav->ivlen;

	/* Remove the ESP header and IV from the mbuf. */
	error = m_striphdr(m, skip, hlen);
	if (error) {
	ESPSTAT_INC(esps_hdrops);
	DPRINTF(("%s: bad mbuf chain, SA %s/%08lx\n", __func__,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	goto bad;
	}

	/* Save the last three bytes of decrypted data */
	m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree);

	/* Verify pad length */
	if (lastthree[1] + 2 > m->m_pkthdr.len - skip) {
	ESPSTAT_INC(esps_badilen);
	DPRINTF(("%s: invalid padding length %d for %u byte packet "
	"in SA %s/%08lx\n", __func__, lastthree[1],
	m->m_pkthdr.len - skip,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	error = EINVAL;
	goto bad;
	}

	/* Verify correct decryption by checking the last padding bytes */
	if ((sav->flags & SADB_X_EXT_PMASK) != SADB_X_EXT_PRAND) {
	if (lastthree[1] != lastthree[0] && lastthree[1] != 0) {
	ESPSTAT_INC(esps_badenc);
	DPRINTF(("%s: decryption failed for packet in "
	"SA %s/%08lx\n", __func__, ipsec_address(
	&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	error = EINVAL;
	goto bad;
	}
	}

	/* Trim the mbuf chain to remove trailing authenticator and padding */
	m_adj(m, -(lastthree[1] + 2));

	/* Restore the Next Protocol field */
	m_copyback(m, protoff, sizeof (u_int8_t), lastthree + 2);

	switch (saidx->dst.sa.sa_family) {
	#ifdef INET6
	case AF_INET6:
	error = ipsec6_common_input_cb(m, sav, skip, protoff);
	break;
	#endif
	#ifdef INET
	case AF_INET:
	error = ipsec4_common_input_cb(m, sav, skip, protoff);
	break;
	#endif
	default:
	panic("%s: Unexpected address family: %d saidx=%p", __func__,
	saidx->dst.sa.sa_family, saidx);
	}
	return error;
	bad:
	if (sav != NULL)
	key_freesav(&sav);
	if (m != NULL)
	m_freem(m);
	if (xd != NULL)
	free(xd, M_XDATA);
	if (crp != NULL)
	crypto_freereq(crp);
	return error;
	}
	/*
	* ESP output routine, called by ipsec[46]_perform_request().
	*/
	static int
	esp_output(struct mbuf m, struct secpolicy sp, struct secasvar *sav,
	u_int idx, int skip, int protoff)
	{
	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
	struct cryptodesc crde = NULL, crda = NULL;
	struct cryptop *crp;
	const struct auth_hash *esph;
	const struct enc_xform *espx;
	struct mbuf *mo = NULL;
	struct xform_data *xd;
	struct secasindex *saidx;
	unsigned char *pad;
	uint8_t *ivp;
	uint64_t cntr, cryptoid;
	int hlen, rlen, padding, blks, alen, i, roff;
	int error, maxpacketsize;
	uint8_t prot;

	IPSEC_ASSERT(sav != NULL, ("null SA"));
	esph = sav->tdb_authalgxform;
	espx = sav->tdb_encalgxform;
	IPSEC_ASSERT(espx != NULL, ("null encoding xform"));

	if (sav->flags & SADB_X_EXT_OLD)
	hlen = sizeof (struct esp) + sav->ivlen;
	else
	hlen = sizeof (struct newesp) + sav->ivlen;

	rlen = m->m_pkthdr.len - skip; /* Raw payload length. */
	/*
	* RFC4303 2.4 Requires 4 byte alignment.
	*/
	blks = MAX(4, espx->blocksize); /* Cipher blocksize */

	/* XXX clamp padding length a la KAME??? */
	padding = ((blks - ((rlen + 2) % blks)) % blks) + 2;

	alen = xform_ah_authsize(esph);

	ESPSTAT_INC(esps_output);

	saidx = &sav->sah->saidx;
	/* Check for maximum packet size violations. */
	switch (saidx->dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	maxpacketsize = IP_MAXPACKET;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	maxpacketsize = IPV6_MAXPACKET;
	break;
	#endif /* INET6 */
	default:
	DPRINTF(("%s: unknown/unsupported protocol "
	"family %d, SA %s/%08lx\n", __func__,
	saidx->dst.sa.sa_family, ipsec_address(&saidx->dst,
	buf, sizeof(buf)), (u_long) ntohl(sav->spi)));
	ESPSTAT_INC(esps_nopf);
	error = EPFNOSUPPORT;
	goto bad;
	}
	/*
	DPRINTF(("%s: skip %d hlen %d rlen %d padding %d alen %d blksd %d\n",
	__func__, skip, hlen, rlen, padding, alen, blks)); */
	if (skip + hlen + rlen + padding + alen > maxpacketsize) {
	DPRINTF(("%s: packet in SA %s/%08lx got too big "
	"(len %u, max len %u)\n", __func__,
	ipsec_address(&saidx->dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi),
	skip + hlen + rlen + padding + alen, maxpacketsize));
	ESPSTAT_INC(esps_toobig);
	error = EMSGSIZE;
	goto bad;
	}

	/* Update the counters. */
	ESPSTAT_ADD(esps_obytes, m->m_pkthdr.len - skip);

	m = m_unshare(m, M_NOWAIT);
	if (m == NULL) {
	DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	ESPSTAT_INC(esps_hdrops);
	error = ENOBUFS;
	goto bad;
	}

	/* Inject ESP header. */
	mo = m_makespace(m, skip, hlen, &roff);
	if (mo == NULL) {
	DPRINTF(("%s: %u byte ESP hdr inject failed for SA %s/%08lx\n",
	__func__, hlen, ipsec_address(&saidx->dst, buf,
	sizeof(buf)), (u_long) ntohl(sav->spi)));
	ESPSTAT_INC(esps_hdrops); /* XXX diffs from openbsd */
	error = ENOBUFS;
	goto bad;
	}

	/* Initialize ESP header. */
	bcopy((caddr_t) &sav->spi, mtod(mo, caddr_t) + roff,
	sizeof(uint32_t));
	SECASVAR_LOCK(sav);
	if (sav->replay) {
	uint32_t replay;

	#ifdef REGRESSION
	/* Emulate replay attack when ipsec_replay is TRUE. */
	if (!V_ipsec_replay)
	#endif
	sav->replay->count++;
	replay = htonl(sav->replay->count);

	bcopy((caddr_t) &replay, mtod(mo, caddr_t) + roff +
	sizeof(uint32_t), sizeof(uint32_t));
	}
	cryptoid = sav->tdb_cryptoid;
	if (SAV_ISCTRORGCM(sav))
	cntr = sav->cntr++;
	SECASVAR_UNLOCK(sav);

	/*
	* Add padding -- better to do it ourselves than use the crypto engine,
	* although if/when we support compression, we'd have to do that.
	*/
	pad = (u_char *) m_pad(m, padding + alen);
	if (pad == NULL) {
	DPRINTF(("%s: m_pad failed for SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	m = NULL; /* NB: free'd by m_pad */
	error = ENOBUFS;
	goto bad;
	}

	/*
	* Add padding: random, zero, or self-describing.
	* XXX catch unexpected setting
	*/
	switch (sav->flags & SADB_X_EXT_PMASK) {
	case SADB_X_EXT_PRAND:
	(void) read_random(pad, padding - 2);
	break;
	case SADB_X_EXT_PZERO:
	bzero(pad, padding - 2);
	break;
	case SADB_X_EXT_PSEQ:
	for (i = 0; i < padding - 2; i++)
	pad[i] = i+1;
	break;
	}

	/* Fix padding length and Next Protocol in padding itself. */
	pad[padding - 2] = padding - 2;
	m_copydata(m, protoff, sizeof(u_int8_t), pad + padding - 1);

	/* Fix Next Protocol in IPv4/IPv6 header. */
	prot = IPPROTO_ESP;
	m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot);

	/* Get crypto descriptors. */
	crp = crypto_getreq(esph != NULL ? 2 : 1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptors\n",
	__func__));
	ESPSTAT_INC(esps_crypto);
	error = ENOBUFS;
	goto bad;
	}

	/* IPsec-specific opaque crypto info. */
	xd = malloc(sizeof(struct xform_data), M_XDATA, M_NOWAIT \| M_ZERO);
	if (xd == NULL) {
	crypto_freereq(crp);
	DPRINTF(("%s: failed to allocate xform_data\n", __func__));
	ESPSTAT_INC(esps_crypto);
	error = ENOBUFS;
	goto bad;
	}

	crde = crp->crp_desc;
	crda = crde->crd_next;

	/* Encryption descriptor. */
	crde->crd_skip = skip + hlen;
	crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
	crde->crd_flags = CRD_F_ENCRYPT;
	crde->crd_inject = skip + hlen - sav->ivlen;

	/* Encryption operation. */
	crde->crd_alg = espx->type;
	if (SAV_ISCTRORGCM(sav)) {
	ivp = &crde->crd_iv[0];

	/* GCM IV Format: RFC4106 4 */
	/* CTR IV Format: RFC3686 4 */
	/* Salt is last four bytes of key, RFC4106 8.1 */
	/* Nonce is last four bytes of key, RFC3686 5.1 */
	memcpy(ivp, sav->key_enc->key_data +
	_KEYLEN(sav->key_enc) - 4, 4);
	be64enc(&ivp[4], cntr);
	if (SAV_ISCTR(sav)) {
	/* Initial block counter is 1, RFC3686 4 */
	/* XXXAE: should we use this only for first packet? */
	be32enc(&ivp[sav->ivlen + 4], 1);
	}

	m_copyback(m, skip + hlen - sav->ivlen, sav->ivlen, &ivp[4]);
	crde->crd_flags \|= CRD_F_IV_EXPLICIT\|CRD_F_IV_PRESENT;
	}

	/* Callback parameters */
	xd->sp = sp;
	xd->sav = sav;
	xd->idx = idx;
	xd->cryptoid = cryptoid;

	/* Crypto operation descriptor. */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	if (V_async_crypto)
	crp->crp_flags \|= CRYPTO_F_ASYNC \| CRYPTO_F_ASYNC_KEEPORDER;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = esp_output_cb;
	crp->crp_opaque = (caddr_t) xd;
	crp->crp_sid = cryptoid;

	if (esph) {
	/* Authentication descriptor. */
	crda->crd_alg = esph->type;
	crda->crd_skip = skip;
	if (SAV_ISGCM(sav))
	crda->crd_len = 8; /* RFC4106 5, SPI + SN */
	else
	crda->crd_len = m->m_pkthdr.len - (skip + alen);
	crda->crd_inject = m->m_pkthdr.len - alen;
	}

	return crypto_dispatch(crp);
	bad:
	if (m)
	m_freem(m);
	key_freesav(&sav);
	key_freesp(&sp);
	return (error);
	}
	/*
	* ESP output callback from the crypto driver.
	*/
	static int
	esp_output_cb(struct cryptop *crp)
	{
	struct xform_data *xd;
	struct secpolicy *sp;
	struct secasvar *sav;
	struct mbuf *m;
	uint64_t cryptoid;
	u_int idx;
	int error;

	xd = (struct xform_data *) crp->crp_opaque;
	m = (struct mbuf *) crp->crp_buf;
	sp = xd->sp;
	sav = xd->sav;
	idx = xd->idx;
	cryptoid = xd->cryptoid;

	/* Check for crypto errors. */
	if (crp->crp_etype) {
	if (crp->crp_etype == EAGAIN) {
	/* Reset the session ID */
	if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
	crypto_freesession(cryptoid);
	xd->cryptoid = crp->crp_sid;
	return (crypto_dispatch(crp));
	}
	ESPSTAT_INC(esps_noxform);
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	m_freem(m);
	goto bad;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	ESPSTAT_INC(esps_crypto);
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	free(xd, M_XDATA);
	crypto_freereq(crp);
	ESPSTAT_INC(esps_hist[sav->alg_enc]);
	if (sav->tdb_authalgxform != NULL)
	AHSTAT_INC(ahs_hist[sav->alg_auth]);

	#ifdef REGRESSION
	/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
	if (V_ipsec_integrity) {
	static unsigned char ipseczeroes[AH_HMAC_MAXHASHLEN];
	const struct auth_hash *esph;

	/*
	* Corrupt HMAC if we want to test integrity verification of
	* the other side.
	*/
	esph = sav->tdb_authalgxform;
	if (esph != NULL) {
	int alen;

	alen = xform_ah_authsize(esph);
	m_copyback(m, m->m_pkthdr.len - alen,
	alen, ipseczeroes);
	}
	}
	#endif

	/* NB: m is reclaimed by ipsec_process_done. */
	error = ipsec_process_done(m, sp, sav, idx);
	return (error);
	bad:
	free(xd, M_XDATA);
	crypto_freereq(crp);
	key_freesav(&sav);
	key_freesp(&sp);
	return (error);
	}

	static struct xformsw esp_xformsw = {
	.xf_type = XF_ESP,
	.xf_name = "IPsec ESP",
	.xf_init = esp_init,
	.xf_zeroize = esp_zeroize,
	.xf_input = esp_input,
	.xf_output = esp_output,
	};

	SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
	xform_attach, &esp_xformsw);
	SYSUNINIT(esp_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
	xform_detach, &esp_xformsw);
	Index: head/sys/netipsec/xform_ipcomp.c
	===================================================================
	--- head/sys/netipsec/xform_ipcomp.c (revision 327172)
	+++ head/sys/netipsec/xform_ipcomp.c (revision 327173)
	@@ -1,780 +1,777 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */

	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org)
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/* IP payload compression protocol (IPComp), see RFC 2393 */
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/socket.h>
	#include <sys/kernel.h>
	#include <sys/protosw.h>
	#include <sys/sysctl.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_encap.h>

	#include <net/netisr.h>
	#include <net/vnet.h>

	#include <netipsec/ipsec.h>
	#include <netipsec/xform.h>

	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netipsec/ipsec6.h>
	#endif

	#include <netipsec/ipcomp.h>
	#include <netipsec/ipcomp_var.h>

	#include <netipsec/key.h>
	#include <netipsec/key_debug.h>

	#include <opencrypto/cryptodev.h>
	#include <opencrypto/deflate.h>
	#include <opencrypto/xform.h>

	VNET_DEFINE(int, ipcomp_enable) = 1;
	VNET_PCPUSTAT_DEFINE(struct ipcompstat, ipcompstat);
	VNET_PCPUSTAT_SYSINIT(ipcompstat);

	#ifdef VIMAGE
	VNET_PCPUSTAT_SYSUNINIT(ipcompstat);
	#endif /* VIMAGE */

	SYSCTL_DECL(_net_inet_ipcomp);
	SYSCTL_INT(_net_inet_ipcomp, OID_AUTO, ipcomp_enable,
	CTLFLAG_VNET \| CTLFLAG_RW, &VNET_NAME(ipcomp_enable), 0, "");
	SYSCTL_VNET_PCPUSTAT(_net_inet_ipcomp, IPSECCTL_STATS, stats,
	struct ipcompstat, ipcompstat,
	"IPCOMP statistics (struct ipcompstat, netipsec/ipcomp_var.h");

	static int ipcomp_input_cb(struct cryptop *crp);
	static int ipcomp_output_cb(struct cryptop *crp);

	/*
	* RFC 3173 p 2.2. Non-Expansion Policy:
	* If the total size of a compressed payload and the IPComp header, as
	* defined in section 3, is not smaller than the size of the original
	* payload, the IP datagram MUST be sent in the original non-compressed
	* form.
	*
	* When we use IPComp in tunnel mode, for small packets we will receive
	* encapsulated IP-IP datagrams without any compression and without IPComp
	* header.
	*/
	static int
	ipcomp_encapcheck(union sockaddr_union src, union sockaddr_union dst)
	{
	struct secasvar *sav;

	sav = key_allocsa_tunnel(src, dst, IPPROTO_IPCOMP);
	if (sav == NULL)
	return (0);
	key_freesav(&sav);

	if (src->sa.sa_family == AF_INET)
	return (sizeof(struct in_addr) << 4);
	else
	return (sizeof(struct in6_addr) << 4);
	}

	static int
	ipcomp_nonexp_input(struct mbuf *mp, int offp, int proto)
	{
	int isr;

	switch (proto) {
	#ifdef INET
	case IPPROTO_IPV4:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case IPPROTO_IPV6:
	isr = NETISR_IPV6;
	break;
	#endif
	default:
	IPCOMPSTAT_INC(ipcomps_nopf);
	m_freem(*mp);
	return (IPPROTO_DONE);
	}
	m_adj(mp, offp);
	IPCOMPSTAT_ADD(ipcomps_ibytes, (*mp)->m_pkthdr.len);
	IPCOMPSTAT_INC(ipcomps_input);
	netisr_dispatch(isr, *mp);
	return (IPPROTO_DONE);
	}

	/*
	* ipcomp_init() is called when an CPI is being set up.
	*/
	static int
	ipcomp_init(struct secasvar sav, struct xformsw xsp)
	{
	const struct comp_algo *tcomp;
	struct cryptoini cric;

	/* NB: algorithm really comes in alg_enc and not alg_comp! */
	tcomp = comp_algorithm_lookup(sav->alg_enc);
	if (tcomp == NULL) {
	DPRINTF(("%s: unsupported compression algorithm %d\n", __func__,
	sav->alg_comp));
	return EINVAL;
	}
	sav->alg_comp = sav->alg_enc; /* set for doing histogram */
	sav->tdb_xform = xsp;
	sav->tdb_compalgxform = tcomp;

	/* Initialize crypto session */
	bzero(&cric, sizeof (cric));
	cric.cri_alg = sav->tdb_compalgxform->type;

	return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support);
	}

	/*
	* ipcomp_zeroize() used when IPCA is deleted
	*/
	static int
	ipcomp_zeroize(struct secasvar *sav)
	{
	int err;

	err = crypto_freesession(sav->tdb_cryptoid);
	sav->tdb_cryptoid = 0;
	return err;
	}

	/*
	* ipcomp_input() gets called to uncompress an input packet
	*/
	static int
	ipcomp_input(struct mbuf m, struct secasvar sav, int skip, int protoff)
	{
	struct xform_data *xd;
	struct cryptodesc *crdc;
	struct cryptop *crp;
	struct ipcomp *ipcomp;
	caddr_t addr;
	int error, hlen = IPCOMP_HLENGTH;

	/*
	* Check that the next header of the IPComp is not IPComp again, before
	* doing any real work. Given it is not possible to do double
	* compression it means someone is playing tricks on us.
	*/
	error = ENOBUFS;
	if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) {
	IPCOMPSTAT_INC(ipcomps_hdrops); /XXX/
	DPRINTF(("%s: m_pullup failed\n", __func__));
	key_freesav(&sav);
	return (error);
	}
	addr = (caddr_t) mtod(m, struct ip *) + skip;
	ipcomp = (struct ipcomp *)addr;
	if (ipcomp->comp_nxt == IPPROTO_IPCOMP) {
	IPCOMPSTAT_INC(ipcomps_pdrops); /* XXX have our own stats? */
	DPRINTF(("%s: recursive compression detected\n", __func__));
	error = EINVAL;
	goto bad;
	}

	/* Get crypto descriptors */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	DPRINTF(("%s: no crypto descriptors\n", __func__));
	IPCOMPSTAT_INC(ipcomps_crypto);
	goto bad;
	}
	/* Get IPsec-specific opaque pointer */
	xd = malloc(sizeof(*xd), M_XDATA, M_NOWAIT \| M_ZERO);
	if (xd == NULL) {
	DPRINTF(("%s: cannot allocate xform_data\n", __func__));
	IPCOMPSTAT_INC(ipcomps_crypto);
	crypto_freereq(crp);
	goto bad;
	}
	crdc = crp->crp_desc;

	crdc->crd_skip = skip + hlen;
	crdc->crd_len = m->m_pkthdr.len - (skip + hlen);
	crdc->crd_inject = skip;

	/* Decompression operation */
	crdc->crd_alg = sav->tdb_compalgxform->type;


	/* Crypto operation descriptor */
	crp->crp_ilen = m->m_pkthdr.len - (skip + hlen);
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ipcomp_input_cb;
	crp->crp_opaque = (caddr_t) xd;

	/* These are passed as-is to the callback */
	xd->sav = sav;
	xd->protoff = protoff;
	xd->skip = skip;

	SECASVAR_LOCK(sav);
	crp->crp_sid = xd->cryptoid = sav->tdb_cryptoid;
	SECASVAR_UNLOCK(sav);

	return crypto_dispatch(crp);
	bad:
	m_freem(m);
	key_freesav(&sav);
	return (error);
	}

	/*
	* IPComp input callback from the crypto driver.
	*/
	static int
	ipcomp_input_cb(struct cryptop *crp)
	{
	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
	- struct cryptodesc *crd;
	struct xform_data *xd;
	struct mbuf *m;
	struct secasvar *sav;
	struct secasindex *saidx;
	caddr_t addr;
	uint64_t cryptoid;
	int hlen = IPCOMP_HLENGTH, error, clen;
	int skip, protoff;
	uint8_t nproto;
	-
	- crd = crp->crp_desc;

	m = (struct mbuf *) crp->crp_buf;
	xd = (struct xform_data *) crp->crp_opaque;
	sav = xd->sav;
	skip = xd->skip;
	protoff = xd->protoff;
	cryptoid = xd->cryptoid;
	saidx = &sav->sah->saidx;
	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET \|\|
	saidx->dst.sa.sa_family == AF_INET6,
	("unexpected protocol family %u", saidx->dst.sa.sa_family));

	/* Check for crypto errors */
	if (crp->crp_etype) {
	if (crp->crp_etype == EAGAIN) {
	/* Reset the session ID */
	if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
	crypto_freesession(cryptoid);
	xd->cryptoid = crp->crp_sid;
	return (crypto_dispatch(crp));
	}
	IPCOMPSTAT_INC(ipcomps_noxform);
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}
	/* Shouldn't happen... */
	if (m == NULL) {
	IPCOMPSTAT_INC(ipcomps_crypto);
	DPRINTF(("%s: null mbuf returned from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]);

	clen = crp->crp_olen; /* Length of data after processing */

	/* Release the crypto descriptors */
	free(xd, M_XDATA), xd = NULL;
	crypto_freereq(crp), crp = NULL;

	/* In case it's not done already, adjust the size of the mbuf chain */
	m->m_pkthdr.len = clen + hlen + skip;

	if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) {
	IPCOMPSTAT_INC(ipcomps_hdrops); /XXX/
	DPRINTF(("%s: m_pullup failed\n", __func__));
	error = EINVAL; /XXX/
	goto bad;
	}

	/* Keep the next protocol field */
	addr = (caddr_t) mtod(m, struct ip *) + skip;
	nproto = ((struct ipcomp *) addr)->comp_nxt;

	/* Remove the IPCOMP header */
	error = m_striphdr(m, skip, hlen);
	if (error) {
	IPCOMPSTAT_INC(ipcomps_hdrops);
	DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	goto bad;
	}

	/* Restore the Next Protocol field */
	m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto);

	switch (saidx->dst.sa.sa_family) {
	#ifdef INET6
	case AF_INET6:
	error = ipsec6_common_input_cb(m, sav, skip, protoff);
	break;
	#endif
	#ifdef INET
	case AF_INET:
	error = ipsec4_common_input_cb(m, sav, skip, protoff);
	break;
	#endif
	default:
	panic("%s: Unexpected address family: %d saidx=%p", __func__,
	saidx->dst.sa.sa_family, saidx);
	}
	return error;
	bad:
	if (sav != NULL)
	key_freesav(&sav);
	if (m != NULL)
	m_freem(m);
	if (xd != NULL)
	free(xd, M_XDATA);
	if (crp != NULL)
	crypto_freereq(crp);
	return error;
	}

	/*
	* IPComp output routine, called by ipsec[46]_perform_request()
	*/
	static int
	ipcomp_output(struct mbuf m, struct secpolicy sp, struct secasvar *sav,
	u_int idx, int skip, int protoff)
	{
	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
	const struct comp_algo *ipcompx;
	struct cryptodesc *crdc;
	struct cryptop *crp;
	struct xform_data *xd;
	int error, ralen, maxpacketsize;

	IPSEC_ASSERT(sav != NULL, ("null SA"));
	ipcompx = sav->tdb_compalgxform;
	IPSEC_ASSERT(ipcompx != NULL, ("null compression xform"));

	/*
	* Do not touch the packet in case our payload to compress
	* is lower than the minimal threshold of the compression
	* alogrithm. We will just send out the data uncompressed.
	* See RFC 3173, 2.2. Non-Expansion Policy.
	*/
	if (m->m_pkthdr.len <= ipcompx->minlen) {
	IPCOMPSTAT_INC(ipcomps_threshold);
	return ipsec_process_done(m, sp, sav, idx);
	}

	ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */
	IPCOMPSTAT_INC(ipcomps_output);

	/* Check for maximum packet size violations. */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	maxpacketsize = IP_MAXPACKET;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	maxpacketsize = IPV6_MAXPACKET;
	break;
	#endif /* INET6 */
	default:
	IPCOMPSTAT_INC(ipcomps_nopf);
	DPRINTF(("%s: unknown/unsupported protocol family %d, "
	"IPCA %s/%08lx\n", __func__,
	sav->sah->saidx.dst.sa.sa_family,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi)));
	error = EPFNOSUPPORT;
	goto bad;
	}
	if (ralen + skip + IPCOMP_HLENGTH > maxpacketsize) {
	IPCOMPSTAT_INC(ipcomps_toobig);
	DPRINTF(("%s: packet in IPCA %s/%08lx got too big "
	"(len %u, max len %u)\n", __func__,
	ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)),
	(u_long) ntohl(sav->spi),
	ralen + skip + IPCOMP_HLENGTH, maxpacketsize));
	error = EMSGSIZE;
	goto bad;
	}

	/* Update the counters */
	IPCOMPSTAT_ADD(ipcomps_obytes, m->m_pkthdr.len - skip);

	m = m_unshare(m, M_NOWAIT);
	if (m == NULL) {
	IPCOMPSTAT_INC(ipcomps_hdrops);
	DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n",
	__func__, ipsec_address(&sav->sah->saidx.dst, buf,
	sizeof(buf)), (u_long) ntohl(sav->spi)));
	error = ENOBUFS;
	goto bad;
	}

	/* Ok now, we can pass to the crypto processing. */

	/* Get crypto descriptors */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	IPCOMPSTAT_INC(ipcomps_crypto);
	DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__));
	error = ENOBUFS;
	goto bad;
	}
	crdc = crp->crp_desc;

	/* Compression descriptor */
	crdc->crd_skip = skip;
	crdc->crd_len = ralen;
	crdc->crd_flags = CRD_F_COMP;
	crdc->crd_inject = skip;

	/* Compression operation */
	crdc->crd_alg = ipcompx->type;

	/* IPsec-specific opaque crypto info */
	xd = malloc(sizeof(struct xform_data), M_XDATA, M_NOWAIT \| M_ZERO);
	if (xd == NULL) {
	IPCOMPSTAT_INC(ipcomps_crypto);
	DPRINTF(("%s: failed to allocate xform_data\n", __func__));
	crypto_freereq(crp);
	error = ENOBUFS;
	goto bad;
	}

	xd->sp = sp;
	xd->sav = sav;
	xd->idx = idx;
	xd->skip = skip;
	xd->protoff = protoff;

	/* Crypto operation descriptor */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ipcomp_output_cb;
	crp->crp_opaque = (caddr_t) xd;

	SECASVAR_LOCK(sav);
	crp->crp_sid = xd->cryptoid = sav->tdb_cryptoid;
	SECASVAR_UNLOCK(sav);

	return crypto_dispatch(crp);
	bad:
	if (m)
	m_freem(m);
	key_freesav(&sav);
	key_freesp(&sp);
	return (error);
	}

	/*
	* IPComp output callback from the crypto driver.
	*/
	static int
	ipcomp_output_cb(struct cryptop *crp)
	{
	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
	struct xform_data *xd;
	struct secpolicy *sp;
	struct secasvar *sav;
	struct mbuf *m;
	uint64_t cryptoid;
	u_int idx;
	int error, skip, protoff;

	m = (struct mbuf *) crp->crp_buf;
	xd = (struct xform_data *) crp->crp_opaque;
	idx = xd->idx;
	sp = xd->sp;
	sav = xd->sav;
	skip = xd->skip;
	protoff = xd->protoff;
	cryptoid = xd->cryptoid;

	/* Check for crypto errors */
	if (crp->crp_etype) {
	if (crp->crp_etype == EAGAIN) {
	/* Reset the session ID */
	if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0)
	crypto_freesession(cryptoid);
	xd->cryptoid = crp->crp_sid;
	return (crypto_dispatch(crp));
	}
	IPCOMPSTAT_INC(ipcomps_noxform);
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}
	/* Shouldn't happen... */
	if (m == NULL) {
	IPCOMPSTAT_INC(ipcomps_crypto);
	DPRINTF(("%s: bogus return buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]);

	if (crp->crp_ilen - skip > crp->crp_olen) {
	struct mbuf *mo;
	struct ipcomp *ipcomp;
	int roff;
	uint8_t prot;

	/* Compression helped, inject IPCOMP header. */
	mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff);
	if (mo == NULL) {
	IPCOMPSTAT_INC(ipcomps_wrap);
	DPRINTF(("%s: IPCOMP header inject failed "
	"for IPCA %s/%08lx\n",
	__func__, ipsec_address(&sav->sah->saidx.dst, buf,
	sizeof(buf)), (u_long) ntohl(sav->spi)));
	error = ENOBUFS;
	goto bad;
	}
	ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff);

	/* Initialize the IPCOMP header */
	/* XXX alignment always correct? */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt;
	break;
	#endif
	}
	ipcomp->comp_flags = 0;
	ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi));

	/* Fix Next Protocol in IPv4/IPv6 header */
	prot = IPPROTO_IPCOMP;
	m_copyback(m, protoff, sizeof(u_int8_t),
	(u_char *)&prot);

	/* Adjust the length in the IP header */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	mtod(m, struct ip6_hdr *)->ip6_plen =
	htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr);
	break;
	#endif /* INET6 */
	default:
	IPCOMPSTAT_INC(ipcomps_nopf);
	DPRINTF(("%s: unknown/unsupported protocol "
	"family %d, IPCA %s/%08lx\n", __func__,
	sav->sah->saidx.dst.sa.sa_family,
	ipsec_address(&sav->sah->saidx.dst, buf,
	sizeof(buf)), (u_long) ntohl(sav->spi)));
	error = EPFNOSUPPORT;
	goto bad;
	}
	} else {
	/* Compression was useless, we have lost time. */
	IPCOMPSTAT_INC(ipcomps_uncompr);
	DPRINTF(("%s: compressions was useless %d - %d <= %d\n",
	__func__, crp->crp_ilen, skip, crp->crp_olen));
	/* XXX remember state to not compress the next couple
	* of packets, RFC 3173, 2.2. Non-Expansion Policy */
	}

	/* Release the crypto descriptor */
	free(xd, M_XDATA);
	crypto_freereq(crp);

	/* NB: m is reclaimed by ipsec_process_done. */
	error = ipsec_process_done(m, sp, sav, idx);
	return (error);
	bad:
	if (m)
	m_freem(m);
	free(xd, M_XDATA);
	crypto_freereq(crp);
	key_freesav(&sav);
	key_freesp(&sp);
	return (error);
	}

	#ifdef INET
	static const struct encaptab *ipe4_cookie = NULL;
	extern struct domain inetdomain;
	static struct protosw ipcomp4_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inetdomain,
	.pr_protocol = 0 /* IPPROTO_IPV[46] */,
	.pr_flags = PR_ATOMIC \| PR_ADDR \| PR_LASTHDR,
	.pr_input = ipcomp_nonexp_input,
	.pr_output = rip_output,
	.pr_ctloutput = rip_ctloutput,
	.pr_usrreqs = &rip_usrreqs
	};

	static int
	ipcomp4_nonexp_encapcheck(const struct mbuf *m, int off, int proto,
	void *arg __unused)
	{
	union sockaddr_union src, dst;
	const struct ip *ip;

	if (V_ipcomp_enable == 0)
	return (0);
	if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6)
	return (0);
	bzero(&src, sizeof(src));
	bzero(&dst, sizeof(dst));
	src.sa.sa_family = dst.sa.sa_family = AF_INET;
	src.sin.sin_len = dst.sin.sin_len = sizeof(struct sockaddr_in);
	ip = mtod(m, const struct ip *);
	src.sin.sin_addr = ip->ip_src;
	dst.sin.sin_addr = ip->ip_dst;
	return (ipcomp_encapcheck(&src, &dst));
	}
	#endif
	#ifdef INET6
	static const struct encaptab *ipe6_cookie = NULL;
	extern struct domain inet6domain;
	static struct protosw ipcomp6_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = 0 /* IPPROTO_IPV[46] */,
	.pr_flags = PR_ATOMIC \| PR_ADDR \| PR_LASTHDR,
	.pr_input = ipcomp_nonexp_input,
	.pr_output = rip6_output,
	.pr_ctloutput = rip6_ctloutput,
	.pr_usrreqs = &rip6_usrreqs
	};

	static int
	ipcomp6_nonexp_encapcheck(const struct mbuf *m, int off, int proto,
	void *arg __unused)
	{
	union sockaddr_union src, dst;
	const struct ip6_hdr *ip6;

	if (V_ipcomp_enable == 0)
	return (0);
	if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6)
	return (0);
	bzero(&src, sizeof(src));
	bzero(&dst, sizeof(dst));
	src.sa.sa_family = dst.sa.sa_family = AF_INET;
	src.sin6.sin6_len = dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
	ip6 = mtod(m, const struct ip6_hdr *);
	src.sin6.sin6_addr = ip6->ip6_src;
	dst.sin6.sin6_addr = ip6->ip6_dst;
	if (IN6_IS_SCOPE_LINKLOCAL(&src.sin6.sin6_addr)) {
	/* XXX: sa6_recoverscope() */
	src.sin6.sin6_scope_id =
	ntohs(src.sin6.sin6_addr.s6_addr16[1]);
	src.sin6.sin6_addr.s6_addr16[1] = 0;
	}
	if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6.sin6_addr)) {
	/* XXX: sa6_recoverscope() */
	dst.sin6.sin6_scope_id =
	ntohs(dst.sin6.sin6_addr.s6_addr16[1]);
	dst.sin6.sin6_addr.s6_addr16[1] = 0;
	}
	return (ipcomp_encapcheck(&src, &dst));
	}
	#endif

	static struct xformsw ipcomp_xformsw = {
	.xf_type = XF_IPCOMP,
	.xf_name = "IPcomp",
	.xf_init = ipcomp_init,
	.xf_zeroize = ipcomp_zeroize,
	.xf_input = ipcomp_input,
	.xf_output = ipcomp_output,
	};

	static void
	ipcomp_attach(void)
	{

	#ifdef INET
	ipe4_cookie = encap_attach_func(AF_INET, -1,
	ipcomp4_nonexp_encapcheck, &ipcomp4_protosw, NULL);
	#endif
	#ifdef INET6
	ipe6_cookie = encap_attach_func(AF_INET6, -1,
	ipcomp6_nonexp_encapcheck, &ipcomp6_protosw, NULL);
	#endif
	xform_attach(&ipcomp_xformsw);
	}

	static void
	ipcomp_detach(void)
	{

	#ifdef INET
	encap_detach(ipe4_cookie);
	#endif
	#ifdef INET6
	encap_detach(ipe6_cookie);
	#endif
	xform_detach(&ipcomp_xformsw);
	}

	SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
	ipcomp_attach, NULL);
	SYSUNINIT(ipcomp_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
	ipcomp_detach, NULL);
	Index: head/sys/nfs/nfs_fha.c
	===================================================================
	--- head/sys/nfs/nfs_fha.c (revision 327172)
	+++ head/sys/nfs/nfs_fha.c (revision 327173)
	@@ -1,536 +1,527 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/vnode.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mbuf.h>
	#include <sys/sbuf.h>

	#include <rpc/rpc.h>
	#include <nfs/nfs_fha.h>

	static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");

	/*
	* XXX need to commonize definitions between old and new NFS code. Define
	* this here so we don't include one nfsproto.h over the other.
	*/
	#define NFS_PROG 100003

	void
	fha_init(struct fha_params *softc)
	{
	int i;

	for (i = 0; i < FHA_HASH_SIZE; i++)
	mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);

	/*
	* Set the default tuning parameters.
	*/
	softc->ctls.enable = FHA_DEF_ENABLE;
	softc->ctls.read = FHA_DEF_READ;
	softc->ctls.write = FHA_DEF_WRITE;
	softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
	softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
	softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;

	/*
	* Add sysctls so the user can change the tuning parameters.
	*/
	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "enable", CTLFLAG_RWTUN,
	&softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");

	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "read", CTLFLAG_RWTUN,
	&softc->ctls.read, 0, "Enable NFS FHA read locality");

	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "write", CTLFLAG_RWTUN,
	&softc->ctls.write, 0, "Enable NFS FHA write locality");

	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "bin_shift", CTLFLAG_RWTUN,
	&softc->ctls.bin_shift, 0, "Maximum locality distance 2^(bin_shift) bytes");

	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RWTUN,
	&softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
	"should be working on requests for the same file handle");

	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RWTUN,
	&softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
	"single nfsd thread should be working on at any time");

	SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
	OID_AUTO, "fhe_stats", CTLTYPE_STRING \| CTLFLAG_RD, 0, 0,
	softc->callbacks.fhe_stats_sysctl, "A", "");

	}

	void
	fha_uninit(struct fha_params *softc)
	{
	int i;

	sysctl_ctx_free(&softc->sysctl_ctx);
	for (i = 0; i < FHA_HASH_SIZE; i++)
	mtx_destroy(&softc->fha_hash[i].mtx);
	}

	/*
	* This just specifies that offsets should obey affinity when within
	* the same 1Mbyte (1<<20) chunk for the file (reads only for now).
	*/
	static void
	fha_extract_info(struct svc_req req, struct fha_info i,
	struct fha_callbacks *cb)
	{
	struct mbuf *md;
	caddr_t dpos;
	static u_int64_t random_fh = 0;
	int error;
	int v3 = (req->rq_vers == 3);
	rpcproc_t procnum;

	/*
	* We start off with a random fh. If we get a reasonable
	* procnum, we set the fh. If there's a concept of offset
	* that we're interested in, we set that.
	*/
	i->fh = ++random_fh;
	i->offset = 0;
	i->locktype = LK_EXCLUSIVE;
	i->read = i->write = 0;

	/*
	* Extract the procnum and convert to v3 form if necessary,
	* taking care to deal with out-of-range procnums. Caller will
	* ensure that rq_vers is either 2 or 3.
	*/
	procnum = req->rq_proc;
	if (!v3) {
	rpcproc_t tmp_procnum;

	tmp_procnum = cb->get_procnum(procnum);
	if (tmp_procnum == -1)
	goto out;
	procnum = tmp_procnum;
	}

	/*
	* We do affinity for most. However, we divide a realm of affinity
	* by file offset so as to allow for concurrent random access. We
	* only do this for reads today, but this may change when IFS supports
	* efficient concurrent writes.
	*/
	if (cb->no_offset(procnum))
	goto out;

	i->read = cb->is_read(procnum);
	i->write = cb->is_write(procnum);

	error = cb->realign(&req->rq_args, M_NOWAIT);
	if (error)
	goto out;
	md = req->rq_args;
	dpos = mtod(md, caddr_t);

	/* Grab the filehandle. */
	error = cb->get_fh(&i->fh, v3, &md, &dpos);
	if (error)
	goto out;

	/* Content ourselves with zero offset for all but reads. */
	if (i->read \|\| i->write)
	cb->get_offset(&md, &dpos, v3, i);

	out:
	cb->set_locktype(procnum, i);
	}

	static struct fha_hash_entry *
	fha_hash_entry_new(u_int64_t fh)
	{
	struct fha_hash_entry *e;

	e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
	e->fh = fh;
	e->num_rw = 0;
	e->num_exclusive = 0;
	e->num_threads = 0;
	LIST_INIT(&e->threads);

	return (e);
	}

	static void
	fha_hash_entry_destroy(struct fha_hash_entry *e)
	{

	mtx_assert(e->mtx, MA_OWNED);
	KASSERT(e->num_rw == 0,
	("%d reqs on destroyed fhe %p", e->num_rw, e));
	KASSERT(e->num_exclusive == 0,
	("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
	KASSERT(e->num_threads == 0,
	("%d threads on destroyed fhe %p", e->num_threads, e));
	free(e, M_NFS_FHA);
	}

	static void
	fha_hash_entry_remove(struct fha_hash_entry *e)
	{

	mtx_assert(e->mtx, MA_OWNED);
	LIST_REMOVE(e, link);
	fha_hash_entry_destroy(e);
	}

	static struct fha_hash_entry *
	fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
	{
	- SVCPOOL *pool;
	struct fha_hash_slot *fhs;
	struct fha_hash_entry fhe, new_fhe;

	- pool = *softc->pool;
	fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
	new_fhe = fha_hash_entry_new(fh);
	new_fhe->mtx = &fhs->mtx;
	mtx_lock(&fhs->mtx);
	LIST_FOREACH(fhe, &fhs->list, link)
	if (fhe->fh == fh)
	break;
	if (!fhe) {
	fhe = new_fhe;
	LIST_INSERT_HEAD(&fhs->list, fhe, link);
	} else
	fha_hash_entry_destroy(new_fhe);
	return (fhe);
	}

	static void
	fha_hash_entry_add_thread(struct fha_hash_entry fhe, SVCTHREAD thread)
	{

	mtx_assert(fhe->mtx, MA_OWNED);
	thread->st_p2 = 0;
	LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
	fhe->num_threads++;
	}

	static void
	fha_hash_entry_remove_thread(struct fha_hash_entry fhe, SVCTHREAD thread)
	{

	mtx_assert(fhe->mtx, MA_OWNED);
	KASSERT(thread->st_p2 == 0,
	("%d reqs on removed thread %p", thread->st_p2, thread));
	LIST_REMOVE(thread, st_alink);
	fhe->num_threads--;
	}

	/*
	* Account for an ongoing operation associated with this file.
	*/
	static void
	fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
	{

	mtx_assert(fhe->mtx, MA_OWNED);
	if (LK_EXCLUSIVE == locktype)
	fhe->num_exclusive += count;
	else
	fhe->num_rw += count;
	}

	/*
	* Get the service thread currently associated with the fhe that is
	* appropriate to handle this operation.
	*/
	static SVCTHREAD *
	fha_hash_entry_choose_thread(struct fha_params *softc,
	struct fha_hash_entry fhe, struct fha_info i, SVCTHREAD *this_thread)
	{
	SVCTHREAD thread, min_thread = NULL;
	- SVCPOOL *pool;
	int req_count, min_count = 0;
	off_t offset1, offset2;

	- pool = *softc->pool;
	-
	LIST_FOREACH(thread, &fhe->threads, st_alink) {
	req_count = thread->st_p2;

	/* If there are any writes in progress, use the first thread. */
	if (fhe->num_exclusive) {
	#if 0
	ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
	"fha: %p(%d)w", thread, req_count);
	#endif
	return (thread);
	}

	/* Check whether we should consider locality. */
	if ((i->read && !softc->ctls.read) \|\|
	(i->write && !softc->ctls.write))
	goto noloc;

	/*
	* Check for locality, making sure that we won't
	* exceed our per-thread load limit in the process.
	*/
	offset1 = i->offset;
	offset2 = thread->st_p3;

	if (((offset1 >= offset2)
	&& ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
	\|\| ((offset2 > offset1)
	&& ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
	if ((softc->ctls.max_reqs_per_nfsd == 0) \|\|
	(req_count < softc->ctls.max_reqs_per_nfsd)) {
	#if 0
	ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
	"fha: %p(%d)r", thread, req_count);
	#endif
	return (thread);
	}
	}

	noloc:
	/*
	* We don't have a locality match, so skip this thread,
	* but keep track of the most attractive thread in case
	* we need to come back to it later.
	*/
	#if 0
	ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
	"fha: %p(%d)s off1 %llu off2 %llu", thread,
	req_count, offset1, offset2);
	#endif
	if ((min_thread == NULL) \|\| (req_count < min_count)) {
	min_count = req_count;
	min_thread = thread;
	}
	}

	/*
	* We didn't find a good match yet. See if we can add
	* a new thread to this file handle entry's thread list.
	*/
	if ((softc->ctls.max_nfsds_per_fh == 0) \|\|
	(fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
	thread = this_thread;
	#if 0
	ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
	"fha: %p(%d)t", thread, thread->st_p2);
	#endif
	fha_hash_entry_add_thread(fhe, thread);
	} else {
	/*
	* We don't want to use any more threads for this file, so
	* go back to the most attractive nfsd we're already using.
	*/
	thread = min_thread;
	}

	return (thread);
	}

	/*
	* After getting a request, try to assign it to some thread. Usually we
	* handle it ourselves.
	*/
	SVCTHREAD *
	fha_assign(SVCTHREAD this_thread, struct svc_req req,
	struct fha_params *softc)
	{
	SVCTHREAD *thread;
	struct fha_info i;
	struct fha_hash_entry *fhe;
	struct fha_callbacks *cb;

	cb = &softc->callbacks;

	/* Check to see whether we're enabled. */
	if (softc->ctls.enable == 0)
	goto thist;

	/*
	* Only do placement if this is an NFS request.
	*/
	if (req->rq_prog != NFS_PROG)
	goto thist;

	if (req->rq_vers != 2 && req->rq_vers != 3)
	goto thist;

	fha_extract_info(req, &i, cb);

	/*
	* We save the offset associated with this request for later
	* nfsd matching.
	*/
	fhe = fha_hash_entry_lookup(softc, i.fh);
	req->rq_p1 = fhe;
	req->rq_p2 = i.locktype;
	req->rq_p3 = i.offset;

	/*
	* Choose a thread, taking into consideration locality, thread load,
	* and the number of threads already working on this file.
	*/
	thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
	KASSERT(thread, ("fha_assign: NULL thread!"));
	fha_hash_entry_add_op(fhe, i.locktype, 1);
	thread->st_p2++;
	thread->st_p3 = i.offset;

	/*
	* Grab the pool lock here to not let chosen thread go away before
	* the new request inserted to its queue while we drop fhe lock.
	*/
	mtx_lock(&thread->st_lock);
	mtx_unlock(fhe->mtx);

	return (thread);
	thist:
	req->rq_p1 = NULL;
	mtx_lock(&this_thread->st_lock);
	return (this_thread);
	}

	/*
	* Called when we're done with an operation. The request has already
	* been de-queued.
	*/
	void
	fha_nd_complete(SVCTHREAD thread, struct svc_req req)
	{
	struct fha_hash_entry *fhe = req->rq_p1;
	struct mtx *mtx;

	/*
	* This may be called for reqs that didn't go through
	* fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
	*/
	if (!fhe)
	return;

	mtx = fhe->mtx;
	mtx_lock(mtx);
	fha_hash_entry_add_op(fhe, req->rq_p2, -1);
	thread->st_p2--;
	KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
	thread->st_p2, thread));
	if (thread->st_p2 == 0) {
	fha_hash_entry_remove_thread(fhe, thread);
	if (0 == fhe->num_rw + fhe->num_exclusive)
	fha_hash_entry_remove(fhe);
	}
	mtx_unlock(mtx);
	}

	int
	fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
	{
	int error, i;
	struct sbuf sb;
	struct fha_hash_entry *fhe;
	bool_t first, hfirst;
	SVCTHREAD *thread;
	- SVCPOOL *pool;

	sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);

	- pool = NULL;
	-
	if (!*softc->pool) {
	sbuf_printf(&sb, "NFSD not running\n");
	goto out;
	}
	- pool = *softc->pool;

	for (i = 0; i < FHA_HASH_SIZE; i++)
	if (!LIST_EMPTY(&softc->fha_hash[i].list))
	break;

	if (i == FHA_HASH_SIZE) {
	sbuf_printf(&sb, "No file handle entries.\n");
	goto out;
	}

	hfirst = TRUE;
	for (; i < FHA_HASH_SIZE; i++) {
	mtx_lock(&softc->fha_hash[i].mtx);
	if (LIST_EMPTY(&softc->fha_hash[i].list)) {
	mtx_unlock(&softc->fha_hash[i].mtx);
	continue;
	}
	sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
	first = TRUE;
	LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
	sbuf_printf(&sb, "%sfhe %p: {\n", first ? " " : ", ", fhe);

	sbuf_printf(&sb, " fh: %ju\n", (uintmax_t) fhe->fh);
	sbuf_printf(&sb, " num_rw/exclusive: %d/%d\n",
	fhe->num_rw, fhe->num_exclusive);
	sbuf_printf(&sb, " num_threads: %d\n", fhe->num_threads);

	LIST_FOREACH(thread, &fhe->threads, st_alink) {
	sbuf_printf(&sb, " thread %p offset %ju "
	"reqs %d\n", thread,
	thread->st_p3, thread->st_p2);
	}

	sbuf_printf(&sb, " }");
	first = FALSE;
	}
	sbuf_printf(&sb, "\n}");
	mtx_unlock(&softc->fha_hash[i].mtx);
	hfirst = FALSE;
	}

	out:
	sbuf_trim(&sb);
	sbuf_finish(&sb);
	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
	sbuf_delete(&sb);
	return (error);
	}
	Index: head/sys/nlm/nlm_prot_impl.c
	===================================================================
	--- head/sys/nlm/nlm_prot_impl.c (revision 327172)
	+++ head/sys/nlm/nlm_prot_impl.c (revision 327173)
	@@ -1,2437 +1,2433 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
	* Authors: Doug Rabson <dfr@rabson.org>
	* Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_inet6.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/fail.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lockf.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#if __FreeBSD_version >= 700000
	#include <sys/priv.h>
	#endif
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscall.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/taskqueue.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>

	#include <nfs/nfsproto.h>
	#include <nfs/nfs_lock.h>

	#include <nlm/nlm_prot.h>
	#include <nlm/sm_inter.h>
	#include <nlm/nlm.h>
	#include <rpc/rpc_com.h>
	#include <rpc/rpcb_prot.h>

	MALLOC_DEFINE(M_NLM, "NLM", "Network Lock Manager");

	/*
	* If a host is inactive (and holds no locks) for this amount of
	* seconds, we consider it idle and stop tracking it.
	*/
	#define NLM_IDLE_TIMEOUT 30

	/*
	* We check the host list for idle every few seconds.
	*/
	#define NLM_IDLE_PERIOD 5

	/*
	* We only look for GRANTED_RES messages for a little while.
	*/
	#define NLM_EXPIRE_TIMEOUT 10

	/*
	* Support for sysctl vfs.nlm.sysid
	*/
	static SYSCTL_NODE(_vfs, OID_AUTO, nlm, CTLFLAG_RW, NULL,
	"Network Lock Manager");
	static SYSCTL_NODE(_vfs_nlm, OID_AUTO, sysid, CTLFLAG_RW, NULL, "");

	/*
	* Syscall hooks
	*/
	static int nlm_syscall_offset = SYS_nlm_syscall;
	static struct sysent nlm_syscall_prev_sysent;
	#if __FreeBSD_version < 700000
	static struct sysent nlm_syscall_sysent = {
	(sizeof(struct nlm_syscall_args) / sizeof(register_t)) \| SYF_MPSAFE,
	(sy_call_t *) nlm_syscall
	};
	#else
	MAKE_SYSENT(nlm_syscall);
	#endif
	static bool_t nlm_syscall_registered = FALSE;

	/*
	* Debug level passed in from userland. We also support a sysctl hook
	* so that it can be changed on a live system.
	*/
	static int nlm_debug_level;
	SYSCTL_INT(_debug, OID_AUTO, nlm_debug, CTLFLAG_RW, &nlm_debug_level, 0, "");

	#define NLM_DEBUG(_level, args...) \
	do { \
	if (nlm_debug_level >= (_level)) \
	log(LOG_DEBUG, args); \
	} while(0)
	#define NLM_ERR(args...) \
	do { \
	log(LOG_ERR, args); \
	} while(0)

	/*
	* Grace period handling. The value of nlm_grace_threshold is the
	* value of time_uptime after which we are serving requests normally.
	*/
	static time_t nlm_grace_threshold;

	/*
	* We check for idle hosts if time_uptime is greater than
	* nlm_next_idle_check,
	*/
	static time_t nlm_next_idle_check;

	/*
	* A flag to indicate the server is already running.
	*/
	static int nlm_is_running;

	/*
	* A socket to use for RPC - shared by all IPv4 RPC clients.
	*/
	static struct socket *nlm_socket;

	#ifdef INET6

	/*
	* A socket to use for RPC - shared by all IPv6 RPC clients.
	*/
	static struct socket *nlm_socket6;

	#endif

	/*
	* An RPC client handle that can be used to communicate with the local
	* NSM.
	*/
	static CLIENT *nlm_nsm;

	/*
	* An AUTH handle for the server's creds.
	*/
	static AUTH *nlm_auth;

	/*
	* A zero timeval for sending async RPC messages.
	*/
	struct timeval nlm_zero_tv = { 0, 0 };

	/*
	* The local NSM state number
	*/
	int nlm_nsm_state;


	/*
	* A lock to protect the host list and waiting lock list.
	*/
	static struct mtx nlm_global_lock;

	/*
	* Locks:
	* (l) locked by nh_lock
	* (s) only accessed via server RPC which is single threaded
	* (g) locked by nlm_global_lock
	* (c) const until freeing
	* (a) modified using atomic ops
	*/

	/*
	* A pending client-side lock request, stored on the nlm_waiting_locks
	* list.
	*/
	struct nlm_waiting_lock {
	TAILQ_ENTRY(nlm_waiting_lock) nw_link; /* (g) */
	bool_t nw_waiting; /* (g) */
	nlm4_lock nw_lock; /* (c) */
	union nfsfh nw_fh; /* (c) */
	struct vnode nw_vp; / (c) */
	};
	TAILQ_HEAD(nlm_waiting_lock_list, nlm_waiting_lock);

	struct nlm_waiting_lock_list nlm_waiting_locks; /* (g) */

	/*
	* A pending server-side asynchronous lock request, stored on the
	* nh_pending list of the NLM host.
	*/
	struct nlm_async_lock {
	TAILQ_ENTRY(nlm_async_lock) af_link; /* (l) host's list of locks */
	struct task af_task; /* (c) async callback details */
	void af_cookie; / (l) lock manager cancel token */
	struct vnode af_vp; / (l) vnode to lock */
	struct flock af_fl; /* (c) lock details */
	struct nlm_host af_host; / (c) host which is locking */
	CLIENT af_rpc; / (c) rpc client to send message */
	nlm4_testargs af_granted; /* (c) notification details */
	time_t af_expiretime; /* (c) notification time */
	};
	TAILQ_HEAD(nlm_async_lock_list, nlm_async_lock);

	/*
	* NLM host.
	*/
	enum nlm_host_state {
	NLM_UNMONITORED,
	NLM_MONITORED,
	NLM_MONITOR_FAILED,
	NLM_RECOVERING
	};

	struct nlm_rpc {
	CLIENT nr_client; / (l) RPC client handle */
	time_t nr_create_time; /* (l) when client was created */
	};

	struct nlm_host {
	struct mtx nh_lock;
	volatile u_int nh_refs; /* (a) reference count */
	TAILQ_ENTRY(nlm_host) nh_link; /* (g) global list of hosts */
	char nh_caller_name[MAXNAMELEN]; /* (c) printable name of host */
	uint32_t nh_sysid; /* (c) our allocaed system ID */
	char nh_sysid_string[10]; /* (c) string rep. of sysid */
	struct sockaddr_storage nh_addr; /* (s) remote address of host */
	struct nlm_rpc nh_srvrpc; /* (l) RPC for server replies */
	struct nlm_rpc nh_clntrpc; /* (l) RPC for client requests */
	rpcvers_t nh_vers; /* (s) NLM version of host */
	int nh_state; /* (s) last seen NSM state of host */
	enum nlm_host_state nh_monstate; /* (l) local NSM monitoring state */
	time_t nh_idle_timeout; /* (s) Time at which host is idle */
	struct sysctl_ctx_list nh_sysctl; /* (c) vfs.nlm.sysid nodes */
	uint32_t nh_grantcookie; /* (l) grant cookie counter */
	struct nlm_async_lock_list nh_pending; /* (l) pending async locks */
	struct nlm_async_lock_list nh_granted; /* (l) granted locks */
	struct nlm_async_lock_list nh_finished; /* (l) finished async locks */
	};
	TAILQ_HEAD(nlm_host_list, nlm_host);

	static struct nlm_host_list nlm_hosts; /* (g) */
	static uint32_t nlm_next_sysid = 1; /* (g) */

	static void nlm_host_unmonitor(struct nlm_host *);

	struct nlm_grantcookie {
	uint32_t ng_sysid;
	uint32_t ng_cookie;
	};

	static inline uint32_t
	ng_sysid(struct netobj *src)
	{

	return ((struct nlm_grantcookie *)src->n_bytes)->ng_sysid;
	}

	static inline uint32_t
	ng_cookie(struct netobj *src)
	{

	return ((struct nlm_grantcookie *)src->n_bytes)->ng_cookie;
	}

	/**********************************************************************/

	/*
	* Initialise NLM globals.
	*/
	static void
	nlm_init(void *dummy)
	{
	int error;

	mtx_init(&nlm_global_lock, "nlm_global_lock", NULL, MTX_DEF);
	TAILQ_INIT(&nlm_waiting_locks);
	TAILQ_INIT(&nlm_hosts);

	error = syscall_register(&nlm_syscall_offset, &nlm_syscall_sysent,
	&nlm_syscall_prev_sysent, SY_THR_STATIC_KLD);
	if (error)
	NLM_ERR("Can't register NLM syscall\n");
	else
	nlm_syscall_registered = TRUE;
	}
	SYSINIT(nlm_init, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_init, NULL);

	static void
	nlm_uninit(void *dummy)
	{

	if (nlm_syscall_registered)
	syscall_deregister(&nlm_syscall_offset,
	&nlm_syscall_prev_sysent);
	}
	SYSUNINIT(nlm_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_uninit, NULL);

	/*
	* Create a netobj from an arbitrary source.
	*/
	void
	nlm_make_netobj(struct netobj *dst, caddr_t src, size_t srcsize,
	struct malloc_type *type)
	{

	dst->n_len = srcsize;
	dst->n_bytes = malloc(srcsize, type, M_WAITOK);
	memcpy(dst->n_bytes, src, srcsize);
	}

	/*
	* Copy a struct netobj.
	*/
	void
	nlm_copy_netobj(struct netobj dst, struct netobj src,
	struct malloc_type *type)
	{

	nlm_make_netobj(dst, src->n_bytes, src->n_len, type);
	}


	/*
	* Create an RPC client handle for the given (address,prog,vers)
	* triple using UDP.
	*/
	static CLIENT *
	nlm_get_rpc(struct sockaddr *sa, rpcprog_t prog, rpcvers_t vers)
	{
	char *wchan = "nlmrcv";
	- const char* protofmly;
	struct sockaddr_storage ss;
	struct socket *so;
	CLIENT *rpcb;
	struct timeval timo;
	RPCB parms;
	char *uaddr;
	enum clnt_stat stat = RPC_SUCCESS;
	int rpcvers = RPCBVERS4;
	bool_t do_tcp = FALSE;
	bool_t tryagain = FALSE;
	struct portmap mapping;
	u_short port = 0;

	/*
	* First we need to contact the remote RPCBIND service to find
	* the right port.
	*/
	memcpy(&ss, sa, sa->sa_len);
	switch (ss.ss_family) {
	case AF_INET:
	((struct sockaddr_in *)&ss)->sin_port = htons(111);
	- protofmly = "inet";
	so = nlm_socket;
	break;
	-
	#ifdef INET6
	case AF_INET6:
	((struct sockaddr_in6 *)&ss)->sin6_port = htons(111);
	- protofmly = "inet6";
	so = nlm_socket6;
	break;
	#endif

	default:
	/*
	* Unsupported address family - fail.
	*/
	return (NULL);
	}

	rpcb = clnt_dg_create(so, (struct sockaddr *)&ss,
	RPCBPROG, rpcvers, 0, 0);
	if (!rpcb)
	return (NULL);

	try_tcp:
	parms.r_prog = prog;
	parms.r_vers = vers;
	if (do_tcp)
	parms.r_netid = "tcp";
	else
	parms.r_netid = "udp";
	parms.r_addr = "";
	parms.r_owner = "";

	/*
	* Use the default timeout.
	*/
	timo.tv_sec = 25;
	timo.tv_usec = 0;
	again:
	switch (rpcvers) {
	case RPCBVERS4:
	case RPCBVERS:
	/*
	* Try RPCBIND 4 then 3.
	*/
	uaddr = NULL;
	stat = CLNT_CALL(rpcb, (rpcprog_t) RPCBPROC_GETADDR,
	(xdrproc_t) xdr_rpcb, &parms,
	(xdrproc_t) xdr_wrapstring, &uaddr, timo);
	if (stat == RPC_SUCCESS) {
	/*
	* We have a reply from the remote RPCBIND - turn it
	* into an appropriate address and make a new client
	* that can talk to the remote NLM.
	*
	* XXX fixup IPv6 scope ID.
	*/
	struct netbuf *a;
	a = __rpc_uaddr2taddr_af(ss.ss_family, uaddr);
	if (!a) {
	tryagain = TRUE;
	} else {
	tryagain = FALSE;
	memcpy(&ss, a->buf, a->len);
	free(a->buf, M_RPC);
	free(a, M_RPC);
	xdr_free((xdrproc_t) xdr_wrapstring, &uaddr);
	}
	}
	if (tryagain \|\| stat == RPC_PROGVERSMISMATCH) {
	if (rpcvers == RPCBVERS4)
	rpcvers = RPCBVERS;
	else if (rpcvers == RPCBVERS)
	rpcvers = PMAPVERS;
	CLNT_CONTROL(rpcb, CLSET_VERS, &rpcvers);
	goto again;
	}
	break;
	case PMAPVERS:
	/*
	* Try portmap.
	*/
	mapping.pm_prog = parms.r_prog;
	mapping.pm_vers = parms.r_vers;
	mapping.pm_prot = do_tcp ? IPPROTO_TCP : IPPROTO_UDP;
	mapping.pm_port = 0;

	stat = CLNT_CALL(rpcb, (rpcprog_t) PMAPPROC_GETPORT,
	(xdrproc_t) xdr_portmap, &mapping,
	(xdrproc_t) xdr_u_short, &port, timo);

	if (stat == RPC_SUCCESS) {
	switch (ss.ss_family) {
	case AF_INET:
	((struct sockaddr_in *)&ss)->sin_port =
	htons(port);
	break;

	#ifdef INET6
	case AF_INET6:
	((struct sockaddr_in6 *)&ss)->sin6_port =
	htons(port);
	break;
	#endif
	}
	}
	break;
	default:
	panic("invalid rpcvers %d", rpcvers);
	}
	/*
	* We may have a positive response from the portmapper, but the NLM
	* service was not found. Make sure we received a valid port.
	*/
	switch (ss.ss_family) {
	case AF_INET:
	port = ((struct sockaddr_in *)&ss)->sin_port;
	break;
	#ifdef INET6
	case AF_INET6:
	port = ((struct sockaddr_in6 *)&ss)->sin6_port;
	break;
	#endif
	}
	if (stat != RPC_SUCCESS \|\| !port) {
	/*
	* If we were able to talk to rpcbind or portmap, but the udp
	* variant wasn't available, ask about tcp.
	*
	* XXX - We could also check for a TCP portmapper, but
	* if the host is running a portmapper at all, we should be able
	* to hail it over UDP.
	*/
	if (stat == RPC_SUCCESS && !do_tcp) {
	do_tcp = TRUE;
	goto try_tcp;
	}

	/* Otherwise, bad news. */
	NLM_ERR("NLM: failed to contact remote rpcbind, "
	"stat = %d, port = %d\n", (int) stat, port);
	CLNT_DESTROY(rpcb);
	return (NULL);
	}

	if (do_tcp) {
	/*
	* Destroy the UDP client we used to speak to rpcbind and
	* recreate as a TCP client.
	*/
	struct netconfig *nconf = NULL;

	CLNT_DESTROY(rpcb);

	switch (ss.ss_family) {
	case AF_INET:
	nconf = getnetconfigent("tcp");
	break;
	#ifdef INET6
	case AF_INET6:
	nconf = getnetconfigent("tcp6");
	break;
	#endif
	}

	rpcb = clnt_reconnect_create(nconf, (struct sockaddr *)&ss,
	prog, vers, 0, 0);
	CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
	rpcb->cl_auth = nlm_auth;

	} else {
	/*
	* Re-use the client we used to speak to rpcbind.
	*/
	CLNT_CONTROL(rpcb, CLSET_SVC_ADDR, &ss);
	CLNT_CONTROL(rpcb, CLSET_PROG, &prog);
	CLNT_CONTROL(rpcb, CLSET_VERS, &vers);
	CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
	rpcb->cl_auth = nlm_auth;
	}

	return (rpcb);
	}

	/*
	* This async callback after when an async lock request has been
	* granted. We notify the host which initiated the request.
	*/
	static void
	nlm_lock_callback(void *arg, int pending)
	{
	struct nlm_async_lock af = (struct nlm_async_lock ) arg;
	struct rpc_callextra ext;

	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) granted,"
	" cookie %d:%d\n", af, af->af_host->nh_caller_name,
	af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
	ng_cookie(&af->af_granted.cookie));

	/*
	* Send the results back to the host.
	*
	* Note: there is a possible race here with nlm_host_notify
	* destroying the RPC client. To avoid problems, the first
	* thing nlm_host_notify does is to cancel pending async lock
	* requests.
	*/
	memset(&ext, 0, sizeof(ext));
	ext.rc_auth = nlm_auth;
	if (af->af_host->nh_vers == NLM_VERS4) {
	nlm4_granted_msg_4(&af->af_granted,
	NULL, af->af_rpc, &ext, nlm_zero_tv);
	} else {
	/*
	* Back-convert to legacy protocol
	*/
	nlm_testargs granted;
	granted.cookie = af->af_granted.cookie;
	granted.exclusive = af->af_granted.exclusive;
	granted.alock.caller_name =
	af->af_granted.alock.caller_name;
	granted.alock.fh = af->af_granted.alock.fh;
	granted.alock.oh = af->af_granted.alock.oh;
	granted.alock.svid = af->af_granted.alock.svid;
	granted.alock.l_offset =
	af->af_granted.alock.l_offset;
	granted.alock.l_len =
	af->af_granted.alock.l_len;

	nlm_granted_msg_1(&granted,
	NULL, af->af_rpc, &ext, nlm_zero_tv);
	}

	/*
	* Move this entry to the nh_granted list.
	*/
	af->af_expiretime = time_uptime + NLM_EXPIRE_TIMEOUT;
	mtx_lock(&af->af_host->nh_lock);
	TAILQ_REMOVE(&af->af_host->nh_pending, af, af_link);
	TAILQ_INSERT_TAIL(&af->af_host->nh_granted, af, af_link);
	mtx_unlock(&af->af_host->nh_lock);
	}

	/*
	* Free an async lock request. The request must have been removed from
	* any list.
	*/
	static void
	nlm_free_async_lock(struct nlm_async_lock *af)
	{
	/*
	* Free an async lock.
	*/
	if (af->af_rpc)
	CLNT_RELEASE(af->af_rpc);
	xdr_free((xdrproc_t) xdr_nlm4_testargs, &af->af_granted);
	if (af->af_vp)
	vrele(af->af_vp);
	free(af, M_NLM);
	}

	/*
	* Cancel our async request - this must be called with
	* af->nh_host->nh_lock held. This is slightly complicated by a
	* potential race with our own callback. If we fail to cancel the
	* lock, it must already have been granted - we make sure our async
	* task has completed by calling taskqueue_drain in this case.
	*/
	static int
	nlm_cancel_async_lock(struct nlm_async_lock *af)
	{
	struct nlm_host *host = af->af_host;
	int error;

	mtx_assert(&host->nh_lock, MA_OWNED);

	mtx_unlock(&host->nh_lock);

	error = VOP_ADVLOCKASYNC(af->af_vp, NULL, F_CANCEL, &af->af_fl,
	F_REMOTE, NULL, &af->af_cookie);

	if (error) {
	/*
	* We failed to cancel - make sure our callback has
	* completed before we continue.
	*/
	taskqueue_drain(taskqueue_thread, &af->af_task);
	}

	mtx_lock(&host->nh_lock);

	if (!error) {
	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) "
	"cancelled\n", af, host->nh_caller_name, host->nh_sysid);

	/*
	* Remove from the nh_pending list and free now that
	* we are safe from the callback.
	*/
	TAILQ_REMOVE(&host->nh_pending, af, af_link);
	mtx_unlock(&host->nh_lock);
	nlm_free_async_lock(af);
	mtx_lock(&host->nh_lock);
	}

	return (error);
	}

	static void
	nlm_check_expired_locks(struct nlm_host *host)
	{
	struct nlm_async_lock *af;
	time_t uptime = time_uptime;

	mtx_lock(&host->nh_lock);
	while ((af = TAILQ_FIRST(&host->nh_granted)) != NULL
	&& uptime >= af->af_expiretime) {
	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) expired,"
	" cookie %d:%d\n", af, af->af_host->nh_caller_name,
	af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
	ng_cookie(&af->af_granted.cookie));
	TAILQ_REMOVE(&host->nh_granted, af, af_link);
	mtx_unlock(&host->nh_lock);
	nlm_free_async_lock(af);
	mtx_lock(&host->nh_lock);
	}
	while ((af = TAILQ_FIRST(&host->nh_finished)) != NULL) {
	TAILQ_REMOVE(&host->nh_finished, af, af_link);
	mtx_unlock(&host->nh_lock);
	nlm_free_async_lock(af);
	mtx_lock(&host->nh_lock);
	}
	mtx_unlock(&host->nh_lock);
	}

	/*
	* Free resources used by a host. This is called after the reference
	* count has reached zero so it doesn't need to worry about locks.
	*/
	static void
	nlm_host_destroy(struct nlm_host *host)
	{

	mtx_lock(&nlm_global_lock);
	TAILQ_REMOVE(&nlm_hosts, host, nh_link);
	mtx_unlock(&nlm_global_lock);

	if (host->nh_srvrpc.nr_client)
	CLNT_RELEASE(host->nh_srvrpc.nr_client);
	if (host->nh_clntrpc.nr_client)
	CLNT_RELEASE(host->nh_clntrpc.nr_client);
	mtx_destroy(&host->nh_lock);
	sysctl_ctx_free(&host->nh_sysctl);
	free(host, M_NLM);
	}

	/*
	* Thread start callback for client lock recovery
	*/
	static void
	nlm_client_recovery_start(void *arg)
	{
	struct nlm_host host = (struct nlm_host ) arg;

	NLM_DEBUG(1, "NLM: client lock recovery for %s started\n",
	host->nh_caller_name);

	nlm_client_recovery(host);

	NLM_DEBUG(1, "NLM: client lock recovery for %s completed\n",
	host->nh_caller_name);

	host->nh_monstate = NLM_MONITORED;
	nlm_host_release(host);

	kthread_exit();
	}

	/*
	* This is called when we receive a host state change notification. We
	* unlock any active locks owned by the host. When rpc.lockd is
	* shutting down, this function is called with newstate set to zero
	* which allows us to cancel any pending async locks and clear the
	* locking state.
	*/
	static void
	nlm_host_notify(struct nlm_host *host, int newstate)
	{
	struct nlm_async_lock *af;

	if (newstate) {
	NLM_DEBUG(1, "NLM: host %s (sysid %d) rebooted, new "
	"state is %d\n", host->nh_caller_name,
	host->nh_sysid, newstate);
	}

	/*
	* Cancel any pending async locks for this host.
	*/
	mtx_lock(&host->nh_lock);
	while ((af = TAILQ_FIRST(&host->nh_pending)) != NULL) {
	/*
	* nlm_cancel_async_lock will remove the entry from
	* nh_pending and free it.
	*/
	nlm_cancel_async_lock(af);
	}
	mtx_unlock(&host->nh_lock);
	nlm_check_expired_locks(host);

	/*
	* The host just rebooted - trash its locks.
	*/
	lf_clearremotesys(host->nh_sysid);
	host->nh_state = newstate;

	/*
	* If we have any remote locks for this host (i.e. it
	* represents a remote NFS server that our local NFS client
	* has locks for), start a recovery thread.
	*/
	if (newstate != 0
	&& host->nh_monstate != NLM_RECOVERING
	&& lf_countlocks(NLM_SYSID_CLIENT \| host->nh_sysid) > 0) {
	struct thread *td;
	host->nh_monstate = NLM_RECOVERING;
	refcount_acquire(&host->nh_refs);
	kthread_add(nlm_client_recovery_start, host, curproc, &td, 0, 0,
	"NFS lock recovery for %s", host->nh_caller_name);
	}
	}

	/*
	* Sysctl handler to count the number of locks for a sysid.
	*/
	static int
	nlm_host_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct nlm_host *host;
	int count;

	host = oidp->oid_arg1;
	count = lf_countlocks(host->nh_sysid);
	return sysctl_handle_int(oidp, &count, 0, req);
	}

	/*
	* Sysctl handler to count the number of client locks for a sysid.
	*/
	static int
	nlm_host_client_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct nlm_host *host;
	int count;

	host = oidp->oid_arg1;
	count = lf_countlocks(NLM_SYSID_CLIENT \| host->nh_sysid);
	return sysctl_handle_int(oidp, &count, 0, req);
	}

	/*
	* Create a new NLM host.
	*/
	static struct nlm_host *
	nlm_create_host(const char* caller_name)
	{
	struct nlm_host *host;
	struct sysctl_oid *oid;

	mtx_assert(&nlm_global_lock, MA_OWNED);

	NLM_DEBUG(1, "NLM: new host %s (sysid %d)\n",
	caller_name, nlm_next_sysid);
	host = malloc(sizeof(struct nlm_host), M_NLM, M_NOWAIT\|M_ZERO);
	if (!host)
	return (NULL);
	mtx_init(&host->nh_lock, "nh_lock", NULL, MTX_DEF);
	host->nh_refs = 1;
	strlcpy(host->nh_caller_name, caller_name, MAXNAMELEN);
	host->nh_sysid = nlm_next_sysid++;
	snprintf(host->nh_sysid_string, sizeof(host->nh_sysid_string),
	"%d", host->nh_sysid);
	host->nh_vers = 0;
	host->nh_state = 0;
	host->nh_monstate = NLM_UNMONITORED;
	host->nh_grantcookie = 1;
	TAILQ_INIT(&host->nh_pending);
	TAILQ_INIT(&host->nh_granted);
	TAILQ_INIT(&host->nh_finished);
	TAILQ_INSERT_TAIL(&nlm_hosts, host, nh_link);

	mtx_unlock(&nlm_global_lock);

	sysctl_ctx_init(&host->nh_sysctl);
	oid = SYSCTL_ADD_NODE(&host->nh_sysctl,
	SYSCTL_STATIC_CHILDREN(_vfs_nlm_sysid),
	OID_AUTO, host->nh_sysid_string, CTLFLAG_RD, NULL, "");
	SYSCTL_ADD_STRING(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"hostname", CTLFLAG_RD, host->nh_caller_name, 0, "");
	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"version", CTLFLAG_RD, &host->nh_vers, 0, "");
	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"monitored", CTLFLAG_RD, &host->nh_monstate, 0, "");
	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"lock_count", CTLTYPE_INT \| CTLFLAG_RD, host, 0,
	nlm_host_lock_count_sysctl, "I", "");
	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"client_lock_count", CTLTYPE_INT \| CTLFLAG_RD, host, 0,
	nlm_host_client_lock_count_sysctl, "I", "");

	mtx_lock(&nlm_global_lock);

	return (host);
	}

	/*
	* Acquire the next sysid for remote locks not handled by the NLM.
	*/
	uint32_t
	nlm_acquire_next_sysid(void)
	{
	uint32_t next_sysid;

	mtx_lock(&nlm_global_lock);
	next_sysid = nlm_next_sysid++;
	mtx_unlock(&nlm_global_lock);
	return (next_sysid);
	}

	/*
	* Return non-zero if the address parts of the two sockaddrs are the
	* same.
	*/
	static int
	nlm_compare_addr(const struct sockaddr a, const struct sockaddr b)
	{
	const struct sockaddr_in a4, b4;
	#ifdef INET6
	const struct sockaddr_in6 a6, b6;
	#endif

	if (a->sa_family != b->sa_family)
	return (FALSE);

	switch (a->sa_family) {
	case AF_INET:
	a4 = (const struct sockaddr_in *) a;
	b4 = (const struct sockaddr_in *) b;
	return !memcmp(&a4->sin_addr, &b4->sin_addr,
	sizeof(a4->sin_addr));
	#ifdef INET6
	case AF_INET6:
	a6 = (const struct sockaddr_in6 *) a;
	b6 = (const struct sockaddr_in6 *) b;
	return !memcmp(&a6->sin6_addr, &b6->sin6_addr,
	sizeof(a6->sin6_addr));
	#endif
	}

	return (0);
	}

	/*
	* Check for idle hosts and stop monitoring them. We could also free
	* the host structure here, possibly after a larger timeout but that
	* would require some care to avoid races with
	* e.g. nlm_host_lock_count_sysctl.
	*/
	static void
	nlm_check_idle(void)
	{
	struct nlm_host *host;

	mtx_assert(&nlm_global_lock, MA_OWNED);

	if (time_uptime <= nlm_next_idle_check)
	return;

	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;

	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (host->nh_monstate == NLM_MONITORED
	&& time_uptime > host->nh_idle_timeout) {
	mtx_unlock(&nlm_global_lock);
	if (lf_countlocks(host->nh_sysid) > 0
	\|\| lf_countlocks(NLM_SYSID_CLIENT
	+ host->nh_sysid)) {
	host->nh_idle_timeout =
	time_uptime + NLM_IDLE_TIMEOUT;
	mtx_lock(&nlm_global_lock);
	continue;
	}
	nlm_host_unmonitor(host);
	mtx_lock(&nlm_global_lock);
	}
	}
	}

	/*
	* Search for an existing NLM host that matches the given name
	* (typically the caller_name element of an nlm4_lock). If none is
	* found, create a new host. If 'addr' is non-NULL, record the remote
	* address of the host so that we can call it back for async
	* responses. If 'vers' is greater than zero then record the NLM
	* program version to use to communicate with this client.
	*/
	struct nlm_host *
	nlm_find_host_by_name(const char name, const struct sockaddr addr,
	rpcvers_t vers)
	{
	struct nlm_host *host;

	mtx_lock(&nlm_global_lock);

	/*
	* The remote host is determined by caller_name.
	*/
	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (!strcmp(host->nh_caller_name, name))
	break;
	}

	if (!host) {
	host = nlm_create_host(name);
	if (!host) {
	mtx_unlock(&nlm_global_lock);
	return (NULL);
	}
	}
	refcount_acquire(&host->nh_refs);

	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;

	/*
	* If we have an address for the host, record it so that we
	* can send async replies etc.
	*/
	if (addr) {

	KASSERT(addr->sa_len < sizeof(struct sockaddr_storage),
	("Strange remote transport address length"));

	/*
	* If we have seen an address before and we currently
	* have an RPC client handle, make sure the address is
	* the same, otherwise discard the client handle.
	*/
	if (host->nh_addr.ss_len && host->nh_srvrpc.nr_client) {
	if (!nlm_compare_addr(
	(struct sockaddr *) &host->nh_addr,
	addr)
	\|\| host->nh_vers != vers) {
	CLIENT *client;
	mtx_lock(&host->nh_lock);
	client = host->nh_srvrpc.nr_client;
	host->nh_srvrpc.nr_client = NULL;
	mtx_unlock(&host->nh_lock);
	if (client) {
	CLNT_RELEASE(client);
	}
	}
	}
	memcpy(&host->nh_addr, addr, addr->sa_len);
	host->nh_vers = vers;
	}

	nlm_check_idle();

	mtx_unlock(&nlm_global_lock);

	return (host);
	}

	/*
	* Search for an existing NLM host that matches the given remote
	* address. If none is found, create a new host with the requested
	* address and remember 'vers' as the NLM protocol version to use for
	* that host.
	*/
	struct nlm_host *
	nlm_find_host_by_addr(const struct sockaddr *addr, int vers)
	{
	/*
	* Fake up a name using inet_ntop. This buffer is
	* large enough for an IPv6 address.
	*/
	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
	struct nlm_host *host;

	switch (addr->sa_family) {
	case AF_INET:
	inet_ntop(AF_INET,
	&((const struct sockaddr_in *) addr)->sin_addr,
	tmp, sizeof tmp);
	break;
	#ifdef INET6
	case AF_INET6:
	inet_ntop(AF_INET6,
	&((const struct sockaddr_in6 *) addr)->sin6_addr,
	tmp, sizeof tmp);
	break;
	#endif
	default:
	strlcpy(tmp, "<unknown>", sizeof(tmp));
	}


	mtx_lock(&nlm_global_lock);

	/*
	* The remote host is determined by caller_name.
	*/
	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (nlm_compare_addr(addr,
	(const struct sockaddr *) &host->nh_addr))
	break;
	}

	if (!host) {
	host = nlm_create_host(tmp);
	if (!host) {
	mtx_unlock(&nlm_global_lock);
	return (NULL);
	}
	memcpy(&host->nh_addr, addr, addr->sa_len);
	host->nh_vers = vers;
	}
	refcount_acquire(&host->nh_refs);

	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;

	nlm_check_idle();

	mtx_unlock(&nlm_global_lock);

	return (host);
	}

	/*
	* Find the NLM host that matches the value of 'sysid'. If none
	* exists, return NULL.
	*/
	static struct nlm_host *
	nlm_find_host_by_sysid(int sysid)
	{
	struct nlm_host *host;

	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (host->nh_sysid == sysid) {
	refcount_acquire(&host->nh_refs);
	return (host);
	}
	}

	return (NULL);
	}

	void nlm_host_release(struct nlm_host *host)
	{
	if (refcount_release(&host->nh_refs)) {
	/*
	* Free the host
	*/
	nlm_host_destroy(host);
	}
	}

	/*
	* Unregister this NLM host with the local NSM due to idleness.
	*/
	static void
	nlm_host_unmonitor(struct nlm_host *host)
	{
	mon_id smmonid;
	sm_stat_res smstat;
	struct timeval timo;
	enum clnt_stat stat;

	NLM_DEBUG(1, "NLM: unmonitoring %s (sysid %d)\n",
	host->nh_caller_name, host->nh_sysid);

	/*
	* We put our assigned system ID value in the priv field to
	* make it simpler to find the host if we are notified of a
	* host restart.
	*/
	smmonid.mon_name = host->nh_caller_name;
	smmonid.my_id.my_name = "localhost";
	smmonid.my_id.my_prog = NLM_PROG;
	smmonid.my_id.my_vers = NLM_SM;
	smmonid.my_id.my_proc = NLM_SM_NOTIFY;

	timo.tv_sec = 25;
	timo.tv_usec = 0;
	stat = CLNT_CALL(nlm_nsm, SM_UNMON,
	(xdrproc_t) xdr_mon, &smmonid,
	(xdrproc_t) xdr_sm_stat, &smstat, timo);

	if (stat != RPC_SUCCESS) {
	NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
	return;
	}
	if (smstat.res_stat == stat_fail) {
	NLM_ERR("Local NSM refuses to unmonitor %s\n",
	host->nh_caller_name);
	return;
	}

	host->nh_monstate = NLM_UNMONITORED;
	}

	/*
	* Register this NLM host with the local NSM so that we can be
	* notified if it reboots.
	*/
	void
	nlm_host_monitor(struct nlm_host *host, int state)
	{
	mon smmon;
	sm_stat_res smstat;
	struct timeval timo;
	enum clnt_stat stat;

	if (state && !host->nh_state) {
	/*
	* This is the first time we have seen an NSM state
	* value for this host. We record it here to help
	* detect host reboots.
	*/
	host->nh_state = state;
	NLM_DEBUG(1, "NLM: host %s (sysid %d) has NSM state %d\n",
	host->nh_caller_name, host->nh_sysid, state);
	}

	mtx_lock(&host->nh_lock);
	if (host->nh_monstate != NLM_UNMONITORED) {
	mtx_unlock(&host->nh_lock);
	return;
	}
	host->nh_monstate = NLM_MONITORED;
	mtx_unlock(&host->nh_lock);

	NLM_DEBUG(1, "NLM: monitoring %s (sysid %d)\n",
	host->nh_caller_name, host->nh_sysid);

	/*
	* We put our assigned system ID value in the priv field to
	* make it simpler to find the host if we are notified of a
	* host restart.
	*/
	smmon.mon_id.mon_name = host->nh_caller_name;
	smmon.mon_id.my_id.my_name = "localhost";
	smmon.mon_id.my_id.my_prog = NLM_PROG;
	smmon.mon_id.my_id.my_vers = NLM_SM;
	smmon.mon_id.my_id.my_proc = NLM_SM_NOTIFY;
	memcpy(smmon.priv, &host->nh_sysid, sizeof(host->nh_sysid));

	timo.tv_sec = 25;
	timo.tv_usec = 0;
	stat = CLNT_CALL(nlm_nsm, SM_MON,
	(xdrproc_t) xdr_mon, &smmon,
	(xdrproc_t) xdr_sm_stat, &smstat, timo);

	if (stat != RPC_SUCCESS) {
	NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
	return;
	}
	if (smstat.res_stat == stat_fail) {
	NLM_ERR("Local NSM refuses to monitor %s\n",
	host->nh_caller_name);
	mtx_lock(&host->nh_lock);
	host->nh_monstate = NLM_MONITOR_FAILED;
	mtx_unlock(&host->nh_lock);
	return;
	}

	host->nh_monstate = NLM_MONITORED;
	}

	/*
	* Return an RPC client handle that can be used to talk to the NLM
	* running on the given host.
	*/
	CLIENT *
	nlm_host_get_rpc(struct nlm_host *host, bool_t isserver)
	{
	struct nlm_rpc *rpc;
	CLIENT *client;

	mtx_lock(&host->nh_lock);

	if (isserver)
	rpc = &host->nh_srvrpc;
	else
	rpc = &host->nh_clntrpc;

	/*
	* We can't hold onto RPC handles for too long - the async
	* call/reply protocol used by some NLM clients makes it hard
	* to tell when they change port numbers (e.g. after a
	* reboot). Note that if a client reboots while it isn't
	* holding any locks, it won't bother to notify us. We
	* expire the RPC handles after two minutes.
	*/
	if (rpc->nr_client && time_uptime > rpc->nr_create_time + 2*60) {
	client = rpc->nr_client;
	rpc->nr_client = NULL;
	mtx_unlock(&host->nh_lock);
	CLNT_RELEASE(client);
	mtx_lock(&host->nh_lock);
	}

	if (!rpc->nr_client) {
	mtx_unlock(&host->nh_lock);
	client = nlm_get_rpc((struct sockaddr *)&host->nh_addr,
	NLM_PROG, host->nh_vers);
	mtx_lock(&host->nh_lock);

	if (client) {
	if (rpc->nr_client) {
	mtx_unlock(&host->nh_lock);
	CLNT_DESTROY(client);
	mtx_lock(&host->nh_lock);
	} else {
	rpc->nr_client = client;
	rpc->nr_create_time = time_uptime;
	}
	}
	}

	client = rpc->nr_client;
	if (client)
	CLNT_ACQUIRE(client);
	mtx_unlock(&host->nh_lock);

	return (client);

	}

	int nlm_host_get_sysid(struct nlm_host *host)
	{

	return (host->nh_sysid);
	}

	int
	nlm_host_get_state(struct nlm_host *host)
	{

	return (host->nh_state);
	}

	void *
	nlm_register_wait_lock(struct nlm4_lock lock, struct vnode vp)
	{
	struct nlm_waiting_lock *nw;

	nw = malloc(sizeof(struct nlm_waiting_lock), M_NLM, M_WAITOK);
	nw->nw_lock = *lock;
	memcpy(&nw->nw_fh.fh_bytes, nw->nw_lock.fh.n_bytes,
	nw->nw_lock.fh.n_len);
	nw->nw_lock.fh.n_bytes = nw->nw_fh.fh_bytes;
	nw->nw_waiting = TRUE;
	nw->nw_vp = vp;
	mtx_lock(&nlm_global_lock);
	TAILQ_INSERT_TAIL(&nlm_waiting_locks, nw, nw_link);
	mtx_unlock(&nlm_global_lock);

	return nw;
	}

	void
	nlm_deregister_wait_lock(void *handle)
	{
	struct nlm_waiting_lock *nw = handle;

	mtx_lock(&nlm_global_lock);
	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
	mtx_unlock(&nlm_global_lock);

	free(nw, M_NLM);
	}

	int
	nlm_wait_lock(void *handle, int timo)
	{
	struct nlm_waiting_lock *nw = handle;
	int error, stops_deferred;

	/*
	* If the granted message arrived before we got here,
	* nw->nw_waiting will be FALSE - in that case, don't sleep.
	*/
	mtx_lock(&nlm_global_lock);
	error = 0;
	if (nw->nw_waiting) {
	stops_deferred = sigdeferstop(SIGDEFERSTOP_ERESTART);
	error = msleep(nw, &nlm_global_lock, PCATCH, "nlmlock", timo);
	sigallowstop(stops_deferred);
	}
	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
	if (error) {
	/*
	* The granted message may arrive after the
	* interrupt/timeout but before we manage to lock the
	* mutex. Detect this by examining nw_lock.
	*/
	if (!nw->nw_waiting)
	error = 0;
	} else {
	/*
	* If nlm_cancel_wait is called, then error will be
	* zero but nw_waiting will still be TRUE. We
	* translate this into EINTR.
	*/
	if (nw->nw_waiting)
	error = EINTR;
	}
	mtx_unlock(&nlm_global_lock);

	free(nw, M_NLM);

	return (error);
	}

	void
	nlm_cancel_wait(struct vnode *vp)
	{
	struct nlm_waiting_lock *nw;

	mtx_lock(&nlm_global_lock);
	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
	if (nw->nw_vp == vp) {
	wakeup(nw);
	}
	}
	mtx_unlock(&nlm_global_lock);
	}


	/**********************************************************************/

	/*
	* Syscall interface with userland.
	*/

	extern void nlm_prog_0(struct svc_req rqstp, SVCXPRT transp);
	extern void nlm_prog_1(struct svc_req rqstp, SVCXPRT transp);
	extern void nlm_prog_3(struct svc_req rqstp, SVCXPRT transp);
	extern void nlm_prog_4(struct svc_req rqstp, SVCXPRT transp);

	static int
	nlm_register_services(SVCPOOL pool, int addr_count, char *addrs)
	{
	static rpcvers_t versions[] = {
	NLM_SM, NLM_VERS, NLM_VERSX, NLM_VERS4
	};
	static void (dispatchers[])(struct svc_req , SVCXPRT *) = {
	nlm_prog_0, nlm_prog_1, nlm_prog_3, nlm_prog_4
	};

	SVCXPRT **xprts;
	char netid[16];
	char uaddr[128];
	struct netconfig *nconf;
	int i, j, error;

	if (!addr_count) {
	NLM_ERR("NLM: no service addresses given - can't start server");
	return (EINVAL);
	}

	if (addr_count < 0 \|\| addr_count > 256 ) {
	NLM_ERR("NLM: too many service addresses (%d) given, "
	"max 256 - can't start server\n", addr_count);
	return (EINVAL);
	}

	xprts = malloc(addr_count * sizeof(SVCXPRT *), M_NLM, M_WAITOK\|M_ZERO);
	for (i = 0; i < nitems(versions); i++) {
	for (j = 0; j < addr_count; j++) {
	/*
	* Create transports for the first version and
	* then just register everything else to the
	* same transports.
	*/
	if (i == 0) {
	char *up;

	error = copyin(&addrs[2*j], &up,
	sizeof(char*));
	if (error)
	goto out;
	error = copyinstr(up, netid, sizeof(netid),
	NULL);
	if (error)
	goto out;
	error = copyin(&addrs[2*j+1], &up,
	sizeof(char*));
	if (error)
	goto out;
	error = copyinstr(up, uaddr, sizeof(uaddr),
	NULL);
	if (error)
	goto out;
	nconf = getnetconfigent(netid);
	if (!nconf) {
	NLM_ERR("Can't lookup netid %s\n",
	netid);
	error = EINVAL;
	goto out;
	}
	xprts[j] = svc_tp_create(pool, dispatchers[i],
	NLM_PROG, versions[i], uaddr, nconf);
	if (!xprts[j]) {
	NLM_ERR("NLM: unable to create "
	"(NLM_PROG, %d).\n", versions[i]);
	error = EINVAL;
	goto out;
	}
	freenetconfigent(nconf);
	} else {
	nconf = getnetconfigent(xprts[j]->xp_netid);
	rpcb_unset(NLM_PROG, versions[i], nconf);
	if (!svc_reg(xprts[j], NLM_PROG, versions[i],
	dispatchers[i], nconf)) {
	NLM_ERR("NLM: can't register "
	"(NLM_PROG, %d)\n", versions[i]);
	error = EINVAL;
	goto out;
	}
	}
	}
	}
	error = 0;
	out:
	for (j = 0; j < addr_count; j++) {
	if (xprts[j])
	SVC_RELEASE(xprts[j]);
	}
	free(xprts, M_NLM);
	return (error);
	}

	/*
	* Main server entry point. Contacts the local NSM to get its current
	* state and send SM_UNMON_ALL. Registers the NLM services and then
	* services requests. Does not return until the server is interrupted
	* by a signal.
	*/
	static int
	nlm_server_main(int addr_count, char **addrs)
	{
	struct thread *td = curthread;
	int error;
	SVCPOOL *pool = NULL;
	struct sockopt opt;
	int portlow;
	#ifdef INET6
	struct sockaddr_in6 sin6;
	#endif
	struct sockaddr_in sin;
	my_id id;
	sm_stat smstat;
	struct timeval timo;
	enum clnt_stat stat;
	struct nlm_host host, nhost;
	struct nlm_waiting_lock *nw;
	vop_advlock_t *old_nfs_advlock;
	vop_reclaim_t *old_nfs_reclaim;

	if (nlm_is_running != 0) {
	NLM_ERR("NLM: can't start server - "
	"it appears to be running already\n");
	return (EPERM);
	}

	if (nlm_socket == NULL) {
	memset(&opt, 0, sizeof(opt));

	error = socreate(AF_INET, &nlm_socket, SOCK_DGRAM, 0,
	td->td_ucred, td);
	if (error) {
	NLM_ERR("NLM: can't create IPv4 socket - error %d\n",
	error);
	return (error);
	}
	opt.sopt_dir = SOPT_SET;
	opt.sopt_level = IPPROTO_IP;
	opt.sopt_name = IP_PORTRANGE;
	portlow = IP_PORTRANGE_LOW;
	opt.sopt_val = &portlow;
	opt.sopt_valsize = sizeof(portlow);
	sosetopt(nlm_socket, &opt);

	#ifdef INET6
	nlm_socket6 = NULL;
	error = socreate(AF_INET6, &nlm_socket6, SOCK_DGRAM, 0,
	td->td_ucred, td);
	if (error) {
	NLM_ERR("NLM: can't create IPv6 socket - error %d\n",
	error);
	soclose(nlm_socket);
	nlm_socket = NULL;
	return (error);
	}
	opt.sopt_dir = SOPT_SET;
	opt.sopt_level = IPPROTO_IPV6;
	opt.sopt_name = IPV6_PORTRANGE;
	portlow = IPV6_PORTRANGE_LOW;
	opt.sopt_val = &portlow;
	opt.sopt_valsize = sizeof(portlow);
	sosetopt(nlm_socket6, &opt);
	#endif
	}

	nlm_auth = authunix_create(curthread->td_ucred);

	#ifdef INET6
	memset(&sin6, 0, sizeof(sin6));
	sin6.sin6_len = sizeof(sin6);
	sin6.sin6_family = AF_INET6;
	sin6.sin6_addr = in6addr_loopback;
	nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin6, SM_PROG, SM_VERS);
	if (!nlm_nsm) {
	#endif
	memset(&sin, 0, sizeof(sin));
	sin.sin_len = sizeof(sin);
	sin.sin_family = AF_INET;
	sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
	nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin, SM_PROG,
	SM_VERS);
	#ifdef INET6
	}
	#endif

	if (!nlm_nsm) {
	NLM_ERR("Can't start NLM - unable to contact NSM\n");
	error = EINVAL;
	goto out;
	}

	pool = svcpool_create("NLM", NULL);

	error = nlm_register_services(pool, addr_count, addrs);
	if (error)
	goto out;

	memset(&id, 0, sizeof(id));
	id.my_name = "NFS NLM";

	timo.tv_sec = 25;
	timo.tv_usec = 0;
	stat = CLNT_CALL(nlm_nsm, SM_UNMON_ALL,
	(xdrproc_t) xdr_my_id, &id,
	(xdrproc_t) xdr_sm_stat, &smstat, timo);

	if (stat != RPC_SUCCESS) {
	struct rpc_err err;

	CLNT_GETERR(nlm_nsm, &err);
	NLM_ERR("NLM: unexpected error contacting NSM, "
	"stat=%d, errno=%d\n", stat, err.re_errno);
	error = EINVAL;
	goto out;
	}
	nlm_is_running = 1;

	NLM_DEBUG(1, "NLM: local NSM state is %d\n", smstat.state);
	nlm_nsm_state = smstat.state;

	old_nfs_advlock = nfs_advlock_p;
	nfs_advlock_p = nlm_advlock;
	old_nfs_reclaim = nfs_reclaim_p;
	nfs_reclaim_p = nlm_reclaim;

	svc_run(pool);
	error = 0;

	nfs_advlock_p = old_nfs_advlock;
	nfs_reclaim_p = old_nfs_reclaim;

	out:
	nlm_is_running = 0;
	if (pool)
	svcpool_destroy(pool);

	/*
	* We are finished communicating with the NSM.
	*/
	if (nlm_nsm) {
	CLNT_RELEASE(nlm_nsm);
	nlm_nsm = NULL;
	}

	/*
	* Trash all the existing state so that if the server
	* restarts, it gets a clean slate. This is complicated by the
	* possibility that there may be other threads trying to make
	* client locking requests.
	*
	* First we fake a client reboot notification which will
	* cancel any pending async locks and purge remote lock state
	* from the local lock manager. We release the reference from
	* nlm_hosts to the host (which may remove it from the list
	* and free it). After this phase, the only entries in the
	* nlm_host list should be from other threads performing
	* client lock requests.
	*/
	mtx_lock(&nlm_global_lock);
	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
	wakeup(nw);
	}
	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
	mtx_unlock(&nlm_global_lock);
	nlm_host_notify(host, 0);
	nlm_host_release(host);
	mtx_lock(&nlm_global_lock);
	}
	mtx_unlock(&nlm_global_lock);

	AUTH_DESTROY(nlm_auth);

	return (error);
	}

	int
	sys_nlm_syscall(struct thread td, struct nlm_syscall_args uap)
	{
	int error;

	#if __FreeBSD_version >= 700000
	error = priv_check(td, PRIV_NFS_LOCKD);
	#else
	error = suser(td);
	#endif
	if (error)
	return (error);

	nlm_debug_level = uap->debug_level;
	nlm_grace_threshold = time_uptime + uap->grace_period;
	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;

	return nlm_server_main(uap->addr_count, uap->addrs);
	}

	/**********************************************************************/

	/*
	* NLM implementation details, called from the RPC stubs.
	*/


	void
	nlm_sm_notify(struct nlm_sm_status *argp)
	{
	uint32_t sysid;
	struct nlm_host *host;

	NLM_DEBUG(3, "nlm_sm_notify(): mon_name = %s\n", argp->mon_name);
	memcpy(&sysid, &argp->priv, sizeof(sysid));
	host = nlm_find_host_by_sysid(sysid);
	if (host) {
	nlm_host_notify(host, argp->state);
	nlm_host_release(host);
	}
	}

	static void
	nlm_convert_to_fhandle_t(fhandle_t fhp, struct netobj p)
	{
	memcpy(fhp, p->n_bytes, sizeof(fhandle_t));
	}

	struct vfs_state {
	struct mount *vs_mp;
	struct vnode *vs_vp;
	int vs_vnlocked;
	};

	static int
	nlm_get_vfs_state(struct nlm_host host, struct svc_req rqstp,
	fhandle_t fhp, struct vfs_state vs, accmode_t accmode)
	{
	int error, exflags;
	struct ucred cred = NULL, credanon = NULL;

	memset(vs, 0, sizeof(*vs));

	vs->vs_mp = vfs_getvfs(&fhp->fh_fsid);
	if (!vs->vs_mp) {
	return (ESTALE);
	}

	/* accmode == 0 means don't check, since it is an unlock. */
	if (accmode != 0) {
	error = VFS_CHECKEXP(vs->vs_mp,
	(struct sockaddr *)&host->nh_addr, &exflags, &credanon,
	NULL, NULL);
	if (error)
	goto out;

	if (exflags & MNT_EXRDONLY \|\|
	(vs->vs_mp->mnt_flag & MNT_RDONLY)) {
	error = EROFS;
	goto out;
	}
	}

	error = VFS_FHTOVP(vs->vs_mp, &fhp->fh_fid, LK_EXCLUSIVE, &vs->vs_vp);
	if (error)
	goto out;
	vs->vs_vnlocked = TRUE;

	if (accmode != 0) {
	if (!svc_getcred(rqstp, &cred, NULL)) {
	error = EINVAL;
	goto out;
	}
	if (cred->cr_uid == 0 \|\| (exflags & MNT_EXPORTANON)) {
	crfree(cred);
	cred = credanon;
	credanon = NULL;
	}

	/*
	* Check cred.
	*/
	error = VOP_ACCESS(vs->vs_vp, accmode, cred, curthread);
	/*
	* If this failed and accmode != VWRITE, try again with
	* VWRITE to maintain backwards compatibility with the
	* old code that always used VWRITE.
	*/
	if (error != 0 && accmode != VWRITE)
	error = VOP_ACCESS(vs->vs_vp, VWRITE, cred, curthread);
	if (error)
	goto out;
	}

	#if __FreeBSD_version < 800011
	VOP_UNLOCK(vs->vs_vp, 0, curthread);
	#else
	VOP_UNLOCK(vs->vs_vp, 0);
	#endif
	vs->vs_vnlocked = FALSE;

	out:
	if (cred)
	crfree(cred);
	if (credanon)
	crfree(credanon);

	return (error);
	}

	static void
	nlm_release_vfs_state(struct vfs_state *vs)
	{

	if (vs->vs_vp) {
	if (vs->vs_vnlocked)
	vput(vs->vs_vp);
	else
	vrele(vs->vs_vp);
	}
	if (vs->vs_mp)
	vfs_rel(vs->vs_mp);
	}

	static nlm4_stats
	nlm_convert_error(int error)
	{

	if (error == ESTALE)
	return nlm4_stale_fh;
	else if (error == EROFS)
	return nlm4_rofs;
	else
	return nlm4_failed;
	}

	int
	nlm_do_test(nlm4_testargs argp, nlm4_testres result, struct svc_req *rqstp,
	CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host host, bhost;
	int error, sysid;
	struct flock fl;
	accmode_t accmode;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_test(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	accmode = argp->exclusive ? VWRITE : VREAD;
	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	if (argp->exclusive)
	fl.l_type = F_WRLCK;
	else
	fl.l_type = F_RDLCK;
	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_GETLK, &fl, F_REMOTE);
	if (error) {
	result->stat.stat = nlm4_failed;
	goto out;
	}

	if (fl.l_type == F_UNLCK) {
	result->stat.stat = nlm4_granted;
	} else {
	result->stat.stat = nlm4_denied;
	result->stat.nlm4_testrply_u.holder.exclusive =
	(fl.l_type == F_WRLCK);
	result->stat.nlm4_testrply_u.holder.svid = fl.l_pid;
	bhost = nlm_find_host_by_sysid(fl.l_sysid);
	if (bhost) {
	/*
	* We don't have any useful way of recording
	* the value of oh used in the original lock
	* request. Ideally, the test reply would have
	* a space for the owning host's name allowing
	* our caller's NLM to keep track.
	*
	* As far as I can see, Solaris uses an eight
	* byte structure for oh which contains a four
	* byte pid encoded in local byte order and
	* the first four bytes of the host
	* name. Linux uses a variable length string
	* 'pid@hostname' in ascii but doesn't even
	* return that in test replies.
	*
	* For the moment, return nothing in oh
	* (already zero'ed above).
	*/
	nlm_host_release(bhost);
	}
	result->stat.nlm4_testrply_u.holder.l_offset = fl.l_start;
	result->stat.nlm4_testrply_u.holder.l_len = fl.l_len;
	}

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_lock(nlm4_lockargs argp, nlm4_res result, struct svc_req *rqstp,
	bool_t monitor, CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host *host;
	int error, sysid;
	struct flock fl;
	accmode_t accmode;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_lock(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	if (monitor && host->nh_state && argp->state
	&& host->nh_state != argp->state) {
	/*
	* The host rebooted without telling us. Trash its
	* locks.
	*/
	nlm_host_notify(host, argp->state);
	}

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold && !argp->reclaim) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	accmode = argp->exclusive ? VWRITE : VREAD;
	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	if (argp->exclusive)
	fl.l_type = F_WRLCK;
	else
	fl.l_type = F_RDLCK;
	if (argp->block) {
	struct nlm_async_lock *af;
	CLIENT *client;
	struct nlm_grantcookie cookie;

	/*
	* First, make sure we can contact the host's NLM.
	*/
	client = nlm_host_get_rpc(host, TRUE);
	if (!client) {
	result->stat.stat = nlm4_failed;
	goto out;
	}

	/*
	* First we need to check and see if there is an
	* existing blocked lock that matches. This could be a
	* badly behaved client or an RPC re-send. If we find
	* one, just return nlm4_blocked.
	*/
	mtx_lock(&host->nh_lock);
	TAILQ_FOREACH(af, &host->nh_pending, af_link) {
	if (af->af_fl.l_start == fl.l_start
	&& af->af_fl.l_len == fl.l_len
	&& af->af_fl.l_pid == fl.l_pid
	&& af->af_fl.l_type == fl.l_type) {
	break;
	}
	}
	if (!af) {
	cookie.ng_sysid = host->nh_sysid;
	cookie.ng_cookie = host->nh_grantcookie++;
	}
	mtx_unlock(&host->nh_lock);
	if (af) {
	CLNT_RELEASE(client);
	result->stat.stat = nlm4_blocked;
	goto out;
	}

	af = malloc(sizeof(struct nlm_async_lock), M_NLM,
	M_WAITOK\|M_ZERO);
	TASK_INIT(&af->af_task, 0, nlm_lock_callback, af);
	af->af_vp = vs.vs_vp;
	af->af_fl = fl;
	af->af_host = host;
	af->af_rpc = client;
	/*
	* We use M_RPC here so that we can xdr_free the thing
	* later.
	*/
	nlm_make_netobj(&af->af_granted.cookie,
	(caddr_t)&cookie, sizeof(cookie), M_RPC);
	af->af_granted.exclusive = argp->exclusive;
	af->af_granted.alock.caller_name =
	strdup(argp->alock.caller_name, M_RPC);
	nlm_copy_netobj(&af->af_granted.alock.fh,
	&argp->alock.fh, M_RPC);
	nlm_copy_netobj(&af->af_granted.alock.oh,
	&argp->alock.oh, M_RPC);
	af->af_granted.alock.svid = argp->alock.svid;
	af->af_granted.alock.l_offset = argp->alock.l_offset;
	af->af_granted.alock.l_len = argp->alock.l_len;

	/*
	* Put the entry on the pending list before calling
	* VOP_ADVLOCKASYNC. We do this in case the lock
	* request was blocked (returning EINPROGRESS) but
	* then granted before we manage to run again. The
	* client may receive the granted message before we
	* send our blocked reply but thats their problem.
	*/
	mtx_lock(&host->nh_lock);
	TAILQ_INSERT_TAIL(&host->nh_pending, af, af_link);
	mtx_unlock(&host->nh_lock);

	error = VOP_ADVLOCKASYNC(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE,
	&af->af_task, &af->af_cookie);

	/*
	* If the lock completed synchronously, just free the
	* tracking structure now.
	*/
	if (error != EINPROGRESS) {
	CLNT_RELEASE(af->af_rpc);
	mtx_lock(&host->nh_lock);
	TAILQ_REMOVE(&host->nh_pending, af, af_link);
	mtx_unlock(&host->nh_lock);
	xdr_free((xdrproc_t) xdr_nlm4_testargs,
	&af->af_granted);
	free(af, M_NLM);
	} else {
	NLM_DEBUG(2, "NLM: pending async lock %p for %s "
	"(sysid %d)\n", af, host->nh_caller_name, sysid);
	/*
	* Don't vrele the vnode just yet - this must
	* wait until either the async callback
	* happens or the lock is cancelled.
	*/
	vs.vs_vp = NULL;
	}
	} else {
	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE);
	}

	if (error) {
	if (error == EINPROGRESS) {
	result->stat.stat = nlm4_blocked;
	} else if (error == EDEADLK) {
	result->stat.stat = nlm4_deadlck;
	} else if (error == EAGAIN) {
	result->stat.stat = nlm4_denied;
	} else {
	result->stat.stat = nlm4_failed;
	}
	} else {
	if (monitor)
	nlm_host_monitor(host, argp->state);
	result->stat.stat = nlm4_granted;
	}

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_cancel(nlm4_cancargs argp, nlm4_res result, struct svc_req *rqstp,
	CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host *host;
	int error, sysid;
	struct flock fl;
	struct nlm_async_lock *af;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_cancel(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	if (argp->exclusive)
	fl.l_type = F_WRLCK;
	else
	fl.l_type = F_RDLCK;

	/*
	* First we need to try and find the async lock request - if
	* there isn't one, we give up and return nlm4_denied.
	*/
	mtx_lock(&host->nh_lock);

	TAILQ_FOREACH(af, &host->nh_pending, af_link) {
	if (af->af_fl.l_start == fl.l_start
	&& af->af_fl.l_len == fl.l_len
	&& af->af_fl.l_pid == fl.l_pid
	&& af->af_fl.l_type == fl.l_type) {
	break;
	}
	}

	if (!af) {
	mtx_unlock(&host->nh_lock);
	result->stat.stat = nlm4_denied;
	goto out;
	}

	error = nlm_cancel_async_lock(af);

	if (error) {
	result->stat.stat = nlm4_denied;
	} else {
	result->stat.stat = nlm4_granted;
	}

	mtx_unlock(&host->nh_lock);

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_unlock(nlm4_unlockargs argp, nlm4_res result, struct svc_req *rqstp,
	CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host *host;
	int error, sysid;
	struct flock fl;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_unlock(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	fl.l_type = F_UNLCK;
	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_UNLCK, &fl, F_REMOTE);

	/*
	* Ignore the error - there is no result code for failure,
	* only for grace period.
	*/
	result->stat.stat = nlm4_granted;

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_granted(nlm4_testargs argp, nlm4_res result, struct svc_req *rqstp,

	CLIENT **rpcp)
	{
	struct nlm_host *host;
	struct nlm_waiting_lock *nw;

	memset(result, 0, sizeof(*result));

	host = nlm_find_host_by_addr(svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
	result->stat.stat = nlm4_denied;
	KFAIL_POINT_CODE(DEBUG_FP, nlm_deny_grant, goto out);

	mtx_lock(&nlm_global_lock);
	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
	if (!nw->nw_waiting)
	continue;
	if (argp->alock.svid == nw->nw_lock.svid
	&& argp->alock.l_offset == nw->nw_lock.l_offset
	&& argp->alock.l_len == nw->nw_lock.l_len
	&& argp->alock.fh.n_len == nw->nw_lock.fh.n_len
	&& !memcmp(argp->alock.fh.n_bytes, nw->nw_lock.fh.n_bytes,
	nw->nw_lock.fh.n_len)) {
	nw->nw_waiting = FALSE;
	wakeup(nw);
	result->stat.stat = nlm4_granted;
	break;
	}
	}
	mtx_unlock(&nlm_global_lock);

	out:
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	void
	nlm_do_granted_res(nlm4_res argp, struct svc_req rqstp)
	{
	struct nlm_host *host = NULL;
	struct nlm_async_lock *af = NULL;
	int error;

	if (argp->cookie.n_len != sizeof(struct nlm_grantcookie)) {
	NLM_DEBUG(1, "NLM: bogus grant cookie");
	goto out;
	}

	host = nlm_find_host_by_sysid(ng_sysid(&argp->cookie));
	if (!host) {
	NLM_DEBUG(1, "NLM: Unknown host rejected our grant");
	goto out;
	}

	mtx_lock(&host->nh_lock);
	TAILQ_FOREACH(af, &host->nh_granted, af_link)
	if (ng_cookie(&argp->cookie) ==
	ng_cookie(&af->af_granted.cookie))
	break;
	if (af)
	TAILQ_REMOVE(&host->nh_granted, af, af_link);
	mtx_unlock(&host->nh_lock);

	if (!af) {
	NLM_DEBUG(1, "NLM: host %s (sysid %d) replied to our grant "
	"with unrecognized cookie %d:%d", host->nh_caller_name,
	host->nh_sysid, ng_sysid(&argp->cookie),
	ng_cookie(&argp->cookie));
	goto out;
	}

	if (argp->stat.stat != nlm4_granted) {
	af->af_fl.l_type = F_UNLCK;
	error = VOP_ADVLOCK(af->af_vp, NULL, F_UNLCK, &af->af_fl, F_REMOTE);
	if (error) {
	NLM_DEBUG(1, "NLM: host %s (sysid %d) rejected our grant "
	"and we failed to unlock (%d)", host->nh_caller_name,
	host->nh_sysid, error);
	goto out;
	}

	NLM_DEBUG(5, "NLM: async lock %p rejected by host %s (sysid %d)",
	af, host->nh_caller_name, host->nh_sysid);
	} else {
	NLM_DEBUG(5, "NLM: async lock %p accepted by host %s (sysid %d)",
	af, host->nh_caller_name, host->nh_sysid);
	}

	out:
	if (af)
	nlm_free_async_lock(af);
	if (host)
	nlm_host_release(host);
	}

	void
	nlm_do_free_all(nlm4_notify *argp)
	{
	struct nlm_host host, thost;

	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, thost) {
	if (!strcmp(host->nh_caller_name, argp->name))
	nlm_host_notify(host, argp->state);
	}
	}

	/*
	* Kernel module glue
	*/
	static int
	nfslockd_modevent(module_t mod, int type, void *data)
	{

	switch (type) {
	case MOD_LOAD:
	return (0);
	case MOD_UNLOAD:
	/* The NLM module cannot be safely unloaded. */
	/* FALLTHROUGH */
	default:
	return (EOPNOTSUPP);
	}
	}
	static moduledata_t nfslockd_mod = {
	"nfslockd",
	nfslockd_modevent,
	NULL,
	};
	DECLARE_MODULE(nfslockd, nfslockd_mod, SI_SUB_VFS, SI_ORDER_ANY);

	/* So that loader and kldload(2) can find us, wherever we are.. */
	MODULE_DEPEND(nfslockd, krpc, 1, 1, 1);
	MODULE_DEPEND(nfslockd, nfslock, 1, 1, 1);
	MODULE_VERSION(nfslockd, 1);
	Index: head/sys/opencrypto/crypto.c
	===================================================================
	--- head/sys/opencrypto/crypto.c (revision 327172)
	+++ head/sys/opencrypto/crypto.c (revision 327173)
	@@ -1,1781 +1,1780 @@
	/*-
	* Copyright (c) 2002-2006 Sam Leffler. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* Cryptographic Subsystem.
	*
	* This code is derived from the Openbsd Cryptographic Framework (OCF)
	* that has the copyright shown below. Very little of the original
	* code remains.
	*/

	/*-
	* The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu)
	*
	* This code was written by Angelos D. Keromytis in Athens, Greece, in
	* February 2000. Network Security Technologies Inc. (NSTI) kindly
	* supported the development of this code.
	*
	* Copyright (c) 2000, 2001 Angelos D. Keromytis
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all source code copies of any software which is or includes a copy or
	* modification of this software.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/

	#define CRYPTO_TIMING /* enable timing support */

	#include "opt_ddb.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/sdt.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>

	#include <ddb/ddb.h>

	#include <vm/uma.h>
	#include <crypto/intake.h>
	#include <opencrypto/cryptodev.h>
	#include <opencrypto/xform.h> /* XXX for M_XDATA */

	#include <sys/kobj.h>
	#include <sys/bus.h>
	#include "cryptodev_if.h"

	#if defined(__i386__) \|\| defined(__amd64__) \|\| defined(__aarch64__)
	#include <machine/pcb.h>
	#endif

	SDT_PROVIDER_DEFINE(opencrypto);

	/*
	* Crypto drivers register themselves by allocating a slot in the
	* crypto_drivers table with crypto_get_driverid() and then registering
	* each algorithm they support with crypto_register() and crypto_kregister().
	*/
	static struct mtx crypto_drivers_mtx; /* lock on driver table */
	#define CRYPTO_DRIVER_LOCK() mtx_lock(&crypto_drivers_mtx)
	#define CRYPTO_DRIVER_UNLOCK() mtx_unlock(&crypto_drivers_mtx)
	#define CRYPTO_DRIVER_ASSERT() mtx_assert(&crypto_drivers_mtx, MA_OWNED)

	/*
	* Crypto device/driver capabilities structure.
	*
	* Synchronization:
	* (d) - protected by CRYPTO_DRIVER_LOCK()
	* (q) - protected by CRYPTO_Q_LOCK()
	* Not tagged fields are read-only.
	*/
	struct cryptocap {
	device_t cc_dev; /* (d) device/driver */
	u_int32_t cc_sessions; /* (d) # of sessions */
	u_int32_t cc_koperations; /* (d) # os asym operations */
	/*
	* Largest possible operator length (in bits) for each type of
	* encryption algorithm. XXX not used
	*/
	u_int16_t cc_max_op_len[CRYPTO_ALGORITHM_MAX + 1];
	u_int8_t cc_alg[CRYPTO_ALGORITHM_MAX + 1];
	u_int8_t cc_kalg[CRK_ALGORITHM_MAX + 1];

	int cc_flags; /* (d) flags */
	#define CRYPTOCAP_F_CLEANUP 0x80000000 /* needs resource cleanup */
	int cc_qblocked; /* (q) symmetric q blocked */
	int cc_kqblocked; /* (q) asymmetric q blocked */
	};
	static struct cryptocap *crypto_drivers = NULL;
	static int crypto_drivers_num = 0;

	/*
	* There are two queues for crypto requests; one for symmetric (e.g.
	* cipher) operations and one for asymmetric (e.g. MOD)operations.
	* A single mutex is used to lock access to both queues. We could
	* have one per-queue but having one simplifies handling of block/unblock
	* operations.
	*/
	static int crp_sleep = 0;
	static TAILQ_HEAD(cryptop_q ,cryptop) crp_q; /* request queues */
	static TAILQ_HEAD(,cryptkop) crp_kq;
	static struct mtx crypto_q_mtx;
	#define CRYPTO_Q_LOCK() mtx_lock(&crypto_q_mtx)
	#define CRYPTO_Q_UNLOCK() mtx_unlock(&crypto_q_mtx)

	/*
	* Taskqueue used to dispatch the crypto requests
	* that have the CRYPTO_F_ASYNC flag
	*/
	static struct taskqueue *crypto_tq;

	/*
	* Crypto seq numbers are operated on with modular arithmetic
	*/
	#define CRYPTO_SEQ_GT(a,b) ((int)((a)-(b)) > 0)

	struct crypto_ret_worker {
	struct mtx crypto_ret_mtx;

	TAILQ_HEAD(,cryptop) crp_ordered_ret_q; /* ordered callback queue for symetric jobs */
	TAILQ_HEAD(,cryptop) crp_ret_q; /* callback queue for symetric jobs */
	TAILQ_HEAD(,cryptkop) crp_ret_kq; /* callback queue for asym jobs */

	u_int32_t reorder_ops; /* total ordered sym jobs received */
	u_int32_t reorder_cur_seq; /* current sym job dispatched */

	struct proc *cryptoretproc;
	};
	static struct crypto_ret_worker *crypto_ret_workers = NULL;

	#define CRYPTO_RETW(i) (&crypto_ret_workers[i])
	#define CRYPTO_RETW_ID(w) ((w) - crypto_ret_workers)
	#define FOREACH_CRYPTO_RETW(w) \
	for (w = crypto_ret_workers; w < crypto_ret_workers + crypto_workers_num; ++w)

	#define CRYPTO_RETW_LOCK(w) mtx_lock(&w->crypto_ret_mtx)
	#define CRYPTO_RETW_UNLOCK(w) mtx_unlock(&w->crypto_ret_mtx)
	#define CRYPTO_RETW_EMPTY(w) \
	(TAILQ_EMPTY(&w->crp_ret_q) && TAILQ_EMPTY(&w->crp_ret_kq) && TAILQ_EMPTY(&w->crp_ordered_ret_q))

	static int crypto_workers_num = 0;
	SYSCTL_INT(_kern, OID_AUTO, crypto_workers_num, CTLFLAG_RDTUN,
	&crypto_workers_num, 0,
	"Number of crypto workers used to dispatch crypto jobs");

	static uma_zone_t cryptop_zone;
	static uma_zone_t cryptodesc_zone;

	int crypto_userasymcrypto = 1; /* userland may do asym crypto reqs */
	SYSCTL_INT(_kern, OID_AUTO, userasymcrypto, CTLFLAG_RW,
	&crypto_userasymcrypto, 0,
	"Enable/disable user-mode access to asymmetric crypto support");
	int crypto_devallowsoft = 0; /* only use hardware crypto */
	SYSCTL_INT(_kern, OID_AUTO, cryptodevallowsoft, CTLFLAG_RW,
	&crypto_devallowsoft, 0,
	"Enable/disable use of software crypto by /dev/crypto");

	MALLOC_DEFINE(M_CRYPTO_DATA, "crypto", "crypto session records");

	static void crypto_proc(void);
	static struct proc *cryptoproc;
	static void crypto_ret_proc(struct crypto_ret_worker *ret_worker);
	static void crypto_destroy(void);
	static int crypto_invoke(struct cryptocap cap, struct cryptop crp, int hint);
	static int crypto_kinvoke(struct cryptkop *krp, int flags);
	static void crypto_task_invoke(void *ctx, int pending);
	static void crypto_batch_enqueue(struct cryptop *crp);

	static struct cryptostats cryptostats;
	SYSCTL_STRUCT(_kern, OID_AUTO, crypto_stats, CTLFLAG_RW, &cryptostats,
	cryptostats, "Crypto system statistics");

	#ifdef CRYPTO_TIMING
	static int crypto_timing = 0;
	SYSCTL_INT(_debug, OID_AUTO, crypto_timing, CTLFLAG_RW,
	&crypto_timing, 0, "Enable/disable crypto timing support");
	#endif

	/* Try to avoid directly exposing the key buffer as a symbol */
	static struct keybuf *keybuf;

	static struct keybuf empty_keybuf = {
	.kb_nents = 0
	};

	/* Obtain the key buffer from boot metadata */
	static void
	keybuf_init(void)
	{
	caddr_t kmdp;

	kmdp = preload_search_by_type("elf kernel");

	if (kmdp == NULL)
	kmdp = preload_search_by_type("elf64 kernel");

	keybuf = (struct keybuf *)preload_search_info(kmdp,
	MODINFO_METADATA \| MODINFOMD_KEYBUF);

	if (keybuf == NULL)
	keybuf = &empty_keybuf;
	}

	/* It'd be nice if we could store these in some kind of secure memory... */
	struct keybuf * get_keybuf(void) {

	return (keybuf);
	}

	static int
	crypto_init(void)
	{
	struct crypto_ret_worker *ret_worker;
	int error;

	mtx_init(&crypto_drivers_mtx, "crypto", "crypto driver table",
	MTX_DEF\|MTX_QUIET);

	TAILQ_INIT(&crp_q);
	TAILQ_INIT(&crp_kq);
	mtx_init(&crypto_q_mtx, "crypto", "crypto op queues", MTX_DEF);

	cryptop_zone = uma_zcreate("cryptop", sizeof (struct cryptop),
	0, 0, 0, 0,
	UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
	cryptodesc_zone = uma_zcreate("cryptodesc", sizeof (struct cryptodesc),
	0, 0, 0, 0,
	UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
	if (cryptodesc_zone == NULL \|\| cryptop_zone == NULL) {
	printf("crypto_init: cannot setup crypto zones\n");
	error = ENOMEM;
	goto bad;
	}

	crypto_drivers_num = CRYPTO_DRIVERS_INITIAL;
	crypto_drivers = malloc(crypto_drivers_num *
	sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT \| M_ZERO);
	if (crypto_drivers == NULL) {
	printf("crypto_init: cannot setup crypto drivers\n");
	error = ENOMEM;
	goto bad;
	}

	if (crypto_workers_num < 1 \|\| crypto_workers_num > mp_ncpus)
	crypto_workers_num = mp_ncpus;

	crypto_tq = taskqueue_create("crypto", M_WAITOK\|M_ZERO,
	taskqueue_thread_enqueue, &crypto_tq);
	if (crypto_tq == NULL) {
	printf("crypto init: cannot setup crypto taskqueue\n");
	error = ENOMEM;
	goto bad;
	}

	taskqueue_start_threads(&crypto_tq, crypto_workers_num, PRI_MIN_KERN,
	"crypto");

	error = kproc_create((void ()(void )) crypto_proc, NULL,
	&cryptoproc, 0, 0, "crypto");
	if (error) {
	printf("crypto_init: cannot start crypto thread; error %d",
	error);
	goto bad;
	}

	crypto_ret_workers = malloc(crypto_workers_num * sizeof(struct crypto_ret_worker),
	M_CRYPTO_DATA, M_NOWAIT\|M_ZERO);
	if (crypto_ret_workers == NULL) {
	error = ENOMEM;
	printf("crypto_init: cannot allocate ret workers\n");
	goto bad;
	}


	FOREACH_CRYPTO_RETW(ret_worker) {
	TAILQ_INIT(&ret_worker->crp_ordered_ret_q);
	TAILQ_INIT(&ret_worker->crp_ret_q);
	TAILQ_INIT(&ret_worker->crp_ret_kq);

	ret_worker->reorder_ops = 0;
	ret_worker->reorder_cur_seq = 0;

	mtx_init(&ret_worker->crypto_ret_mtx, "crypto", "crypto return queues", MTX_DEF);

	error = kproc_create((void ()(void )) crypto_ret_proc, ret_worker,
	&ret_worker->cryptoretproc, 0, 0, "crypto returns %td", CRYPTO_RETW_ID(ret_worker));
	if (error) {
	printf("crypto_init: cannot start cryptoret thread; error %d",
	error);
	goto bad;
	}
	}

	keybuf_init();

	return 0;
	bad:
	crypto_destroy();
	return error;
	}

	/*
	* Signal a crypto thread to terminate. We use the driver
	* table lock to synchronize the sleep/wakeups so that we
	* are sure the threads have terminated before we release
	* the data structures they use. See crypto_finis below
	* for the other half of this song-and-dance.
	*/
	static void
	crypto_terminate(struct proc *pp, void q)
	{
	struct proc *p;

	mtx_assert(&crypto_drivers_mtx, MA_OWNED);
	p = *pp;
	*pp = NULL;
	if (p) {
	wakeup_one(q);
	PROC_LOCK(p); /* NB: insure we don't miss wakeup */
	CRYPTO_DRIVER_UNLOCK(); /* let crypto_finis progress */
	msleep(p, &p->p_mtx, PWAIT, "crypto_destroy", 0);
	PROC_UNLOCK(p);
	CRYPTO_DRIVER_LOCK();
	}
	}

	static void
	crypto_destroy(void)
	{
	struct crypto_ret_worker *ret_worker;

	/*
	* Terminate any crypto threads.
	*/
	if (crypto_tq != NULL)
	taskqueue_drain_all(crypto_tq);
	CRYPTO_DRIVER_LOCK();
	crypto_terminate(&cryptoproc, &crp_q);
	FOREACH_CRYPTO_RETW(ret_worker)
	crypto_terminate(&ret_worker->cryptoretproc, &ret_worker->crp_ret_q);
	CRYPTO_DRIVER_UNLOCK();

	/* XXX flush queues??? */

	/*
	* Reclaim dynamically allocated resources.
	*/
	if (crypto_drivers != NULL)
	free(crypto_drivers, M_CRYPTO_DATA);

	if (cryptodesc_zone != NULL)
	uma_zdestroy(cryptodesc_zone);
	if (cryptop_zone != NULL)
	uma_zdestroy(cryptop_zone);
	mtx_destroy(&crypto_q_mtx);
	FOREACH_CRYPTO_RETW(ret_worker)
	mtx_destroy(&ret_worker->crypto_ret_mtx);
	free(crypto_ret_workers, M_CRYPTO_DATA);
	if (crypto_tq != NULL)
	taskqueue_free(crypto_tq);
	mtx_destroy(&crypto_drivers_mtx);
	}

	static struct cryptocap *
	crypto_checkdriver(u_int32_t hid)
	{
	if (crypto_drivers == NULL)
	return NULL;
	return (hid >= crypto_drivers_num ? NULL : &crypto_drivers[hid]);
	}

	/*
	* Compare a driver's list of supported algorithms against another
	* list; return non-zero if all algorithms are supported.
	*/
	static int
	driver_suitable(const struct cryptocap cap, const struct cryptoini cri)
	{
	const struct cryptoini *cr;

	/* See if all the algorithms are supported. */
	for (cr = cri; cr; cr = cr->cri_next)
	if (cap->cc_alg[cr->cri_alg] == 0)
	return 0;
	return 1;
	}

	/*
	* Select a driver for a new session that supports the specified
	* algorithms and, optionally, is constrained according to the flags.
	* The algorithm we use here is pretty stupid; just use the
	* first driver that supports all the algorithms we need. If there
	* are multiple drivers we choose the driver with the fewest active
	* sessions. We prefer hardware-backed drivers to software ones.
	*
	* XXX We need more smarts here (in real life too, but that's
	* XXX another story altogether).
	*/
	static struct cryptocap *
	crypto_select_driver(const struct cryptoini *cri, int flags)
	{
	struct cryptocap cap, best;
	int match, hid;

	CRYPTO_DRIVER_ASSERT();

	/*
	* Look first for hardware crypto devices if permitted.
	*/
	if (flags & CRYPTOCAP_F_HARDWARE)
	match = CRYPTOCAP_F_HARDWARE;
	else
	match = CRYPTOCAP_F_SOFTWARE;
	best = NULL;
	again:
	for (hid = 0; hid < crypto_drivers_num; hid++) {
	cap = &crypto_drivers[hid];
	/*
	* If it's not initialized, is in the process of
	* going away, or is not appropriate (hardware
	* or software based on match), then skip.
	*/
	if (cap->cc_dev == NULL \|\|
	(cap->cc_flags & CRYPTOCAP_F_CLEANUP) \|\|
	(cap->cc_flags & match) == 0)
	continue;

	/* verify all the algorithms are supported. */
	if (driver_suitable(cap, cri)) {
	if (best == NULL \|\|
	cap->cc_sessions < best->cc_sessions)
	best = cap;
	}
	}
	if (best == NULL && match == CRYPTOCAP_F_HARDWARE &&
	(flags & CRYPTOCAP_F_SOFTWARE)) {
	/* sort of an Algol 68-style for loop */
	match = CRYPTOCAP_F_SOFTWARE;
	goto again;
	}
	return best;
	}

	/*
	* Create a new session. The crid argument specifies a crypto
	* driver to use or constraints on a driver to select (hardware
	* only, software only, either). Whatever driver is selected
	* must be capable of the requested crypto algorithms.
	*/
	int
	crypto_newsession(u_int64_t sid, struct cryptoini cri, int crid)
	{
	struct cryptocap *cap;
	u_int32_t hid, lid;
	int err;

	CRYPTO_DRIVER_LOCK();
	if ((crid & (CRYPTOCAP_F_HARDWARE \| CRYPTOCAP_F_SOFTWARE)) == 0) {
	/*
	* Use specified driver; verify it is capable.
	*/
	cap = crypto_checkdriver(crid);
	if (cap != NULL && !driver_suitable(cap, cri))
	cap = NULL;
	} else {
	/*
	* No requested driver; select based on crid flags.
	*/
	cap = crypto_select_driver(cri, crid);
	/*
	* if NULL then can't do everything in one session.
	* XXX Fix this. We need to inject a "virtual" session
	* XXX layer right about here.
	*/
	}
	if (cap != NULL) {
	/* Call the driver initialization routine. */
	hid = cap - crypto_drivers;
	lid = hid; /* Pass the driver ID. */
	err = CRYPTODEV_NEWSESSION(cap->cc_dev, &lid, cri);
	if (err == 0) {
	(*sid) = (cap->cc_flags & 0xff000000)
	\| (hid & 0x00ffffff);
	(*sid) <<= 32;
	(*sid) \|= (lid & 0xffffffff);
	cap->cc_sessions++;
	} else
	CRYPTDEB("dev newsession failed");
	} else {
	CRYPTDEB("no driver");
	err = EOPNOTSUPP;
	}
	CRYPTO_DRIVER_UNLOCK();
	return err;
	}

	static void
	crypto_remove(struct cryptocap *cap)
	{

	mtx_assert(&crypto_drivers_mtx, MA_OWNED);
	if (cap->cc_sessions == 0 && cap->cc_koperations == 0)
	bzero(cap, sizeof(*cap));
	}

	/*
	* Delete an existing session (or a reserved session on an unregistered
	* driver).
	*/
	int
	crypto_freesession(u_int64_t sid)
	{
	struct cryptocap *cap;
	u_int32_t hid;
	int err;

	CRYPTO_DRIVER_LOCK();

	if (crypto_drivers == NULL) {
	err = EINVAL;
	goto done;
	}

	/* Determine two IDs. */
	hid = CRYPTO_SESID2HID(sid);

	if (hid >= crypto_drivers_num) {
	err = ENOENT;
	goto done;
	}
	cap = &crypto_drivers[hid];

	if (cap->cc_sessions)
	cap->cc_sessions--;

	/* Call the driver cleanup routine, if available. */
	err = CRYPTODEV_FREESESSION(cap->cc_dev, sid);

	if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
	crypto_remove(cap);

	done:
	CRYPTO_DRIVER_UNLOCK();
	return err;
	}

	/*
	* Return an unused driver id. Used by drivers prior to registering
	* support for the algorithms they handle.
	*/
	int32_t
	crypto_get_driverid(device_t dev, int flags)
	{
	struct cryptocap *newdrv;
	int i;

	if ((flags & (CRYPTOCAP_F_HARDWARE \| CRYPTOCAP_F_SOFTWARE)) == 0) {
	printf("%s: no flags specified when registering driver\n",
	device_get_nameunit(dev));
	return -1;
	}

	CRYPTO_DRIVER_LOCK();

	for (i = 0; i < crypto_drivers_num; i++) {
	if (crypto_drivers[i].cc_dev == NULL &&
	(crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP) == 0) {
	break;
	}
	}

	/* Out of entries, allocate some more. */
	if (i == crypto_drivers_num) {
	/* Be careful about wrap-around. */
	if (2 * crypto_drivers_num <= crypto_drivers_num) {
	CRYPTO_DRIVER_UNLOCK();
	printf("crypto: driver count wraparound!\n");
	return -1;
	}

	newdrv = malloc(2 * crypto_drivers_num *
	sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT\|M_ZERO);
	if (newdrv == NULL) {
	CRYPTO_DRIVER_UNLOCK();
	printf("crypto: no space to expand driver table!\n");
	return -1;
	}

	bcopy(crypto_drivers, newdrv,
	crypto_drivers_num * sizeof(struct cryptocap));

	crypto_drivers_num *= 2;

	free(crypto_drivers, M_CRYPTO_DATA);
	crypto_drivers = newdrv;
	}

	/* NB: state is zero'd on free */
	crypto_drivers[i].cc_sessions = 1; /* Mark */
	crypto_drivers[i].cc_dev = dev;
	crypto_drivers[i].cc_flags = flags;
	if (bootverbose)
	printf("crypto: assign %s driver id %u, flags 0x%x\n",
	device_get_nameunit(dev), i, flags);

	CRYPTO_DRIVER_UNLOCK();

	return i;
	}

	/*
	* Lookup a driver by name. We match against the full device
	* name and unit, and against just the name. The latter gives
	* us a simple widlcarding by device name. On success return the
	* driver/hardware identifier; otherwise return -1.
	*/
	int
	crypto_find_driver(const char *match)
	{
	int i, len = strlen(match);

	CRYPTO_DRIVER_LOCK();
	for (i = 0; i < crypto_drivers_num; i++) {
	device_t dev = crypto_drivers[i].cc_dev;
	if (dev == NULL \|\|
	(crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP))
	continue;
	if (strncmp(match, device_get_nameunit(dev), len) == 0 \|\|
	strncmp(match, device_get_name(dev), len) == 0)
	break;
	}
	CRYPTO_DRIVER_UNLOCK();
	return i < crypto_drivers_num ? i : -1;
	}

	/*
	* Return the device_t for the specified driver or NULL
	* if the driver identifier is invalid.
	*/
	device_t
	crypto_find_device_byhid(int hid)
	{
	struct cryptocap *cap = crypto_checkdriver(hid);
	return cap != NULL ? cap->cc_dev : NULL;
	}

	/*
	* Return the device/driver capabilities.
	*/
	int
	crypto_getcaps(int hid)
	{
	struct cryptocap *cap = crypto_checkdriver(hid);
	return cap != NULL ? cap->cc_flags : 0;
	}

	/*
	* Register support for a key-related algorithm. This routine
	* is called once for each algorithm supported a driver.
	*/
	int
	crypto_kregister(u_int32_t driverid, int kalg, u_int32_t flags)
	{
	struct cryptocap *cap;
	int err;

	CRYPTO_DRIVER_LOCK();

	cap = crypto_checkdriver(driverid);
	if (cap != NULL &&
	(CRK_ALGORITM_MIN <= kalg && kalg <= CRK_ALGORITHM_MAX)) {
	/*
	* XXX Do some performance testing to determine placing.
	* XXX We probably need an auxiliary data structure that
	* XXX describes relative performances.
	*/

	cap->cc_kalg[kalg] = flags \| CRYPTO_ALG_FLAG_SUPPORTED;
	if (bootverbose)
	printf("crypto: %s registers key alg %u flags %u\n"
	, device_get_nameunit(cap->cc_dev)
	, kalg
	, flags
	);
	err = 0;
	} else
	err = EINVAL;

	CRYPTO_DRIVER_UNLOCK();
	return err;
	}

	/*
	* Register support for a non-key-related algorithm. This routine
	* is called once for each such algorithm supported by a driver.
	*/
	int
	crypto_register(u_int32_t driverid, int alg, u_int16_t maxoplen,
	u_int32_t flags)
	{
	struct cryptocap *cap;
	int err;

	CRYPTO_DRIVER_LOCK();

	cap = crypto_checkdriver(driverid);
	/* NB: algorithms are in the range [1..max] */
	if (cap != NULL &&
	(CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX)) {
	/*
	* XXX Do some performance testing to determine placing.
	* XXX We probably need an auxiliary data structure that
	* XXX describes relative performances.
	*/

	cap->cc_alg[alg] = flags \| CRYPTO_ALG_FLAG_SUPPORTED;
	cap->cc_max_op_len[alg] = maxoplen;
	if (bootverbose)
	printf("crypto: %s registers alg %u flags %u maxoplen %u\n"
	, device_get_nameunit(cap->cc_dev)
	, alg
	, flags
	, maxoplen
	);
	cap->cc_sessions = 0; /* Unmark */
	err = 0;
	} else
	err = EINVAL;

	CRYPTO_DRIVER_UNLOCK();
	return err;
	}

	static void
	driver_finis(struct cryptocap *cap)
	{
	u_int32_t ses, kops;

	CRYPTO_DRIVER_ASSERT();

	ses = cap->cc_sessions;
	kops = cap->cc_koperations;
	bzero(cap, sizeof(*cap));
	if (ses != 0 \|\| kops != 0) {
	/*
	* If there are pending sessions,
	* just mark as invalid.
	*/
	cap->cc_flags \|= CRYPTOCAP_F_CLEANUP;
	cap->cc_sessions = ses;
	cap->cc_koperations = kops;
	}
	}

	/*
	* Unregister a crypto driver. If there are pending sessions using it,
	* leave enough information around so that subsequent calls using those
	* sessions will correctly detect the driver has been unregistered and
	* reroute requests.
	*/
	int
	crypto_unregister(u_int32_t driverid, int alg)
	{
	struct cryptocap *cap;
	int i, err;

	CRYPTO_DRIVER_LOCK();
	cap = crypto_checkdriver(driverid);
	if (cap != NULL &&
	(CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX) &&
	cap->cc_alg[alg] != 0) {
	cap->cc_alg[alg] = 0;
	cap->cc_max_op_len[alg] = 0;

	/* Was this the last algorithm ? */
	for (i = 1; i <= CRYPTO_ALGORITHM_MAX; i++)
	if (cap->cc_alg[i] != 0)
	break;

	if (i == CRYPTO_ALGORITHM_MAX + 1)
	driver_finis(cap);
	err = 0;
	} else
	err = EINVAL;
	CRYPTO_DRIVER_UNLOCK();

	return err;
	}

	/*
	* Unregister all algorithms associated with a crypto driver.
	* If there are pending sessions using it, leave enough information
	* around so that subsequent calls using those sessions will
	* correctly detect the driver has been unregistered and reroute
	* requests.
	*/
	int
	crypto_unregister_all(u_int32_t driverid)
	{
	struct cryptocap *cap;
	int err;

	CRYPTO_DRIVER_LOCK();
	cap = crypto_checkdriver(driverid);
	if (cap != NULL) {
	driver_finis(cap);
	err = 0;
	} else
	err = EINVAL;
	CRYPTO_DRIVER_UNLOCK();

	return err;
	}

	/*
	* Clear blockage on a driver. The what parameter indicates whether
	* the driver is now ready for cryptop's and/or cryptokop's.
	*/
	int
	crypto_unblock(u_int32_t driverid, int what)
	{
	struct cryptocap *cap;
	int err;

	CRYPTO_Q_LOCK();
	cap = crypto_checkdriver(driverid);
	if (cap != NULL) {
	if (what & CRYPTO_SYMQ)
	cap->cc_qblocked = 0;
	if (what & CRYPTO_ASYMQ)
	cap->cc_kqblocked = 0;
	if (crp_sleep)
	wakeup_one(&crp_q);
	err = 0;
	} else
	err = EINVAL;
	CRYPTO_Q_UNLOCK();

	return err;
	}

	/*
	* Add a crypto request to a queue, to be processed by the kernel thread.
	*/
	int
	crypto_dispatch(struct cryptop *crp)
	{
	struct cryptocap *cap;
	u_int32_t hid;
	int result;

	cryptostats.cs_ops++;

	#ifdef CRYPTO_TIMING
	if (crypto_timing)
	binuptime(&crp->crp_tstamp);
	#endif

	if (CRYPTOP_ASYNC(crp)) {
	if (crp->crp_flags & CRYPTO_F_ASYNC_KEEPORDER) {
	struct crypto_ret_worker *ret_worker;

	crp->crp_retw_id = crp->crp_sid % crypto_workers_num;
	ret_worker = CRYPTO_RETW(crp->crp_retw_id);

	CRYPTO_RETW_LOCK(ret_worker);
	crp->crp_seq = ret_worker->reorder_ops++;
	CRYPTO_RETW_UNLOCK(ret_worker);
	}

	TASK_INIT(&crp->crp_task, 0, crypto_task_invoke, crp);
	taskqueue_enqueue(crypto_tq, &crp->crp_task);
	return (0);
	}

	if ((crp->crp_flags & CRYPTO_F_BATCH) == 0) {
	hid = CRYPTO_SESID2HID(crp->crp_sid);

	/*
	* Caller marked the request to be processed
	* immediately; dispatch it directly to the
	* driver unless the driver is currently blocked.
	*/
	cap = crypto_checkdriver(hid);
	/* Driver cannot disappeared when there is an active session. */
	KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__));
	if (!cap->cc_qblocked) {
	result = crypto_invoke(cap, crp, 0);
	if (result != ERESTART)
	return (result);
	/*
	* The driver ran out of resources, put the request on
	* the queue.
	*/
	}
	}
	crypto_batch_enqueue(crp);
	return 0;
	}

	void
	crypto_batch_enqueue(struct cryptop *crp)
	{

	CRYPTO_Q_LOCK();
	TAILQ_INSERT_TAIL(&crp_q, crp, crp_next);
	if (crp_sleep)
	wakeup_one(&crp_q);
	CRYPTO_Q_UNLOCK();
	}

	/*
	* Add an asymetric crypto request to a queue,
	* to be processed by the kernel thread.
	*/
	int
	crypto_kdispatch(struct cryptkop *krp)
	{
	int error;

	cryptostats.cs_kops++;

	error = crypto_kinvoke(krp, krp->krp_crid);
	if (error == ERESTART) {
	CRYPTO_Q_LOCK();
	TAILQ_INSERT_TAIL(&crp_kq, krp, krp_next);
	if (crp_sleep)
	wakeup_one(&crp_q);
	CRYPTO_Q_UNLOCK();
	error = 0;
	}
	return error;
	}

	/*
	* Verify a driver is suitable for the specified operation.
	*/
	static __inline int
	kdriver_suitable(const struct cryptocap cap, const struct cryptkop krp)
	{
	return (cap->cc_kalg[krp->krp_op] & CRYPTO_ALG_FLAG_SUPPORTED) != 0;
	}

	/*
	* Select a driver for an asym operation. The driver must
	* support the necessary algorithm. The caller can constrain
	* which device is selected with the flags parameter. The
	* algorithm we use here is pretty stupid; just use the first
	* driver that supports the algorithms we need. If there are
	* multiple suitable drivers we choose the driver with the
	* fewest active operations. We prefer hardware-backed
	* drivers to software ones when either may be used.
	*/
	static struct cryptocap *
	crypto_select_kdriver(const struct cryptkop *krp, int flags)
	{
	- struct cryptocap cap, best, *blocked;
	+ struct cryptocap cap, best;
	int match, hid;

	CRYPTO_DRIVER_ASSERT();

	/*
	* Look first for hardware crypto devices if permitted.
	*/
	if (flags & CRYPTOCAP_F_HARDWARE)
	match = CRYPTOCAP_F_HARDWARE;
	else
	match = CRYPTOCAP_F_SOFTWARE;
	best = NULL;
	- blocked = NULL;
	again:
	for (hid = 0; hid < crypto_drivers_num; hid++) {
	cap = &crypto_drivers[hid];
	/*
	* If it's not initialized, is in the process of
	* going away, or is not appropriate (hardware
	* or software based on match), then skip.
	*/
	if (cap->cc_dev == NULL \|\|
	(cap->cc_flags & CRYPTOCAP_F_CLEANUP) \|\|
	(cap->cc_flags & match) == 0)
	continue;

	/* verify all the algorithms are supported. */
	if (kdriver_suitable(cap, krp)) {
	if (best == NULL \|\|
	cap->cc_koperations < best->cc_koperations)
	best = cap;
	}
	}
	if (best != NULL)
	return best;
	if (match == CRYPTOCAP_F_HARDWARE && (flags & CRYPTOCAP_F_SOFTWARE)) {
	/* sort of an Algol 68-style for loop */
	match = CRYPTOCAP_F_SOFTWARE;
	goto again;
	}
	return best;
	}

	/*
	* Dispatch an asymmetric crypto request.
	*/
	static int
	crypto_kinvoke(struct cryptkop *krp, int crid)
	{
	struct cryptocap *cap = NULL;
	int error;

	KASSERT(krp != NULL, ("%s: krp == NULL", __func__));
	KASSERT(krp->krp_callback != NULL,
	("%s: krp->crp_callback == NULL", __func__));

	CRYPTO_DRIVER_LOCK();
	if ((crid & (CRYPTOCAP_F_HARDWARE \| CRYPTOCAP_F_SOFTWARE)) == 0) {
	cap = crypto_checkdriver(crid);
	if (cap != NULL) {
	/*
	* Driver present, it must support the necessary
	* algorithm and, if s/w drivers are excluded,
	* it must be registered as hardware-backed.
	*/
	if (!kdriver_suitable(cap, krp) \|\|
	(!crypto_devallowsoft &&
	(cap->cc_flags & CRYPTOCAP_F_HARDWARE) == 0))
	cap = NULL;
	}
	} else {
	/*
	* No requested driver; select based on crid flags.
	*/
	if (!crypto_devallowsoft) /* NB: disallow s/w drivers */
	crid &= ~CRYPTOCAP_F_SOFTWARE;
	cap = crypto_select_kdriver(krp, crid);
	}
	if (cap != NULL && !cap->cc_kqblocked) {
	krp->krp_hid = cap - crypto_drivers;
	cap->cc_koperations++;
	CRYPTO_DRIVER_UNLOCK();
	error = CRYPTODEV_KPROCESS(cap->cc_dev, krp, 0);
	CRYPTO_DRIVER_LOCK();
	if (error == ERESTART) {
	cap->cc_koperations--;
	CRYPTO_DRIVER_UNLOCK();
	return (error);
	}
	} else {
	/*
	* NB: cap is !NULL if device is blocked; in
	* that case return ERESTART so the operation
	* is resubmitted if possible.
	*/
	error = (cap == NULL) ? ENODEV : ERESTART;
	}
	CRYPTO_DRIVER_UNLOCK();

	if (error) {
	krp->krp_status = error;
	crypto_kdone(krp);
	}
	return 0;
	}

	#ifdef CRYPTO_TIMING
	static void
	crypto_tstat(struct cryptotstat ts, struct bintime bt)
	{
	struct bintime now, delta;
	struct timespec t;
	uint64_t u;

	binuptime(&now);
	u = now.frac;
	delta.frac = now.frac - bt->frac;
	delta.sec = now.sec - bt->sec;
	if (u < delta.frac)
	delta.sec--;
	bintime2timespec(&delta, &t);
	timespecadd(&ts->acc, &t);
	if (timespeccmp(&t, &ts->min, <))
	ts->min = t;
	if (timespeccmp(&t, &ts->max, >))
	ts->max = t;
	ts->count++;

	*bt = now;
	}
	#endif

	static void
	crypto_task_invoke(void *ctx, int pending)
	{
	struct cryptocap *cap;
	struct cryptop *crp;
	int hid, result;

	crp = (struct cryptop *)ctx;

	hid = CRYPTO_SESID2HID(crp->crp_sid);
	cap = crypto_checkdriver(hid);

	result = crypto_invoke(cap, crp, 0);
	if (result == ERESTART)
	crypto_batch_enqueue(crp);
	}

	/*
	* Dispatch a crypto request to the appropriate crypto devices.
	*/
	static int
	crypto_invoke(struct cryptocap cap, struct cryptop crp, int hint)
	{

	KASSERT(crp != NULL, ("%s: crp == NULL", __func__));
	KASSERT(crp->crp_callback != NULL,
	("%s: crp->crp_callback == NULL", __func__));
	KASSERT(crp->crp_desc != NULL, ("%s: crp->crp_desc == NULL", __func__));

	#ifdef CRYPTO_TIMING
	if (crypto_timing)
	crypto_tstat(&cryptostats.cs_invoke, &crp->crp_tstamp);
	#endif
	if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) {
	struct cryptodesc *crd;
	u_int64_t nid;

	/*
	* Driver has unregistered; migrate the session and return
	* an error to the caller so they'll resubmit the op.
	*
	* XXX: What if there are more already queued requests for this
	* session?
	*/
	crypto_freesession(crp->crp_sid);

	for (crd = crp->crp_desc; crd->crd_next; crd = crd->crd_next)
	crd->CRD_INI.cri_next = &(crd->crd_next->CRD_INI);

	/* XXX propagate flags from initial session? */
	if (crypto_newsession(&nid, &(crp->crp_desc->CRD_INI),
	CRYPTOCAP_F_HARDWARE \| CRYPTOCAP_F_SOFTWARE) == 0)
	crp->crp_sid = nid;

	crp->crp_etype = EAGAIN;
	crypto_done(crp);
	return 0;
	} else {
	/*
	* Invoke the driver to process the request.
	*/
	return CRYPTODEV_PROCESS(cap->cc_dev, crp, hint);
	}
	}

	/*
	* Release a set of crypto descriptors.
	*/
	void
	crypto_freereq(struct cryptop *crp)
	{
	struct cryptodesc *crd;

	if (crp == NULL)
	return;

	#ifdef DIAGNOSTIC
	{
	struct cryptop *crp2;
	struct crypto_ret_worker *ret_worker;

	CRYPTO_Q_LOCK();
	TAILQ_FOREACH(crp2, &crp_q, crp_next) {
	KASSERT(crp2 != crp,
	("Freeing cryptop from the crypto queue (%p).",
	crp));
	}
	CRYPTO_Q_UNLOCK();

	FOREACH_CRYPTO_RETW(ret_worker) {
	CRYPTO_RETW_LOCK(ret_worker);
	TAILQ_FOREACH(crp2, &ret_worker->crp_ret_q, crp_next) {
	KASSERT(crp2 != crp,
	("Freeing cryptop from the return queue (%p).",
	crp));
	}
	CRYPTO_RETW_UNLOCK(ret_worker);
	}
	}
	#endif

	while ((crd = crp->crp_desc) != NULL) {
	crp->crp_desc = crd->crd_next;
	uma_zfree(cryptodesc_zone, crd);
	}
	uma_zfree(cryptop_zone, crp);
	}

	/*
	* Acquire a set of crypto descriptors.
	*/
	struct cryptop *
	crypto_getreq(int num)
	{
	struct cryptodesc *crd;
	struct cryptop *crp;

	crp = uma_zalloc(cryptop_zone, M_NOWAIT\|M_ZERO);
	if (crp != NULL) {
	while (num--) {
	crd = uma_zalloc(cryptodesc_zone, M_NOWAIT\|M_ZERO);
	if (crd == NULL) {
	crypto_freereq(crp);
	return NULL;
	}

	crd->crd_next = crp->crp_desc;
	crp->crp_desc = crd;
	}
	}
	return crp;
	}

	/*
	* Invoke the callback on behalf of the driver.
	*/
	void
	crypto_done(struct cryptop *crp)
	{
	KASSERT((crp->crp_flags & CRYPTO_F_DONE) == 0,
	("crypto_done: op already done, flags 0x%x", crp->crp_flags));
	crp->crp_flags \|= CRYPTO_F_DONE;
	if (crp->crp_etype != 0)
	cryptostats.cs_errs++;
	#ifdef CRYPTO_TIMING
	if (crypto_timing)
	crypto_tstat(&cryptostats.cs_done, &crp->crp_tstamp);
	#endif
	/*
	* CBIMM means unconditionally do the callback immediately;
	* CBIFSYNC means do the callback immediately only if the
	* operation was done synchronously. Both are used to avoid
	* doing extraneous context switches; the latter is mostly
	* used with the software crypto driver.
	*/
	if (!CRYPTOP_ASYNC_KEEPORDER(crp) &&
	((crp->crp_flags & CRYPTO_F_CBIMM) \|\|
	((crp->crp_flags & CRYPTO_F_CBIFSYNC) &&
	(CRYPTO_SESID2CAPS(crp->crp_sid) & CRYPTOCAP_F_SYNC)))) {
	/*
	* Do the callback directly. This is ok when the
	* callback routine does very little (e.g. the
	* /dev/crypto callback method just does a wakeup).
	*/
	#ifdef CRYPTO_TIMING
	if (crypto_timing) {
	/*
	* NB: We must copy the timestamp before
	* doing the callback as the cryptop is
	* likely to be reclaimed.
	*/
	struct bintime t = crp->crp_tstamp;
	crypto_tstat(&cryptostats.cs_cb, &t);
	crp->crp_callback(crp);
	crypto_tstat(&cryptostats.cs_finis, &t);
	} else
	#endif
	crp->crp_callback(crp);
	} else {
	struct crypto_ret_worker *ret_worker;
	bool wake;

	ret_worker = CRYPTO_RETW(crp->crp_retw_id);
	wake = false;

	/*
	* Normal case; queue the callback for the thread.
	*/
	CRYPTO_RETW_LOCK(ret_worker);
	if (CRYPTOP_ASYNC_KEEPORDER(crp)) {
	struct cryptop *tmp;

	TAILQ_FOREACH_REVERSE(tmp, &ret_worker->crp_ordered_ret_q,
	cryptop_q, crp_next) {
	if (CRYPTO_SEQ_GT(crp->crp_seq, tmp->crp_seq)) {
	TAILQ_INSERT_AFTER(&ret_worker->crp_ordered_ret_q,
	tmp, crp, crp_next);
	break;
	}
	}
	if (tmp == NULL) {
	TAILQ_INSERT_HEAD(&ret_worker->crp_ordered_ret_q,
	crp, crp_next);
	}

	if (crp->crp_seq == ret_worker->reorder_cur_seq)
	wake = true;
	}
	else {
	if (CRYPTO_RETW_EMPTY(ret_worker))
	wake = true;

	TAILQ_INSERT_TAIL(&ret_worker->crp_ret_q, crp, crp_next);
	}

	if (wake)
	wakeup_one(&ret_worker->crp_ret_q); /* shared wait channel */
	CRYPTO_RETW_UNLOCK(ret_worker);
	}
	}

	/*
	* Invoke the callback on behalf of the driver.
	*/
	void
	crypto_kdone(struct cryptkop *krp)
	{
	struct crypto_ret_worker *ret_worker;
	struct cryptocap *cap;

	if (krp->krp_status != 0)
	cryptostats.cs_kerrs++;
	CRYPTO_DRIVER_LOCK();
	/* XXX: What if driver is loaded in the meantime? */
	if (krp->krp_hid < crypto_drivers_num) {
	cap = &crypto_drivers[krp->krp_hid];
	KASSERT(cap->cc_koperations > 0, ("cc_koperations == 0"));
	cap->cc_koperations--;
	if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
	crypto_remove(cap);
	}
	CRYPTO_DRIVER_UNLOCK();

	ret_worker = CRYPTO_RETW(0);

	CRYPTO_RETW_LOCK(ret_worker);
	if (CRYPTO_RETW_EMPTY(ret_worker))
	wakeup_one(&ret_worker->crp_ret_q); /* shared wait channel */
	TAILQ_INSERT_TAIL(&ret_worker->crp_ret_kq, krp, krp_next);
	CRYPTO_RETW_UNLOCK(ret_worker);
	}

	int
	crypto_getfeat(int *featp)
	{
	int hid, kalg, feat = 0;

	CRYPTO_DRIVER_LOCK();
	for (hid = 0; hid < crypto_drivers_num; hid++) {
	const struct cryptocap *cap = &crypto_drivers[hid];

	if ((cap->cc_flags & CRYPTOCAP_F_SOFTWARE) &&
	!crypto_devallowsoft) {
	continue;
	}
	for (kalg = 0; kalg < CRK_ALGORITHM_MAX; kalg++)
	if (cap->cc_kalg[kalg] & CRYPTO_ALG_FLAG_SUPPORTED)
	feat \|= 1 << kalg;
	}
	CRYPTO_DRIVER_UNLOCK();
	*featp = feat;
	return (0);
	}

	/*
	* Terminate a thread at module unload. The process that
	* initiated this is waiting for us to signal that we're gone;
	* wake it up and exit. We use the driver table lock to insure
	* we don't do the wakeup before they're waiting. There is no
	* race here because the waiter sleeps on the proc lock for the
	* thread so it gets notified at the right time because of an
	* extra wakeup that's done in exit1().
	*/
	static void
	crypto_finis(void *chan)
	{
	CRYPTO_DRIVER_LOCK();
	wakeup_one(chan);
	CRYPTO_DRIVER_UNLOCK();
	kproc_exit(0);
	}

	/*
	* Crypto thread, dispatches crypto requests.
	*/
	static void
	crypto_proc(void)
	{
	struct cryptop crp, submit;
	struct cryptkop *krp;
	struct cryptocap *cap;
	u_int32_t hid;
	int result, hint;

	#if defined(__i386__) \|\| defined(__amd64__) \|\| defined(__aarch64__)
	fpu_kern_thread(FPU_KERN_NORMAL);
	#endif

	CRYPTO_Q_LOCK();
	for (;;) {
	/*
	* Find the first element in the queue that can be
	* processed and look-ahead to see if multiple ops
	* are ready for the same driver.
	*/
	submit = NULL;
	hint = 0;
	TAILQ_FOREACH(crp, &crp_q, crp_next) {
	hid = CRYPTO_SESID2HID(crp->crp_sid);
	cap = crypto_checkdriver(hid);
	/*
	* Driver cannot disappeared when there is an active
	* session.
	*/
	KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
	__func__, __LINE__));
	if (cap == NULL \|\| cap->cc_dev == NULL) {
	/* Op needs to be migrated, process it. */
	if (submit == NULL)
	submit = crp;
	break;
	}
	if (!cap->cc_qblocked) {
	if (submit != NULL) {
	/*
	* We stop on finding another op,
	* regardless whether its for the same
	* driver or not. We could keep
	* searching the queue but it might be
	* better to just use a per-driver
	* queue instead.
	*/
	if (CRYPTO_SESID2HID(submit->crp_sid) == hid)
	hint = CRYPTO_HINT_MORE;
	break;
	} else {
	submit = crp;
	if ((submit->crp_flags & CRYPTO_F_BATCH) == 0)
	break;
	/* keep scanning for more are q'd */
	}
	}
	}
	if (submit != NULL) {
	TAILQ_REMOVE(&crp_q, submit, crp_next);
	hid = CRYPTO_SESID2HID(submit->crp_sid);
	cap = crypto_checkdriver(hid);
	KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
	__func__, __LINE__));
	result = crypto_invoke(cap, submit, hint);
	if (result == ERESTART) {
	/*
	* The driver ran out of resources, mark the
	* driver ``blocked'' for cryptop's and put
	* the request back in the queue. It would
	* best to put the request back where we got
	* it but that's hard so for now we put it
	* at the front. This should be ok; putting
	* it at the end does not work.
	*/
	/* XXX validate sid again? */
	crypto_drivers[CRYPTO_SESID2HID(submit->crp_sid)].cc_qblocked = 1;
	TAILQ_INSERT_HEAD(&crp_q, submit, crp_next);
	cryptostats.cs_blocks++;
	}
	}

	/* As above, but for key ops */
	TAILQ_FOREACH(krp, &crp_kq, krp_next) {
	cap = crypto_checkdriver(krp->krp_hid);
	if (cap == NULL \|\| cap->cc_dev == NULL) {
	/*
	* Operation needs to be migrated, invalidate
	* the assigned device so it will reselect a
	* new one below. Propagate the original
	* crid selection flags if supplied.
	*/
	krp->krp_hid = krp->krp_crid &
	(CRYPTOCAP_F_SOFTWARE\|CRYPTOCAP_F_HARDWARE);
	if (krp->krp_hid == 0)
	krp->krp_hid =
	CRYPTOCAP_F_SOFTWARE\|CRYPTOCAP_F_HARDWARE;
	break;
	}
	if (!cap->cc_kqblocked)
	break;
	}
	if (krp != NULL) {
	TAILQ_REMOVE(&crp_kq, krp, krp_next);
	result = crypto_kinvoke(krp, krp->krp_hid);
	if (result == ERESTART) {
	/*
	* The driver ran out of resources, mark the
	* driver ``blocked'' for cryptkop's and put
	* the request back in the queue. It would
	* best to put the request back where we got
	* it but that's hard so for now we put it
	* at the front. This should be ok; putting
	* it at the end does not work.
	*/
	/* XXX validate sid again? */
	crypto_drivers[krp->krp_hid].cc_kqblocked = 1;
	TAILQ_INSERT_HEAD(&crp_kq, krp, krp_next);
	cryptostats.cs_kblocks++;
	}
	}

	if (submit == NULL && krp == NULL) {
	/*
	* Nothing more to be processed. Sleep until we're
	* woken because there are more ops to process.
	* This happens either by submission or by a driver
	* becoming unblocked and notifying us through
	* crypto_unblock. Note that when we wakeup we
	* start processing each queue again from the
	* front. It's not clear that it's important to
	* preserve this ordering since ops may finish
	* out of order if dispatched to different devices
	* and some become blocked while others do not.
	*/
	crp_sleep = 1;
	msleep(&crp_q, &crypto_q_mtx, PWAIT, "crypto_wait", 0);
	crp_sleep = 0;
	if (cryptoproc == NULL)
	break;
	cryptostats.cs_intrs++;
	}
	}
	CRYPTO_Q_UNLOCK();

	crypto_finis(&crp_q);
	}

	/*
	* Crypto returns thread, does callbacks for processed crypto requests.
	* Callbacks are done here, rather than in the crypto drivers, because
	* callbacks typically are expensive and would slow interrupt handling.
	*/
	static void
	crypto_ret_proc(struct crypto_ret_worker *ret_worker)
	{
	struct cryptop *crpt;
	struct cryptkop *krpt;

	CRYPTO_RETW_LOCK(ret_worker);
	for (;;) {
	/* Harvest return q's for completed ops */
	crpt = TAILQ_FIRST(&ret_worker->crp_ordered_ret_q);
	if (crpt != NULL) {
	if (crpt->crp_seq == ret_worker->reorder_cur_seq) {
	TAILQ_REMOVE(&ret_worker->crp_ordered_ret_q, crpt, crp_next);
	ret_worker->reorder_cur_seq++;
	} else {
	crpt = NULL;
	}
	}

	if (crpt == NULL) {
	crpt = TAILQ_FIRST(&ret_worker->crp_ret_q);
	if (crpt != NULL)
	TAILQ_REMOVE(&ret_worker->crp_ret_q, crpt, crp_next);
	}

	krpt = TAILQ_FIRST(&ret_worker->crp_ret_kq);
	if (krpt != NULL)
	TAILQ_REMOVE(&ret_worker->crp_ret_kq, krpt, krp_next);

	if (crpt != NULL \|\| krpt != NULL) {
	CRYPTO_RETW_UNLOCK(ret_worker);
	/*
	* Run callbacks unlocked.
	*/
	if (crpt != NULL) {
	#ifdef CRYPTO_TIMING
	if (crypto_timing) {
	/*
	* NB: We must copy the timestamp before
	* doing the callback as the cryptop is
	* likely to be reclaimed.
	*/
	struct bintime t = crpt->crp_tstamp;
	crypto_tstat(&cryptostats.cs_cb, &t);
	crpt->crp_callback(crpt);
	crypto_tstat(&cryptostats.cs_finis, &t);
	} else
	#endif
	crpt->crp_callback(crpt);
	}
	if (krpt != NULL)
	krpt->krp_callback(krpt);
	CRYPTO_RETW_LOCK(ret_worker);
	} else {
	/*
	* Nothing more to be processed. Sleep until we're
	* woken because there are more returns to process.
	*/
	msleep(&ret_worker->crp_ret_q, &ret_worker->crypto_ret_mtx, PWAIT,
	"crypto_ret_wait", 0);
	if (ret_worker->cryptoretproc == NULL)
	break;
	cryptostats.cs_rets++;
	}
	}
	CRYPTO_RETW_UNLOCK(ret_worker);

	crypto_finis(&ret_worker->crp_ret_q);
	}

	#ifdef DDB
	static void
	db_show_drivers(void)
	{
	int hid;

	db_printf("%12s %4s %4s %8s %2s %2s\n"
	, "Device"
	, "Ses"
	, "Kops"
	, "Flags"
	, "QB"
	, "KB"
	);
	for (hid = 0; hid < crypto_drivers_num; hid++) {
	const struct cryptocap *cap = &crypto_drivers[hid];
	if (cap->cc_dev == NULL)
	continue;
	db_printf("%-12s %4u %4u %08x %2u %2u\n"
	, device_get_nameunit(cap->cc_dev)
	, cap->cc_sessions
	, cap->cc_koperations
	, cap->cc_flags
	, cap->cc_qblocked
	, cap->cc_kqblocked
	);
	}
	}

	DB_SHOW_COMMAND(crypto, db_show_crypto)
	{
	struct cryptop *crp;
	struct crypto_ret_worker *ret_worker;

	db_show_drivers();
	db_printf("\n");

	db_printf("%4s %8s %4s %4s %4s %4s %8s %8s\n",
	"HID", "Caps", "Ilen", "Olen", "Etype", "Flags",
	"Desc", "Callback");
	TAILQ_FOREACH(crp, &crp_q, crp_next) {
	db_printf("%4u %08x %4u %4u %4u %04x %8p %8p\n"
	, (int) CRYPTO_SESID2HID(crp->crp_sid)
	, (int) CRYPTO_SESID2CAPS(crp->crp_sid)
	, crp->crp_ilen, crp->crp_olen
	, crp->crp_etype
	, crp->crp_flags
	, crp->crp_desc
	, crp->crp_callback
	);
	}
	FOREACH_CRYPTO_RETW(ret_worker) {
	db_printf("\n%8s %4s %4s %4s %8s\n",
	"ret_worker", "HID", "Etype", "Flags", "Callback");
	if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) {
	TAILQ_FOREACH(crp, &ret_worker->crp_ret_q, crp_next) {
	db_printf("%8td %4u %4u %04x %8p\n"
	, CRYPTO_RETW_ID(ret_worker)
	, (int) CRYPTO_SESID2HID(crp->crp_sid)
	, crp->crp_etype
	, crp->crp_flags
	, crp->crp_callback
	);
	}
	}
	}
	}

	DB_SHOW_COMMAND(kcrypto, db_show_kcrypto)
	{
	struct cryptkop *krp;
	struct crypto_ret_worker *ret_worker;

	db_show_drivers();
	db_printf("\n");

	db_printf("%4s %5s %4s %4s %8s %4s %8s\n",
	"Op", "Status", "#IP", "#OP", "CRID", "HID", "Callback");
	TAILQ_FOREACH(krp, &crp_kq, krp_next) {
	db_printf("%4u %5u %4u %4u %08x %4u %8p\n"
	, krp->krp_op
	, krp->krp_status
	, krp->krp_iparams, krp->krp_oparams
	, krp->krp_crid, krp->krp_hid
	, krp->krp_callback
	);
	}

	ret_worker = CRYPTO_RETW(0);
	if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) {
	db_printf("%4s %5s %8s %4s %8s\n",
	"Op", "Status", "CRID", "HID", "Callback");
	TAILQ_FOREACH(krp, &ret_worker->crp_ret_kq, krp_next) {
	db_printf("%4u %5u %08x %4u %8p\n"
	, krp->krp_op
	, krp->krp_status
	, krp->krp_crid, krp->krp_hid
	, krp->krp_callback
	);
	}
	}
	}
	#endif

	int crypto_modevent(module_t mod, int type, void *unused);

	/*
	* Initialization code, both for static and dynamic loading.
	* Note this is not invoked with the usual MODULE_DECLARE
	* mechanism but instead is listed as a dependency by the
	* cryptosoft driver. This guarantees proper ordering of
	* calls on module load/unload.
	*/
	int
	crypto_modevent(module_t mod, int type, void *unused)
	{
	int error = EINVAL;

	switch (type) {
	case MOD_LOAD:
	error = crypto_init();
	if (error == 0 && bootverbose)
	printf("crypto: <crypto core>\n");
	break;
	case MOD_UNLOAD:
	/XXX disallow if active sessions /
	error = 0;
	crypto_destroy();
	return 0;
	}
	return error;
	}
	MODULE_VERSION(crypto, 1);
	MODULE_DEPEND(crypto, zlib, 1, 1, 1);
	Index: head/sys/opencrypto/cryptosoft.c
	===================================================================
	--- head/sys/opencrypto/cryptosoft.c (revision 327172)
	+++ head/sys/opencrypto/cryptosoft.c (revision 327173)
	@@ -1,1298 +1,1297 @@
	/* $OpenBSD: cryptosoft.c,v 1.35 2002/04/26 08:43:50 deraadt Exp $ */

	/*-
	* The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu)
	* Copyright (c) 2002-2006 Sam Leffler, Errno Consulting
	*
	* This code was written by Angelos D. Keromytis in Athens, Greece, in
	* February 2000. Network Security Technologies Inc. (NSTI) kindly
	* supported the development of this code.
	*
	* Copyright (c) 2000, 2001 Angelos D. Keromytis
	* Copyright (c) 2014 The FreeBSD Foundation
	* All rights reserved.
	*
	* Portions of this software were developed by John-Mark Gurney
	* under sponsorship of the FreeBSD Foundation and
	* Rubicon Communications, LLC (Netgate).
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all source code copies of any software which is or includes a copy or
	* modification of this software.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/sysctl.h>
	#include <sys/errno.h>
	#include <sys/random.h>
	#include <sys/kernel.h>
	#include <sys/uio.h>
	#include <sys/lock.h>
	#include <sys/rwlock.h>
	#include <sys/endian.h>
	#include <sys/limits.h>

	#include <crypto/blowfish/blowfish.h>
	#include <crypto/sha1.h>
	#include <opencrypto/rmd160.h>
	#include <opencrypto/cast.h>
	#include <opencrypto/skipjack.h>
	#include <sys/md5.h>

	#include <opencrypto/cryptodev.h>
	#include <opencrypto/cryptosoft.h>
	#include <opencrypto/xform.h>

	#include <sys/kobj.h>
	#include <sys/bus.h>
	#include "cryptodev_if.h"

	static int32_t swcr_id;
	static struct swcr_data **swcr_sessions = NULL;
	static u_int32_t swcr_sesnum;
	/* Protects swcr_sessions pointer, not data. */
	static struct rwlock swcr_sessions_lock;

	u_int8_t hmac_ipad_buffer[HMAC_MAX_BLOCK_LEN];
	u_int8_t hmac_opad_buffer[HMAC_MAX_BLOCK_LEN];

	static int swcr_encdec(struct cryptodesc , struct swcr_data , caddr_t, int);
	static int swcr_authcompute(struct cryptodesc , struct swcr_data , caddr_t, int);
	static int swcr_authenc(struct cryptop *crp);
	static int swcr_compdec(struct cryptodesc , struct swcr_data , caddr_t, int);
	static int swcr_freesession(device_t dev, u_int64_t tid);
	static int swcr_freesession_locked(device_t dev, u_int64_t tid);

	/*
	* Apply a symmetric encryption/decryption algorithm.
	*/
	static int
	swcr_encdec(struct cryptodesc crd, struct swcr_data sw, caddr_t buf,
	int flags)
	{
	unsigned char iv[EALG_MAX_BLOCK_LEN], blk[EALG_MAX_BLOCK_LEN], *idat;
	unsigned char ivp, nivp, iv2[EALG_MAX_BLOCK_LEN];
	struct enc_xform *exf;
	int i, j, k, blks, ind, count, ivlen;
	struct uio *uio, uiolcl;
	struct iovec iovlcl[4];
	struct iovec *iov;
	int iovcnt, iovalloc;
	int error;

	error = 0;

	exf = sw->sw_exf;
	blks = exf->blocksize;
	ivlen = exf->ivsize;

	/* Check for non-padded data */
	if (crd->crd_len % blks)
	return EINVAL;

	if (crd->crd_alg == CRYPTO_AES_ICM &&
	(crd->crd_flags & CRD_F_IV_EXPLICIT) == 0)
	return (EINVAL);

	/* Initialize the IV */
	if (crd->crd_flags & CRD_F_ENCRYPT) {
	/* IV explicitly provided ? */
	if (crd->crd_flags & CRD_F_IV_EXPLICIT)
	bcopy(crd->crd_iv, iv, ivlen);
	else
	arc4rand(iv, ivlen, 0);

	/* Do we need to write the IV */
	if (!(crd->crd_flags & CRD_F_IV_PRESENT))
	crypto_copyback(flags, buf, crd->crd_inject, ivlen, iv);

	} else { /* Decryption */
	/* IV explicitly provided ? */
	if (crd->crd_flags & CRD_F_IV_EXPLICIT)
	bcopy(crd->crd_iv, iv, ivlen);
	else {
	/* Get IV off buf */
	crypto_copydata(flags, buf, crd->crd_inject, ivlen, iv);
	}
	}

	if (crd->crd_flags & CRD_F_KEY_EXPLICIT) {
	int error;

	if (sw->sw_kschedule)
	exf->zerokey(&(sw->sw_kschedule));

	error = exf->setkey(&sw->sw_kschedule,
	crd->crd_key, crd->crd_klen / 8);
	if (error)
	return (error);
	}

	iov = iovlcl;
	iovcnt = nitems(iovlcl);
	iovalloc = 0;
	uio = &uiolcl;
	if ((flags & CRYPTO_F_IMBUF) != 0) {
	error = crypto_mbuftoiov((struct mbuf *)buf, &iov, &iovcnt,
	&iovalloc);
	if (error)
	return (error);
	uio->uio_iov = iov;
	uio->uio_iovcnt = iovcnt;
	} else if ((flags & CRYPTO_F_IOV) != 0)
	uio = (struct uio *)buf;
	else {
	iov[0].iov_base = buf;
	iov[0].iov_len = crd->crd_skip + crd->crd_len;
	uio->uio_iov = iov;
	uio->uio_iovcnt = 1;
	}

	ivp = iv;

	if (exf->reinit) {
	/*
	* xforms that provide a reinit method perform all IV
	* handling themselves.
	*/
	exf->reinit(sw->sw_kschedule, iv);
	}

	count = crd->crd_skip;
	ind = cuio_getptr(uio, count, &k);
	if (ind == -1) {
	error = EINVAL;
	goto out;
	}

	i = crd->crd_len;

	while (i > 0) {
	/*
	* If there's insufficient data at the end of
	* an iovec, we have to do some copying.
	*/
	if (uio->uio_iov[ind].iov_len < k + blks &&
	uio->uio_iov[ind].iov_len != k) {
	cuio_copydata(uio, count, blks, blk);

	/* Actual encryption/decryption */
	if (exf->reinit) {
	if (crd->crd_flags & CRD_F_ENCRYPT) {
	exf->encrypt(sw->sw_kschedule,
	blk);
	} else {
	exf->decrypt(sw->sw_kschedule,
	blk);
	}
	} else if (crd->crd_flags & CRD_F_ENCRYPT) {
	/* XOR with previous block */
	for (j = 0; j < blks; j++)
	blk[j] ^= ivp[j];

	exf->encrypt(sw->sw_kschedule, blk);

	/*
	* Keep encrypted block for XOR'ing
	* with next block
	*/
	bcopy(blk, iv, blks);
	ivp = iv;
	} else { /* decrypt */
	/*
	* Keep encrypted block for XOR'ing
	* with next block
	*/
	nivp = (ivp == iv) ? iv2 : iv;
	bcopy(blk, nivp, blks);

	exf->decrypt(sw->sw_kschedule, blk);

	/* XOR with previous block */
	for (j = 0; j < blks; j++)
	blk[j] ^= ivp[j];

	ivp = nivp;
	}

	/* Copy back decrypted block */
	cuio_copyback(uio, count, blks, blk);

	count += blks;

	/* Advance pointer */
	ind = cuio_getptr(uio, count, &k);
	if (ind == -1) {
	error = EINVAL;
	goto out;
	}

	i -= blks;

	/* Could be done... */
	if (i == 0)
	break;
	}

	/*
	* Warning: idat may point to garbage here, but
	* we only use it in the while() loop, only if
	* there are indeed enough data.
	*/
	idat = (char *)uio->uio_iov[ind].iov_base + k;

	while (uio->uio_iov[ind].iov_len >= k + blks && i > 0) {
	if (exf->reinit) {
	if (crd->crd_flags & CRD_F_ENCRYPT) {
	exf->encrypt(sw->sw_kschedule,
	idat);
	} else {
	exf->decrypt(sw->sw_kschedule,
	idat);
	}
	} else if (crd->crd_flags & CRD_F_ENCRYPT) {
	/* XOR with previous block/IV */
	for (j = 0; j < blks; j++)
	idat[j] ^= ivp[j];

	exf->encrypt(sw->sw_kschedule, idat);
	ivp = idat;
	} else { /* decrypt */
	/*
	* Keep encrypted block to be used
	* in next block's processing.
	*/
	nivp = (ivp == iv) ? iv2 : iv;
	bcopy(idat, nivp, blks);

	exf->decrypt(sw->sw_kschedule, idat);

	/* XOR with previous block/IV */
	for (j = 0; j < blks; j++)
	idat[j] ^= ivp[j];

	ivp = nivp;
	}

	idat += blks;
	count += blks;
	k += blks;
	i -= blks;
	}

	/*
	* Advance to the next iov if the end of the current iov
	* is aligned with the end of a cipher block.
	* Note that the code is equivalent to calling:
	* ind = cuio_getptr(uio, count, &k);
	*/
	if (i > 0 && k == uio->uio_iov[ind].iov_len) {
	k = 0;
	ind++;
	if (ind >= uio->uio_iovcnt) {
	error = EINVAL;
	goto out;
	}
	}
	}

	out:
	if (iovalloc)
	free(iov, M_CRYPTO_DATA);

	return (error);
	}

	static void
	swcr_authprepare(struct auth_hash axf, struct swcr_data sw, u_char *key,
	int klen)
	{
	int k;

	klen /= 8;

	switch (axf->type) {
	case CRYPTO_MD5_HMAC:
	case CRYPTO_SHA1_HMAC:
	case CRYPTO_SHA2_256_HMAC:
	case CRYPTO_SHA2_384_HMAC:
	case CRYPTO_SHA2_512_HMAC:
	case CRYPTO_NULL_HMAC:
	case CRYPTO_RIPEMD160_HMAC:
	for (k = 0; k < klen; k++)
	key[k] ^= HMAC_IPAD_VAL;

	axf->Init(sw->sw_ictx);
	axf->Update(sw->sw_ictx, key, klen);
	axf->Update(sw->sw_ictx, hmac_ipad_buffer, axf->blocksize - klen);

	for (k = 0; k < klen; k++)
	key[k] ^= (HMAC_IPAD_VAL ^ HMAC_OPAD_VAL);

	axf->Init(sw->sw_octx);
	axf->Update(sw->sw_octx, key, klen);
	axf->Update(sw->sw_octx, hmac_opad_buffer, axf->blocksize - klen);

	for (k = 0; k < klen; k++)
	key[k] ^= HMAC_OPAD_VAL;
	break;
	case CRYPTO_MD5_KPDK:
	case CRYPTO_SHA1_KPDK:
	{
	/*
	* We need a buffer that can hold an md5 and a sha1 result
	* just to throw it away.
	* What we do here is the initial part of:
	* ALGO( key, keyfill, .. )
	* adding the key to sw_ictx and abusing Final() to get the
	* "keyfill" padding.
	* In addition we abuse the sw_octx to save the key to have
	* it to be able to append it at the end in swcr_authcompute().
	*/
	u_char buf[SHA1_RESULTLEN];

	sw->sw_klen = klen;
	bcopy(key, sw->sw_octx, klen);
	axf->Init(sw->sw_ictx);
	axf->Update(sw->sw_ictx, key, klen);
	axf->Final(buf, sw->sw_ictx);
	break;
	}
	default:
	printf("%s: CRD_F_KEY_EXPLICIT flag given, but algorithm %d "
	"doesn't use keys.\n", __func__, axf->type);
	}
	}

	/*
	* Compute keyed-hash authenticator.
	*/
	static int
	swcr_authcompute(struct cryptodesc crd, struct swcr_data sw, caddr_t buf,
	int flags)
	{
	unsigned char aalg[HASH_MAX_LEN];
	struct auth_hash *axf;
	union authctx ctx;
	int err;

	if (sw->sw_ictx == 0)
	return EINVAL;

	axf = sw->sw_axf;

	if (crd->crd_flags & CRD_F_KEY_EXPLICIT)
	swcr_authprepare(axf, sw, crd->crd_key, crd->crd_klen);

	bcopy(sw->sw_ictx, &ctx, axf->ctxsize);

	err = crypto_apply(flags, buf, crd->crd_skip, crd->crd_len,
	(int ()(void , void *, unsigned int))axf->Update, (caddr_t)&ctx);
	if (err)
	return err;

	switch (sw->sw_alg) {
	case CRYPTO_MD5_HMAC:
	case CRYPTO_SHA1_HMAC:
	case CRYPTO_SHA2_256_HMAC:
	case CRYPTO_SHA2_384_HMAC:
	case CRYPTO_SHA2_512_HMAC:
	case CRYPTO_RIPEMD160_HMAC:
	if (sw->sw_octx == NULL)
	return EINVAL;

	axf->Final(aalg, &ctx);
	bcopy(sw->sw_octx, &ctx, axf->ctxsize);
	axf->Update(&ctx, aalg, axf->hashsize);
	axf->Final(aalg, &ctx);
	break;

	case CRYPTO_MD5_KPDK:
	case CRYPTO_SHA1_KPDK:
	/* If we have no key saved, return error. */
	if (sw->sw_octx == NULL)
	return EINVAL;

	/*
	* Add the trailing copy of the key (see comment in
	* swcr_authprepare()) after the data:
	* ALGO( .., key, algofill )
	* and let Final() do the proper, natural "algofill"
	* padding.
	*/
	axf->Update(&ctx, sw->sw_octx, sw->sw_klen);
	axf->Final(aalg, &ctx);
	break;

	case CRYPTO_NULL_HMAC:
	axf->Final(aalg, &ctx);
	break;
	}

	/* Inject the authentication data */
	crypto_copyback(flags, buf, crd->crd_inject,
	sw->sw_mlen == 0 ? axf->hashsize : sw->sw_mlen, aalg);
	return 0;
	}

	CTASSERT(INT_MAX <= (1ll<<39) - 256); /* GCM: plain text < 2^39-256 */
	CTASSERT(INT_MAX <= (uint64_t)-1); /* GCM: associated data <= 2^64-1 */

	/*
	* Apply a combined encryption-authentication transformation
	*/
	static int
	swcr_authenc(struct cryptop *crp)
	{
	uint32_t blkbuf[howmany(EALG_MAX_BLOCK_LEN, sizeof(uint32_t))];
	u_char blk = (u_char )blkbuf;
	u_char aalg[AALG_MAX_RESULT_LEN];
	u_char uaalg[AALG_MAX_RESULT_LEN];
	u_char iv[EALG_MAX_BLOCK_LEN];
	union authctx ctx;
	struct cryptodesc crd, crda = NULL, *crde = NULL;
	struct swcr_data sw, swa, *swe = NULL;
	struct auth_hash *axf = NULL;
	struct enc_xform *exf = NULL;
	caddr_t buf = (caddr_t)crp->crp_buf;
	uint32_t *blkp;
	int aadlen, blksz, i, ivlen, len, iskip, oskip, r;

	ivlen = blksz = iskip = oskip = 0;

	for (crd = crp->crp_desc; crd; crd = crd->crd_next) {
	for (sw = swcr_sessions[crp->crp_sid & 0xffffffff];
	sw && sw->sw_alg != crd->crd_alg;
	sw = sw->sw_next)
	;
	if (sw == NULL)
	return (EINVAL);

	switch (sw->sw_alg) {
	case CRYPTO_AES_NIST_GCM_16:
	case CRYPTO_AES_NIST_GMAC:
	swe = sw;
	crde = crd;
	exf = swe->sw_exf;
	ivlen = 12;
	break;
	case CRYPTO_AES_128_NIST_GMAC:
	case CRYPTO_AES_192_NIST_GMAC:
	case CRYPTO_AES_256_NIST_GMAC:
	swa = sw;
	crda = crd;
	axf = swa->sw_axf;
	if (swa->sw_ictx == 0)
	return (EINVAL);
	bcopy(swa->sw_ictx, &ctx, axf->ctxsize);
	blksz = axf->blocksize;
	break;
	default:
	return (EINVAL);
	}
	}
	if (crde == NULL \|\| crda == NULL)
	return (EINVAL);

	if (crde->crd_alg == CRYPTO_AES_NIST_GCM_16 &&
	(crde->crd_flags & CRD_F_IV_EXPLICIT) == 0)
	return (EINVAL);

	if (crde->crd_klen != crda->crd_klen)
	return (EINVAL);

	/* Initialize the IV */
	if (crde->crd_flags & CRD_F_ENCRYPT) {
	/* IV explicitly provided ? */
	if (crde->crd_flags & CRD_F_IV_EXPLICIT)
	bcopy(crde->crd_iv, iv, ivlen);
	else
	arc4rand(iv, ivlen, 0);

	/* Do we need to write the IV */
	if (!(crde->crd_flags & CRD_F_IV_PRESENT))
	crypto_copyback(crp->crp_flags, buf, crde->crd_inject,
	ivlen, iv);

	} else { /* Decryption */
	/* IV explicitly provided ? */
	if (crde->crd_flags & CRD_F_IV_EXPLICIT)
	bcopy(crde->crd_iv, iv, ivlen);
	else {
	/* Get IV off buf */
	crypto_copydata(crp->crp_flags, buf, crde->crd_inject,
	ivlen, iv);
	}
	}

	/* Supply MAC with IV */
	if (axf->Reinit)
	axf->Reinit(&ctx, iv, ivlen);

	/* Supply MAC with AAD */
	aadlen = crda->crd_len;

	for (i = iskip; i < crda->crd_len; i += blksz) {
	len = MIN(crda->crd_len - i, blksz - oskip);
	crypto_copydata(crp->crp_flags, buf, crda->crd_skip + i, len,
	blk + oskip);
	bzero(blk + len + oskip, blksz - len - oskip);
	axf->Update(&ctx, blk, blksz);
	oskip = 0; /* reset initial output offset */
	}

	if (exf->reinit)
	exf->reinit(swe->sw_kschedule, iv);

	/* Do encryption/decryption with MAC */
	for (i = 0; i < crde->crd_len; i += blksz) {
	len = MIN(crde->crd_len - i, blksz);
	if (len < blksz)
	bzero(blk, blksz);
	crypto_copydata(crp->crp_flags, buf, crde->crd_skip + i, len,
	blk);
	if (crde->crd_flags & CRD_F_ENCRYPT) {
	exf->encrypt(swe->sw_kschedule, blk);
	axf->Update(&ctx, blk, len);
	crypto_copyback(crp->crp_flags, buf,
	crde->crd_skip + i, len, blk);
	} else {
	axf->Update(&ctx, blk, len);
	}
	}

	/* Do any required special finalization */
	switch (crda->crd_alg) {
	case CRYPTO_AES_128_NIST_GMAC:
	case CRYPTO_AES_192_NIST_GMAC:
	case CRYPTO_AES_256_NIST_GMAC:
	/* length block */
	bzero(blk, blksz);
	blkp = (uint32_t *)blk + 1;
	blkp = htobe32(aadlen 8);
	blkp = (uint32_t *)blk + 3;
	blkp = htobe32(crde->crd_len 8);
	axf->Update(&ctx, blk, blksz);
	break;
	}

	/* Finalize MAC */
	axf->Final(aalg, &ctx);

	/* Validate tag */
	if (!(crde->crd_flags & CRD_F_ENCRYPT)) {
	crypto_copydata(crp->crp_flags, buf, crda->crd_inject,
	axf->hashsize, uaalg);

	r = timingsafe_bcmp(aalg, uaalg, axf->hashsize);
	if (r == 0) {
	/* tag matches, decrypt data */
	for (i = 0; i < crde->crd_len; i += blksz) {
	len = MIN(crde->crd_len - i, blksz);
	if (len < blksz)
	bzero(blk, blksz);
	crypto_copydata(crp->crp_flags, buf,
	crde->crd_skip + i, len, blk);
	if (!(crde->crd_flags & CRD_F_ENCRYPT)) {
	exf->decrypt(swe->sw_kschedule, blk);
	}
	crypto_copyback(crp->crp_flags, buf,
	crde->crd_skip + i, len, blk);
	}
	} else
	return (EBADMSG);
	} else {
	/* Inject the authentication data */
	crypto_copyback(crp->crp_flags, buf, crda->crd_inject,
	axf->hashsize, aalg);
	}

	return (0);
	}

	/*
	* Apply a compression/decompression algorithm
	*/
	static int
	swcr_compdec(struct cryptodesc crd, struct swcr_data sw,
	caddr_t buf, int flags)
	{
	u_int8_t data, out;
	struct comp_algo *cxf;
	int adj;
	u_int32_t result;

	cxf = sw->sw_cxf;

	/* We must handle the whole buffer of data in one time
	* then if there is not all the data in the mbuf, we must
	* copy in a buffer.
	*/

	data = malloc(crd->crd_len, M_CRYPTO_DATA, M_NOWAIT);
	if (data == NULL)
	return (EINVAL);
	crypto_copydata(flags, buf, crd->crd_skip, crd->crd_len, data);

	if (crd->crd_flags & CRD_F_COMP)
	result = cxf->compress(data, crd->crd_len, &out);
	else
	result = cxf->decompress(data, crd->crd_len, &out);

	free(data, M_CRYPTO_DATA);
	if (result == 0)
	return EINVAL;

	/* Copy back the (de)compressed data. m_copyback is
	* extending the mbuf as necessary.
	*/
	sw->sw_size = result;
	/* Check the compressed size when doing compression */
	if (crd->crd_flags & CRD_F_COMP) {
	if (result >= crd->crd_len) {
	/* Compression was useless, we lost time */
	free(out, M_CRYPTO_DATA);
	return 0;
	}
	}

	crypto_copyback(flags, buf, crd->crd_skip, result, out);
	if (result < crd->crd_len) {
	adj = result - crd->crd_len;
	if (flags & CRYPTO_F_IMBUF) {
	adj = result - crd->crd_len;
	m_adj((struct mbuf *)buf, adj);
	} else if (flags & CRYPTO_F_IOV) {
	struct uio uio = (struct uio )buf;
	int ind;

	adj = crd->crd_len - result;
	ind = uio->uio_iovcnt - 1;

	while (adj > 0 && ind >= 0) {
	if (adj < uio->uio_iov[ind].iov_len) {
	uio->uio_iov[ind].iov_len -= adj;
	break;
	}

	adj -= uio->uio_iov[ind].iov_len;
	uio->uio_iov[ind].iov_len = 0;
	ind--;
	uio->uio_iovcnt--;
	}
	}
	}
	free(out, M_CRYPTO_DATA);
	return 0;
	}

	/*
	* Generate a new software session.
	*/
	static int
	swcr_newsession(device_t dev, u_int32_t sid, struct cryptoini cri)
	{
	struct swcr_data **swd;
	struct auth_hash *axf;
	struct enc_xform *txf;
	struct comp_algo *cxf;
	u_int32_t i;
	int len;
	int error;

	if (sid == NULL \|\| cri == NULL)
	return EINVAL;

	rw_wlock(&swcr_sessions_lock);
	if (swcr_sessions) {
	for (i = 1; i < swcr_sesnum; i++)
	if (swcr_sessions[i] == NULL)
	break;
	} else
	i = 1; /* NB: to silence compiler warning */

	if (swcr_sessions == NULL \|\| i == swcr_sesnum) {
	if (swcr_sessions == NULL) {
	i = 1; /* We leave swcr_sessions[0] empty */
	swcr_sesnum = CRYPTO_SW_SESSIONS;
	} else
	swcr_sesnum *= 2;

	swd = malloc(swcr_sesnum * sizeof(struct swcr_data *),
	M_CRYPTO_DATA, M_NOWAIT\|M_ZERO);
	if (swd == NULL) {
	/* Reset session number */
	if (swcr_sesnum == CRYPTO_SW_SESSIONS)
	swcr_sesnum = 0;
	else
	swcr_sesnum /= 2;
	rw_wunlock(&swcr_sessions_lock);
	return ENOBUFS;
	}

	/* Copy existing sessions */
	if (swcr_sessions != NULL) {
	bcopy(swcr_sessions, swd,
	(swcr_sesnum / 2) * sizeof(struct swcr_data *));
	free(swcr_sessions, M_CRYPTO_DATA);
	}

	swcr_sessions = swd;
	}

	rw_downgrade(&swcr_sessions_lock);
	swd = &swcr_sessions[i];
	*sid = i;

	while (cri) {
	*swd = malloc(sizeof(struct swcr_data),
	M_CRYPTO_DATA, M_NOWAIT\|M_ZERO);
	if (*swd == NULL) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return ENOBUFS;
	}

	switch (cri->cri_alg) {
	case CRYPTO_DES_CBC:
	txf = &enc_xform_des;
	goto enccommon;
	case CRYPTO_3DES_CBC:
	txf = &enc_xform_3des;
	goto enccommon;
	case CRYPTO_BLF_CBC:
	txf = &enc_xform_blf;
	goto enccommon;
	case CRYPTO_CAST_CBC:
	txf = &enc_xform_cast5;
	goto enccommon;
	case CRYPTO_SKIPJACK_CBC:
	txf = &enc_xform_skipjack;
	goto enccommon;
	case CRYPTO_RIJNDAEL128_CBC:
	txf = &enc_xform_rijndael128;
	goto enccommon;
	case CRYPTO_AES_XTS:
	txf = &enc_xform_aes_xts;
	goto enccommon;
	case CRYPTO_AES_ICM:
	txf = &enc_xform_aes_icm;
	goto enccommon;
	case CRYPTO_AES_NIST_GCM_16:
	txf = &enc_xform_aes_nist_gcm;
	goto enccommon;
	case CRYPTO_AES_NIST_GMAC:
	txf = &enc_xform_aes_nist_gmac;
	(*swd)->sw_exf = txf;
	break;
	case CRYPTO_CAMELLIA_CBC:
	txf = &enc_xform_camellia;
	goto enccommon;
	case CRYPTO_NULL_CBC:
	txf = &enc_xform_null;
	goto enccommon;
	enccommon:
	if (cri->cri_key != NULL) {
	error = txf->setkey(&((*swd)->sw_kschedule),
	cri->cri_key, cri->cri_klen / 8);
	if (error) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return error;
	}
	}
	(*swd)->sw_exf = txf;
	break;

	case CRYPTO_MD5_HMAC:
	axf = &auth_hash_hmac_md5;
	goto authcommon;
	case CRYPTO_SHA1_HMAC:
	axf = &auth_hash_hmac_sha1;
	goto authcommon;
	case CRYPTO_SHA2_256_HMAC:
	axf = &auth_hash_hmac_sha2_256;
	goto authcommon;
	case CRYPTO_SHA2_384_HMAC:
	axf = &auth_hash_hmac_sha2_384;
	goto authcommon;
	case CRYPTO_SHA2_512_HMAC:
	axf = &auth_hash_hmac_sha2_512;
	goto authcommon;
	case CRYPTO_NULL_HMAC:
	axf = &auth_hash_null;
	goto authcommon;
	case CRYPTO_RIPEMD160_HMAC:
	axf = &auth_hash_hmac_ripemd_160;
	authcommon:
	(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
	M_NOWAIT);
	if ((*swd)->sw_ictx == NULL) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return ENOBUFS;
	}

	(*swd)->sw_octx = malloc(axf->ctxsize, M_CRYPTO_DATA,
	M_NOWAIT);
	if ((*swd)->sw_octx == NULL) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return ENOBUFS;
	}

	if (cri->cri_key != NULL) {
	swcr_authprepare(axf, *swd, cri->cri_key,
	cri->cri_klen);
	}

	(*swd)->sw_mlen = cri->cri_mlen;
	(*swd)->sw_axf = axf;
	break;

	case CRYPTO_MD5_KPDK:
	axf = &auth_hash_key_md5;
	goto auth2common;

	case CRYPTO_SHA1_KPDK:
	axf = &auth_hash_key_sha1;
	auth2common:
	(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
	M_NOWAIT);
	if ((*swd)->sw_ictx == NULL) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return ENOBUFS;
	}

	(*swd)->sw_octx = malloc(cri->cri_klen / 8,
	M_CRYPTO_DATA, M_NOWAIT);
	if ((*swd)->sw_octx == NULL) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return ENOBUFS;
	}

	/* Store the key so we can "append" it to the payload */
	if (cri->cri_key != NULL) {
	swcr_authprepare(axf, *swd, cri->cri_key,
	cri->cri_klen);
	}

	(*swd)->sw_mlen = cri->cri_mlen;
	(*swd)->sw_axf = axf;
	break;
	#ifdef notdef
	case CRYPTO_MD5:
	axf = &auth_hash_md5;
	goto auth3common;

	case CRYPTO_SHA1:
	axf = &auth_hash_sha1;
	auth3common:
	(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
	M_NOWAIT);
	if ((*swd)->sw_ictx == NULL) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return ENOBUFS;
	}

	axf->Init((*swd)->sw_ictx);
	(*swd)->sw_mlen = cri->cri_mlen;
	(*swd)->sw_axf = axf;
	break;
	#endif

	case CRYPTO_AES_128_NIST_GMAC:
	axf = &auth_hash_nist_gmac_aes_128;
	goto auth4common;

	case CRYPTO_AES_192_NIST_GMAC:
	axf = &auth_hash_nist_gmac_aes_192;
	goto auth4common;

	case CRYPTO_AES_256_NIST_GMAC:
	axf = &auth_hash_nist_gmac_aes_256;
	auth4common:
	len = cri->cri_klen / 8;
	if (len != 16 && len != 24 && len != 32) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return EINVAL;
	}

	(*swd)->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA,
	M_NOWAIT);
	if ((*swd)->sw_ictx == NULL) {
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return ENOBUFS;
	}
	axf->Init((*swd)->sw_ictx);
	axf->Setkey((*swd)->sw_ictx, cri->cri_key, len);
	(*swd)->sw_axf = axf;
	break;

	case CRYPTO_DEFLATE_COMP:
	cxf = &comp_algo_deflate;
	(*swd)->sw_cxf = cxf;
	break;
	default:
	swcr_freesession_locked(dev, i);
	rw_runlock(&swcr_sessions_lock);
	return EINVAL;
	}

	(*swd)->sw_alg = cri->cri_alg;
	cri = cri->cri_next;
	swd = &((*swd)->sw_next);
	}
	rw_runlock(&swcr_sessions_lock);
	return 0;
	}

	static int
	swcr_freesession(device_t dev, u_int64_t tid)
	{
	int error;

	rw_rlock(&swcr_sessions_lock);
	error = swcr_freesession_locked(dev, tid);
	rw_runlock(&swcr_sessions_lock);
	return error;
	}

	/*
	* Free a session.
	*/
	static int
	swcr_freesession_locked(device_t dev, u_int64_t tid)
	{
	struct swcr_data *swd;
	struct enc_xform *txf;
	struct auth_hash *axf;
	- struct comp_algo *cxf;
	u_int32_t sid = CRYPTO_SESID2LID(tid);

	if (sid > swcr_sesnum \|\| swcr_sessions == NULL \|\|
	swcr_sessions[sid] == NULL)
	return EINVAL;

	/* Silently accept and return */
	if (sid == 0)
	return 0;

	while ((swd = swcr_sessions[sid]) != NULL) {
	swcr_sessions[sid] = swd->sw_next;

	switch (swd->sw_alg) {
	case CRYPTO_DES_CBC:
	case CRYPTO_3DES_CBC:
	case CRYPTO_BLF_CBC:
	case CRYPTO_CAST_CBC:
	case CRYPTO_SKIPJACK_CBC:
	case CRYPTO_RIJNDAEL128_CBC:
	case CRYPTO_AES_XTS:
	case CRYPTO_AES_ICM:
	case CRYPTO_AES_NIST_GCM_16:
	case CRYPTO_AES_NIST_GMAC:
	case CRYPTO_CAMELLIA_CBC:
	case CRYPTO_NULL_CBC:
	txf = swd->sw_exf;

	if (swd->sw_kschedule)
	txf->zerokey(&(swd->sw_kschedule));
	break;

	case CRYPTO_MD5_HMAC:
	case CRYPTO_SHA1_HMAC:
	case CRYPTO_SHA2_256_HMAC:
	case CRYPTO_SHA2_384_HMAC:
	case CRYPTO_SHA2_512_HMAC:
	case CRYPTO_RIPEMD160_HMAC:
	case CRYPTO_NULL_HMAC:
	axf = swd->sw_axf;

	if (swd->sw_ictx) {
	bzero(swd->sw_ictx, axf->ctxsize);
	free(swd->sw_ictx, M_CRYPTO_DATA);
	}
	if (swd->sw_octx) {
	bzero(swd->sw_octx, axf->ctxsize);
	free(swd->sw_octx, M_CRYPTO_DATA);
	}
	break;

	case CRYPTO_MD5_KPDK:
	case CRYPTO_SHA1_KPDK:
	axf = swd->sw_axf;

	if (swd->sw_ictx) {
	bzero(swd->sw_ictx, axf->ctxsize);
	free(swd->sw_ictx, M_CRYPTO_DATA);
	}
	if (swd->sw_octx) {
	bzero(swd->sw_octx, swd->sw_klen);
	free(swd->sw_octx, M_CRYPTO_DATA);
	}
	break;

	case CRYPTO_MD5:
	case CRYPTO_SHA1:
	axf = swd->sw_axf;

	if (swd->sw_ictx)
	free(swd->sw_ictx, M_CRYPTO_DATA);
	break;

	case CRYPTO_DEFLATE_COMP:
	- cxf = swd->sw_cxf;
	+ /* Nothing to do */
	break;
	}

	free(swd, M_CRYPTO_DATA);
	}
	return 0;
	}

	/*
	* Process a software request.
	*/
	static int
	swcr_process(device_t dev, struct cryptop *crp, int hint)
	{
	struct cryptodesc *crd;
	struct swcr_data *sw;
	u_int32_t lid;

	/* Sanity check */
	if (crp == NULL)
	return EINVAL;

	if (crp->crp_desc == NULL \|\| crp->crp_buf == NULL) {
	crp->crp_etype = EINVAL;
	goto done;
	}

	lid = CRYPTO_SESID2LID(crp->crp_sid);
	rw_rlock(&swcr_sessions_lock);
	if (swcr_sessions == NULL \|\| lid >= swcr_sesnum \|\| lid == 0 \|\|
	swcr_sessions[lid] == NULL) {
	rw_runlock(&swcr_sessions_lock);
	crp->crp_etype = ENOENT;
	goto done;
	}
	rw_runlock(&swcr_sessions_lock);

	/* Go through crypto descriptors, processing as we go */
	for (crd = crp->crp_desc; crd; crd = crd->crd_next) {
	/*
	* Find the crypto context.
	*
	* XXX Note that the logic here prevents us from having
	* XXX the same algorithm multiple times in a session
	* XXX (or rather, we can but it won't give us the right
	* XXX results). To do that, we'd need some way of differentiating
	* XXX between the various instances of an algorithm (so we can
	* XXX locate the correct crypto context).
	*/
	rw_rlock(&swcr_sessions_lock);
	if (swcr_sessions == NULL) {
	rw_runlock(&swcr_sessions_lock);
	crp->crp_etype = ENOENT;
	goto done;
	}
	for (sw = swcr_sessions[lid];
	sw && sw->sw_alg != crd->crd_alg;
	sw = sw->sw_next)
	;
	rw_runlock(&swcr_sessions_lock);

	/* No such context ? */
	if (sw == NULL) {
	crp->crp_etype = EINVAL;
	goto done;
	}
	switch (sw->sw_alg) {
	case CRYPTO_DES_CBC:
	case CRYPTO_3DES_CBC:
	case CRYPTO_BLF_CBC:
	case CRYPTO_CAST_CBC:
	case CRYPTO_SKIPJACK_CBC:
	case CRYPTO_RIJNDAEL128_CBC:
	case CRYPTO_AES_XTS:
	case CRYPTO_AES_ICM:
	case CRYPTO_CAMELLIA_CBC:
	if ((crp->crp_etype = swcr_encdec(crd, sw,
	crp->crp_buf, crp->crp_flags)) != 0)
	goto done;
	break;
	case CRYPTO_NULL_CBC:
	crp->crp_etype = 0;
	break;
	case CRYPTO_MD5_HMAC:
	case CRYPTO_SHA1_HMAC:
	case CRYPTO_SHA2_256_HMAC:
	case CRYPTO_SHA2_384_HMAC:
	case CRYPTO_SHA2_512_HMAC:
	case CRYPTO_RIPEMD160_HMAC:
	case CRYPTO_NULL_HMAC:
	case CRYPTO_MD5_KPDK:
	case CRYPTO_SHA1_KPDK:
	case CRYPTO_MD5:
	case CRYPTO_SHA1:
	if ((crp->crp_etype = swcr_authcompute(crd, sw,
	crp->crp_buf, crp->crp_flags)) != 0)
	goto done;
	break;

	case CRYPTO_AES_NIST_GCM_16:
	case CRYPTO_AES_NIST_GMAC:
	case CRYPTO_AES_128_NIST_GMAC:
	case CRYPTO_AES_192_NIST_GMAC:
	case CRYPTO_AES_256_NIST_GMAC:
	crp->crp_etype = swcr_authenc(crp);
	goto done;

	case CRYPTO_DEFLATE_COMP:
	if ((crp->crp_etype = swcr_compdec(crd, sw,
	crp->crp_buf, crp->crp_flags)) != 0)
	goto done;
	else
	crp->crp_olen = (int)sw->sw_size;
	break;

	default:
	/* Unknown/unsupported algorithm */
	crp->crp_etype = EINVAL;
	goto done;
	}
	}

	done:
	crypto_done(crp);
	return 0;
	}

	static void
	swcr_identify(driver_t *drv, device_t parent)
	{
	/* NB: order 10 is so we get attached after h/w devices */
	if (device_find_child(parent, "cryptosoft", -1) == NULL &&
	BUS_ADD_CHILD(parent, 10, "cryptosoft", 0) == 0)
	panic("cryptosoft: could not attach");
	}

	static int
	swcr_probe(device_t dev)
	{
	device_set_desc(dev, "software crypto");
	return (BUS_PROBE_NOWILDCARD);
	}

	static int
	swcr_attach(device_t dev)
	{
	rw_init(&swcr_sessions_lock, "swcr_sessions_lock");
	memset(hmac_ipad_buffer, HMAC_IPAD_VAL, HMAC_MAX_BLOCK_LEN);
	memset(hmac_opad_buffer, HMAC_OPAD_VAL, HMAC_MAX_BLOCK_LEN);

	swcr_id = crypto_get_driverid(dev,
	CRYPTOCAP_F_SOFTWARE \| CRYPTOCAP_F_SYNC);
	if (swcr_id < 0) {
	device_printf(dev, "cannot initialize!");
	return ENOMEM;
	}
	#define REGISTER(alg) \
	crypto_register(swcr_id, alg, 0,0)
	REGISTER(CRYPTO_DES_CBC);
	REGISTER(CRYPTO_3DES_CBC);
	REGISTER(CRYPTO_BLF_CBC);
	REGISTER(CRYPTO_CAST_CBC);
	REGISTER(CRYPTO_SKIPJACK_CBC);
	REGISTER(CRYPTO_NULL_CBC);
	REGISTER(CRYPTO_MD5_HMAC);
	REGISTER(CRYPTO_SHA1_HMAC);
	REGISTER(CRYPTO_SHA2_256_HMAC);
	REGISTER(CRYPTO_SHA2_384_HMAC);
	REGISTER(CRYPTO_SHA2_512_HMAC);
	REGISTER(CRYPTO_RIPEMD160_HMAC);
	REGISTER(CRYPTO_NULL_HMAC);
	REGISTER(CRYPTO_MD5_KPDK);
	REGISTER(CRYPTO_SHA1_KPDK);
	REGISTER(CRYPTO_MD5);
	REGISTER(CRYPTO_SHA1);
	REGISTER(CRYPTO_RIJNDAEL128_CBC);
	REGISTER(CRYPTO_AES_XTS);
	REGISTER(CRYPTO_AES_ICM);
	REGISTER(CRYPTO_AES_NIST_GCM_16);
	REGISTER(CRYPTO_AES_NIST_GMAC);
	REGISTER(CRYPTO_AES_128_NIST_GMAC);
	REGISTER(CRYPTO_AES_192_NIST_GMAC);
	REGISTER(CRYPTO_AES_256_NIST_GMAC);
	REGISTER(CRYPTO_CAMELLIA_CBC);
	REGISTER(CRYPTO_DEFLATE_COMP);
	#undef REGISTER

	return 0;
	}

	static int
	swcr_detach(device_t dev)
	{
	crypto_unregister_all(swcr_id);
	rw_wlock(&swcr_sessions_lock);
	free(swcr_sessions, M_CRYPTO_DATA);
	swcr_sessions = NULL;
	rw_wunlock(&swcr_sessions_lock);
	rw_destroy(&swcr_sessions_lock);
	return 0;
	}

	static device_method_t swcr_methods[] = {
	DEVMETHOD(device_identify, swcr_identify),
	DEVMETHOD(device_probe, swcr_probe),
	DEVMETHOD(device_attach, swcr_attach),
	DEVMETHOD(device_detach, swcr_detach),

	DEVMETHOD(cryptodev_newsession, swcr_newsession),
	DEVMETHOD(cryptodev_freesession,swcr_freesession),
	DEVMETHOD(cryptodev_process, swcr_process),

	{0, 0},
	};

	static driver_t swcr_driver = {
	"cryptosoft",
	swcr_methods,
	0, /* NB: no softc */
	};
	static devclass_t swcr_devclass;

	/*
	* NB: We explicitly reference the crypto module so we
	* get the necessary ordering when built as a loadable
	* module. This is required because we bundle the crypto
	* module code together with the cryptosoft driver (otherwise
	* normal module dependencies would handle things).
	*/
	extern int crypto_modevent(struct module , int, void );
	/* XXX where to attach */
	DRIVER_MODULE(cryptosoft, nexus, swcr_driver, swcr_devclass, crypto_modevent,0);
	MODULE_VERSION(cryptosoft, 1);
	MODULE_DEPEND(cryptosoft, crypto, 1, 1, 1);
	Index: head/sys/rpc/clnt_dg.c
	===================================================================
	--- head/sys/rpc/clnt_dg.c (revision 327172)
	+++ head/sys/rpc/clnt_dg.c (revision 327173)
	@@ -1,1155 +1,1151 @@
	/* $NetBSD: clnt_dg.c,v 1.4 2000/07/14 08:40:41 fvdl Exp $ */

	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2009, Sun Microsystems, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* - Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	* - Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	* - Neither the name of Sun Microsystems, Inc. nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/
	/*
	* Copyright (c) 1986-1991 by Sun Microsystems Inc.
	*/

	#if defined(LIBC_SCCS) && !defined(lint)
	#ident "@(#)clnt_dg.c 1.23 94/04/22 SMI"
	static char sccsid[] = "@(#)clnt_dg.c 1.19 89/03/16 Copyr 1988 Sun Micro";
	#endif
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* Implements a connectionless client side RPC.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/time.h>
	#include <sys/uio.h>

	#include <net/vnet.h>

	#include <rpc/rpc.h>
	#include <rpc/rpc_com.h>


	#ifdef _FREEFALL_CONFIG
	/*
	* Disable RPC exponential back-off for FreeBSD.org systems.
	*/
	#define RPC_MAX_BACKOFF 1 /* second */
	#else
	#define RPC_MAX_BACKOFF 30 /* seconds */
	#endif

	static bool_t time_not_ok(struct timeval *);
	static enum clnt_stat clnt_dg_call(CLIENT , struct rpc_callextra ,
	rpcproc_t, struct mbuf , struct mbuf *, struct timeval);
	static void clnt_dg_geterr(CLIENT , struct rpc_err );
	static bool_t clnt_dg_freeres(CLIENT , xdrproc_t, void );
	static void clnt_dg_abort(CLIENT *);
	static bool_t clnt_dg_control(CLIENT , u_int, void );
	static void clnt_dg_close(CLIENT *);
	static void clnt_dg_destroy(CLIENT *);
	static int clnt_dg_soupcall(struct socket so, void arg, int waitflag);

	static struct clnt_ops clnt_dg_ops = {
	.cl_call = clnt_dg_call,
	.cl_abort = clnt_dg_abort,
	.cl_geterr = clnt_dg_geterr,
	.cl_freeres = clnt_dg_freeres,
	.cl_close = clnt_dg_close,
	.cl_destroy = clnt_dg_destroy,
	.cl_control = clnt_dg_control
	};

	/*
	* A pending RPC request which awaits a reply. Requests which have
	* received their reply will have cr_xid set to zero and cr_mrep to
	* the mbuf chain of the reply.
	*/
	struct cu_request {
	TAILQ_ENTRY(cu_request) cr_link;
	CLIENT cr_client; / owner */
	uint32_t cr_xid; /* XID of request */
	struct mbuf cr_mrep; / reply received by upcall */
	int cr_error; /* any error from upcall */
	char cr_verf[MAX_AUTH_BYTES]; /* reply verf */
	};

	TAILQ_HEAD(cu_request_list, cu_request);

	#define MCALL_MSG_SIZE 24

	/*
	* This structure is pointed to by the socket buffer's sb_upcallarg
	* member. It is separate from the client private data to facilitate
	* multiple clients sharing the same socket. The cs_lock mutex is used
	* to protect all fields of this structure, the socket's receive
	* buffer SOCKBUF_LOCK is used to ensure that exactly one of these
	* structures is installed on the socket.
	*/
	struct cu_socket {
	struct mtx cs_lock;
	int cs_refs; /* Count of clients */
	struct cu_request_list cs_pending; /* Requests awaiting replies */
	int cs_upcallrefs; /* Refcnt of upcalls in prog.*/
	};

	static void clnt_dg_upcallsdone(struct socket , struct cu_socket );

	/*
	* Private data kept per client handle
	*/
	struct cu_data {
	int cu_threads; /* # threads in clnt_vc_call */
	bool_t cu_closing; /* TRUE if we are closing */
	bool_t cu_closed; /* TRUE if we are closed */
	struct socket cu_socket; / connection socket */
	bool_t cu_closeit; /* opened by library */
	struct sockaddr_storage cu_raddr; /* remote address */
	int cu_rlen;
	struct timeval cu_wait; /* retransmit interval */
	struct timeval cu_total; /* total time for the call */
	struct rpc_err cu_error;
	uint32_t cu_xid;
	char cu_mcallc[MCALL_MSG_SIZE]; /* marshalled callmsg */
	size_t cu_mcalllen;
	size_t cu_sendsz; /* send size */
	size_t cu_recvsz; /* recv size */
	int cu_async;
	int cu_connect; /* Use connect(). */
	int cu_connected; /* Have done connect(). */
	const char *cu_waitchan;
	int cu_waitflag;
	int cu_cwnd; /* congestion window */
	int cu_sent; /* number of in-flight RPCs */
	bool_t cu_cwnd_wait;
	};

	#define CWNDSCALE 256
	#define MAXCWND (32 * CWNDSCALE)

	/*
	* Connection less client creation returns with client handle parameters.
	* Default options are set, which the user can change using clnt_control().
	* fd should be open and bound.
	* NB: The rpch->cl_auth is initialized to null authentication.
	* Caller may wish to set this something more useful.
	*
	* sendsz and recvsz are the maximum allowable packet sizes that can be
	* sent and received. Normally they are the same, but they can be
	* changed to improve the program efficiency and buffer allocation.
	* If they are 0, use the transport default.
	*
	* If svcaddr is NULL, returns NULL.
	*/
	CLIENT *
	clnt_dg_create(
	struct socket *so,
	struct sockaddr svcaddr, / servers address */
	rpcprog_t program, /* program number */
	rpcvers_t version, /* version number */
	size_t sendsz, /* buffer recv size */
	size_t recvsz) /* buffer send size */
	{
	CLIENT cl = NULL; / client handle */
	struct cu_data cu = NULL; / private data */
	struct cu_socket *cs = NULL;
	struct sockbuf *sb;
	struct timeval now;
	struct rpc_msg call_msg;
	struct __rpc_sockinfo si;
	XDR xdrs;
	int error;

	if (svcaddr == NULL) {
	rpc_createerr.cf_stat = RPC_UNKNOWNADDR;
	return (NULL);
	}

	if (!__rpc_socket2sockinfo(so, &si)) {
	rpc_createerr.cf_stat = RPC_TLIERROR;
	rpc_createerr.cf_error.re_errno = 0;
	return (NULL);
	}

	/*
	* Find the receive and the send size
	*/
	sendsz = __rpc_get_t_size(si.si_af, si.si_proto, (int)sendsz);
	recvsz = __rpc_get_t_size(si.si_af, si.si_proto, (int)recvsz);
	if ((sendsz == 0) \|\| (recvsz == 0)) {
	rpc_createerr.cf_stat = RPC_TLIERROR; /* XXX */
	rpc_createerr.cf_error.re_errno = 0;
	return (NULL);
	}

	cl = mem_alloc(sizeof (CLIENT));

	/*
	* Should be multiple of 4 for XDR.
	*/
	sendsz = rounddown(sendsz + 3, 4);
	recvsz = rounddown(recvsz + 3, 4);
	cu = mem_alloc(sizeof (*cu));
	cu->cu_threads = 0;
	cu->cu_closing = FALSE;
	cu->cu_closed = FALSE;
	(void) memcpy(&cu->cu_raddr, svcaddr, (size_t)svcaddr->sa_len);
	cu->cu_rlen = svcaddr->sa_len;
	/* Other values can also be set through clnt_control() */
	cu->cu_wait.tv_sec = 3; /* heuristically chosen */
	cu->cu_wait.tv_usec = 0;
	cu->cu_total.tv_sec = -1;
	cu->cu_total.tv_usec = -1;
	cu->cu_sendsz = sendsz;
	cu->cu_recvsz = recvsz;
	cu->cu_async = FALSE;
	cu->cu_connect = FALSE;
	cu->cu_connected = FALSE;
	cu->cu_waitchan = "rpcrecv";
	cu->cu_waitflag = 0;
	cu->cu_cwnd = MAXCWND / 2;
	cu->cu_sent = 0;
	cu->cu_cwnd_wait = FALSE;
	(void) getmicrotime(&now);
	cu->cu_xid = __RPC_GETXID(&now);
	call_msg.rm_xid = cu->cu_xid;
	call_msg.rm_call.cb_prog = program;
	call_msg.rm_call.cb_vers = version;
	xdrmem_create(&xdrs, cu->cu_mcallc, MCALL_MSG_SIZE, XDR_ENCODE);
	if (! xdr_callhdr(&xdrs, &call_msg)) {
	rpc_createerr.cf_stat = RPC_CANTENCODEARGS; /* XXX */
	rpc_createerr.cf_error.re_errno = 0;
	goto err2;
	}
	cu->cu_mcalllen = XDR_GETPOS(&xdrs);

	/*
	* By default, closeit is always FALSE. It is users responsibility
	* to do a close on it, else the user may use clnt_control
	* to let clnt_destroy do it for him/her.
	*/
	cu->cu_closeit = FALSE;
	cu->cu_socket = so;
	error = soreserve(so, (u_long)sendsz, (u_long)recvsz);
	if (error != 0) {
	rpc_createerr.cf_stat = RPC_FAILED;
	rpc_createerr.cf_error.re_errno = error;
	goto err2;
	}

	sb = &so->so_rcv;
	SOCKBUF_LOCK(&so->so_rcv);
	recheck_socket:
	if (sb->sb_upcall) {
	if (sb->sb_upcall != clnt_dg_soupcall) {
	SOCKBUF_UNLOCK(&so->so_rcv);
	printf("clnt_dg_create(): socket already has an incompatible upcall\n");
	goto err2;
	}
	cs = (struct cu_socket *) sb->sb_upcallarg;
	mtx_lock(&cs->cs_lock);
	cs->cs_refs++;
	mtx_unlock(&cs->cs_lock);
	} else {
	/*
	* We are the first on this socket - allocate the
	* structure and install it in the socket.
	*/
	SOCKBUF_UNLOCK(&so->so_rcv);
	cs = mem_alloc(sizeof(*cs));
	SOCKBUF_LOCK(&so->so_rcv);
	if (sb->sb_upcall) {
	/*
	* We have lost a race with some other client.
	*/
	mem_free(cs, sizeof(*cs));
	goto recheck_socket;
	}
	mtx_init(&cs->cs_lock, "cs->cs_lock", NULL, MTX_DEF);
	cs->cs_refs = 1;
	cs->cs_upcallrefs = 0;
	TAILQ_INIT(&cs->cs_pending);
	soupcall_set(so, SO_RCV, clnt_dg_soupcall, cs);
	}
	SOCKBUF_UNLOCK(&so->so_rcv);

	cl->cl_refs = 1;
	cl->cl_ops = &clnt_dg_ops;
	cl->cl_private = (caddr_t)(void *)cu;
	cl->cl_auth = authnone_create();
	cl->cl_tp = NULL;
	cl->cl_netid = NULL;
	return (cl);
	err2:
	mem_free(cl, sizeof (CLIENT));
	mem_free(cu, sizeof (*cu));

	return (NULL);
	}

	static enum clnt_stat
	clnt_dg_call(
	CLIENT cl, / client handle */
	struct rpc_callextra ext, / call metadata */
	rpcproc_t proc, /* procedure number */
	struct mbuf args, / pointer to args */
	struct mbuf *resultsp, / pointer to results */
	struct timeval utimeout) /* seconds to wait before giving up */
	{
	struct cu_data cu = (struct cu_data )cl->cl_private;
	struct cu_socket *cs;
	struct rpc_timers *rt;
	AUTH *auth;
	struct rpc_err *errp;
	enum clnt_stat stat;
	XDR xdrs;
	struct rpc_msg reply_msg;
	bool_t ok;
	int retrans; /* number of re-transmits so far */
	int nrefreshes = 2; /* number of times to refresh cred */
	struct timeval *tvp;
	int timeout;
	int retransmit_time;
	int next_sendtime, starttime, rtt, time_waited, tv = 0;
	struct sockaddr *sa;
	- socklen_t salen;
	uint32_t xid = 0;
	struct mbuf mreq = NULL, results;
	struct cu_request *cr;
	int error;

	cs = cu->cu_socket->so_rcv.sb_upcallarg;
	cr = malloc(sizeof(struct cu_request), M_RPC, M_WAITOK);

	mtx_lock(&cs->cs_lock);

	if (cu->cu_closing \|\| cu->cu_closed) {
	mtx_unlock(&cs->cs_lock);
	free(cr, M_RPC);
	return (RPC_CANTSEND);
	}
	cu->cu_threads++;

	if (ext) {
	auth = ext->rc_auth;
	errp = &ext->rc_err;
	} else {
	auth = cl->cl_auth;
	errp = &cu->cu_error;
	}

	cr->cr_client = cl;
	cr->cr_mrep = NULL;
	cr->cr_error = 0;

	if (cu->cu_total.tv_usec == -1) {
	tvp = &utimeout; /* use supplied timeout */
	} else {
	tvp = &cu->cu_total; /* use default timeout */
	}
	if (tvp->tv_sec \|\| tvp->tv_usec)
	timeout = tvtohz(tvp);
	else
	timeout = 0;

	if (cu->cu_connect && !cu->cu_connected) {
	mtx_unlock(&cs->cs_lock);
	error = soconnect(cu->cu_socket,
	(struct sockaddr *)&cu->cu_raddr, curthread);
	mtx_lock(&cs->cs_lock);
	if (error) {
	errp->re_errno = error;
	errp->re_status = stat = RPC_CANTSEND;
	goto out;
	}
	cu->cu_connected = 1;
	}
	- if (cu->cu_connected) {
	+ if (cu->cu_connected)
	sa = NULL;
	- salen = 0;
	- } else {
	+ else
	sa = (struct sockaddr *)&cu->cu_raddr;
	- salen = cu->cu_rlen;
	- }
	time_waited = 0;
	retrans = 0;
	if (ext && ext->rc_timers) {
	rt = ext->rc_timers;
	if (!rt->rt_rtxcur)
	rt->rt_rtxcur = tvtohz(&cu->cu_wait);
	retransmit_time = next_sendtime = rt->rt_rtxcur;
	} else {
	rt = NULL;
	retransmit_time = next_sendtime = tvtohz(&cu->cu_wait);
	}

	starttime = ticks;

	call_again:
	mtx_assert(&cs->cs_lock, MA_OWNED);

	cu->cu_xid++;
	xid = cu->cu_xid;

	send_again:
	mtx_unlock(&cs->cs_lock);

	mreq = m_gethdr(M_WAITOK, MT_DATA);
	KASSERT(cu->cu_mcalllen <= MHLEN, ("RPC header too big"));
	bcopy(cu->cu_mcallc, mreq->m_data, cu->cu_mcalllen);
	mreq->m_len = cu->cu_mcalllen;

	/*
	* The XID is the first thing in the request.
	*/
	mtod(mreq, uint32_t ) = htonl(xid);

	xdrmbuf_create(&xdrs, mreq, XDR_ENCODE);

	if (cu->cu_async == TRUE && args == NULL)
	goto get_reply;

	if ((! XDR_PUTINT32(&xdrs, &proc)) \|\|
	(! AUTH_MARSHALL(auth, xid, &xdrs,
	m_copym(args, 0, M_COPYALL, M_WAITOK)))) {
	errp->re_status = stat = RPC_CANTENCODEARGS;
	mtx_lock(&cs->cs_lock);
	goto out;
	}
	mreq->m_pkthdr.len = m_length(mreq, NULL);

	cr->cr_xid = xid;
	mtx_lock(&cs->cs_lock);

	/*
	* Try to get a place in the congestion window.
	*/
	while (cu->cu_sent >= cu->cu_cwnd) {
	cu->cu_cwnd_wait = TRUE;
	error = msleep(&cu->cu_cwnd_wait, &cs->cs_lock,
	cu->cu_waitflag, "rpccwnd", 0);
	if (error) {
	errp->re_errno = error;
	if (error == EINTR \|\| error == ERESTART)
	errp->re_status = stat = RPC_INTR;
	else
	errp->re_status = stat = RPC_CANTSEND;
	goto out;
	}
	}
	cu->cu_sent += CWNDSCALE;

	TAILQ_INSERT_TAIL(&cs->cs_pending, cr, cr_link);
	mtx_unlock(&cs->cs_lock);

	/*
	* sosend consumes mreq.
	*/
	error = sosend(cu->cu_socket, sa, NULL, mreq, NULL, 0, curthread);
	mreq = NULL;

	/*
	* sub-optimal code appears here because we have
	* some clock time to spare while the packets are in flight.
	* (We assume that this is actually only executed once.)
	*/
	reply_msg.acpted_rply.ar_verf.oa_flavor = AUTH_NULL;
	reply_msg.acpted_rply.ar_verf.oa_base = cr->cr_verf;
	reply_msg.acpted_rply.ar_verf.oa_length = 0;
	reply_msg.acpted_rply.ar_results.where = NULL;
	reply_msg.acpted_rply.ar_results.proc = (xdrproc_t)xdr_void;

	mtx_lock(&cs->cs_lock);
	if (error) {
	TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
	errp->re_errno = error;
	errp->re_status = stat = RPC_CANTSEND;
	cu->cu_sent -= CWNDSCALE;
	if (cu->cu_cwnd_wait) {
	cu->cu_cwnd_wait = FALSE;
	wakeup(&cu->cu_cwnd_wait);
	}
	goto out;
	}

	/*
	* Check to see if we got an upcall while waiting for the
	* lock.
	*/
	if (cr->cr_error) {
	TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
	errp->re_errno = cr->cr_error;
	errp->re_status = stat = RPC_CANTRECV;
	cu->cu_sent -= CWNDSCALE;
	if (cu->cu_cwnd_wait) {
	cu->cu_cwnd_wait = FALSE;
	wakeup(&cu->cu_cwnd_wait);
	}
	goto out;
	}
	if (cr->cr_mrep) {
	TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
	cu->cu_sent -= CWNDSCALE;
	if (cu->cu_cwnd_wait) {
	cu->cu_cwnd_wait = FALSE;
	wakeup(&cu->cu_cwnd_wait);
	}
	goto got_reply;
	}

	/*
	* Hack to provide rpc-based message passing
	*/
	if (timeout == 0) {
	TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
	errp->re_status = stat = RPC_TIMEDOUT;
	cu->cu_sent -= CWNDSCALE;
	if (cu->cu_cwnd_wait) {
	cu->cu_cwnd_wait = FALSE;
	wakeup(&cu->cu_cwnd_wait);
	}
	goto out;
	}

	get_reply:
	for (;;) {
	/* Decide how long to wait. */
	if (next_sendtime < timeout)
	tv = next_sendtime;
	else
	tv = timeout;
	tv -= time_waited;

	if (tv > 0) {
	if (cu->cu_closing \|\| cu->cu_closed) {
	error = 0;
	cr->cr_error = ESHUTDOWN;
	} else {
	error = msleep(cr, &cs->cs_lock,
	cu->cu_waitflag, cu->cu_waitchan, tv);
	}
	} else {
	error = EWOULDBLOCK;
	}

	TAILQ_REMOVE(&cs->cs_pending, cr, cr_link);
	cu->cu_sent -= CWNDSCALE;
	if (cu->cu_cwnd_wait) {
	cu->cu_cwnd_wait = FALSE;
	wakeup(&cu->cu_cwnd_wait);
	}

	if (!error) {
	/*
	* We were woken up by the upcall. If the
	* upcall had a receive error, report that,
	* otherwise we have a reply.
	*/
	if (cr->cr_error) {
	errp->re_errno = cr->cr_error;
	errp->re_status = stat = RPC_CANTRECV;
	goto out;
	}

	cu->cu_cwnd += (CWNDSCALE * CWNDSCALE
	+ cu->cu_cwnd / 2) / cu->cu_cwnd;
	if (cu->cu_cwnd > MAXCWND)
	cu->cu_cwnd = MAXCWND;

	if (rt) {
	/*
	* Add one to the time since a tick
	* count of N means that the actual
	* time taken was somewhere between N
	* and N+1.
	*/
	rtt = ticks - starttime + 1;

	/*
	* Update our estimate of the round
	* trip time using roughly the
	* algorithm described in RFC
	* 2988. Given an RTT sample R:
	*
	* RTTVAR = (1-beta) * RTTVAR + beta * \|SRTT-R\|
	* SRTT = (1-alpha) * SRTT + alpha * R
	*
	* where alpha = 0.125 and beta = 0.25.
	*
	* The initial retransmit timeout is
	* SRTT + 4*RTTVAR and doubles on each
	* retransmision.
	*/
	if (rt->rt_srtt == 0) {
	rt->rt_srtt = rtt;
	rt->rt_deviate = rtt / 2;
	} else {
	int32_t error = rtt - rt->rt_srtt;
	rt->rt_srtt += error / 8;
	error = abs(error) - rt->rt_deviate;
	rt->rt_deviate += error / 4;
	}
	rt->rt_rtxcur = rt->rt_srtt + 4*rt->rt_deviate;
	}

	break;
	}

	/*
	* The sleep returned an error so our request is still
	* on the list. If we got EWOULDBLOCK, we may want to
	* re-send the request.
	*/
	if (error != EWOULDBLOCK) {
	errp->re_errno = error;
	if (error == EINTR \|\| error == ERESTART)
	errp->re_status = stat = RPC_INTR;
	else
	errp->re_status = stat = RPC_CANTRECV;
	goto out;
	}

	time_waited = ticks - starttime;

	/* Check for timeout. */
	if (time_waited > timeout) {
	errp->re_errno = EWOULDBLOCK;
	errp->re_status = stat = RPC_TIMEDOUT;
	goto out;
	}

	/* Retransmit if necessary. */
	if (time_waited >= next_sendtime) {
	cu->cu_cwnd /= 2;
	if (cu->cu_cwnd < CWNDSCALE)
	cu->cu_cwnd = CWNDSCALE;
	if (ext && ext->rc_feedback) {
	mtx_unlock(&cs->cs_lock);
	if (retrans == 0)
	ext->rc_feedback(FEEDBACK_REXMIT1,
	proc, ext->rc_feedback_arg);
	else
	ext->rc_feedback(FEEDBACK_REXMIT2,
	proc, ext->rc_feedback_arg);
	mtx_lock(&cs->cs_lock);
	}
	if (cu->cu_closing \|\| cu->cu_closed) {
	errp->re_errno = ESHUTDOWN;
	errp->re_status = stat = RPC_CANTRECV;
	goto out;
	}
	retrans++;
	/* update retransmit_time */
	if (retransmit_time < RPC_MAX_BACKOFF * hz)
	retransmit_time = 2 * retransmit_time;
	next_sendtime += retransmit_time;
	goto send_again;
	}
	cu->cu_sent += CWNDSCALE;
	TAILQ_INSERT_TAIL(&cs->cs_pending, cr, cr_link);
	}

	got_reply:
	/*
	* Now decode and validate the response. We need to drop the
	* lock since xdr_replymsg may end up sleeping in malloc.
	*/
	mtx_unlock(&cs->cs_lock);

	if (ext && ext->rc_feedback)
	ext->rc_feedback(FEEDBACK_OK, proc, ext->rc_feedback_arg);

	xdrmbuf_create(&xdrs, cr->cr_mrep, XDR_DECODE);
	ok = xdr_replymsg(&xdrs, &reply_msg);
	cr->cr_mrep = NULL;

	if (ok) {
	if ((reply_msg.rm_reply.rp_stat == MSG_ACCEPTED) &&
	(reply_msg.acpted_rply.ar_stat == SUCCESS))
	errp->re_status = stat = RPC_SUCCESS;
	else
	stat = _seterr_reply(&reply_msg, &(cu->cu_error));

	if (errp->re_status == RPC_SUCCESS) {
	results = xdrmbuf_getall(&xdrs);
	if (! AUTH_VALIDATE(auth, xid,
	&reply_msg.acpted_rply.ar_verf,
	&results)) {
	errp->re_status = stat = RPC_AUTHERROR;
	errp->re_why = AUTH_INVALIDRESP;
	if (retrans &&
	auth->ah_cred.oa_flavor == RPCSEC_GSS) {
	/*
	* If we retransmitted, its
	* possible that we will
	* receive a reply for one of
	* the earlier transmissions
	* (which will use an older
	* RPCSEC_GSS sequence
	* number). In this case, just
	* go back and listen for a
	* new reply. We could keep a
	* record of all the seq
	* numbers we have transmitted
	* so far so that we could
	* accept a reply for any of
	* them here.
	*/
	XDR_DESTROY(&xdrs);
	mtx_lock(&cs->cs_lock);
	cu->cu_sent += CWNDSCALE;
	TAILQ_INSERT_TAIL(&cs->cs_pending,
	cr, cr_link);
	cr->cr_mrep = NULL;
	goto get_reply;
	}
	} else {
	*resultsp = results;
	}
	} /* end successful completion */
	/*
	* If unsuccessful AND error is an authentication error
	* then refresh credentials and try again, else break
	*/
	else if (stat == RPC_AUTHERROR)
	/* maybe our credentials need to be refreshed ... */
	if (nrefreshes > 0 &&
	AUTH_REFRESH(auth, &reply_msg)) {
	nrefreshes--;
	XDR_DESTROY(&xdrs);
	mtx_lock(&cs->cs_lock);
	goto call_again;
	}
	/* end of unsuccessful completion */
	} /* end of valid reply message */
	else {
	errp->re_status = stat = RPC_CANTDECODERES;

	}
	XDR_DESTROY(&xdrs);
	mtx_lock(&cs->cs_lock);
	out:
	mtx_assert(&cs->cs_lock, MA_OWNED);

	if (mreq)
	m_freem(mreq);
	if (cr->cr_mrep)
	m_freem(cr->cr_mrep);

	cu->cu_threads--;
	if (cu->cu_closing)
	wakeup(cu);

	mtx_unlock(&cs->cs_lock);

	if (auth && stat != RPC_SUCCESS)
	AUTH_VALIDATE(auth, xid, NULL, NULL);

	free(cr, M_RPC);

	return (stat);
	}

	static void
	clnt_dg_geterr(CLIENT cl, struct rpc_err errp)
	{
	struct cu_data cu = (struct cu_data )cl->cl_private;

	*errp = cu->cu_error;
	}

	static bool_t
	clnt_dg_freeres(CLIENT cl, xdrproc_t xdr_res, void res_ptr)
	{
	XDR xdrs;
	bool_t dummy;

	xdrs.x_op = XDR_FREE;
	dummy = (*xdr_res)(&xdrs, res_ptr);

	return (dummy);
	}

	/ARGSUSED/
	static void
	clnt_dg_abort(CLIENT *h)
	{
	}

	static bool_t
	clnt_dg_control(CLIENT cl, u_int request, void info)
	{
	struct cu_data cu = (struct cu_data )cl->cl_private;
	struct cu_socket *cs;
	struct sockaddr *addr;

	cs = cu->cu_socket->so_rcv.sb_upcallarg;
	mtx_lock(&cs->cs_lock);

	switch (request) {
	case CLSET_FD_CLOSE:
	cu->cu_closeit = TRUE;
	mtx_unlock(&cs->cs_lock);
	return (TRUE);
	case CLSET_FD_NCLOSE:
	cu->cu_closeit = FALSE;
	mtx_unlock(&cs->cs_lock);
	return (TRUE);
	}

	/* for other requests which use info */
	if (info == NULL) {
	mtx_unlock(&cs->cs_lock);
	return (FALSE);
	}
	switch (request) {
	case CLSET_TIMEOUT:
	if (time_not_ok((struct timeval *)info)) {
	mtx_unlock(&cs->cs_lock);
	return (FALSE);
	}
	cu->cu_total = (struct timeval )info;
	break;
	case CLGET_TIMEOUT:
	(struct timeval )info = cu->cu_total;
	break;
	case CLSET_RETRY_TIMEOUT:
	if (time_not_ok((struct timeval *)info)) {
	mtx_unlock(&cs->cs_lock);
	return (FALSE);
	}
	cu->cu_wait = (struct timeval )info;
	break;
	case CLGET_RETRY_TIMEOUT:
	(struct timeval )info = cu->cu_wait;
	break;
	case CLGET_SVC_ADDR:
	/*
	* Slightly different semantics to userland - we use
	* sockaddr instead of netbuf.
	*/
	memcpy(info, &cu->cu_raddr, cu->cu_raddr.ss_len);
	break;
	case CLSET_SVC_ADDR: /* set to new address */
	addr = (struct sockaddr *)info;
	(void) memcpy(&cu->cu_raddr, addr, addr->sa_len);
	break;
	case CLGET_XID:
	(uint32_t )info = cu->cu_xid;
	break;

	case CLSET_XID:
	/* This will set the xid of the NEXT call */
	/* decrement by 1 as clnt_dg_call() increments once */
	cu->cu_xid = (uint32_t )info - 1;
	break;

	case CLGET_VERS:
	/*
	* This RELIES on the information that, in the call body,
	* the version number field is the fifth field from the
	* beginning of the RPC header. MUST be changed if the
	* call_struct is changed
	*/
	(uint32_t )info =
	ntohl((uint32_t )(void *)(cu->cu_mcallc +
	4 * BYTES_PER_XDR_UNIT));
	break;

	case CLSET_VERS:
	(uint32_t )(void )(cu->cu_mcallc + 4 BYTES_PER_XDR_UNIT)
	= htonl((uint32_t )info);
	break;

	case CLGET_PROG:
	/*
	* This RELIES on the information that, in the call body,
	* the program number field is the fourth field from the
	* beginning of the RPC header. MUST be changed if the
	* call_struct is changed
	*/
	(uint32_t )info =
	ntohl((uint32_t )(void *)(cu->cu_mcallc +
	3 * BYTES_PER_XDR_UNIT));
	break;

	case CLSET_PROG:
	(uint32_t )(void )(cu->cu_mcallc + 3 BYTES_PER_XDR_UNIT)
	= htonl((uint32_t )info);
	break;
	case CLSET_ASYNC:
	cu->cu_async = (int )info;
	break;
	case CLSET_CONNECT:
	cu->cu_connect = (int )info;
	break;
	case CLSET_WAITCHAN:
	cu->cu_waitchan = (const char *)info;
	break;
	case CLGET_WAITCHAN:
	(const char *) info = cu->cu_waitchan;
	break;
	case CLSET_INTERRUPTIBLE:
	if ((int ) info)
	cu->cu_waitflag = PCATCH;
	else
	cu->cu_waitflag = 0;
	break;
	case CLGET_INTERRUPTIBLE:
	if (cu->cu_waitflag)
	(int ) info = TRUE;
	else
	(int ) info = FALSE;
	break;
	default:
	mtx_unlock(&cs->cs_lock);
	return (FALSE);
	}
	mtx_unlock(&cs->cs_lock);
	return (TRUE);
	}

	static void
	clnt_dg_close(CLIENT *cl)
	{
	struct cu_data cu = (struct cu_data )cl->cl_private;
	struct cu_socket *cs;
	struct cu_request *cr;

	cs = cu->cu_socket->so_rcv.sb_upcallarg;
	mtx_lock(&cs->cs_lock);

	if (cu->cu_closed) {
	mtx_unlock(&cs->cs_lock);
	return;
	}

	if (cu->cu_closing) {
	while (cu->cu_closing)
	msleep(cu, &cs->cs_lock, 0, "rpcclose", 0);
	KASSERT(cu->cu_closed, ("client should be closed"));
	mtx_unlock(&cs->cs_lock);
	return;
	}

	/*
	* Abort any pending requests and wait until everyone
	* has finished with clnt_vc_call.
	*/
	cu->cu_closing = TRUE;
	TAILQ_FOREACH(cr, &cs->cs_pending, cr_link) {
	if (cr->cr_client == cl) {
	cr->cr_xid = 0;
	cr->cr_error = ESHUTDOWN;
	wakeup(cr);
	}
	}

	while (cu->cu_threads)
	msleep(cu, &cs->cs_lock, 0, "rpcclose", 0);

	cu->cu_closing = FALSE;
	cu->cu_closed = TRUE;

	mtx_unlock(&cs->cs_lock);
	wakeup(cu);
	}

	static void
	clnt_dg_destroy(CLIENT *cl)
	{
	struct cu_data cu = (struct cu_data )cl->cl_private;
	struct cu_socket *cs;
	struct socket *so = NULL;
	bool_t lastsocketref;

	cs = cu->cu_socket->so_rcv.sb_upcallarg;
	clnt_dg_close(cl);

	SOCKBUF_LOCK(&cu->cu_socket->so_rcv);
	mtx_lock(&cs->cs_lock);

	cs->cs_refs--;
	if (cs->cs_refs == 0) {
	mtx_unlock(&cs->cs_lock);
	soupcall_clear(cu->cu_socket, SO_RCV);
	clnt_dg_upcallsdone(cu->cu_socket, cs);
	SOCKBUF_UNLOCK(&cu->cu_socket->so_rcv);
	mtx_destroy(&cs->cs_lock);
	mem_free(cs, sizeof(*cs));
	lastsocketref = TRUE;
	} else {
	mtx_unlock(&cs->cs_lock);
	SOCKBUF_UNLOCK(&cu->cu_socket->so_rcv);
	lastsocketref = FALSE;
	}

	if (cu->cu_closeit && lastsocketref) {
	so = cu->cu_socket;
	cu->cu_socket = NULL;
	}

	if (so)
	soclose(so);

	if (cl->cl_netid && cl->cl_netid[0])
	mem_free(cl->cl_netid, strlen(cl->cl_netid) +1);
	if (cl->cl_tp && cl->cl_tp[0])
	mem_free(cl->cl_tp, strlen(cl->cl_tp) +1);
	mem_free(cu, sizeof (*cu));
	mem_free(cl, sizeof (CLIENT));
	}

	/*
	* Make sure that the time is not garbage. -1 value is allowed.
	*/
	static bool_t
	time_not_ok(struct timeval *t)
	{
	return (t->tv_sec < -1 \|\| t->tv_sec > 100000000 \|\|
	t->tv_usec < -1 \|\| t->tv_usec > 1000000);
	}

	int
	clnt_dg_soupcall(struct socket so, void arg, int waitflag)
	{
	struct cu_socket cs = (struct cu_socket ) arg;
	struct uio uio;
	struct mbuf *m;
	struct mbuf *control;
	struct cu_request *cr;
	int error, rcvflag, foundreq;
	uint32_t xid;

	cs->cs_upcallrefs++;
	uio.uio_resid = 1000000000;
	uio.uio_td = curthread;
	do {
	SOCKBUF_UNLOCK(&so->so_rcv);
	m = NULL;
	control = NULL;
	rcvflag = MSG_DONTWAIT;
	error = soreceive(so, NULL, &uio, &m, &control, &rcvflag);
	if (control)
	m_freem(control);
	SOCKBUF_LOCK(&so->so_rcv);

	if (error == EWOULDBLOCK)
	break;

	/*
	* If there was an error, wake up all pending
	* requests.
	*/
	if (error) {
	mtx_lock(&cs->cs_lock);
	TAILQ_FOREACH(cr, &cs->cs_pending, cr_link) {
	cr->cr_xid = 0;
	cr->cr_error = error;
	wakeup(cr);
	}
	mtx_unlock(&cs->cs_lock);
	break;
	}

	/*
	* The XID is in the first uint32_t of the reply.
	*/
	if (m->m_len < sizeof(xid) && m_length(m, NULL) < sizeof(xid)) {
	/*
	* Should never happen.
	*/
	m_freem(m);
	continue;
	}

	m_copydata(m, 0, sizeof(xid), (char *)&xid);
	xid = ntohl(xid);

	/*
	* Attempt to match this reply with a pending request.
	*/
	mtx_lock(&cs->cs_lock);
	foundreq = 0;
	TAILQ_FOREACH(cr, &cs->cs_pending, cr_link) {
	if (cr->cr_xid == xid) {
	/*
	* This one matches. We leave the
	* reply mbuf in cr->cr_mrep. Set the
	* XID to zero so that we will ignore
	* any duplicated replies that arrive
	* before clnt_dg_call removes it from
	* the queue.
	*/
	cr->cr_xid = 0;
	cr->cr_mrep = m;
	cr->cr_error = 0;
	foundreq = 1;
	wakeup(cr);
	break;
	}
	}
	mtx_unlock(&cs->cs_lock);

	/*
	* If we didn't find the matching request, just drop
	* it - its probably a repeated reply.
	*/
	if (!foundreq)
	m_freem(m);
	} while (m);
	cs->cs_upcallrefs--;
	if (cs->cs_upcallrefs < 0)
	panic("rpcdg upcall refcnt");
	if (cs->cs_upcallrefs == 0)
	wakeup(&cs->cs_upcallrefs);
	return (SU_OK);
	}

	/*
	* Wait for all upcalls in progress to complete.
	*/
	static void
	clnt_dg_upcallsdone(struct socket so, struct cu_socket cs)
	{

	SOCKBUF_LOCK_ASSERT(&so->so_rcv);

	while (cs->cs_upcallrefs > 0)
	(void) msleep(&cs->cs_upcallrefs, SOCKBUF_MTX(&so->so_rcv), 0,
	"rpcdgup", 0);
	}
	Index: head/sys/security/mac/mac_syscalls.c
	===================================================================
	--- head/sys/security/mac/mac_syscalls.c (revision 327172)
	+++ head/sys/security/mac/mac_syscalls.c (revision 327173)
	@@ -1,733 +1,731 @@
	/*-
	* Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson
	* Copyright (c) 2001 Ilmar S. Habibulin
	* Copyright (c) 2001-2005 Networks Associates Technology, Inc.
	* Copyright (c) 2005-2006 SPARTA, Inc.
	* Copyright (c) 2008 Apple Inc.
	* All rights reserved.
	*
	* This software was developed by Robert Watson and Ilmar Habibulin for the
	* TrustedBSD Project.
	*
	* This software was developed for the FreeBSD Project in part by Network
	* Associates Laboratories, the Security Research Division of Network
	* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
	* as part of the DARPA CHATS research program.
	*
	* This software was enhanced by SPARTA ISSO under SPAWAR contract
	* N66001-04-C-6019 ("SEFOS").
	*
	* This software was developed at the University of Cambridge Computer
	* Laboratory with support from a grant from Google, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/capsicum.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/mac.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/sysent.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/file.h>
	#include <sys/namei.h>
	#include <sys/socket.h>
	#include <sys/pipe.h>
	#include <sys/socketvar.h>

	#include <security/mac/mac_framework.h>
	#include <security/mac/mac_internal.h>
	#include <security/mac/mac_policy.h>

	#ifdef MAC

	FEATURE(security_mac, "Mandatory Access Control Framework support");

	int
	sys___mac_get_pid(struct thread td, struct __mac_get_pid_args uap)
	{
	char elements, buffer;
	struct mac mac;
	struct proc *tproc;
	struct ucred *tcred;
	int error;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	tproc = pfind(uap->pid);
	if (tproc == NULL)
	return (ESRCH);

	tcred = NULL; /* Satisfy gcc. */
	error = p_cansee(td, tproc);
	if (error == 0)
	tcred = crhold(tproc->p_ucred);
	PROC_UNLOCK(tproc);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	crfree(tcred);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	error = mac_cred_externalize_label(tcred->cr_label, elements,
	buffer, mac.m_buflen);
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);
	crfree(tcred);
	return (error);
	}

	int
	sys___mac_get_proc(struct thread td, struct __mac_get_proc_args uap)
	{
	char elements, buffer;
	struct mac mac;
	int error;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	error = mac_cred_externalize_label(td->td_ucred->cr_label,
	elements, buffer, mac.m_buflen);
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);
	return (error);
	}

	int
	sys___mac_set_proc(struct thread td, struct __mac_set_proc_args uap)
	{
	struct ucred newcred, oldcred;
	struct label *intlabel;
	struct proc *p;
	struct mac mac;
	char *buffer;
	int error;

	if (!(mac_labeled & MPC_OBJECT_CRED))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	intlabel = mac_cred_label_alloc();
	error = mac_cred_internalize_label(intlabel, buffer);
	free(buffer, M_MACTEMP);
	if (error)
	goto out;

	newcred = crget();

	p = td->td_proc;
	PROC_LOCK(p);
	oldcred = p->p_ucred;

	error = mac_cred_check_relabel(oldcred, intlabel);
	if (error) {
	PROC_UNLOCK(p);
	crfree(newcred);
	goto out;
	}

	setsugid(p);
	crcopy(newcred, oldcred);
	mac_cred_relabel(newcred, intlabel);
	proc_set_cred(p, newcred);

	PROC_UNLOCK(p);
	crfree(oldcred);
	mac_proc_vm_revoke(td);

	out:
	mac_cred_label_free(intlabel);
	return (error);
	}

	int
	sys___mac_get_fd(struct thread td, struct __mac_get_fd_args uap)
	{
	char elements, buffer;
	struct label *intlabel;
	struct file *fp;
	struct mac mac;
	struct vnode *vp;
	struct pipe *pipe;
	struct socket *so;
	cap_rights_t rights;
	- short label_type;
	int error;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_GET), &fp);
	if (error)
	goto out;

	- label_type = fp->f_type;
	switch (fp->f_type) {
	case DTYPE_FIFO:
	case DTYPE_VNODE:
	if (!(mac_labeled & MPC_OBJECT_VNODE)) {
	error = EINVAL;
	goto out_fdrop;
	}
	vp = fp->f_vnode;
	intlabel = mac_vnode_label_alloc();
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	mac_vnode_copy_label(vp->v_label, intlabel);
	VOP_UNLOCK(vp, 0);
	error = mac_vnode_externalize_label(intlabel, elements,
	buffer, mac.m_buflen);
	mac_vnode_label_free(intlabel);
	break;

	case DTYPE_PIPE:
	if (!(mac_labeled & MPC_OBJECT_PIPE)) {
	error = EINVAL;
	goto out_fdrop;
	}
	pipe = fp->f_data;
	intlabel = mac_pipe_label_alloc();
	PIPE_LOCK(pipe);
	mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel);
	PIPE_UNLOCK(pipe);
	error = mac_pipe_externalize_label(intlabel, elements,
	buffer, mac.m_buflen);
	mac_pipe_label_free(intlabel);
	break;

	case DTYPE_SOCKET:
	if (!(mac_labeled & MPC_OBJECT_SOCKET)) {
	error = EINVAL;
	goto out_fdrop;
	}
	so = fp->f_data;
	intlabel = mac_socket_label_alloc(M_WAITOK);
	SOCK_LOCK(so);
	mac_socket_copy_label(so->so_label, intlabel);
	SOCK_UNLOCK(so);
	error = mac_socket_externalize_label(intlabel, elements,
	buffer, mac.m_buflen);
	mac_socket_label_free(intlabel);
	break;

	default:
	error = EINVAL;
	}
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);
	out_fdrop:
	fdrop(fp, td);
	out:
	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);
	return (error);
	}

	int
	sys___mac_get_file(struct thread td, struct __mac_get_file_args uap)
	{
	char elements, buffer;
	struct nameidata nd;
	struct label *intlabel;
	struct mac mac;
	int error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	NDINIT(&nd, LOOKUP, LOCKLEAF \| FOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	if (error)
	goto out;

	intlabel = mac_vnode_label_alloc();
	mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
	error = mac_vnode_externalize_label(intlabel, elements, buffer,
	mac.m_buflen);

	NDFREE(&nd, 0);
	mac_vnode_label_free(intlabel);
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	out:
	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);

	return (error);
	}

	int
	sys___mac_get_link(struct thread td, struct __mac_get_link_args uap)
	{
	char elements, buffer;
	struct nameidata nd;
	struct label *intlabel;
	struct mac mac;
	int error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	NDINIT(&nd, LOOKUP, LOCKLEAF \| NOFOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	if (error)
	goto out;

	intlabel = mac_vnode_label_alloc();
	mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
	error = mac_vnode_externalize_label(intlabel, elements, buffer,
	mac.m_buflen);
	NDFREE(&nd, 0);
	mac_vnode_label_free(intlabel);

	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	out:
	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);

	return (error);
	}

	int
	sys___mac_set_fd(struct thread td, struct __mac_set_fd_args uap)
	{
	struct label *intlabel;
	struct pipe *pipe;
	struct socket *so;
	struct file *fp;
	struct mount *mp;
	struct vnode *vp;
	struct mac mac;
	cap_rights_t rights;
	char *buffer;
	int error;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_SET), &fp);
	if (error)
	goto out;

	switch (fp->f_type) {
	case DTYPE_FIFO:
	case DTYPE_VNODE:
	if (!(mac_labeled & MPC_OBJECT_VNODE)) {
	error = EINVAL;
	goto out_fdrop;
	}
	intlabel = mac_vnode_label_alloc();
	error = mac_vnode_internalize_label(intlabel, buffer);
	if (error) {
	mac_vnode_label_free(intlabel);
	break;
	}
	vp = fp->f_vnode;
	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error != 0) {
	mac_vnode_label_free(intlabel);
	break;
	}
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = vn_setlabel(vp, intlabel, td->td_ucred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	mac_vnode_label_free(intlabel);
	break;

	case DTYPE_PIPE:
	if (!(mac_labeled & MPC_OBJECT_PIPE)) {
	error = EINVAL;
	goto out_fdrop;
	}
	intlabel = mac_pipe_label_alloc();
	error = mac_pipe_internalize_label(intlabel, buffer);
	if (error == 0) {
	pipe = fp->f_data;
	PIPE_LOCK(pipe);
	error = mac_pipe_label_set(td->td_ucred,
	pipe->pipe_pair, intlabel);
	PIPE_UNLOCK(pipe);
	}
	mac_pipe_label_free(intlabel);
	break;

	case DTYPE_SOCKET:
	if (!(mac_labeled & MPC_OBJECT_SOCKET)) {
	error = EINVAL;
	goto out_fdrop;
	}
	intlabel = mac_socket_label_alloc(M_WAITOK);
	error = mac_socket_internalize_label(intlabel, buffer);
	if (error == 0) {
	so = fp->f_data;
	error = mac_socket_label_set(td->td_ucred, so,
	intlabel);
	}
	mac_socket_label_free(intlabel);
	break;

	default:
	error = EINVAL;
	}
	out_fdrop:
	fdrop(fp, td);
	out:
	free(buffer, M_MACTEMP);
	return (error);
	}

	int
	sys___mac_set_file(struct thread td, struct __mac_set_file_args uap)
	{
	struct label *intlabel;
	struct nameidata nd;
	struct mount *mp;
	struct mac mac;
	char *buffer;
	int error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	intlabel = mac_vnode_label_alloc();
	error = mac_vnode_internalize_label(intlabel, buffer);
	free(buffer, M_MACTEMP);
	if (error)
	goto out;

	NDINIT(&nd, LOOKUP, LOCKLEAF \| FOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	if (error == 0) {
	error = vn_start_write(nd.ni_vp, &mp, V_WAIT \| PCATCH);
	if (error == 0) {
	error = vn_setlabel(nd.ni_vp, intlabel,
	td->td_ucred);
	vn_finished_write(mp);
	}
	}

	NDFREE(&nd, 0);
	out:
	mac_vnode_label_free(intlabel);
	return (error);
	}

	int
	sys___mac_set_link(struct thread td, struct __mac_set_link_args uap)
	{
	struct label *intlabel;
	struct nameidata nd;
	struct mount *mp;
	struct mac mac;
	char *buffer;
	int error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	intlabel = mac_vnode_label_alloc();
	error = mac_vnode_internalize_label(intlabel, buffer);
	free(buffer, M_MACTEMP);
	if (error)
	goto out;

	NDINIT(&nd, LOOKUP, LOCKLEAF \| NOFOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	if (error == 0) {
	error = vn_start_write(nd.ni_vp, &mp, V_WAIT \| PCATCH);
	if (error == 0) {
	error = vn_setlabel(nd.ni_vp, intlabel,
	td->td_ucred);
	vn_finished_write(mp);
	}
	}

	NDFREE(&nd, 0);
	out:
	mac_vnode_label_free(intlabel);
	return (error);
	}

	int
	sys_mac_syscall(struct thread td, struct mac_syscall_args uap)
	{
	struct mac_policy_conf *mpc;
	char target[MAC_MAX_POLICY_NAME];
	int error;

	error = copyinstr(uap->policy, target, sizeof(target), NULL);
	if (error)
	return (error);

	error = ENOSYS;
	LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
	if (strcmp(mpc->mpc_name, target) == 0 &&
	mpc->mpc_ops->mpo_syscall != NULL) {
	error = mpc->mpc_ops->mpo_syscall(td,
	uap->call, uap->arg);
	goto out;
	}
	}

	if (!LIST_EMPTY(&mac_policy_list)) {
	mac_policy_slock_sleep();
	LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
	if (strcmp(mpc->mpc_name, target) == 0 &&
	mpc->mpc_ops->mpo_syscall != NULL) {
	error = mpc->mpc_ops->mpo_syscall(td,
	uap->call, uap->arg);
	break;
	}
	}
	mac_policy_sunlock_sleep();
	}
	out:
	return (error);
	}

	#else /* !MAC */

	int
	sys___mac_get_pid(struct thread td, struct __mac_get_pid_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_get_proc(struct thread td, struct __mac_get_proc_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_set_proc(struct thread td, struct __mac_set_proc_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_get_fd(struct thread td, struct __mac_get_fd_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_get_file(struct thread td, struct __mac_get_file_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_get_link(struct thread td, struct __mac_get_link_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_set_fd(struct thread td, struct __mac_set_fd_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_set_file(struct thread td, struct __mac_set_file_args uap)
	{

	return (ENOSYS);
	}

	int
	sys___mac_set_link(struct thread td, struct __mac_set_link_args uap)
	{

	return (ENOSYS);
	}

	int
	sys_mac_syscall(struct thread td, struct mac_syscall_args uap)
	{

	return (ENOSYS);
	}

	#endif /* !MAC */
	Index: head/sys/ufs/ffs/ffs_alloc.c
	===================================================================
	--- head/sys/ufs/ffs/ffs_alloc.c (revision 327172)
	+++ head/sys/ufs/ffs/ffs_alloc.c (revision 327173)
	@@ -1,3255 +1,3253 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2002 Networks Associates Technology, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project by Marshall
	* Kirk McKusick and Network Associates Laboratories, the Security
	* Research Division of Network Associates, Inc. under DARPA/SPAWAR
	* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
	* research program
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_quota.h"

	#include <sys/param.h>
	#include <sys/capsicum.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/kernel.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/taskqueue.h>

	#include <security/audit/audit.h>

	#include <geom/geom.h>

	#include <ufs/ufs/dir.h>
	#include <ufs/ufs/extattr.h>
	#include <ufs/ufs/quota.h>
	#include <ufs/ufs/inode.h>
	#include <ufs/ufs/ufs_extern.h>
	#include <ufs/ufs/ufsmount.h>

	#include <ufs/ffs/fs.h>
	#include <ufs/ffs/ffs_extern.h>
	#include <ufs/ffs/softdep.h>

	typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
	int size, int rsize);

	static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
	static ufs2_daddr_t
	ffs_alloccgblk(struct inode , struct buf , ufs2_daddr_t, int);
	static void ffs_blkfree_cg(struct ufsmount , struct fs ,
	struct vnode *, ufs2_daddr_t, long, ino_t,
	struct workhead *);
	static void ffs_blkfree_trim_completed(struct bio *);
	static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
	#ifdef INVARIANTS
	static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);
	#endif
	static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
	static ino_t ffs_dirpref(struct inode *);
	static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
	int, int);
	static ufs2_daddr_t ffs_hashalloc
	(struct inode , u_int, ufs2_daddr_t, int, int, allocfcn_t );
	static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
	int);
	static ufs1_daddr_t ffs_mapsearch(struct fs , struct cg , ufs2_daddr_t, int);
	static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
	static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
	static void ffs_ckhash_cg(struct buf *);

	/*
	* Allocate a block in the filesystem.
	*
	* The size of the requested block is given, which must be some
	* multiple of fs_fsize and <= fs_bsize.
	* A preference may be optionally specified. If a preference is given
	* the following hierarchy is used to allocate a block:
	* 1) allocate the requested block.
	* 2) allocate a rotationally optimal block in the same cylinder.
	* 3) allocate a block in the same cylinder group.
	* 4) quadradically rehash into other cylinder groups, until an
	* available block is located.
	* If no block preference is given the following hierarchy is used
	* to allocate a block:
	* 1) allocate a block in the cylinder group that contains the
	* inode for the file.
	* 2) quadradically rehash into other cylinder groups, until an
	* available block is located.
	*/
	int
	ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp)
	struct inode *ip;
	ufs2_daddr_t lbn, bpref;
	int size, flags;
	struct ucred *cred;
	ufs2_daddr_t *bnp;
	{
	struct fs *fs;
	struct ufsmount *ump;
	ufs2_daddr_t bno;
	u_int cg, reclaimed;
	static struct timeval lastfail;
	static int curfail;
	int64_t delta;
	#ifdef QUOTA
	int error;
	#endif

	*bnp = 0;
	ump = ITOUMP(ip);
	fs = ump->um_fs;
	mtx_assert(UFS_MTX(ump), MA_OWNED);
	#ifdef INVARIANTS
	if ((u_int)size > fs->fs_bsize \|\| fragoff(fs, size) != 0) {
	printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
	devtoname(ump->um_dev), (long)fs->fs_bsize, size,
	fs->fs_fsmnt);
	panic("ffs_alloc: bad size");
	}
	if (cred == NOCRED)
	panic("ffs_alloc: missing credential");
	#endif /* INVARIANTS */
	reclaimed = 0;
	retry:
	#ifdef QUOTA
	UFS_UNLOCK(ump);
	error = chkdq(ip, btodb(size), cred, 0);
	if (error)
	return (error);
	UFS_LOCK(ump);
	#endif
	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
	goto nospace;
	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
	freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
	goto nospace;
	if (bpref >= fs->fs_size)
	bpref = 0;
	if (bpref == 0)
	cg = ino_to_cg(fs, ip->i_number);
	else
	cg = dtog(fs, bpref);
	bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
	if (bno > 0) {
	delta = btodb(size);
	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
	if (flags & IO_EXT)
	ip->i_flag \|= IN_CHANGE;
	else
	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	*bnp = bno;
	return (0);
	}
	nospace:
	#ifdef QUOTA
	UFS_UNLOCK(ump);
	/*
	* Restore user's disk quota because allocation failed.
	*/
	(void) chkdq(ip, -btodb(size), cred, FORCE);
	UFS_LOCK(ump);
	#endif
	if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
	reclaimed = 1;
	softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
	goto retry;
	}
	UFS_UNLOCK(ump);
	if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) {
	ffs_fserr(fs, ip->i_number, "filesystem full");
	uprintf("\n%s: write failed, filesystem is full\n",
	fs->fs_fsmnt);
	}
	return (ENOSPC);
	}

	/*
	* Reallocate a fragment to a bigger size
	*
	* The number and size of the old block is given, and a preference
	* and new size is also specified. The allocator attempts to extend
	* the original block. Failing that, the regular block allocator is
	* invoked to get an appropriate block.
	*/
	int
	ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp)
	struct inode *ip;
	ufs2_daddr_t lbprev;
	ufs2_daddr_t bprev;
	ufs2_daddr_t bpref;
	int osize, nsize, flags;
	struct ucred *cred;
	struct buf **bpp;
	{
	struct vnode *vp;
	struct fs *fs;
	struct buf *bp;
	struct ufsmount *ump;
	u_int cg, request, reclaimed;
	int error, gbflags;
	ufs2_daddr_t bno;
	static struct timeval lastfail;
	static int curfail;
	int64_t delta;

	vp = ITOV(ip);
	ump = ITOUMP(ip);
	fs = ump->um_fs;
	bp = NULL;
	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;

	mtx_assert(UFS_MTX(ump), MA_OWNED);
	#ifdef INVARIANTS
	if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
	panic("ffs_realloccg: allocation on suspended filesystem");
	if ((u_int)osize > fs->fs_bsize \|\| fragoff(fs, osize) != 0 \|\|
	(u_int)nsize > fs->fs_bsize \|\| fragoff(fs, nsize) != 0) {
	printf(
	"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
	devtoname(ump->um_dev), (long)fs->fs_bsize, osize,
	nsize, fs->fs_fsmnt);
	panic("ffs_realloccg: bad size");
	}
	if (cred == NOCRED)
	panic("ffs_realloccg: missing credential");
	#endif /* INVARIANTS */
	reclaimed = 0;
	retry:
	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
	freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) {
	goto nospace;
	}
	if (bprev == 0) {
	printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
	devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev,
	fs->fs_fsmnt);
	panic("ffs_realloccg: bad bprev");
	}
	UFS_UNLOCK(ump);
	/*
	* Allocate the extra space in the buffer.
	*/
	error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp);
	if (error) {
	brelse(bp);
	return (error);
	}

	if (bp->b_blkno == bp->b_lblkno) {
	if (lbprev >= UFS_NDADDR)
	panic("ffs_realloccg: lbprev out of range");
	bp->b_blkno = fsbtodb(fs, bprev);
	}

	#ifdef QUOTA
	error = chkdq(ip, btodb(nsize - osize), cred, 0);
	if (error) {
	brelse(bp);
	return (error);
	}
	#endif
	/*
	* Check for extension in the existing location.
	*/
	*bpp = NULL;
	cg = dtog(fs, bprev);
	UFS_LOCK(ump);
	bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
	if (bno) {
	if (bp->b_blkno != fsbtodb(fs, bno))
	panic("ffs_realloccg: bad blockno");
	delta = btodb(nsize - osize);
	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
	if (flags & IO_EXT)
	ip->i_flag \|= IN_CHANGE;
	else
	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	allocbuf(bp, nsize);
	bp->b_flags \|= B_DONE;
	vfs_bio_bzero_buf(bp, osize, nsize - osize);
	if ((bp->b_flags & (B_MALLOC \| B_VMIO)) == B_VMIO)
	vfs_bio_set_valid(bp, osize, nsize - osize);
	*bpp = bp;
	return (0);
	}
	/*
	* Allocate a new disk location.
	*/
	if (bpref >= fs->fs_size)
	bpref = 0;
	switch ((int)fs->fs_optim) {
	case FS_OPTSPACE:
	/*
	* Allocate an exact sized fragment. Although this makes
	* best use of space, we will waste time relocating it if
	* the file continues to grow. If the fragmentation is
	* less than half of the minimum free reserve, we choose
	* to begin optimizing for time.
	*/
	request = nsize;
	if (fs->fs_minfree <= 5 \|\|
	fs->fs_cstotal.cs_nffree >
	(off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
	break;
	log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
	fs->fs_fsmnt);
	fs->fs_optim = FS_OPTTIME;
	break;
	case FS_OPTTIME:
	/*
	* At this point we have discovered a file that is trying to
	* grow a small fragment to a larger fragment. To save time,
	* we allocate a full sized block, then free the unused portion.
	* If the file continues to grow, the `ffs_fragextend' call
	* above will be able to grow it in place without further
	* copying. If aberrant programs cause disk fragmentation to
	* grow within 2% of the free reserve, we choose to begin
	* optimizing for space.
	*/
	request = fs->fs_bsize;
	if (fs->fs_cstotal.cs_nffree <
	(off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
	break;
	log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
	fs->fs_fsmnt);
	fs->fs_optim = FS_OPTSPACE;
	break;
	default:
	printf("dev = %s, optim = %ld, fs = %s\n",
	devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt);
	panic("ffs_realloccg: bad optim");
	/* NOTREACHED */
	}
	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
	if (bno > 0) {
	bp->b_blkno = fsbtodb(fs, bno);
	if (!DOINGSOFTDEP(vp))
	ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
	ip->i_number, vp->v_type, NULL);
	delta = btodb(nsize - osize);
	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
	if (flags & IO_EXT)
	ip->i_flag \|= IN_CHANGE;
	else
	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	allocbuf(bp, nsize);
	bp->b_flags \|= B_DONE;
	vfs_bio_bzero_buf(bp, osize, nsize - osize);
	if ((bp->b_flags & (B_MALLOC \| B_VMIO)) == B_VMIO)
	vfs_bio_set_valid(bp, osize, nsize - osize);
	*bpp = bp;
	return (0);
	}
	#ifdef QUOTA
	UFS_UNLOCK(ump);
	/*
	* Restore user's disk quota because allocation failed.
	*/
	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
	UFS_LOCK(ump);
	#endif
	nospace:
	/*
	* no space available
	*/
	if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
	reclaimed = 1;
	UFS_UNLOCK(ump);
	if (bp) {
	brelse(bp);
	bp = NULL;
	}
	UFS_LOCK(ump);
	softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
	goto retry;
	}
	UFS_UNLOCK(ump);
	if (bp)
	brelse(bp);
	if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) {
	ffs_fserr(fs, ip->i_number, "filesystem full");
	uprintf("\n%s: write failed, filesystem is full\n",
	fs->fs_fsmnt);
	}
	return (ENOSPC);
	}

	/*
	* Reallocate a sequence of blocks into a contiguous sequence of blocks.
	*
	* The vnode and an array of buffer pointers for a range of sequential
	* logical blocks to be made contiguous is given. The allocator attempts
	* to find a range of sequential blocks starting as close as possible
	* from the end of the allocation for the logical block immediately
	* preceding the current range. If successful, the physical block numbers
	* in the buffer pointers and in the inode are changed to reflect the new
	* allocation. If unsuccessful, the allocation is left unchanged. The
	* success in doing the reallocation is returned. Note that the error
	* return is not reflected back to the user. Rather the previous block
	* allocation will be used.
	*/

	SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem");

	static int doasyncfree = 1;
	SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
	"do not force synchronous writes when blocks are reallocated");

	static int doreallocblks = 1;
	SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0,
	"enable block reallocation");

	static int maxclustersearch = 10;
	SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch,
	0, "max number of cylinder group to search for contigous blocks");

	#ifdef DEBUG
	static volatile int prtrealloc = 0;
	#endif

	int
	ffs_reallocblks(ap)
	struct vop_reallocblks_args /* {
	struct vnode *a_vp;
	struct cluster_save *a_buflist;
	} / ap;
	{
	struct ufsmount *ump;

	/*
	* If the underlying device can do deletes, then skip reallocating
	* the blocks of this file into contiguous sequences. Devices that
	* benefit from BIO_DELETE also benefit from not moving the data.
	* These devices are flash and therefore work less well with this
	* optimization. Also skip if reallocblks has been disabled globally.
	*/
	ump = ap->a_vp->v_mount->mnt_data;
	if (ump->um_candelete \|\| doreallocblks == 0)
	return (ENOSPC);

	/*
	* We can't wait in softdep prealloc as it may fsync and recurse
	* here. Instead we simply fail to reallocate blocks if this
	* rare condition arises.
	*/
	if (DOINGSOFTDEP(ap->a_vp))
	if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
	return (ENOSPC);
	if (ump->um_fstype == UFS1)
	return (ffs_reallocblks_ufs1(ap));
	return (ffs_reallocblks_ufs2(ap));
	}

	static int
	ffs_reallocblks_ufs1(ap)
	struct vop_reallocblks_args /* {
	struct vnode *a_vp;
	struct cluster_save *a_buflist;
	} / ap;
	{
	struct fs *fs;
	struct inode *ip;
	struct vnode *vp;
	struct buf sbp, ebp;
	ufs1_daddr_t bap, sbap, *ebap;
	struct cluster_save *buflist;
	struct ufsmount *ump;
	ufs_lbn_t start_lbn, end_lbn;
	ufs1_daddr_t soff, newblk, blkno;
	ufs2_daddr_t pref;
	struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
	int i, cg, len, start_lvl, end_lvl, ssize;

	vp = ap->a_vp;
	ip = VTOI(vp);
	ump = ITOUMP(ip);
	fs = ump->um_fs;
	/*
	* If we are not tracking block clusters or if we have less than 4%
	* free blocks left, then do not attempt to cluster. Running with
	* less than 5% free block reserve is not recommended and those that
	* choose to do so do not expect to have good file layout.
	*/
	if (fs->fs_contigsumsize <= 0 \|\| freespace(fs, 4) < 0)
	return (ENOSPC);
	buflist = ap->a_buflist;
	len = buflist->bs_nchildren;
	start_lbn = buflist->bs_children[0]->b_lblkno;
	end_lbn = start_lbn + len - 1;
	#ifdef INVARIANTS
	for (i = 0; i < len; i++)
	if (!ffs_checkblk(ip,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
	panic("ffs_reallocblks: unallocated block 1");
	for (i = 1; i < len; i++)
	if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
	panic("ffs_reallocblks: non-logical cluster");
	blkno = buflist->bs_children[0]->b_blkno;
	ssize = fsbtodb(fs, fs->fs_frag);
	for (i = 1; i < len - 1; i++)
	if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
	panic("ffs_reallocblks: non-physical cluster %d", i);
	#endif
	/*
	* If the cluster crosses the boundary for the first indirect
	* block, leave space for the indirect block. Indirect blocks
	* are initially laid out in a position after the last direct
	* block. Block reallocation would usually destroy locality by
	* moving the indirect block out of the way to make room for
	* data blocks if we didn't compensate here. We should also do
	* this for other indirect block boundaries, but it is only
	* important for the first one.
	*/
	if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
	return (ENOSPC);
	/*
	* If the latest allocation is in a new cylinder group, assume that
	* the filesystem has decided to move and do not force it back to
	* the previous cylinder group.
	*/
	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
	dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
	return (ENOSPC);
	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) \|\|
	ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
	return (ENOSPC);
	/*
	* Get the starting offset and block map for the first block.
	*/
	if (start_lvl == 0) {
	sbap = &ip->i_din1->di_db[0];
	soff = start_lbn;
	} else {
	idp = &start_ap[start_lvl - 1];
	if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
	brelse(sbp);
	return (ENOSPC);
	}
	sbap = (ufs1_daddr_t *)sbp->b_data;
	soff = idp->in_off;
	}
	/*
	* If the block range spans two block maps, get the second map.
	*/
	ebap = NULL;
	if (end_lvl == 0 \|\| (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
	ssize = len;
	} else {
	#ifdef INVARIANTS
	if (start_lvl > 0 &&
	start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
	panic("ffs_reallocblk: start == end");
	#endif
	ssize = len - (idp->in_off + 1);
	if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
	goto fail;
	ebap = (ufs1_daddr_t *)ebp->b_data;
	}
	/*
	* Find the preferred location for the cluster. If we have not
	* previously failed at this endeavor, then follow our standard
	* preference calculation. If we have failed at it, then pick up
	* where we last ended our search.
	*/
	UFS_LOCK(ump);
	if (ip->i_nextclustercg == -1)
	pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
	else
	pref = cgdata(fs, ip->i_nextclustercg);
	/*
	* Search the block map looking for an allocation of the desired size.
	* To avoid wasting too much time, we limit the number of cylinder
	* groups that we will search.
	*/
	cg = dtog(fs, pref);
	for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
	if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
	break;
	cg += 1;
	if (cg >= fs->fs_ncg)
	cg = 0;
	}
	/*
	* If we have failed in our search, record where we gave up for
	* next time. Otherwise, fall back to our usual search citerion.
	*/
	if (newblk == 0) {
	ip->i_nextclustercg = cg;
	UFS_UNLOCK(ump);
	goto fail;
	}
	ip->i_nextclustercg = -1;
	/*
	* We have found a new contiguous block.
	*
	* First we have to replace the old block pointers with the new
	* block pointers in the inode and indirect blocks associated
	* with the file.
	*/
	#ifdef DEBUG
	if (prtrealloc)
	printf("realloc: ino %ju, lbns %jd-%jd\n\told:",
	(uintmax_t)ip->i_number,
	(intmax_t)start_lbn, (intmax_t)end_lbn);
	#endif
	blkno = newblk;
	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
	if (i == ssize) {
	bap = ebap;
	soff = -i;
	}
	#ifdef INVARIANTS
	if (!ffs_checkblk(ip,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
	panic("ffs_reallocblks: unallocated block 2");
	if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
	panic("ffs_reallocblks: alloc mismatch");
	#endif
	#ifdef DEBUG
	if (prtrealloc)
	printf(" %d,", *bap);
	#endif
	if (DOINGSOFTDEP(vp)) {
	if (sbap == &ip->i_din1->di_db[0] && i < ssize)
	softdep_setup_allocdirect(ip, start_lbn + i,
	blkno, *bap, fs->fs_bsize, fs->fs_bsize,
	buflist->bs_children[i]);
	else
	softdep_setup_allocindir_page(ip, start_lbn + i,
	i < ssize ? sbp : ebp, soff + i, blkno,
	*bap, buflist->bs_children[i]);
	}
	*bap++ = blkno;
	}
	/*
	* Next we must write out the modified inode and indirect blocks.
	* For strict correctness, the writes should be synchronous since
	* the old block values may have been written to disk. In practise
	* they are almost never written, but if we are concerned about
	* strict correctness, the `doasyncfree' flag should be set to zero.
	*
	* The test on `doasyncfree' should be changed to test a flag
	* that shows whether the associated buffers and inodes have
	* been written. The flag should be set when the cluster is
	* started and cleared whenever the buffer or inode is flushed.
	* We can then check below to see if it is set, and do the
	* synchronous write only when it has been cleared.
	*/
	if (sbap != &ip->i_din1->di_db[0]) {
	if (doasyncfree)
	bdwrite(sbp);
	else
	bwrite(sbp);
	} else {
	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	if (!doasyncfree)
	ffs_update(vp, 1);
	}
	if (ssize < len) {
	if (doasyncfree)
	bdwrite(ebp);
	else
	bwrite(ebp);
	}
	/*
	* Last, free the old blocks and assign the new blocks to the buffers.
	*/
	#ifdef DEBUG
	if (prtrealloc)
	printf("\n\tnew:");
	#endif
	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
	if (!DOINGSOFTDEP(vp))
	ffs_blkfree(ump, fs, ump->um_devvp,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno),
	fs->fs_bsize, ip->i_number, vp->v_type, NULL);
	buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
	#ifdef INVARIANTS
	if (!ffs_checkblk(ip,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
	panic("ffs_reallocblks: unallocated block 3");
	#endif
	#ifdef DEBUG
	if (prtrealloc)
	printf(" %d,", blkno);
	#endif
	}
	#ifdef DEBUG
	if (prtrealloc) {
	prtrealloc--;
	printf("\n");
	}
	#endif
	return (0);

	fail:
	if (ssize < len)
	brelse(ebp);
	if (sbap != &ip->i_din1->di_db[0])
	brelse(sbp);
	return (ENOSPC);
	}

	static int
	ffs_reallocblks_ufs2(ap)
	struct vop_reallocblks_args /* {
	struct vnode *a_vp;
	struct cluster_save *a_buflist;
	} / ap;
	{
	struct fs *fs;
	struct inode *ip;
	struct vnode *vp;
	struct buf sbp, ebp;
	ufs2_daddr_t bap, sbap, *ebap;
	struct cluster_save *buflist;
	struct ufsmount *ump;
	ufs_lbn_t start_lbn, end_lbn;
	ufs2_daddr_t soff, newblk, blkno, pref;
	struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
	int i, cg, len, start_lvl, end_lvl, ssize;

	vp = ap->a_vp;
	ip = VTOI(vp);
	ump = ITOUMP(ip);
	fs = ump->um_fs;
	/*
	* If we are not tracking block clusters or if we have less than 4%
	* free blocks left, then do not attempt to cluster. Running with
	* less than 5% free block reserve is not recommended and those that
	* choose to do so do not expect to have good file layout.
	*/
	if (fs->fs_contigsumsize <= 0 \|\| freespace(fs, 4) < 0)
	return (ENOSPC);
	buflist = ap->a_buflist;
	len = buflist->bs_nchildren;
	start_lbn = buflist->bs_children[0]->b_lblkno;
	end_lbn = start_lbn + len - 1;
	#ifdef INVARIANTS
	for (i = 0; i < len; i++)
	if (!ffs_checkblk(ip,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
	panic("ffs_reallocblks: unallocated block 1");
	for (i = 1; i < len; i++)
	if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
	panic("ffs_reallocblks: non-logical cluster");
	blkno = buflist->bs_children[0]->b_blkno;
	ssize = fsbtodb(fs, fs->fs_frag);
	for (i = 1; i < len - 1; i++)
	if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
	panic("ffs_reallocblks: non-physical cluster %d", i);
	#endif
	/*
	* If the cluster crosses the boundary for the first indirect
	* block, do not move anything in it. Indirect blocks are
	* usually initially laid out in a position between the data
	* blocks. Block reallocation would usually destroy locality by
	* moving the indirect block out of the way to make room for
	* data blocks if we didn't compensate here. We should also do
	* this for other indirect block boundaries, but it is only
	* important for the first one.
	*/
	if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
	return (ENOSPC);
	/*
	* If the latest allocation is in a new cylinder group, assume that
	* the filesystem has decided to move and do not force it back to
	* the previous cylinder group.
	*/
	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
	dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
	return (ENOSPC);
	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) \|\|
	ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
	return (ENOSPC);
	/*
	* Get the starting offset and block map for the first block.
	*/
	if (start_lvl == 0) {
	sbap = &ip->i_din2->di_db[0];
	soff = start_lbn;
	} else {
	idp = &start_ap[start_lvl - 1];
	if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
	brelse(sbp);
	return (ENOSPC);
	}
	sbap = (ufs2_daddr_t *)sbp->b_data;
	soff = idp->in_off;
	}
	/*
	* If the block range spans two block maps, get the second map.
	*/
	ebap = NULL;
	if (end_lvl == 0 \|\| (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
	ssize = len;
	} else {
	#ifdef INVARIANTS
	if (start_lvl > 0 &&
	start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
	panic("ffs_reallocblk: start == end");
	#endif
	ssize = len - (idp->in_off + 1);
	if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
	goto fail;
	ebap = (ufs2_daddr_t *)ebp->b_data;
	}
	/*
	* Find the preferred location for the cluster. If we have not
	* previously failed at this endeavor, then follow our standard
	* preference calculation. If we have failed at it, then pick up
	* where we last ended our search.
	*/
	UFS_LOCK(ump);
	if (ip->i_nextclustercg == -1)
	pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
	else
	pref = cgdata(fs, ip->i_nextclustercg);
	/*
	* Search the block map looking for an allocation of the desired size.
	* To avoid wasting too much time, we limit the number of cylinder
	* groups that we will search.
	*/
	cg = dtog(fs, pref);
	for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
	if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
	break;
	cg += 1;
	if (cg >= fs->fs_ncg)
	cg = 0;
	}
	/*
	* If we have failed in our search, record where we gave up for
	* next time. Otherwise, fall back to our usual search citerion.
	*/
	if (newblk == 0) {
	ip->i_nextclustercg = cg;
	UFS_UNLOCK(ump);
	goto fail;
	}
	ip->i_nextclustercg = -1;
	/*
	* We have found a new contiguous block.
	*
	* First we have to replace the old block pointers with the new
	* block pointers in the inode and indirect blocks associated
	* with the file.
	*/
	#ifdef DEBUG
	if (prtrealloc)
	printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number,
	(intmax_t)start_lbn, (intmax_t)end_lbn);
	#endif
	blkno = newblk;
	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
	if (i == ssize) {
	bap = ebap;
	soff = -i;
	}
	#ifdef INVARIANTS
	if (!ffs_checkblk(ip,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
	panic("ffs_reallocblks: unallocated block 2");
	if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
	panic("ffs_reallocblks: alloc mismatch");
	#endif
	#ifdef DEBUG
	if (prtrealloc)
	printf(" %jd,", (intmax_t)*bap);
	#endif
	if (DOINGSOFTDEP(vp)) {
	if (sbap == &ip->i_din2->di_db[0] && i < ssize)
	softdep_setup_allocdirect(ip, start_lbn + i,
	blkno, *bap, fs->fs_bsize, fs->fs_bsize,
	buflist->bs_children[i]);
	else
	softdep_setup_allocindir_page(ip, start_lbn + i,
	i < ssize ? sbp : ebp, soff + i, blkno,
	*bap, buflist->bs_children[i]);
	}
	*bap++ = blkno;
	}
	/*
	* Next we must write out the modified inode and indirect blocks.
	* For strict correctness, the writes should be synchronous since
	* the old block values may have been written to disk. In practise
	* they are almost never written, but if we are concerned about
	* strict correctness, the `doasyncfree' flag should be set to zero.
	*
	* The test on `doasyncfree' should be changed to test a flag
	* that shows whether the associated buffers and inodes have
	* been written. The flag should be set when the cluster is
	* started and cleared whenever the buffer or inode is flushed.
	* We can then check below to see if it is set, and do the
	* synchronous write only when it has been cleared.
	*/
	if (sbap != &ip->i_din2->di_db[0]) {
	if (doasyncfree)
	bdwrite(sbp);
	else
	bwrite(sbp);
	} else {
	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	if (!doasyncfree)
	ffs_update(vp, 1);
	}
	if (ssize < len) {
	if (doasyncfree)
	bdwrite(ebp);
	else
	bwrite(ebp);
	}
	/*
	* Last, free the old blocks and assign the new blocks to the buffers.
	*/
	#ifdef DEBUG
	if (prtrealloc)
	printf("\n\tnew:");
	#endif
	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
	if (!DOINGSOFTDEP(vp))
	ffs_blkfree(ump, fs, ump->um_devvp,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno),
	fs->fs_bsize, ip->i_number, vp->v_type, NULL);
	buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
	#ifdef INVARIANTS
	if (!ffs_checkblk(ip,
	dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
	panic("ffs_reallocblks: unallocated block 3");
	#endif
	#ifdef DEBUG
	if (prtrealloc)
	printf(" %jd,", (intmax_t)blkno);
	#endif
	}
	#ifdef DEBUG
	if (prtrealloc) {
	prtrealloc--;
	printf("\n");
	}
	#endif
	return (0);

	fail:
	if (ssize < len)
	brelse(ebp);
	if (sbap != &ip->i_din2->di_db[0])
	brelse(sbp);
	return (ENOSPC);
	}

	/*
	* Allocate an inode in the filesystem.
	*
	* If allocating a directory, use ffs_dirpref to select the inode.
	* If allocating in a directory, the following hierarchy is followed:
	* 1) allocate the preferred inode.
	* 2) allocate an inode in the same cylinder group.
	* 3) quadradically rehash into other cylinder groups, until an
	* available inode is located.
	* If no inode preference is given the following hierarchy is used
	* to allocate an inode:
	* 1) allocate an inode in cylinder group 0.
	* 2) quadradically rehash into other cylinder groups, until an
	* available inode is located.
	*/
	int
	ffs_valloc(pvp, mode, cred, vpp)
	struct vnode *pvp;
	int mode;
	struct ucred *cred;
	struct vnode **vpp;
	{
	struct inode *pip;
	struct fs *fs;
	struct inode *ip;
	struct timespec ts;
	struct ufsmount *ump;
	ino_t ino, ipref;
	u_int cg;
	int error, error1, reclaimed;
	static struct timeval lastfail;
	static int curfail;

	*vpp = NULL;
	pip = VTOI(pvp);
	ump = ITOUMP(pip);
	fs = ump->um_fs;

	UFS_LOCK(ump);
	reclaimed = 0;
	retry:
	if (fs->fs_cstotal.cs_nifree == 0)
	goto noinodes;

	if ((mode & IFMT) == IFDIR)
	ipref = ffs_dirpref(pip);
	else
	ipref = pip->i_number;
	if (ipref >= fs->fs_ncg * fs->fs_ipg)
	ipref = 0;
	cg = ino_to_cg(fs, ipref);
	/*
	* Track number of dirs created one after another
	* in a same cg without intervening by files.
	*/
	if ((mode & IFMT) == IFDIR) {
	if (fs->fs_contigdirs[cg] < 255)
	fs->fs_contigdirs[cg]++;
	} else {
	if (fs->fs_contigdirs[cg] > 0)
	fs->fs_contigdirs[cg]--;
	}
	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
	(allocfcn_t *)ffs_nodealloccg);
	if (ino == 0)
	goto noinodes;
	error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp);
	if (error) {
	error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp,
	FFSV_FORCEINSMQ);
	ffs_vfree(pvp, ino, mode);
	if (error1 == 0) {
	ip = VTOI(*vpp);
	if (ip->i_mode)
	goto dup_alloc;
	ip->i_flag \|= IN_MODIFIED;
	vput(*vpp);
	}
	return (error);
	}
	ip = VTOI(*vpp);
	if (ip->i_mode) {
	dup_alloc:
	printf("mode = 0%o, inum = %ju, fs = %s\n",
	ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt);
	panic("ffs_valloc: dup alloc");
	}
	if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */
	printf("free inode %s/%lu had %ld blocks\n",
	fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks));
	DIP_SET(ip, i_blocks, 0);
	}
	ip->i_flags = 0;
	DIP_SET(ip, i_flags, 0);
	/*
	* Set up a new generation number for this inode.
	*/
	while (ip->i_gen == 0 \|\| ++ip->i_gen == 0)
	ip->i_gen = arc4random();
	DIP_SET(ip, i_gen, ip->i_gen);
	if (fs->fs_magic == FS_UFS2_MAGIC) {
	vfs_timestamp(&ts);
	ip->i_din2->di_birthtime = ts.tv_sec;
	ip->i_din2->di_birthnsec = ts.tv_nsec;
	}
	ufs_prepare_reclaim(*vpp);
	ip->i_flag = 0;
	(*vpp)->v_vflag = 0;
	(*vpp)->v_type = VNON;
	if (fs->fs_magic == FS_UFS2_MAGIC) {
	(*vpp)->v_op = &ffs_vnodeops2;
	ip->i_flag \|= IN_UFS2;
	} else {
	(*vpp)->v_op = &ffs_vnodeops1;
	}
	return (0);
	noinodes:
	if (reclaimed == 0) {
	reclaimed = 1;
	softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT);
	goto retry;
	}
	UFS_UNLOCK(ump);
	if (ppsratecheck(&lastfail, &curfail, 1)) {
	ffs_fserr(fs, pip->i_number, "out of inodes");
	uprintf("\n%s: create/symlink failed, no inodes free\n",
	fs->fs_fsmnt);
	}
	return (ENOSPC);
	}

	/*
	* Find a cylinder group to place a directory.
	*
	* The policy implemented by this algorithm is to allocate a
	* directory inode in the same cylinder group as its parent
	* directory, but also to reserve space for its files inodes
	* and data. Restrict the number of directories which may be
	* allocated one after another in the same cylinder group
	* without intervening allocation of files.
	*
	* If we allocate a first level directory then force allocation
	* in another cylinder group.
	*/
	static ino_t
	ffs_dirpref(pip)
	struct inode *pip;
	{
	struct fs *fs;
	int cg, prefcg, dirsize, cgsize;
	u_int avgifree, avgbfree, avgndir, curdirsize;
	u_int minifree, minbfree, maxndir;
	u_int mincg, minndir;
	u_int maxcontigdirs;

	mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
	fs = ITOFS(pip);

	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;

	/*
	* Force allocation in another cg if creating a first level dir.
	*/
	ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
	if (ITOV(pip)->v_vflag & VV_ROOT) {
	prefcg = arc4random() % fs->fs_ncg;
	mincg = prefcg;
	minndir = fs->fs_ipg;
	for (cg = prefcg; cg < fs->fs_ncg; cg++)
	if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
	fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
	fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
	mincg = cg;
	minndir = fs->fs_cs(fs, cg).cs_ndir;
	}
	for (cg = 0; cg < prefcg; cg++)
	if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
	fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
	fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
	mincg = cg;
	minndir = fs->fs_cs(fs, cg).cs_ndir;
	}
	return ((ino_t)(fs->fs_ipg * mincg));
	}

	/*
	* Count various limits which used for
	* optimal allocation of a directory inode.
	*/
	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
	minifree = avgifree - avgifree / 4;
	if (minifree < 1)
	minifree = 1;
	minbfree = avgbfree - avgbfree / 4;
	if (minbfree < 1)
	minbfree = 1;
	cgsize = fs->fs_fsize * fs->fs_fpg;
	dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
	curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
	if (dirsize < curdirsize)
	dirsize = curdirsize;
	if (dirsize <= 0)
	maxcontigdirs = 0; /* dirsize overflowed */
	else
	maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
	if (fs->fs_avgfpdir > 0)
	maxcontigdirs = min(maxcontigdirs,
	fs->fs_ipg / fs->fs_avgfpdir);
	if (maxcontigdirs == 0)
	maxcontigdirs = 1;

	/*
	* Limit number of dirs in one cg and reserve space for
	* regular files, but only if we have no deficit in
	* inodes or space.
	*
	* We are trying to find a suitable cylinder group nearby
	* our preferred cylinder group to place a new directory.
	* We scan from our preferred cylinder group forward looking
	* for a cylinder group that meets our criterion. If we get
	* to the final cylinder group and do not find anything,
	* we start scanning forwards from the beginning of the
	* filesystem. While it might seem sensible to start scanning
	* backwards or even to alternate looking forward and backward,
	* this approach fails badly when the filesystem is nearly full.
	* Specifically, we first search all the areas that have no space
	* and finally try the one preceding that. We repeat this on
	* every request and in the case of the final block end up
	* searching the entire filesystem. By jumping to the front
	* of the filesystem, our future forward searches always look
	* in new cylinder groups so finds every possible block after
	* one pass over the filesystem.
	*/
	prefcg = ino_to_cg(fs, pip->i_number);
	for (cg = prefcg; cg < fs->fs_ncg; cg++)
	if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
	fs->fs_cs(fs, cg).cs_nifree >= minifree &&
	fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
	if (fs->fs_contigdirs[cg] < maxcontigdirs)
	return ((ino_t)(fs->fs_ipg * cg));
	}
	for (cg = 0; cg < prefcg; cg++)
	if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
	fs->fs_cs(fs, cg).cs_nifree >= minifree &&
	fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
	if (fs->fs_contigdirs[cg] < maxcontigdirs)
	return ((ino_t)(fs->fs_ipg * cg));
	}
	/*
	* This is a backstop when we have deficit in space.
	*/
	for (cg = prefcg; cg < fs->fs_ncg; cg++)
	if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
	return ((ino_t)(fs->fs_ipg * cg));
	for (cg = 0; cg < prefcg; cg++)
	if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
	break;
	return ((ino_t)(fs->fs_ipg * cg));
	}

	/*
	* Select the desired position for the next block in a file. The file is
	* logically divided into sections. The first section is composed of the
	* direct blocks and the next fs_maxbpg blocks. Each additional section
	* contains fs_maxbpg blocks.
	*
	* If no blocks have been allocated in the first section, the policy is to
	* request a block in the same cylinder group as the inode that describes
	* the file. The first indirect is allocated immediately following the last
	* direct block and the data blocks for the first indirect immediately
	* follow it.
	*
	* If no blocks have been allocated in any other section, the indirect
	* block(s) are allocated in the same cylinder group as its inode in an
	* area reserved immediately following the inode blocks. The policy for
	* the data blocks is to place them in a cylinder group with a greater than
	* average number of free blocks. An appropriate cylinder group is found
	* by using a rotor that sweeps the cylinder groups. When a new group of
	* blocks is needed, the sweep begins in the cylinder group following the
	* cylinder group from which the previous allocation was made. The sweep
	* continues until a cylinder group with greater than the average number
	* of free blocks is found. If the allocation is for the first block in an
	* indirect block or the previous block is a hole, then the information on
	* the previous allocation is unavailable; here a best guess is made based
	* on the logical block number being allocated.
	*
	* If a section is already partially allocated, the policy is to
	* allocate blocks contiguously within the section if possible.
	*/
	ufs2_daddr_t
	ffs_blkpref_ufs1(ip, lbn, indx, bap)
	struct inode *ip;
	ufs_lbn_t lbn;
	int indx;
	ufs1_daddr_t *bap;
	{
	struct fs *fs;
	u_int cg, inocg;
	u_int avgbfree, startcg;
	ufs2_daddr_t pref;

	KASSERT(indx <= 0 \|\| bap != NULL, ("need non-NULL bap"));
	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
	fs = ITOFS(ip);
	/*
	* Allocation of indirect blocks is indicated by passing negative
	* values in indx: -1 for single indirect, -2 for double indirect,
	* -3 for triple indirect. As noted below, we attempt to allocate
	* the first indirect inline with the file data. For all later
	* indirect blocks, the data is often allocated in other cylinder
	* groups. However to speed random file access and to speed up
	* fsck, the filesystem reserves the first fs_metaspace blocks
	* (typically half of fs_minfree) of the data area of each cylinder
	* group to hold these later indirect blocks.
	*/
	inocg = ino_to_cg(fs, ip->i_number);
	if (indx < 0) {
	/*
	* Our preference for indirect blocks is the zone at the
	* beginning of the inode's cylinder group data area that
	* we try to reserve for indirect blocks.
	*/
	pref = cgmeta(fs, inocg);
	/*
	* If we are allocating the first indirect block, try to
	* place it immediately following the last direct block.
	*/
	if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
	ip->i_din1->di_db[UFS_NDADDR - 1] != 0)
	pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag;
	return (pref);
	}
	/*
	* If we are allocating the first data block in the first indirect
	* block and the indirect has been allocated in the data block area,
	* try to place it immediately following the indirect block.
	*/
	if (lbn == UFS_NDADDR) {
	pref = ip->i_din1->di_ib[0];
	if (pref != 0 && pref >= cgdata(fs, inocg) &&
	pref < cgbase(fs, inocg + 1))
	return (pref + fs->fs_frag);
	}
	/*
	* If we are at the beginning of a file, or we have already allocated
	* the maximum number of blocks per cylinder group, or we do not
	* have a block allocated immediately preceding us, then we need
	* to decide where to start allocating new blocks.
	*/
	if (indx % fs->fs_maxbpg == 0 \|\| bap[indx - 1] == 0) {
	/*
	* If we are allocating a directory data block, we want
	* to place it in the metadata area.
	*/
	if ((ip->i_mode & IFMT) == IFDIR)
	return (cgmeta(fs, inocg));
	/*
	* Until we fill all the direct and all the first indirect's
	* blocks, we try to allocate in the data area of the inode's
	* cylinder group.
	*/
	if (lbn < UFS_NDADDR + NINDIR(fs))
	return (cgdata(fs, inocg));
	/*
	* Find a cylinder with greater than average number of
	* unused data blocks.
	*/
	if (indx == 0 \|\| bap[indx - 1] == 0)
	startcg = inocg + lbn / fs->fs_maxbpg;
	else
	startcg = dtog(fs, bap[indx - 1]) + 1;
	startcg %= fs->fs_ncg;
	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
	for (cg = startcg; cg < fs->fs_ncg; cg++)
	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
	fs->fs_cgrotor = cg;
	return (cgdata(fs, cg));
	}
	for (cg = 0; cg <= startcg; cg++)
	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
	fs->fs_cgrotor = cg;
	return (cgdata(fs, cg));
	}
	return (0);
	}
	/*
	* Otherwise, we just always try to lay things out contiguously.
	*/
	return (bap[indx - 1] + fs->fs_frag);
	}

	/*
	* Same as above, but for UFS2
	*/
	ufs2_daddr_t
	ffs_blkpref_ufs2(ip, lbn, indx, bap)
	struct inode *ip;
	ufs_lbn_t lbn;
	int indx;
	ufs2_daddr_t *bap;
	{
	struct fs *fs;
	u_int cg, inocg;
	u_int avgbfree, startcg;
	ufs2_daddr_t pref;

	KASSERT(indx <= 0 \|\| bap != NULL, ("need non-NULL bap"));
	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
	fs = ITOFS(ip);
	/*
	* Allocation of indirect blocks is indicated by passing negative
	* values in indx: -1 for single indirect, -2 for double indirect,
	* -3 for triple indirect. As noted below, we attempt to allocate
	* the first indirect inline with the file data. For all later
	* indirect blocks, the data is often allocated in other cylinder
	* groups. However to speed random file access and to speed up
	* fsck, the filesystem reserves the first fs_metaspace blocks
	* (typically half of fs_minfree) of the data area of each cylinder
	* group to hold these later indirect blocks.
	*/
	inocg = ino_to_cg(fs, ip->i_number);
	if (indx < 0) {
	/*
	* Our preference for indirect blocks is the zone at the
	* beginning of the inode's cylinder group data area that
	* we try to reserve for indirect blocks.
	*/
	pref = cgmeta(fs, inocg);
	/*
	* If we are allocating the first indirect block, try to
	* place it immediately following the last direct block.
	*/
	if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
	ip->i_din2->di_db[UFS_NDADDR - 1] != 0)
	pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag;
	return (pref);
	}
	/*
	* If we are allocating the first data block in the first indirect
	* block and the indirect has been allocated in the data block area,
	* try to place it immediately following the indirect block.
	*/
	if (lbn == UFS_NDADDR) {
	pref = ip->i_din2->di_ib[0];
	if (pref != 0 && pref >= cgdata(fs, inocg) &&
	pref < cgbase(fs, inocg + 1))
	return (pref + fs->fs_frag);
	}
	/*
	* If we are at the beginning of a file, or we have already allocated
	* the maximum number of blocks per cylinder group, or we do not
	* have a block allocated immediately preceding us, then we need
	* to decide where to start allocating new blocks.
	*/
	if (indx % fs->fs_maxbpg == 0 \|\| bap[indx - 1] == 0) {
	/*
	* If we are allocating a directory data block, we want
	* to place it in the metadata area.
	*/
	if ((ip->i_mode & IFMT) == IFDIR)
	return (cgmeta(fs, inocg));
	/*
	* Until we fill all the direct and all the first indirect's
	* blocks, we try to allocate in the data area of the inode's
	* cylinder group.
	*/
	if (lbn < UFS_NDADDR + NINDIR(fs))
	return (cgdata(fs, inocg));
	/*
	* Find a cylinder with greater than average number of
	* unused data blocks.
	*/
	if (indx == 0 \|\| bap[indx - 1] == 0)
	startcg = inocg + lbn / fs->fs_maxbpg;
	else
	startcg = dtog(fs, bap[indx - 1]) + 1;
	startcg %= fs->fs_ncg;
	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
	for (cg = startcg; cg < fs->fs_ncg; cg++)
	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
	fs->fs_cgrotor = cg;
	return (cgdata(fs, cg));
	}
	for (cg = 0; cg <= startcg; cg++)
	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
	fs->fs_cgrotor = cg;
	return (cgdata(fs, cg));
	}
	return (0);
	}
	/*
	* Otherwise, we just always try to lay things out contiguously.
	*/
	return (bap[indx - 1] + fs->fs_frag);
	}

	/*
	* Implement the cylinder overflow algorithm.
	*
	* The policy implemented by this algorithm is:
	* 1) allocate the block in its requested cylinder group.
	* 2) quadradically rehash on the cylinder group number.
	* 3) brute force search for a free block.
	*
	* Must be called with the UFS lock held. Will release the lock on success
	* and return with it held on failure.
	*/
	/VARARGS5/
	static ufs2_daddr_t
	ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
	struct inode *ip;
	u_int cg;
	ufs2_daddr_t pref;
	int size; /* Search size for data blocks, mode for inodes */
	int rsize; /* Real allocated size. */
	allocfcn_t *allocator;
	{
	struct fs *fs;
	ufs2_daddr_t result;
	u_int i, icg = cg;

	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
	#ifdef INVARIANTS
	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
	panic("ffs_hashalloc: allocation on suspended filesystem");
	#endif
	fs = ITOFS(ip);
	/*
	* 1: preferred cylinder group
	*/
	result = (*allocator)(ip, cg, pref, size, rsize);
	if (result)
	return (result);
	/*
	* 2: quadratic rehash
	*/
	for (i = 1; i < fs->fs_ncg; i *= 2) {
	cg += i;
	if (cg >= fs->fs_ncg)
	cg -= fs->fs_ncg;
	result = (*allocator)(ip, cg, 0, size, rsize);
	if (result)
	return (result);
	}
	/*
	* 3: brute force search
	* Note that we start at i == 2, since 0 was checked initially,
	* and 1 is always checked in the quadratic rehash.
	*/
	cg = (icg + 2) % fs->fs_ncg;
	for (i = 2; i < fs->fs_ncg; i++) {
	result = (*allocator)(ip, cg, 0, size, rsize);
	if (result)
	return (result);
	cg++;
	if (cg == fs->fs_ncg)
	cg = 0;
	}
	return (0);
	}

	/*
	* Determine whether a fragment can be extended.
	*
	* Check to see if the necessary fragments are available, and
	* if they are, allocate them.
	*/
	static ufs2_daddr_t
	ffs_fragextend(ip, cg, bprev, osize, nsize)
	struct inode *ip;
	u_int cg;
	ufs2_daddr_t bprev;
	int osize, nsize;
	{
	struct fs *fs;
	struct cg *cgp;
	struct buf *bp;
	struct ufsmount *ump;
	int nffree;
	long bno;
	int frags, bbase;
	int i, error;
	u_int8_t *blksfree;

	ump = ITOUMP(ip);
	fs = ump->um_fs;
	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
	return (0);
	frags = numfrags(fs, nsize);
	bbase = fragnum(fs, bprev);
	if (bbase > fragnum(fs, (bprev + frags - 1))) {
	/* cannot extend across a block boundary */
	return (0);
	}
	UFS_UNLOCK(ump);
	if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0)
	goto fail;
	bno = dtogd(fs, bprev);
	blksfree = cg_blksfree(cgp);
	for (i = numfrags(fs, osize); i < frags; i++)
	if (isclr(blksfree, bno + i))
	goto fail;
	/*
	* the current fragment can be extended
	* deduct the count on fragment being extended into
	* increase the count on the remaining fragment (if any)
	* allocate the extended piece
	*/
	for (i = frags; i < fs->fs_frag - bbase; i++)
	if (isclr(blksfree, bno + i))
	break;
	cgp->cg_frsum[i - numfrags(fs, osize)]--;
	if (i != frags)
	cgp->cg_frsum[i - frags]++;
	for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
	clrbit(blksfree, bno + i);
	cgp->cg_cs.cs_nffree--;
	nffree++;
	}
	UFS_LOCK(ump);
	fs->fs_cstotal.cs_nffree -= nffree;
	fs->fs_cs(fs, cg).cs_nffree -= nffree;
	fs->fs_fmod = 1;
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	if (DOINGSOFTDEP(ITOV(ip)))
	softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
	frags, numfrags(fs, osize));
	bdwrite(bp);
	return (bprev);

	fail:
	brelse(bp);
	UFS_LOCK(ump);
	return (0);

	}

	/*
	* Determine whether a block can be allocated.
	*
	* Check to see if a block of the appropriate size is available,
	* and if it is, allocate it.
	*/
	static ufs2_daddr_t
	ffs_alloccg(ip, cg, bpref, size, rsize)
	struct inode *ip;
	u_int cg;
	ufs2_daddr_t bpref;
	int size;
	int rsize;
	{
	struct fs *fs;
	struct cg *cgp;
	struct buf *bp;
	struct ufsmount *ump;
	ufs1_daddr_t bno;
	ufs2_daddr_t blkno;
	int i, allocsiz, error, frags;
	u_int8_t *blksfree;

	ump = ITOUMP(ip);
	fs = ump->um_fs;
	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
	return (0);
	UFS_UNLOCK(ump);
	if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0 \|\|
	(cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
	goto fail;
	if (size == fs->fs_bsize) {
	UFS_LOCK(ump);
	blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	bdwrite(bp);
	return (blkno);
	}
	/*
	* check to see if any fragments are already available
	* allocsiz is the size which will be allocated, hacking
	* it down to a smaller size if necessary
	*/
	blksfree = cg_blksfree(cgp);
	frags = numfrags(fs, size);
	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
	if (cgp->cg_frsum[allocsiz] != 0)
	break;
	if (allocsiz == fs->fs_frag) {
	/*
	* no fragments were available, so a block will be
	* allocated, and hacked up
	*/
	if (cgp->cg_cs.cs_nbfree == 0)
	goto fail;
	UFS_LOCK(ump);
	blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	bdwrite(bp);
	return (blkno);
	}
	KASSERT(size == rsize,
	("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
	if (bno < 0)
	goto fail;
	for (i = 0; i < frags; i++)
	clrbit(blksfree, bno + i);
	cgp->cg_cs.cs_nffree -= frags;
	cgp->cg_frsum[allocsiz]--;
	if (frags != allocsiz)
	cgp->cg_frsum[allocsiz - frags]++;
	UFS_LOCK(ump);
	fs->fs_cstotal.cs_nffree -= frags;
	fs->fs_cs(fs, cg).cs_nffree -= frags;
	fs->fs_fmod = 1;
	blkno = cgbase(fs, cg) + bno;
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	if (DOINGSOFTDEP(ITOV(ip)))
	softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
	bdwrite(bp);
	return (blkno);

	fail:
	brelse(bp);
	UFS_LOCK(ump);
	return (0);
	}

	/*
	* Allocate a block in a cylinder group.
	*
	* This algorithm implements the following policy:
	* 1) allocate the requested block.
	* 2) allocate a rotationally optimal block in the same cylinder.
	* 3) allocate the next available block on the block rotor for the
	* specified cylinder group.
	* Note that this routine only allocates fs_bsize blocks; these
	* blocks may be fragmented by the routine that allocates them.
	*/
	static ufs2_daddr_t
	ffs_alloccgblk(ip, bp, bpref, size)
	struct inode *ip;
	struct buf *bp;
	ufs2_daddr_t bpref;
	int size;
	{
	struct fs *fs;
	struct cg *cgp;
	struct ufsmount *ump;
	ufs1_daddr_t bno;
	ufs2_daddr_t blkno;
	u_int8_t *blksfree;
	int i, cgbpref;

	ump = ITOUMP(ip);
	fs = ump->um_fs;
	mtx_assert(UFS_MTX(ump), MA_OWNED);
	cgp = (struct cg *)bp->b_data;
	blksfree = cg_blksfree(cgp);
	if (bpref == 0) {
	bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag;
	} else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
	/* map bpref to correct zone in this cg */
	if (bpref < cgdata(fs, cgbpref))
	bpref = cgmeta(fs, cgp->cg_cgx);
	else
	bpref = cgdata(fs, cgp->cg_cgx);
	}
	/*
	* if the requested block is available, use it
	*/
	bno = dtogd(fs, blknum(fs, bpref));
	if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
	goto gotit;
	/*
	* Take the next available block in this cylinder group.
	*/
	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
	if (bno < 0)
	return (0);
	/* Update cg_rotor only if allocated from the data zone */
	if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx)))
	cgp->cg_rotor = bno;
	gotit:
	blkno = fragstoblks(fs, bno);
	ffs_clrblock(fs, blksfree, (long)blkno);
	ffs_clusteracct(fs, cgp, blkno, -1);
	cgp->cg_cs.cs_nbfree--;
	fs->fs_cstotal.cs_nbfree--;
	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
	fs->fs_fmod = 1;
	blkno = cgbase(fs, cgp->cg_cgx) + bno;
	/*
	* If the caller didn't want the whole block free the frags here.
	*/
	size = numfrags(fs, size);
	if (size != fs->fs_frag) {
	bno = dtogd(fs, blkno);
	for (i = size; i < fs->fs_frag; i++)
	setbit(blksfree, bno + i);
	i = fs->fs_frag - size;
	cgp->cg_cs.cs_nffree += i;
	fs->fs_cstotal.cs_nffree += i;
	fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
	fs->fs_fmod = 1;
	cgp->cg_frsum[i]++;
	}
	/* XXX Fixme. */
	UFS_UNLOCK(ump);
	if (DOINGSOFTDEP(ITOV(ip)))
	softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
	size, 0);
	UFS_LOCK(ump);
	return (blkno);
	}

	/*
	* Determine whether a cluster can be allocated.
	*
	* We do not currently check for optimal rotational layout if there
	* are multiple choices in the same cylinder group. Instead we just
	* take the first one that we find following bpref.
	*/
	static ufs2_daddr_t
	ffs_clusteralloc(ip, cg, bpref, len)
	struct inode *ip;
	u_int cg;
	ufs2_daddr_t bpref;
	int len;
	{
	struct fs *fs;
	struct cg *cgp;
	struct buf *bp;
	struct ufsmount *ump;
	int i, run, bit, map, got, error;
	ufs2_daddr_t bno;
	u_char *mapp;
	int32_t *lp;
	u_int8_t *blksfree;

	ump = ITOUMP(ip);
	fs = ump->um_fs;
	if (fs->fs_maxcluster[cg] < len)
	return (0);
	UFS_UNLOCK(ump);
	if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) {
	UFS_LOCK(ump);
	return (0);
	}
	/*
	* Check to see if a cluster of the needed size (or bigger) is
	* available in this cylinder group.
	*/
	lp = &cg_clustersum(cgp)[len];
	for (i = len; i <= fs->fs_contigsumsize; i++)
	if (*lp++ > 0)
	break;
	if (i > fs->fs_contigsumsize) {
	/*
	* This is the first time looking for a cluster in this
	* cylinder group. Update the cluster summary information
	* to reflect the true maximum sized cluster so that
	* future cluster allocation requests can avoid reading
	* the cylinder group map only to find no clusters.
	*/
	lp = &cg_clustersum(cgp)[len - 1];
	for (i = len - 1; i > 0; i--)
	if (*lp-- > 0)
	break;
	UFS_LOCK(ump);
	fs->fs_maxcluster[cg] = i;
	brelse(bp);
	return (0);
	}
	/*
	* Search the cluster map to find a big enough cluster.
	* We take the first one that we find, even if it is larger
	* than we need as we prefer to get one close to the previous
	* block allocation. We do not search before the current
	* preference point as we do not want to allocate a block
	* that is allocated before the previous one (as we will
	* then have to wait for another pass of the elevator
	* algorithm before it will be read). We prefer to fail and
	* be recalled to try an allocation in the next cylinder group.
	*/
	if (dtog(fs, bpref) != cg)
	bpref = cgdata(fs, cg);
	else
	bpref = blknum(fs, bpref);
	bpref = fragstoblks(fs, dtogd(fs, bpref));
	mapp = &cg_clustersfree(cgp)[bpref / NBBY];
	map = *mapp++;
	bit = 1 << (bpref % NBBY);
	for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
	if ((map & bit) == 0) {
	run = 0;
	} else {
	run++;
	if (run == len)
	break;
	}
	if ((got & (NBBY - 1)) != (NBBY - 1)) {
	bit <<= 1;
	} else {
	map = *mapp++;
	bit = 1;
	}
	}
	if (got >= cgp->cg_nclusterblks) {
	UFS_LOCK(ump);
	brelse(bp);
	return (0);
	}
	/*
	* Allocate the cluster that we have found.
	*/
	blksfree = cg_blksfree(cgp);
	for (i = 1; i <= len; i++)
	if (!ffs_isblock(fs, blksfree, got - run + i))
	panic("ffs_clusteralloc: map mismatch");
	bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
	if (dtog(fs, bno) != cg)
	panic("ffs_clusteralloc: allocated out of group");
	len = blkstofrags(fs, len);
	UFS_LOCK(ump);
	for (i = 0; i < len; i += fs->fs_frag)
	if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
	panic("ffs_clusteralloc: lost block");
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	bdwrite(bp);
	return (bno);
	}

	static inline struct buf *
	getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
	{
	struct fs *fs;

	fs = ITOFS(ip);
	return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs,
	cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
	gbflags));
	}

	/*
	* Synchronous inode initialization is needed only when barrier writes do not
	* work as advertised, and will impose a heavy cost on file creation in a newly
	* created filesystem.
	*/
	static int doasyncinodeinit = 1;
	SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN,
	&doasyncinodeinit, 0,
	"Perform inode block initialization using asynchronous writes");

	/*
	* Determine whether an inode can be allocated.
	*
	* Check to see if an inode is available, and if it is,
	* allocate it using the following policy:
	* 1) allocate the requested inode.
	* 2) allocate the next available inode after the requested
	* inode in the specified cylinder group.
	*/
	static ufs2_daddr_t
	ffs_nodealloccg(ip, cg, ipref, mode, unused)
	struct inode *ip;
	u_int cg;
	ufs2_daddr_t ipref;
	int mode;
	int unused;
	{
	struct fs *fs;
	struct cg *cgp;
	struct buf bp, ibp;
	struct ufsmount *ump;
	u_int8_t inosused, loc;
	struct ufs2_dinode *dp2;
	int error, start, len, i;
	u_int32_t old_initediblk;

	ump = ITOUMP(ip);
	fs = ump->um_fs;
	check_nifree:
	if (fs->fs_cs(fs, cg).cs_nifree == 0)
	return (0);
	UFS_UNLOCK(ump);
	if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) {
	UFS_LOCK(ump);
	return (0);
	}
	restart:
	if (cgp->cg_cs.cs_nifree == 0) {
	brelse(bp);
	UFS_LOCK(ump);
	return (0);
	}
	inosused = cg_inosused(cgp);
	if (ipref) {
	ipref %= fs->fs_ipg;
	if (isclr(inosused, ipref))
	goto gotit;
	}
	start = cgp->cg_irotor / NBBY;
	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
	loc = memcchr(&inosused[start], 0xff, len);
	if (loc == NULL) {
	len = start + 1;
	start = 0;
	loc = memcchr(&inosused[start], 0xff, len);
	if (loc == NULL) {
	printf("cg = %d, irotor = %ld, fs = %s\n",
	cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
	panic("ffs_nodealloccg: map corrupted");
	/* NOTREACHED */
	}
	}
	ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
	gotit:
	/*
	* Check to see if we need to initialize more inodes.
	*/
	if (fs->fs_magic == FS_UFS2_MAGIC &&
	ipref + INOPB(fs) > cgp->cg_initediblk &&
	cgp->cg_initediblk < cgp->cg_niblk) {
	old_initediblk = cgp->cg_initediblk;

	/*
	* Free the cylinder group lock before writing the
	* initialized inode block. Entering the
	* babarrierwrite() with the cylinder group lock
	* causes lock order violation between the lock and
	* snaplk.
	*
	* Another thread can decide to initialize the same
	* inode block, but whichever thread first gets the
	* cylinder group lock after writing the newly
	* allocated inode block will update it and the other
	* will realize that it has lost and leave the
	* cylinder group unchanged.
	*/
	ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
	brelse(bp);
	if (ibp == NULL) {
	/*
	* The inode block buffer is already owned by
	* another thread, which must initialize it.
	* Wait on the buffer to allow another thread
	* to finish the updates, with dropped cg
	* buffer lock, then retry.
	*/
	ibp = getinobuf(ip, cg, old_initediblk, 0);
	brelse(ibp);
	UFS_LOCK(ump);
	goto check_nifree;
	}
	bzero(ibp->b_data, (int)fs->fs_bsize);
	dp2 = (struct ufs2_dinode *)(ibp->b_data);
	for (i = 0; i < INOPB(fs); i++) {
	while (dp2->di_gen == 0)
	dp2->di_gen = arc4random();
	dp2++;
	}

	/*
	* Rather than adding a soft updates dependency to ensure
	* that the new inode block is written before it is claimed
	* by the cylinder group map, we just do a barrier write
	* here. The barrier write will ensure that the inode block
	* gets written before the updated cylinder group map can be
	* written. The barrier write should only slow down bulk
	* loading of newly created filesystems.
	*/
	if (doasyncinodeinit)
	babarrierwrite(ibp);
	else
	bwrite(ibp);

	/*
	* After the inode block is written, try to update the
	* cg initediblk pointer. If another thread beat us
	* to it, then leave it unchanged as the other thread
	* has already set it correctly.
	*/
	error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp);
	UFS_LOCK(ump);
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	if (error != 0)
	return (error);
	if (cgp->cg_initediblk == old_initediblk)
	cgp->cg_initediblk += INOPB(fs);
	goto restart;
	}
	cgp->cg_irotor = ipref;
	UFS_LOCK(ump);
	ACTIVECLEAR(fs, cg);
	setbit(inosused, ipref);
	cgp->cg_cs.cs_nifree--;
	fs->fs_cstotal.cs_nifree--;
	fs->fs_cs(fs, cg).cs_nifree--;
	fs->fs_fmod = 1;
	if ((mode & IFMT) == IFDIR) {
	cgp->cg_cs.cs_ndir++;
	fs->fs_cstotal.cs_ndir++;
	fs->fs_cs(fs, cg).cs_ndir++;
	}
	UFS_UNLOCK(ump);
	if (DOINGSOFTDEP(ITOV(ip)))
	softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode);
	bdwrite(bp);
	return ((ino_t)(cg * fs->fs_ipg + ipref));
	}

	/*
	* Free a block or fragment.
	*
	* The specified block or fragment is placed back in the
	* free map. If a fragment is deallocated, a possible
	* block reassembly is checked.
	*/
	static void
	ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
	struct ufsmount *ump;
	struct fs *fs;
	struct vnode *devvp;
	ufs2_daddr_t bno;
	long size;
	ino_t inum;
	struct workhead *dephd;
	{
	struct mount *mp;
	struct cg *cgp;
	struct buf *bp;
	ufs1_daddr_t fragno, cgbno;
	int i, blk, frags, bbase, error;
	u_int cg;
	u_int8_t *blksfree;
	struct cdev *dev;

	cg = dtog(fs, bno);
	if (devvp->v_type == VREG) {
	/* devvp is a snapshot */
	MPASS(devvp->v_mount->mnt_data == ump);
	dev = ump->um_devvp->v_rdev;
	} else if (devvp->v_type == VCHR) {
	/* devvp is a normal disk device */
	dev = devvp->v_rdev;
	ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg");
	} else
	return;
	#ifdef INVARIANTS
	if ((u_int)size > fs->fs_bsize \|\| fragoff(fs, size) != 0 \|\|
	fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
	printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
	devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
	size, fs->fs_fsmnt);
	panic("ffs_blkfree_cg: bad size");
	}
	#endif
	if ((u_int)bno >= fs->fs_size) {
	printf("bad block %jd, ino %lu\n", (intmax_t)bno,
	(u_long)inum);
	ffs_fserr(fs, inum, "bad block");
	return;
	}
	if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
	return;
	cgbno = dtogd(fs, bno);
	blksfree = cg_blksfree(cgp);
	UFS_LOCK(ump);
	if (size == fs->fs_bsize) {
	fragno = fragstoblks(fs, cgbno);
	if (!ffs_isfreeblock(fs, blksfree, fragno)) {
	if (devvp->v_type == VREG) {
	UFS_UNLOCK(ump);
	/* devvp is a snapshot */
	brelse(bp);
	return;
	}
	printf("dev = %s, block = %jd, fs = %s\n",
	devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
	panic("ffs_blkfree_cg: freeing free block");
	}
	ffs_setblock(fs, blksfree, fragno);
	ffs_clusteracct(fs, cgp, fragno, 1);
	cgp->cg_cs.cs_nbfree++;
	fs->fs_cstotal.cs_nbfree++;
	fs->fs_cs(fs, cg).cs_nbfree++;
	} else {
	bbase = cgbno - fragnum(fs, cgbno);
	/*
	* decrement the counts associated with the old frags
	*/
	blk = blkmap(fs, blksfree, bbase);
	ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
	/*
	* deallocate the fragment
	*/
	frags = numfrags(fs, size);
	for (i = 0; i < frags; i++) {
	if (isset(blksfree, cgbno + i)) {
	printf("dev = %s, block = %jd, fs = %s\n",
	devtoname(dev), (intmax_t)(bno + i),
	fs->fs_fsmnt);
	panic("ffs_blkfree_cg: freeing free frag");
	}
	setbit(blksfree, cgbno + i);
	}
	cgp->cg_cs.cs_nffree += i;
	fs->fs_cstotal.cs_nffree += i;
	fs->fs_cs(fs, cg).cs_nffree += i;
	/*
	* add back in counts associated with the new frags
	*/
	blk = blkmap(fs, blksfree, bbase);
	ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
	/*
	* if a complete block has been reassembled, account for it
	*/
	fragno = fragstoblks(fs, bbase);
	if (ffs_isblock(fs, blksfree, fragno)) {
	cgp->cg_cs.cs_nffree -= fs->fs_frag;
	fs->fs_cstotal.cs_nffree -= fs->fs_frag;
	fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
	ffs_clusteracct(fs, cgp, fragno, 1);
	cgp->cg_cs.cs_nbfree++;
	fs->fs_cstotal.cs_nbfree++;
	fs->fs_cs(fs, cg).cs_nbfree++;
	}
	}
	fs->fs_fmod = 1;
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	mp = UFSTOVFS(ump);
	if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR)
	softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
	numfrags(fs, size), dephd);
	bdwrite(bp);
	}

	struct ffs_blkfree_trim_params {
	struct task task;
	struct ufsmount *ump;
	struct vnode *devvp;
	ufs2_daddr_t bno;
	long size;
	ino_t inum;
	struct workhead *pdephd;
	struct workhead dephd;
	};

	static void
	ffs_blkfree_trim_task(ctx, pending)
	void *ctx;
	int pending;
	{
	struct ffs_blkfree_trim_params *tp;

	tp = ctx;
	ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size,
	tp->inum, tp->pdephd);
	vn_finished_secondary_write(UFSTOVFS(tp->ump));
	atomic_add_int(&tp->ump->um_trim_inflight, -1);
	free(tp, M_TEMP);
	}

	static void
	ffs_blkfree_trim_completed(bip)
	struct bio *bip;
	{
	struct ffs_blkfree_trim_params *tp;

	tp = bip->bio_caller2;
	g_destroy_bio(bip);
	TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
	taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
	}

	void
	ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
	struct ufsmount *ump;
	struct fs *fs;
	struct vnode *devvp;
	ufs2_daddr_t bno;
	long size;
	ino_t inum;
	enum vtype vtype;
	struct workhead *dephd;
	{
	struct mount *mp;
	struct bio *bip;
	struct ffs_blkfree_trim_params *tp;

	/*
	* Check to see if a snapshot wants to claim the block.
	* Check that devvp is a normal disk device, not a snapshot,
	* it has a snapshot(s) associated with it, and one of the
	* snapshots wants to claim the block.
	*/
	if (devvp->v_type == VCHR &&
	(devvp->v_vflag & VV_COPYONWRITE) &&
	ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) {
	return;
	}
	/*
	* Nothing to delay if TRIM is disabled, or the operation is
	* performed on the snapshot.
	*/
	if (!ump->um_candelete \|\| devvp->v_type == VREG) {
	ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
	return;
	}

	/*
	* Postpone the set of the free bit in the cg bitmap until the
	* BIO_DELETE is completed. Otherwise, due to disk queue
	* reordering, TRIM might be issued after we reuse the block
	* and write some new data into it.
	*/
	atomic_add_int(&ump->um_trim_inflight, 1);
	tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
	tp->ump = ump;
	tp->devvp = devvp;
	tp->bno = bno;
	tp->size = size;
	tp->inum = inum;
	if (dephd != NULL) {
	LIST_INIT(&tp->dephd);
	LIST_SWAP(dephd, &tp->dephd, worklist, wk_list);
	tp->pdephd = &tp->dephd;
	} else
	tp->pdephd = NULL;

	bip = g_alloc_bio();
	bip->bio_cmd = BIO_DELETE;
	bip->bio_offset = dbtob(fsbtodb(fs, bno));
	bip->bio_done = ffs_blkfree_trim_completed;
	bip->bio_length = size;
	bip->bio_caller2 = tp;

	mp = UFSTOVFS(ump);
	vn_start_secondary_write(NULL, &mp, 0);
	g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private);
	}

	#ifdef INVARIANTS
	/*
	* Verify allocation of a block or fragment. Returns true if block or
	* fragment is allocated, false if it is free.
	*/
	static int
	ffs_checkblk(ip, bno, size)
	struct inode *ip;
	ufs2_daddr_t bno;
	long size;
	{
	struct fs *fs;
	struct cg *cgp;
	struct buf *bp;
	ufs1_daddr_t cgbno;
	int i, error, frags, free;
	u_int8_t *blksfree;

	fs = ITOFS(ip);
	if ((u_int)size > fs->fs_bsize \|\| fragoff(fs, size) != 0) {
	printf("bsize = %ld, size = %ld, fs = %s\n",
	(long)fs->fs_bsize, size, fs->fs_fsmnt);
	panic("ffs_checkblk: bad size");
	}
	if ((u_int)bno >= fs->fs_size)
	panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
	error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), &bp, &cgp);
	if (error)
	panic("ffs_checkblk: cylinder group read failed");
	blksfree = cg_blksfree(cgp);
	cgbno = dtogd(fs, bno);
	if (size == fs->fs_bsize) {
	free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
	} else {
	frags = numfrags(fs, size);
	for (free = 0, i = 0; i < frags; i++)
	if (isset(blksfree, cgbno + i))
	free++;
	if (free != 0 && free != frags)
	panic("ffs_checkblk: partially free fragment");
	}
	brelse(bp);
	return (!free);
	}
	#endif /* INVARIANTS */

	/*
	* Free an inode.
	*/
	int
	ffs_vfree(pvp, ino, mode)
	struct vnode *pvp;
	ino_t ino;
	int mode;
	{
	struct ufsmount *ump;
	- struct inode *ip;

	if (DOINGSOFTDEP(pvp)) {
	softdep_freefile(pvp, ino, mode);
	return (0);
	}
	- ip = VTOI(pvp);
	ump = VFSTOUFS(pvp->v_mount);
	return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL));
	}

	/*
	* Do the actual free operation.
	* The specified inode is placed back in the free map.
	*/
	int
	ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
	struct ufsmount *ump;
	struct fs *fs;
	struct vnode *devvp;
	ino_t ino;
	int mode;
	struct workhead *wkhd;
	{
	struct cg *cgp;
	struct buf *bp;
	ufs2_daddr_t cgbno;
	int error;
	u_int cg;
	u_int8_t *inosused;
	struct cdev *dev;

	cg = ino_to_cg(fs, ino);
	if (devvp->v_type == VREG) {
	/* devvp is a snapshot */
	MPASS(devvp->v_mount->mnt_data == ump);
	dev = ump->um_devvp->v_rdev;
	cgbno = fragstoblks(fs, cgtod(fs, cg));
	} else if (devvp->v_type == VCHR) {
	/* devvp is a normal disk device */
	dev = devvp->v_rdev;
	cgbno = fsbtodb(fs, cgtod(fs, cg));
	} else {
	bp = NULL;
	return (0);
	}
	if (ino >= fs->fs_ipg * fs->fs_ncg)
	panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s",
	devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt);
	if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
	return (error);
	inosused = cg_inosused(cgp);
	ino %= fs->fs_ipg;
	if (isclr(inosused, ino)) {
	printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev),
	(uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt);
	if (fs->fs_ronly == 0)
	panic("ffs_freefile: freeing free inode");
	}
	clrbit(inosused, ino);
	if (ino < cgp->cg_irotor)
	cgp->cg_irotor = ino;
	cgp->cg_cs.cs_nifree++;
	UFS_LOCK(ump);
	fs->fs_cstotal.cs_nifree++;
	fs->fs_cs(fs, cg).cs_nifree++;
	if ((mode & IFMT) == IFDIR) {
	cgp->cg_cs.cs_ndir--;
	fs->fs_cstotal.cs_ndir--;
	fs->fs_cs(fs, cg).cs_ndir--;
	}
	fs->fs_fmod = 1;
	ACTIVECLEAR(fs, cg);
	UFS_UNLOCK(ump);
	if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR)
	softdep_setup_inofree(UFSTOVFS(ump), bp,
	ino + cg * fs->fs_ipg, wkhd);
	bdwrite(bp);
	return (0);
	}

	/*
	* Check to see if a file is free.
	* Used to check for allocated files in snapshots.
	*/
	int
	ffs_checkfreefile(fs, devvp, ino)
	struct fs *fs;
	struct vnode *devvp;
	ino_t ino;
	{
	struct cg *cgp;
	struct buf *bp;
	ufs2_daddr_t cgbno;
	int ret, error;
	u_int cg;
	u_int8_t *inosused;

	cg = ino_to_cg(fs, ino);
	if (devvp->v_type == VREG) {
	/* devvp is a snapshot */
	cgbno = fragstoblks(fs, cgtod(fs, cg));
	} else if (devvp->v_type == VCHR) {
	/* devvp is a normal disk device */
	cgbno = fsbtodb(fs, cgtod(fs, cg));
	} else {
	return (1);
	}
	if (ino >= fs->fs_ipg * fs->fs_ncg)
	return (1);
	if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
	return (1);
	inosused = cg_inosused(cgp);
	ino %= fs->fs_ipg;
	ret = isclr(inosused, ino);
	brelse(bp);
	return (ret);
	}

	/*
	* Find a block of the specified size in the specified cylinder group.
	*
	* It is a panic if a request is made to find a block if none are
	* available.
	*/
	static ufs1_daddr_t
	ffs_mapsearch(fs, cgp, bpref, allocsiz)
	struct fs *fs;
	struct cg *cgp;
	ufs2_daddr_t bpref;
	int allocsiz;
	{
	ufs1_daddr_t bno;
	int start, len, loc, i;
	int blk, field, subfield, pos;
	u_int8_t *blksfree;

	/*
	* find the fragment by searching through the free block
	* map for an appropriate bit pattern
	*/
	if (bpref)
	start = dtogd(fs, bpref) / NBBY;
	else
	start = cgp->cg_frotor / NBBY;
	blksfree = cg_blksfree(cgp);
	len = howmany(fs->fs_fpg, NBBY) - start;
	loc = scanc((u_int)len, (u_char *)&blksfree[start],
	fragtbl[fs->fs_frag],
	(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
	if (loc == 0) {
	len = start + 1;
	start = 0;
	loc = scanc((u_int)len, (u_char *)&blksfree[0],
	fragtbl[fs->fs_frag],
	(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
	if (loc == 0) {
	printf("start = %d, len = %d, fs = %s\n",
	start, len, fs->fs_fsmnt);
	panic("ffs_alloccg: map corrupted");
	/* NOTREACHED */
	}
	}
	bno = (start + len - loc) * NBBY;
	cgp->cg_frotor = bno;
	/*
	* found the byte in the map
	* sift through the bits to find the selected frag
	*/
	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
	blk = blkmap(fs, blksfree, bno);
	blk <<= 1;
	field = around[allocsiz];
	subfield = inside[allocsiz];
	for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
	if ((blk & field) == subfield)
	return (bno + pos);
	field <<= 1;
	subfield <<= 1;
	}
	}
	printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
	panic("ffs_alloccg: block not in map");
	return (-1);
	}

	static const struct statfs *
	ffs_getmntstat(struct vnode *devvp)
	{

	if (devvp->v_type == VCHR)
	return (&devvp->v_rdev->si_mountpt->mnt_stat);
	return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp));
	}

	/*
	* Fetch and verify a cylinder group.
	*/
	int
	ffs_getcg(fs, devvp, cg, bpp, cgpp)
	struct fs *fs;
	struct vnode *devvp;
	u_int cg;
	struct buf **bpp;
	struct cg **cgpp;
	{
	struct buf *bp;
	struct cg *cgp;
	const struct statfs *sfs;
	int flags, error;

	*bpp = NULL;
	*cgpp = NULL;
	flags = 0;
	if ((fs->fs_metackhash & CK_CYLGRP) != 0)
	flags \|= GB_CKHASH;
	error = breadn_flags(devvp, devvp->v_type == VREG ?
	fragstoblks(fs, cgtod(fs, cg)) : fsbtodb(fs, cgtod(fs, cg)),
	(int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags,
	ffs_ckhash_cg, &bp);
	if (error != 0)
	return (error);
	cgp = (struct cg *)bp->b_data;
	if (((fs->fs_metackhash & CK_CYLGRP) != 0 &&
	(bp->b_flags & B_CKHASH) != 0 &&
	cgp->cg_ckhash != bp->b_ckhash) \|\|
	!cg_chkmagic(cgp) \|\| cgp->cg_cgx != cg) {
	sfs = ffs_getmntstat(devvp);
	printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: "
	"0x%x != bp: 0x%jx\n",
	devvp->v_type == VCHR ? "" : "snapshot of ",
	sfs->f_mntfromname, sfs->f_mntonname,
	cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash);
	bp->b_flags &= ~B_CKHASH;
	bp->b_flags \|= B_INVAL \| B_NOCACHE;
	brelse(bp);
	return (EIO);
	}
	bp->b_flags &= ~B_CKHASH;
	bp->b_xflags \|= BX_BKGRDWRITE;
	if ((fs->fs_metackhash & CK_CYLGRP) != 0)
	bp->b_xflags \|= BX_CYLGRP;
	cgp->cg_old_time = cgp->cg_time = time_second;
	*bpp = bp;
	*cgpp = cgp;
	return (0);
	}

	static void
	ffs_ckhash_cg(bp)
	struct buf *bp;
	{
	uint32_t ckhash;
	struct cg *cgp;

	cgp = (struct cg *)bp->b_data;
	ckhash = cgp->cg_ckhash;
	cgp->cg_ckhash = 0;
	bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
	cgp->cg_ckhash = ckhash;
	}

	/*
	* Fserr prints the name of a filesystem with an error diagnostic.
	*
	* The form of the error message is:
	* fs: error message
	*/
	void
	ffs_fserr(fs, inum, cp)
	struct fs *fs;
	ino_t inum;
	char *cp;
	{
	struct thread td = curthread; / XXX */
	struct proc *p = td->td_proc;

	log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n",
	p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum,
	fs->fs_fsmnt, cp);
	}

	/*
	* This function provides the capability for the fsck program to
	* update an active filesystem. Fourteen operations are provided:
	*
	* adjrefcnt(inode, amt) - adjusts the reference count on the
	* specified inode by the specified amount. Under normal
	* operation the count should always go down. Decrementing
	* the count to zero will cause the inode to be freed.
	* adjblkcnt(inode, amt) - adjust the number of blocks used by the
	* inode by the specified amount.
	* adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
	* adjust the superblock summary.
	* freedirs(inode, count) - directory inodes [inode..inode + count - 1]
	* are marked as free. Inodes should never have to be marked
	* as in use.
	* freefiles(inode, count) - file inodes [inode..inode + count - 1]
	* are marked as free. Inodes should never have to be marked
	* as in use.
	* freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
	* are marked as free. Blocks should never have to be marked
	* as in use.
	* setflags(flags, set/clear) - the fs_flags field has the specified
	* flags set (second parameter +1) or cleared (second parameter -1).
	* setcwd(dirinode) - set the current directory to dirinode in the
	* filesystem associated with the snapshot.
	* setdotdot(oldvalue, newvalue) - Verify that the inode number for ".."
	* in the current directory is oldvalue then change it to newvalue.
	* unlink(nameptr, oldvalue) - Verify that the inode number associated
	* with nameptr in the current directory is oldvalue then unlink it.
	*
	* The following functions may only be used on a quiescent filesystem
	* by the soft updates journal. They are not safe to be run on an active
	* filesystem.
	*
	* setinode(inode, dip) - the specified disk inode is replaced with the
	* contents pointed to by dip.
	* setbufoutput(fd, flags) - output associated with the specified file
	* descriptor (which must reference the character device supporting
	* the filesystem) switches from using physio to running through the
	* buffer cache when flags is set to 1. The descriptor reverts to
	* physio for output when flags is set to zero.
	*/

	static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);

	SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR\|CTLTYPE_STRUCT,
	0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count");

	static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR,
	sysctl_ffs_fsck, "Adjust Inode Used Blocks Count");

	static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR,
	sysctl_ffs_fsck, "Adjust number of directories");

	static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR,
	sysctl_ffs_fsck, "Adjust number of free blocks");

	static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR,
	sysctl_ffs_fsck, "Adjust number of free inodes");

	static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR,
	sysctl_ffs_fsck, "Adjust number of free frags");

	static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR,
	sysctl_ffs_fsck, "Adjust number of free clusters");

	static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR,
	sysctl_ffs_fsck, "Free Range of Directory Inodes");

	static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR,
	sysctl_ffs_fsck, "Free Range of File Inodes");

	static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR,
	sysctl_ffs_fsck, "Free Range of Blocks");

	static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR,
	sysctl_ffs_fsck, "Change Filesystem Flags");

	static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR,
	sysctl_ffs_fsck, "Set Current Working Directory");

	static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR,
	sysctl_ffs_fsck, "Change Value of .. Entry");

	static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR,
	sysctl_ffs_fsck, "Unlink a Duplicate Name");

	static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR,
	sysctl_ffs_fsck, "Update an On-Disk Inode");

	static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR,
	sysctl_ffs_fsck, "Set Buffered Writing for Descriptor");

	#define DEBUG 1
	#ifdef DEBUG
	static int fsckcmds = 0;
	SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "");
	#endif /* DEBUG */

	static int buffered_write(struct file , struct uio , struct ucred *,
	int, struct thread *);

	static int
	sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
	{
	struct thread *td = curthread;
	struct fsck_cmd cmd;
	struct ufsmount *ump;
	struct vnode vp, dvp, *fdvp;
	struct inode ip, dp;
	struct mount *mp;
	struct fs *fs;
	ufs2_daddr_t blkno;
	long blkcnt, blksize;
	struct file fp, vfp;
	cap_rights_t rights;
	int filetype, error;
	static struct fileops *origops, bufferedops;

	if (req->newlen > sizeof cmd)
	return (EBADRPC);
	if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
	return (error);
	if (cmd.version != FFS_CMD_VERSION)
	return (ERPCMISMATCH);
	if ((error = getvnode(td, cmd.handle,
	cap_rights_init(&rights, CAP_FSCK), &fp)) != 0)
	return (error);
	vp = fp->f_data;
	if (vp->v_type != VREG && vp->v_type != VDIR) {
	fdrop(fp, td);
	return (EINVAL);
	}
	vn_start_write(vp, &mp, V_WAIT);
	if (mp == NULL \|\|
	strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
	vn_finished_write(mp);
	fdrop(fp, td);
	return (EINVAL);
	}
	ump = VFSTOUFS(mp);
	if ((mp->mnt_flag & MNT_RDONLY) &&
	ump->um_fsckpid != td->td_proc->p_pid) {
	vn_finished_write(mp);
	fdrop(fp, td);
	return (EROFS);
	}
	fs = ump->um_fs;
	filetype = IFREG;

	switch (oidp->oid_number) {

	case FFS_SET_FLAGS:
	#ifdef DEBUG
	if (fsckcmds)
	printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
	cmd.size > 0 ? "set" : "clear");
	#endif /* DEBUG */
	if (cmd.size > 0)
	fs->fs_flags \|= (long)cmd.value;
	else
	fs->fs_flags &= ~(long)cmd.value;
	break;

	case FFS_ADJ_REFCNT:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: adjust inode %jd link count by %jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
	(intmax_t)cmd.size);
	}
	#endif /* DEBUG */
	if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
	break;
	ip = VTOI(vp);
	ip->i_nlink += cmd.size;
	DIP_SET(ip, i_nlink, ip->i_nlink);
	ip->i_effnlink += cmd.size;
	ip->i_flag \|= IN_CHANGE \| IN_MODIFIED;
	error = ffs_update(vp, 1);
	if (DOINGSOFTDEP(vp))
	softdep_change_linkcnt(ip);
	vput(vp);
	break;

	case FFS_ADJ_BLKCNT:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: adjust inode %jd block count by %jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
	(intmax_t)cmd.size);
	}
	#endif /* DEBUG */
	if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
	break;
	ip = VTOI(vp);
	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
	ip->i_flag \|= IN_CHANGE \| IN_MODIFIED;
	error = ffs_update(vp, 1);
	vput(vp);
	break;

	case FFS_DIR_FREE:
	filetype = IFDIR;
	/* fall through */

	case FFS_FILE_FREE:
	#ifdef DEBUG
	if (fsckcmds) {
	if (cmd.size == 1)
	printf("%s: free %s inode %ju\n",
	mp->mnt_stat.f_mntonname,
	filetype == IFDIR ? "directory" : "file",
	(uintmax_t)cmd.value);
	else
	printf("%s: free %s inodes %ju-%ju\n",
	mp->mnt_stat.f_mntonname,
	filetype == IFDIR ? "directory" : "file",
	(uintmax_t)cmd.value,
	(uintmax_t)(cmd.value + cmd.size - 1));
	}
	#endif /* DEBUG */
	while (cmd.size > 0) {
	if ((error = ffs_freefile(ump, fs, ump->um_devvp,
	cmd.value, filetype, NULL)))
	break;
	cmd.size -= 1;
	cmd.value += 1;
	}
	break;

	case FFS_BLK_FREE:
	#ifdef DEBUG
	if (fsckcmds) {
	if (cmd.size == 1)
	printf("%s: free block %jd\n",
	mp->mnt_stat.f_mntonname,
	(intmax_t)cmd.value);
	else
	printf("%s: free blocks %jd-%jd\n",
	mp->mnt_stat.f_mntonname,
	(intmax_t)cmd.value,
	(intmax_t)cmd.value + cmd.size - 1);
	}
	#endif /* DEBUG */
	blkno = cmd.value;
	blkcnt = cmd.size;
	blksize = fs->fs_frag - (blkno % fs->fs_frag);
	while (blkcnt > 0) {
	if (blksize > blkcnt)
	blksize = blkcnt;
	ffs_blkfree(ump, fs, ump->um_devvp, blkno,
	blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
	blkno += blksize;
	blkcnt -= blksize;
	blksize = fs->fs_frag;
	}
	break;

	/*
	* Adjust superblock summaries. fsck(8) is expected to
	* submit deltas when necessary.
	*/
	case FFS_ADJ_NDIR:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: adjust number of directories by %jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	fs->fs_cstotal.cs_ndir += cmd.value;
	break;

	case FFS_ADJ_NBFREE:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: adjust number of free blocks by %+jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	fs->fs_cstotal.cs_nbfree += cmd.value;
	break;

	case FFS_ADJ_NIFREE:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: adjust number of free inodes by %+jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	fs->fs_cstotal.cs_nifree += cmd.value;
	break;

	case FFS_ADJ_NFFREE:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: adjust number of free frags by %+jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	fs->fs_cstotal.cs_nffree += cmd.value;
	break;

	case FFS_ADJ_NUMCLUSTERS:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: adjust number of free clusters by %+jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	fs->fs_cstotal.cs_numclusters += cmd.value;
	break;

	case FFS_SET_CWD:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: set current directory to inode %jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp)))
	break;
	AUDIT_ARG_VNODE1(vp);
	if ((error = change_dir(vp, td)) != 0) {
	vput(vp);
	break;
	}
	VOP_UNLOCK(vp, 0);
	pwd_chdir(td, vp);
	break;

	case FFS_SET_DOTDOT:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: change .. in cwd from %jd to %jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
	(intmax_t)cmd.size);
	}
	#endif /* DEBUG */
	/*
	* First we have to get and lock the parent directory
	* to which ".." points.
	*/
	error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp);
	if (error)
	break;
	/*
	* Now we get and lock the child directory containing "..".
	*/
	FILEDESC_SLOCK(td->td_proc->p_fd);
	dvp = td->td_proc->p_fd->fd_cdir;
	FILEDESC_SUNLOCK(td->td_proc->p_fd);
	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
	vput(fdvp);
	break;
	}
	dp = VTOI(dvp);
	dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */
	error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size,
	DT_DIR, 0);
	cache_purge(fdvp);
	cache_purge(dvp);
	vput(dvp);
	vput(fdvp);
	break;

	case FFS_UNLINK:
	#ifdef DEBUG
	if (fsckcmds) {
	char buf[32];

	if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL))
	strncpy(buf, "Name_too_long", 32);
	printf("%s: unlink %s (inode %jd)\n",
	mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size);
	}
	#endif /* DEBUG */
	/*
	* kern_unlinkat will do its own start/finish writes and
	* they do not nest, so drop ours here. Setting mp == NULL
	* indicates that vn_finished_write is not needed down below.
	*/
	vn_finished_write(mp);
	mp = NULL;
	error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value,
	UIO_USERSPACE, (ino_t)cmd.size);
	break;

	case FFS_SET_INODE:
	if (ump->um_fsckpid != td->td_proc->p_pid) {
	error = EPERM;
	break;
	}
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: update inode %jd\n",
	mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
	break;
	AUDIT_ARG_VNODE1(vp);
	ip = VTOI(vp);
	if (I_IS_UFS1(ip))
	error = copyin((void *)(intptr_t)cmd.size, ip->i_din1,
	sizeof(struct ufs1_dinode));
	else
	error = copyin((void *)(intptr_t)cmd.size, ip->i_din2,
	sizeof(struct ufs2_dinode));
	if (error) {
	vput(vp);
	break;
	}
	ip->i_flag \|= IN_CHANGE \| IN_MODIFIED;
	error = ffs_update(vp, 1);
	vput(vp);
	break;

	case FFS_SET_BUFOUTPUT:
	if (ump->um_fsckpid != td->td_proc->p_pid) {
	error = EPERM;
	break;
	}
	if (ITOUMP(VTOI(vp)) != ump) {
	error = EINVAL;
	break;
	}
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: %s buffered output for descriptor %jd\n",
	mp->mnt_stat.f_mntonname,
	cmd.size == 1 ? "enable" : "disable",
	(intmax_t)cmd.value);
	}
	#endif /* DEBUG */
	if ((error = getvnode(td, cmd.value,
	cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0)
	break;
	if (vfp->f_vnode->v_type != VCHR) {
	fdrop(vfp, td);
	error = EINVAL;
	break;
	}
	if (origops == NULL) {
	origops = vfp->f_ops;
	bcopy((void )origops, (void )&bufferedops,
	sizeof(bufferedops));
	bufferedops.fo_write = buffered_write;
	}
	if (cmd.size == 1)
	atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
	(uintptr_t)&bufferedops);
	else
	atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
	(uintptr_t)origops);
	fdrop(vfp, td);
	break;

	default:
	#ifdef DEBUG
	if (fsckcmds) {
	printf("Invalid request %d from fsck\n",
	oidp->oid_number);
	}
	#endif /* DEBUG */
	error = EINVAL;
	break;

	}
	fdrop(fp, td);
	vn_finished_write(mp);
	return (error);
	}

	/*
	* Function to switch a descriptor to use the buffer cache to stage
	* its I/O. This is needed so that writes to the filesystem device
	* will give snapshots a chance to copy modified blocks for which it
	* needs to retain copies.
	*/
	static int
	buffered_write(fp, uio, active_cred, flags, td)
	struct file *fp;
	struct uio *uio;
	struct ucred *active_cred;
	int flags;
	struct thread *td;
	{
	struct vnode devvp, vp;
	struct inode *ip;
	struct buf *bp;
	struct fs *fs;
	struct filedesc *fdp;
	int error;
	daddr_t lbn;

	/*
	* The devvp is associated with the /dev filesystem. To discover
	* the filesystem with which the device is associated, we depend
	* on the application setting the current directory to a location
	* within the filesystem being written. Yes, this is an ugly hack.
	*/
	devvp = fp->f_vnode;
	if (!vn_isdisk(devvp, NULL))
	return (EINVAL);
	fdp = td->td_proc->p_fd;
	FILEDESC_SLOCK(fdp);
	vp = fdp->fd_cdir;
	vref(vp);
	FILEDESC_SUNLOCK(fdp);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	/*
	* Check that the current directory vnode indeed belongs to
	* UFS before trying to dereference UFS-specific v_data fields.
	*/
	if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) {
	vput(vp);
	return (EINVAL);
	}
	ip = VTOI(vp);
	if (ITODEVVP(ip) != devvp) {
	vput(vp);
	return (EINVAL);
	}
	fs = ITOFS(ip);
	vput(vp);
	foffset_lock_uio(fp, uio, flags);
	vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef DEBUG
	if (fsckcmds) {
	printf("%s: buffered write for block %jd\n",
	fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset));
	}
	#endif /* DEBUG */
	/*
	* All I/O must be contained within a filesystem block, start on
	* a fragment boundary, and be a multiple of fragments in length.
	*/
	if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) \|\|
	fragoff(fs, uio->uio_offset) != 0 \|\|
	fragoff(fs, uio->uio_resid) != 0) {
	error = EINVAL;
	goto out;
	}
	lbn = numfrags(fs, uio->uio_offset);
	bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0);
	bp->b_flags \|= B_RELBUF;
	if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) {
	brelse(bp);
	goto out;
	}
	error = bwrite(bp);
	out:
	VOP_UNLOCK(devvp, 0);
	foffset_unlock_uio(fp, uio, flags \| FOF_NEXTOFF);
	return (error);
	}
	Index: head/sys/ufs/ffs/ffs_vnops.c
	===================================================================
	--- head/sys/ufs/ffs/ffs_vnops.c (revision 327172)
	+++ head/sys/ufs/ffs/ffs_vnops.c (revision 327173)
	@@ -1,1728 +1,1726 @@
	/*-
	* SPDX-License-Identifier: BSD-3-Clause
	*
	* Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project by Marshall
	* Kirk McKusick and Network Associates Laboratories, the Security
	* Research Division of Network Associates, Inc. under DARPA/SPAWAR
	* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
	* research program
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
	* from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
	* @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bio.h>
	#include <sys/systm.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/extattr.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/priv.h>
	#include <sys/rwlock.h>
	#include <sys/stat.h>
	#include <sys/sysctl.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vnode_pager.h>

	#include <ufs/ufs/extattr.h>
	#include <ufs/ufs/quota.h>
	#include <ufs/ufs/inode.h>
	#include <ufs/ufs/ufs_extern.h>
	#include <ufs/ufs/ufsmount.h>

	#include <ufs/ffs/fs.h>
	#include <ufs/ffs/ffs_extern.h>
	#include "opt_directio.h"
	#include "opt_ffs.h"

	#define ALIGNED_TO(ptr, s) \
	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)

	#ifdef DIRECTIO
	extern int ffs_rawread(struct vnode vp, struct uio uio, int *workdone);
	#endif
	static vop_fdatasync_t ffs_fdatasync;
	static vop_fsync_t ffs_fsync;
	static vop_getpages_t ffs_getpages;
	static vop_lock1_t ffs_lock;
	static vop_read_t ffs_read;
	static vop_write_t ffs_write;
	static int ffs_extread(struct vnode vp, struct uio uio, int ioflag);
	static int ffs_extwrite(struct vnode vp, struct uio uio, int ioflag,
	struct ucred *cred);
	static vop_strategy_t ffsext_strategy;
	static vop_closeextattr_t ffs_closeextattr;
	static vop_deleteextattr_t ffs_deleteextattr;
	static vop_getextattr_t ffs_getextattr;
	static vop_listextattr_t ffs_listextattr;
	static vop_openextattr_t ffs_openextattr;
	static vop_setextattr_t ffs_setextattr;
	static vop_vptofh_t ffs_vptofh;

	/* Global vfs data structures for ufs. */
	struct vop_vector ffs_vnodeops1 = {
	.vop_default = &ufs_vnodeops,
	.vop_fsync = ffs_fsync,
	.vop_fdatasync = ffs_fdatasync,
	.vop_getpages = ffs_getpages,
	.vop_getpages_async = vnode_pager_local_getpages_async,
	.vop_lock1 = ffs_lock,
	.vop_read = ffs_read,
	.vop_reallocblks = ffs_reallocblks,
	.vop_write = ffs_write,
	.vop_vptofh = ffs_vptofh,
	};

	struct vop_vector ffs_fifoops1 = {
	.vop_default = &ufs_fifoops,
	.vop_fsync = ffs_fsync,
	.vop_fdatasync = ffs_fdatasync,
	.vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */
	.vop_vptofh = ffs_vptofh,
	};

	/* Global vfs data structures for ufs. */
	struct vop_vector ffs_vnodeops2 = {
	.vop_default = &ufs_vnodeops,
	.vop_fsync = ffs_fsync,
	.vop_fdatasync = ffs_fdatasync,
	.vop_getpages = ffs_getpages,
	.vop_getpages_async = vnode_pager_local_getpages_async,
	.vop_lock1 = ffs_lock,
	.vop_read = ffs_read,
	.vop_reallocblks = ffs_reallocblks,
	.vop_write = ffs_write,
	.vop_closeextattr = ffs_closeextattr,
	.vop_deleteextattr = ffs_deleteextattr,
	.vop_getextattr = ffs_getextattr,
	.vop_listextattr = ffs_listextattr,
	.vop_openextattr = ffs_openextattr,
	.vop_setextattr = ffs_setextattr,
	.vop_vptofh = ffs_vptofh,
	};

	struct vop_vector ffs_fifoops2 = {
	.vop_default = &ufs_fifoops,
	.vop_fsync = ffs_fsync,
	.vop_fdatasync = ffs_fdatasync,
	.vop_lock1 = ffs_lock,
	.vop_reallocblks = ffs_reallocblks,
	.vop_strategy = ffsext_strategy,
	.vop_closeextattr = ffs_closeextattr,
	.vop_deleteextattr = ffs_deleteextattr,
	.vop_getextattr = ffs_getextattr,
	.vop_listextattr = ffs_listextattr,
	.vop_openextattr = ffs_openextattr,
	.vop_setextattr = ffs_setextattr,
	.vop_vptofh = ffs_vptofh,
	};

	/*
	* Synch an open file.
	*/
	/* ARGSUSED */
	static int
	ffs_fsync(struct vop_fsync_args *ap)
	{
	struct vnode *vp;
	struct bufobj *bo;
	int error;

	vp = ap->a_vp;
	bo = &vp->v_bufobj;
	retry:
	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
	if (error)
	return (error);
	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
	error = softdep_fsync(vp);
	if (error)
	return (error);

	/*
	* The softdep_fsync() function may drop vp lock,
	* allowing for dirty buffers to reappear on the
	* bo_dirty list. Recheck and resync as needed.
	*/
	BO_LOCK(bo);
	if ((vp->v_type == VREG \|\| vp->v_type == VDIR) &&
	(bo->bo_numoutput > 0 \|\| bo->bo_dirty.bv_cnt > 0)) {
	BO_UNLOCK(bo);
	goto retry;
	}
	BO_UNLOCK(bo);
	}
	return (0);
	}

	int
	ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
	{
	struct inode *ip;
	struct bufobj *bo;
	struct buf bp, nbp;
	ufs_lbn_t lbn;
	int error, passes;
	bool still_dirty, wait;

	ip = VTOI(vp);
	ip->i_flag &= ~IN_NEEDSYNC;
	bo = &vp->v_bufobj;

	/*
	* When doing MNT_WAIT we must first flush all dependencies
	* on the inode.
	*/
	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
	(error = softdep_sync_metadata(vp)) != 0)
	return (error);

	/*
	* Flush all dirty buffers associated with a vnode.
	*/
	error = 0;
	passes = 0;
	wait = false; /* Always do an async pass first. */
	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
	BO_LOCK(bo);
	loop:
	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
	bp->b_vflags &= ~BV_SCANNED;
	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	/*
	* Reasons to skip this buffer: it has already been considered
	* on this pass, the buffer has dependencies that will cause
	* it to be redirtied and it has not already been deferred,
	* or it is already being written.
	*/
	if ((bp->b_vflags & BV_SCANNED) != 0)
	continue;
	bp->b_vflags \|= BV_SCANNED;
	/*
	* Flush indirects in order, if requested.
	*
	* Note that if only datasync is requested, we can
	* skip indirect blocks when softupdates are not
	* active. Otherwise we must flush them with data,
	* since dependencies prevent data block writes.
	*/
	if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
	(lbn_level(bp->b_lblkno) >= passes \|\|
	((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
	continue;
	if (bp->b_lblkno > lbn)
	panic("ffs_syncvnode: syncing truncated data.");
	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL) == 0) {
	BO_UNLOCK(bo);
	} else if (wait) {
	if (BUF_LOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	BO_LOCKPTR(bo)) != 0) {
	bp->b_vflags &= ~BV_SCANNED;
	goto next;
	}
	} else
	continue;
	if ((bp->b_flags & B_DELWRI) == 0)
	panic("ffs_fsync: not dirty");
	/*
	* Check for dependencies and potentially complete them.
	*/
	if (!LIST_EMPTY(&bp->b_dep) &&
	(error = softdep_sync_buf(vp, bp,
	wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
	/* I/O error. */
	if (error != EBUSY) {
	BUF_UNLOCK(bp);
	return (error);
	}
	/* If we deferred once, don't defer again. */
	if ((bp->b_flags & B_DEFERRED) == 0) {
	bp->b_flags \|= B_DEFERRED;
	BUF_UNLOCK(bp);
	goto next;
	}
	}
	if (wait) {
	bremfree(bp);
	if ((error = bwrite(bp)) != 0)
	return (error);
	} else if ((bp->b_flags & B_CLUSTEROK)) {
	(void) vfs_bio_awrite(bp);
	} else {
	bremfree(bp);
	(void) bawrite(bp);
	}
	next:
	/*
	* Since we may have slept during the I/O, we need
	* to start from a known point.
	*/
	BO_LOCK(bo);
	nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
	}
	if (waitfor != MNT_WAIT) {
	BO_UNLOCK(bo);
	if ((flags & NO_INO_UPDT) != 0)
	return (0);
	else
	return (ffs_update(vp, 0));
	}
	/* Drain IO to see if we're done. */
	bufobj_wwait(bo, 0, 0);
	/*
	* Block devices associated with filesystems may have new I/O
	* requests posted for them even if the vnode is locked, so no
	* amount of trying will get them clean. We make several passes
	* as a best effort.
	*
	* Regular files may need multiple passes to flush all dependency
	* work as it is possible that we must write once per indirect
	* level, once for the leaf, and once for the inode and each of
	* these will be done with one sync and one async pass.
	*/
	if (bo->bo_dirty.bv_cnt > 0) {
	if ((flags & DATA_ONLY) == 0) {
	still_dirty = true;
	} else {
	/*
	* For data-only sync, dirty indirect buffers
	* are ignored.
	*/
	still_dirty = false;
	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
	if (bp->b_lblkno > -UFS_NDADDR) {
	still_dirty = true;
	break;
	}
	}
	}

	if (still_dirty) {
	/* Write the inode after sync passes to flush deps. */
	if (wait && DOINGSOFTDEP(vp) &&
	(flags & NO_INO_UPDT) == 0) {
	BO_UNLOCK(bo);
	ffs_update(vp, 1);
	BO_LOCK(bo);
	}
	/* switch between sync/async. */
	wait = !wait;
	if (wait \|\| ++passes < UFS_NIADDR + 2)
	goto loop;
	#ifdef INVARIANTS
	if (!vn_isdisk(vp, NULL))
	vn_printf(vp, "ffs_fsync: dirty ");
	#endif
	}
	}
	BO_UNLOCK(bo);
	error = 0;
	if ((flags & DATA_ONLY) == 0) {
	if ((flags & NO_INO_UPDT) == 0)
	error = ffs_update(vp, 1);
	if (DOINGSUJ(vp))
	softdep_journal_fsync(VTOI(vp));
	}
	return (error);
	}

	static int
	ffs_fdatasync(struct vop_fdatasync_args *ap)
	{

	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
	}

	static int
	ffs_lock(ap)
	struct vop_lock1_args /* {
	struct vnode *a_vp;
	int a_flags;
	struct thread *a_td;
	char *file;
	int line;
	} / ap;
	{
	#ifndef NO_FFS_SNAPSHOT
	struct vnode *vp;
	int flags;
	struct lock *lkp;
	int result;

	switch (ap->a_flags & LK_TYPE_MASK) {
	case LK_SHARED:
	case LK_UPGRADE:
	case LK_EXCLUSIVE:
	vp = ap->a_vp;
	flags = ap->a_flags;
	for (;;) {
	#ifdef DEBUG_VFS_LOCKS
	KASSERT(vp->v_holdcnt != 0,
	("ffs_lock %p: zero hold count", vp));
	#endif
	lkp = vp->v_vnlock;
	result = _lockmgr_args(lkp, flags, VI_MTX(vp),
	LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
	ap->a_file, ap->a_line);
	if (lkp == vp->v_vnlock \|\| result != 0)
	break;
	/*
	* Apparent success, except that the vnode
	* mutated between snapshot file vnode and
	* regular file vnode while this process
	* slept. The lock currently held is not the
	* right lock. Release it, and try to get the
	* new lock.
	*/
	(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
	LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
	ap->a_file, ap->a_line);
	if ((flags & (LK_INTERLOCK \| LK_NOWAIT)) ==
	(LK_INTERLOCK \| LK_NOWAIT))
	return (EBUSY);
	if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
	flags = (flags & ~LK_TYPE_MASK) \| LK_EXCLUSIVE;
	flags &= ~LK_INTERLOCK;
	}
	break;
	default:
	result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
	}
	return (result);
	#else
	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
	#endif
	}

	/*
	* Vnode op for reading.
	*/
	static int
	ffs_read(ap)
	struct vop_read_args /* {
	struct vnode *a_vp;
	struct uio *a_uio;
	int a_ioflag;
	struct ucred *a_cred;
	} / ap;
	{
	struct vnode *vp;
	struct inode *ip;
	struct uio *uio;
	struct fs *fs;
	struct buf *bp;
	ufs_lbn_t lbn, nextlbn;
	off_t bytesinfile;
	long size, xfersize, blkoffset;
	ssize_t orig_resid;
	int error;
	int seqcount;
	int ioflag;

	vp = ap->a_vp;
	uio = ap->a_uio;
	ioflag = ap->a_ioflag;
	if (ap->a_ioflag & IO_EXT)
	#ifdef notyet
	return (ffs_extread(vp, uio, ioflag));
	#else
	panic("ffs_read+IO_EXT");
	#endif
	#ifdef DIRECTIO
	if ((ioflag & IO_DIRECT) != 0) {
	int workdone;

	error = ffs_rawread(vp, uio, &workdone);
	if (error != 0 \|\| workdone != 0)
	return error;
	}
	#endif

	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
	ip = VTOI(vp);

	#ifdef INVARIANTS
	if (uio->uio_rw != UIO_READ)
	panic("ffs_read: mode");

	if (vp->v_type == VLNK) {
	if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
	panic("ffs_read: short symlink");
	} else if (vp->v_type != VREG && vp->v_type != VDIR)
	panic("ffs_read: type %d", vp->v_type);
	#endif
	orig_resid = uio->uio_resid;
	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
	if (orig_resid == 0)
	return (0);
	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
	fs = ITOFS(ip);
	if (uio->uio_offset < ip->i_size &&
	uio->uio_offset >= fs->fs_maxfilesize)
	return (EOVERFLOW);

	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
	if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
	break;
	lbn = lblkno(fs, uio->uio_offset);
	nextlbn = lbn + 1;

	/*
	* size of buffer. The buffer representing the
	* end of the file is rounded up to the size of
	* the block type ( fragment or full block,
	* depending ).
	*/
	size = blksize(fs, ip, lbn);
	blkoffset = blkoff(fs, uio->uio_offset);

	/*
	* The amount we want to transfer in this iteration is
	* one FS block less the amount of the data before
	* our startpoint (duh!)
	*/
	xfersize = fs->fs_bsize - blkoffset;

	/*
	* But if we actually want less than the block,
	* or the file doesn't have a whole block more of data,
	* then use the lesser number.
	*/
	if (uio->uio_resid < xfersize)
	xfersize = uio->uio_resid;
	if (bytesinfile < xfersize)
	xfersize = bytesinfile;

	if (lblktosize(fs, nextlbn) >= ip->i_size) {
	/*
	* Don't do readahead if this is the end of the file.
	*/
	error = bread_gb(vp, lbn, size, NOCRED,
	GB_UNMAPPED, &bp);
	} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
	/*
	* Otherwise if we are allowed to cluster,
	* grab as much as we can.
	*
	* XXX This may not be a win if we are not
	* doing sequential access.
	*/
	error = cluster_read(vp, ip->i_size, lbn,
	size, NOCRED, blkoffset + uio->uio_resid,
	seqcount, GB_UNMAPPED, &bp);
	} else if (seqcount > 1) {
	/*
	* If we are NOT allowed to cluster, then
	* if we appear to be acting sequentially,
	* fire off a request for a readahead
	* as well as a read. Note that the 4th and 5th
	* arguments point to arrays of the size specified in
	* the 6th argument.
	*/
	u_int nextsize = blksize(fs, ip, nextlbn);
	error = breadn_flags(vp, lbn, size, &nextlbn,
	&nextsize, 1, NOCRED, GB_UNMAPPED, NULL, &bp);
	} else {
	/*
	* Failing all of the above, just read what the
	* user asked for. Interestingly, the same as
	* the first option above.
	*/
	error = bread_gb(vp, lbn, size, NOCRED,
	GB_UNMAPPED, &bp);
	}
	if (error) {
	brelse(bp);
	bp = NULL;
	break;
	}

	/*
	* We should only get non-zero b_resid when an I/O error
	* has occurred, which should cause us to break above.
	* However, if the short read did not cause an error,
	* then we want to ensure that we do not uiomove bad
	* or uninitialized data.
	*/
	size -= bp->b_resid;
	if (size < xfersize) {
	if (size == 0)
	break;
	xfersize = size;
	}

	if (buf_mapped(bp)) {
	error = vn_io_fault_uiomove((char *)bp->b_data +
	blkoffset, (int)xfersize, uio);
	} else {
	error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
	(int)xfersize, uio);
	}
	if (error)
	break;

	vfs_bio_brelse(bp, ioflag);
	}

	/*
	* This can only happen in the case of an error
	* because the loop above resets bp to NULL on each iteration
	* and on normal completion has not set a new value into it.
	* so it must have come from a 'break' statement
	*/
	if (bp != NULL)
	vfs_bio_brelse(bp, ioflag);

	if ((error == 0 \|\| uio->uio_resid != orig_resid) &&
	(vp->v_mount->mnt_flag & (MNT_NOATIME \| MNT_RDONLY)) == 0 &&
	(ip->i_flag & IN_ACCESS) == 0) {
	VI_LOCK(vp);
	ip->i_flag \|= IN_ACCESS;
	VI_UNLOCK(vp);
	}
	return (error);
	}

	/*
	* Vnode op for writing.
	*/
	static int
	ffs_write(ap)
	struct vop_write_args /* {
	struct vnode *a_vp;
	struct uio *a_uio;
	int a_ioflag;
	struct ucred *a_cred;
	} / ap;
	{
	struct vnode *vp;
	struct uio *uio;
	struct inode *ip;
	struct fs *fs;
	struct buf *bp;
	ufs_lbn_t lbn;
	off_t osize;
	ssize_t resid;
	int seqcount;
	int blkoffset, error, flags, ioflag, size, xfersize;

	vp = ap->a_vp;
	uio = ap->a_uio;
	ioflag = ap->a_ioflag;
	if (ap->a_ioflag & IO_EXT)
	#ifdef notyet
	return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
	#else
	panic("ffs_write+IO_EXT");
	#endif

	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
	ip = VTOI(vp);

	#ifdef INVARIANTS
	if (uio->uio_rw != UIO_WRITE)
	panic("ffs_write: mode");
	#endif

	switch (vp->v_type) {
	case VREG:
	if (ioflag & IO_APPEND)
	uio->uio_offset = ip->i_size;
	if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
	return (EPERM);
	/* FALLTHROUGH */
	case VLNK:
	break;
	case VDIR:
	panic("ffs_write: dir write");
	break;
	default:
	panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
	(int)uio->uio_offset,
	(int)uio->uio_resid
	);
	}

	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
	fs = ITOFS(ip);
	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
	return (EFBIG);
	/*
	* Maybe this should be above the vnode op call, but so long as
	* file servers have no limits, I don't think it matters.
	*/
	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
	return (EFBIG);

	resid = uio->uio_resid;
	osize = ip->i_size;
	if (seqcount > BA_SEQMAX)
	flags = BA_SEQMAX << BA_SEQSHIFT;
	else
	flags = seqcount << BA_SEQSHIFT;
	if (ioflag & IO_SYNC)
	flags \|= IO_SYNC;
	flags \|= BA_UNMAPPED;

	for (error = 0; uio->uio_resid > 0;) {
	lbn = lblkno(fs, uio->uio_offset);
	blkoffset = blkoff(fs, uio->uio_offset);
	xfersize = fs->fs_bsize - blkoffset;
	if (uio->uio_resid < xfersize)
	xfersize = uio->uio_resid;
	if (uio->uio_offset + xfersize > ip->i_size)
	vnode_pager_setsize(vp, uio->uio_offset + xfersize);

	/*
	* We must perform a read-before-write if the transfer size
	* does not cover the entire buffer.
	*/
	if (fs->fs_bsize > xfersize)
	flags \|= BA_CLRBUF;
	else
	flags &= ~BA_CLRBUF;
	/* XXX is uio->uio_offset the right thing here? */
	error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
	ap->a_cred, flags, &bp);
	if (error != 0) {
	vnode_pager_setsize(vp, ip->i_size);
	break;
	}
	if ((ioflag & (IO_SYNC\|IO_INVAL)) == (IO_SYNC\|IO_INVAL))
	bp->b_flags \|= B_NOCACHE;

	if (uio->uio_offset + xfersize > ip->i_size) {
	ip->i_size = uio->uio_offset + xfersize;
	DIP_SET(ip, i_size, ip->i_size);
	}

	size = blksize(fs, ip, lbn) - bp->b_resid;
	if (size < xfersize)
	xfersize = size;

	if (buf_mapped(bp)) {
	error = vn_io_fault_uiomove((char *)bp->b_data +
	blkoffset, (int)xfersize, uio);
	} else {
	error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
	(int)xfersize, uio);
	}
	/*
	* If the buffer is not already filled and we encounter an
	* error while trying to fill it, we have to clear out any
	* garbage data from the pages instantiated for the buffer.
	* If we do not, a failed uiomove() during a write can leave
	* the prior contents of the pages exposed to a userland mmap.
	*
	* Note that we need only clear buffers with a transfer size
	* equal to the block size because buffers with a shorter
	* transfer size were cleared above by the call to UFS_BALLOC()
	* with the BA_CLRBUF flag set.
	*
	* If the source region for uiomove identically mmaps the
	* buffer, uiomove() performed the NOP copy, and the buffer
	* content remains valid because the page fault handler
	* validated the pages.
	*/
	if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
	fs->fs_bsize == xfersize)
	vfs_bio_clrbuf(bp);

	vfs_bio_set_flags(bp, ioflag);

	/*
	* If IO_SYNC each buffer is written synchronously. Otherwise
	* if we have a severe page deficiency write the buffer
	* asynchronously. Otherwise try to cluster, and if that
	* doesn't do it then either do an async write (if O_DIRECT),
	* or a delayed write (if not).
	*/
	if (ioflag & IO_SYNC) {
	(void)bwrite(bp);
	} else if (vm_page_count_severe() \|\|
	buf_dirty_count_severe() \|\|
	(ioflag & IO_ASYNC)) {
	bp->b_flags \|= B_CLUSTEROK;
	bawrite(bp);
	} else if (xfersize + blkoffset == fs->fs_bsize) {
	if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
	bp->b_flags \|= B_CLUSTEROK;
	cluster_write(vp, bp, ip->i_size, seqcount,
	GB_UNMAPPED);
	} else {
	bawrite(bp);
	}
	} else if (ioflag & IO_DIRECT) {
	bp->b_flags \|= B_CLUSTEROK;
	bawrite(bp);
	} else {
	bp->b_flags \|= B_CLUSTEROK;
	bdwrite(bp);
	}
	if (error \|\| xfersize == 0)
	break;
	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	}
	/*
	* If we successfully wrote any data, and we are not the superuser
	* we clear the setuid and setgid bits as a precaution against
	* tampering.
	*/
	if ((ip->i_mode & (ISUID \| ISGID)) && resid > uio->uio_resid &&
	ap->a_cred) {
	if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
	ip->i_mode &= ~(ISUID \| ISGID);
	DIP_SET(ip, i_mode, ip->i_mode);
	}
	}
	if (error) {
	if (ioflag & IO_UNIT) {
	(void)ffs_truncate(vp, osize,
	IO_NORMAL \| (ioflag & IO_SYNC), ap->a_cred);
	uio->uio_offset -= resid - uio->uio_resid;
	uio->uio_resid = resid;
	}
	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
	error = ffs_update(vp, 1);
	return (error);
	}

	/*
	* Extended attribute area reading.
	*/
	static int
	ffs_extread(struct vnode vp, struct uio uio, int ioflag)
	{
	struct inode *ip;
	struct ufs2_dinode *dp;
	struct fs *fs;
	struct buf *bp;
	ufs_lbn_t lbn, nextlbn;
	off_t bytesinfile;
	long size, xfersize, blkoffset;
	ssize_t orig_resid;
	int error;

	ip = VTOI(vp);
	fs = ITOFS(ip);
	dp = ip->i_din2;

	#ifdef INVARIANTS
	if (uio->uio_rw != UIO_READ \|\| fs->fs_magic != FS_UFS2_MAGIC)
	panic("ffs_extread: mode");

	#endif
	orig_resid = uio->uio_resid;
	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
	if (orig_resid == 0)
	return (0);
	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));

	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
	if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
	break;
	lbn = lblkno(fs, uio->uio_offset);
	nextlbn = lbn + 1;

	/*
	* size of buffer. The buffer representing the
	* end of the file is rounded up to the size of
	* the block type ( fragment or full block,
	* depending ).
	*/
	size = sblksize(fs, dp->di_extsize, lbn);
	blkoffset = blkoff(fs, uio->uio_offset);

	/*
	* The amount we want to transfer in this iteration is
	* one FS block less the amount of the data before
	* our startpoint (duh!)
	*/
	xfersize = fs->fs_bsize - blkoffset;

	/*
	* But if we actually want less than the block,
	* or the file doesn't have a whole block more of data,
	* then use the lesser number.
	*/
	if (uio->uio_resid < xfersize)
	xfersize = uio->uio_resid;
	if (bytesinfile < xfersize)
	xfersize = bytesinfile;

	if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
	/*
	* Don't do readahead if this is the end of the info.
	*/
	error = bread(vp, -1 - lbn, size, NOCRED, &bp);
	} else {
	/*
	* If we have a second block, then
	* fire off a request for a readahead
	* as well as a read. Note that the 4th and 5th
	* arguments point to arrays of the size specified in
	* the 6th argument.
	*/
	u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);

	nextlbn = -1 - nextlbn;
	error = breadn(vp, -1 - lbn,
	size, &nextlbn, &nextsize, 1, NOCRED, &bp);
	}
	if (error) {
	brelse(bp);
	bp = NULL;
	break;
	}

	/*
	* We should only get non-zero b_resid when an I/O error
	* has occurred, which should cause us to break above.
	* However, if the short read did not cause an error,
	* then we want to ensure that we do not uiomove bad
	* or uninitialized data.
	*/
	size -= bp->b_resid;
	if (size < xfersize) {
	if (size == 0)
	break;
	xfersize = size;
	}

	error = uiomove((char *)bp->b_data + blkoffset,
	(int)xfersize, uio);
	if (error)
	break;
	vfs_bio_brelse(bp, ioflag);
	}

	/*
	* This can only happen in the case of an error
	* because the loop above resets bp to NULL on each iteration
	* and on normal completion has not set a new value into it.
	* so it must have come from a 'break' statement
	*/
	if (bp != NULL)
	vfs_bio_brelse(bp, ioflag);
	return (error);
	}

	/*
	* Extended attribute area writing.
	*/
	static int
	ffs_extwrite(struct vnode vp, struct uio uio, int ioflag, struct ucred *ucred)
	{
	struct inode *ip;
	struct ufs2_dinode *dp;
	struct fs *fs;
	struct buf *bp;
	ufs_lbn_t lbn;
	off_t osize;
	ssize_t resid;
	int blkoffset, error, flags, size, xfersize;

	ip = VTOI(vp);
	fs = ITOFS(ip);
	dp = ip->i_din2;

	#ifdef INVARIANTS
	if (uio->uio_rw != UIO_WRITE \|\| fs->fs_magic != FS_UFS2_MAGIC)
	panic("ffs_extwrite: mode");
	#endif

	if (ioflag & IO_APPEND)
	uio->uio_offset = dp->di_extsize;
	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
	if ((uoff_t)uio->uio_offset + uio->uio_resid >
	UFS_NXADDR * fs->fs_bsize)
	return (EFBIG);

	resid = uio->uio_resid;
	osize = dp->di_extsize;
	flags = IO_EXT;
	if (ioflag & IO_SYNC)
	flags \|= IO_SYNC;

	for (error = 0; uio->uio_resid > 0;) {
	lbn = lblkno(fs, uio->uio_offset);
	blkoffset = blkoff(fs, uio->uio_offset);
	xfersize = fs->fs_bsize - blkoffset;
	if (uio->uio_resid < xfersize)
	xfersize = uio->uio_resid;

	/*
	* We must perform a read-before-write if the transfer size
	* does not cover the entire buffer.
	*/
	if (fs->fs_bsize > xfersize)
	flags \|= BA_CLRBUF;
	else
	flags &= ~BA_CLRBUF;
	error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
	ucred, flags, &bp);
	if (error != 0)
	break;
	/*
	* If the buffer is not valid we have to clear out any
	* garbage data from the pages instantiated for the buffer.
	* If we do not, a failed uiomove() during a write can leave
	* the prior contents of the pages exposed to a userland
	* mmap(). XXX deal with uiomove() errors a better way.
	*/
	if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
	vfs_bio_clrbuf(bp);

	if (uio->uio_offset + xfersize > dp->di_extsize)
	dp->di_extsize = uio->uio_offset + xfersize;

	size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
	if (size < xfersize)
	xfersize = size;

	error =
	uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);

	vfs_bio_set_flags(bp, ioflag);

	/*
	* If IO_SYNC each buffer is written synchronously. Otherwise
	* if we have a severe page deficiency write the buffer
	* asynchronously. Otherwise try to cluster, and if that
	* doesn't do it then either do an async write (if O_DIRECT),
	* or a delayed write (if not).
	*/
	if (ioflag & IO_SYNC) {
	(void)bwrite(bp);
	} else if (vm_page_count_severe() \|\|
	buf_dirty_count_severe() \|\|
	xfersize + blkoffset == fs->fs_bsize \|\|
	(ioflag & (IO_ASYNC \| IO_DIRECT)))
	bawrite(bp);
	else
	bdwrite(bp);
	if (error \|\| xfersize == 0)
	break;
	ip->i_flag \|= IN_CHANGE;
	}
	/*
	* If we successfully wrote any data, and we are not the superuser
	* we clear the setuid and setgid bits as a precaution against
	* tampering.
	*/
	if ((ip->i_mode & (ISUID \| ISGID)) && resid > uio->uio_resid && ucred) {
	if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
	ip->i_mode &= ~(ISUID \| ISGID);
	dp->di_mode = ip->i_mode;
	}
	}
	if (error) {
	if (ioflag & IO_UNIT) {
	(void)ffs_truncate(vp, osize,
	IO_EXT \| (ioflag&IO_SYNC), ucred);
	uio->uio_offset -= resid - uio->uio_resid;
	uio->uio_resid = resid;
	}
	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
	error = ffs_update(vp, 1);
	return (error);
	}


	/*
	* Vnode operating to retrieve a named extended attribute.
	*
	* Locate a particular EA (nspace:name) in the area (ptr:length), and return
	* the length of the EA, and possibly the pointer to the entry and to the data.
	*/
	static int
	ffs_findextattr(u_char ptr, u_int length, int nspace, const char name,
	struct extattr eapp, u_char eac)
	{
	struct extattr eap, eaend;
	size_t nlen;

	nlen = strlen(name);
	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
	eap = (struct extattr *)ptr;
	eaend = (struct extattr *)(ptr + length);
	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
	/* make sure this entry is complete */
	if (EXTATTR_NEXT(eap) > eaend)
	break;
	if (eap->ea_namespace != nspace \|\| eap->ea_namelength != nlen
	\|\| memcmp(eap->ea_name, name, nlen) != 0)
	continue;
	if (eapp != NULL)
	*eapp = eap;
	if (eac != NULL)
	*eac = EXTATTR_CONTENT(eap);
	return (EXTATTR_CONTENT_SIZE(eap));
	}
	return (-1);
	}

	static int
	ffs_rdextattr(u_char *p, struct vnode vp, struct thread *td, int extra)
	{
	struct inode *ip;
	struct ufs2_dinode *dp;
	struct fs *fs;
	struct uio luio;
	struct iovec liovec;
	u_int easize;
	int error;
	u_char *eae;

	ip = VTOI(vp);
	fs = ITOFS(ip);
	dp = ip->i_din2;
	easize = dp->di_extsize;
	if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
	return (EFBIG);

	eae = malloc(easize + extra, M_TEMP, M_WAITOK);

	liovec.iov_base = eae;
	liovec.iov_len = easize;
	luio.uio_iov = &liovec;
	luio.uio_iovcnt = 1;
	luio.uio_offset = 0;
	luio.uio_resid = easize;
	luio.uio_segflg = UIO_SYSSPACE;
	luio.uio_rw = UIO_READ;
	luio.uio_td = td;

	error = ffs_extread(vp, &luio, IO_EXT \| IO_SYNC);
	if (error) {
	free(eae, M_TEMP);
	return(error);
	}
	*p = eae;
	return (0);
	}

	static void
	ffs_lock_ea(struct vnode *vp)
	{
	struct inode *ip;

	ip = VTOI(vp);
	VI_LOCK(vp);
	while (ip->i_flag & IN_EA_LOCKED) {
	ip->i_flag \|= IN_EA_LOCKWAIT;
	msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
	0);
	}
	ip->i_flag \|= IN_EA_LOCKED;
	VI_UNLOCK(vp);
	}

	static void
	ffs_unlock_ea(struct vnode *vp)
	{
	struct inode *ip;

	ip = VTOI(vp);
	VI_LOCK(vp);
	if (ip->i_flag & IN_EA_LOCKWAIT)
	wakeup(&ip->i_ea_refs);
	ip->i_flag &= ~(IN_EA_LOCKED \| IN_EA_LOCKWAIT);
	VI_UNLOCK(vp);
	}

	static int
	ffs_open_ea(struct vnode vp, struct ucred cred, struct thread *td)
	{
	struct inode *ip;
	struct ufs2_dinode *dp;
	int error;

	ip = VTOI(vp);

	ffs_lock_ea(vp);
	if (ip->i_ea_area != NULL) {
	ip->i_ea_refs++;
	ffs_unlock_ea(vp);
	return (0);
	}
	dp = ip->i_din2;
	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
	if (error) {
	ffs_unlock_ea(vp);
	return (error);
	}
	ip->i_ea_len = dp->di_extsize;
	ip->i_ea_error = 0;
	ip->i_ea_refs++;
	ffs_unlock_ea(vp);
	return (0);
	}

	/*
	* Vnode extattr transaction commit/abort
	*/
	static int
	ffs_close_ea(struct vnode vp, int commit, struct ucred cred, struct thread *td)
	{
	struct inode *ip;
	struct uio luio;
	struct iovec liovec;
	int error;
	struct ufs2_dinode *dp;

	ip = VTOI(vp);

	ffs_lock_ea(vp);
	if (ip->i_ea_area == NULL) {
	ffs_unlock_ea(vp);
	return (EINVAL);
	}
	dp = ip->i_din2;
	error = ip->i_ea_error;
	if (commit && error == 0) {
	ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
	if (cred == NOCRED)
	cred = vp->v_mount->mnt_cred;
	liovec.iov_base = ip->i_ea_area;
	liovec.iov_len = ip->i_ea_len;
	luio.uio_iov = &liovec;
	luio.uio_iovcnt = 1;
	luio.uio_offset = 0;
	luio.uio_resid = ip->i_ea_len;
	luio.uio_segflg = UIO_SYSSPACE;
	luio.uio_rw = UIO_WRITE;
	luio.uio_td = td;
	/* XXX: I'm not happy about truncating to zero size */
	if (ip->i_ea_len < dp->di_extsize)
	error = ffs_truncate(vp, 0, IO_EXT, cred);
	error = ffs_extwrite(vp, &luio, IO_EXT \| IO_SYNC, cred);
	}
	if (--ip->i_ea_refs == 0) {
	free(ip->i_ea_area, M_TEMP);
	ip->i_ea_area = NULL;
	ip->i_ea_len = 0;
	ip->i_ea_error = 0;
	}
	ffs_unlock_ea(vp);
	return (error);
	}

	/*
	* Vnode extattr strategy routine for fifos.
	*
	* We need to check for a read or write of the external attributes.
	* Otherwise we just fall through and do the usual thing.
	*/
	static int
	ffsext_strategy(struct vop_strategy_args *ap)
	/*
	struct vop_strategy_args {
	struct vnodeop_desc *a_desc;
	struct vnode *a_vp;
	struct buf *a_bp;
	};
	*/
	{
	struct vnode *vp;
	daddr_t lbn;

	vp = ap->a_vp;
	lbn = ap->a_bp->b_lblkno;
	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
	return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
	if (vp->v_type == VFIFO)
	return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
	panic("spec nodes went here");
	}

	/*
	* Vnode extattr transaction commit/abort
	*/
	static int
	ffs_openextattr(struct vop_openextattr_args *ap)
	/*
	struct vop_openextattr_args {
	struct vnodeop_desc *a_desc;
	struct vnode *a_vp;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{

	if (ap->a_vp->v_type == VCHR \|\| ap->a_vp->v_type == VBLK)
	return (EOPNOTSUPP);

	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
	}


	/*
	* Vnode extattr transaction commit/abort
	*/
	static int
	ffs_closeextattr(struct vop_closeextattr_args *ap)
	/*
	struct vop_closeextattr_args {
	struct vnodeop_desc *a_desc;
	struct vnode *a_vp;
	int a_commit;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{

	if (ap->a_vp->v_type == VCHR \|\| ap->a_vp->v_type == VBLK)
	return (EOPNOTSUPP);

	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
	return (EROFS);

	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
	}

	/*
	* Vnode operation to remove a named attribute.
	*/
	static int
	ffs_deleteextattr(struct vop_deleteextattr_args *ap)
	/*
	vop_deleteextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	IN const char *a_name;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	struct inode *ip;
	- struct fs *fs;
	struct extattr *eap;
	uint32_t ul;
	int olen, error, i, easize;
	u_char *eae;
	void *tmp;

	ip = VTOI(ap->a_vp);
	- fs = ITOFS(ip);

	if (ap->a_vp->v_type == VCHR \|\| ap->a_vp->v_type == VBLK)
	return (EOPNOTSUPP);

	if (strlen(ap->a_name) == 0)
	return (EINVAL);

	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	return (EROFS);

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VWRITE);
	if (error) {

	/*
	* ffs_lock_ea is not needed there, because the vnode
	* must be exclusively locked.
	*/
	if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
	ip->i_ea_error = error;
	return (error);
	}

	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
	if (error)
	return (error);

	/* CEM: delete could be done in-place instead */
	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
	easize = ip->i_ea_len;

	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
	&eap, NULL);
	if (olen == -1) {
	/* delete but nonexistent */
	free(eae, M_TEMP);
	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
	return (ENOATTR);
	}
	ul = eap->ea_length;
	i = (u_char *)EXTATTR_NEXT(eap) - eae;
	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
	easize -= ul;

	tmp = ip->i_ea_area;
	ip->i_ea_area = eae;
	ip->i_ea_len = easize;
	free(tmp, M_TEMP);
	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
	return (error);
	}

	/*
	* Vnode operation to retrieve a named extended attribute.
	*/
	static int
	ffs_getextattr(struct vop_getextattr_args *ap)
	/*
	vop_getextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	IN const char *a_name;
	INOUT struct uio *a_uio;
	OUT size_t *a_size;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	struct inode *ip;
	u_char eae, p;
	unsigned easize;
	int error, ealen;

	ip = VTOI(ap->a_vp);

	if (ap->a_vp->v_type == VCHR \|\| ap->a_vp->v_type == VBLK)
	return (EOPNOTSUPP);

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VREAD);
	if (error)
	return (error);

	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
	if (error)
	return (error);

	eae = ip->i_ea_area;
	easize = ip->i_ea_len;

	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
	NULL, &p);
	if (ealen >= 0) {
	error = 0;
	if (ap->a_size != NULL)
	*ap->a_size = ealen;
	else if (ap->a_uio != NULL)
	error = uiomove(p, ealen, ap->a_uio);
	} else
	error = ENOATTR;

	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
	return (error);
	}

	/*
	* Vnode operation to retrieve extended attributes on a vnode.
	*/
	static int
	ffs_listextattr(struct vop_listextattr_args *ap)
	/*
	vop_listextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	INOUT struct uio *a_uio;
	OUT size_t *a_size;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	struct inode *ip;
	struct extattr eap, eaend;
	int error, ealen;

	ip = VTOI(ap->a_vp);

	if (ap->a_vp->v_type == VCHR \|\| ap->a_vp->v_type == VBLK)
	return (EOPNOTSUPP);

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VREAD);
	if (error)
	return (error);

	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
	if (error)
	return (error);

	error = 0;
	if (ap->a_size != NULL)
	*ap->a_size = 0;

	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
	eap = (struct extattr *)ip->i_ea_area;
	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
	/* make sure this entry is complete */
	if (EXTATTR_NEXT(eap) > eaend)
	break;
	if (eap->ea_namespace != ap->a_attrnamespace)
	continue;

	ealen = eap->ea_namelength;
	if (ap->a_size != NULL)
	*ap->a_size += ealen + 1;
	else if (ap->a_uio != NULL)
	error = uiomove(&eap->ea_namelength, ealen + 1,
	ap->a_uio);
	}

	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
	return (error);
	}

	/*
	* Vnode operation to set a named attribute.
	*/
	static int
	ffs_setextattr(struct vop_setextattr_args *ap)
	/*
	vop_setextattr {
	IN struct vnode *a_vp;
	IN int a_attrnamespace;
	IN const char *a_name;
	INOUT struct uio *a_uio;
	IN struct ucred *a_cred;
	IN struct thread *a_td;
	};
	*/
	{
	struct inode *ip;
	struct fs *fs;
	struct extattr *eap;
	uint32_t ealength, ul;
	ssize_t ealen;
	int olen, eapad1, eapad2, error, i, easize;
	u_char *eae;
	void *tmp;

	ip = VTOI(ap->a_vp);
	fs = ITOFS(ip);

	if (ap->a_vp->v_type == VCHR \|\| ap->a_vp->v_type == VBLK)
	return (EOPNOTSUPP);

	if (strlen(ap->a_name) == 0)
	return (EINVAL);

	/* XXX Now unsupported API to delete EAs using NULL uio. */
	if (ap->a_uio == NULL)
	return (EOPNOTSUPP);

	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
	return (EROFS);

	ealen = ap->a_uio->uio_resid;
	if (ealen < 0 \|\| ealen > lblktosize(fs, UFS_NXADDR))
	return (EINVAL);

	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
	ap->a_cred, ap->a_td, VWRITE);
	if (error) {

	/*
	* ffs_lock_ea is not needed there, because the vnode
	* must be exclusively locked.
	*/
	if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
	ip->i_ea_error = error;
	return (error);
	}

	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
	if (error)
	return (error);

	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
	eapad1 = roundup2(ealength, 8) - ealength;
	eapad2 = roundup2(ealen, 8) - ealen;
	ealength += eapad1 + ealen + eapad2;

	/*
	* CEM: rewrites of the same size or smaller could be done in-place
	* instead. (We don't acquire any fine-grained locks in here either,
	* so we could also do bigger writes in-place.)
	*/
	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
	easize = ip->i_ea_len;

	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
	&eap, NULL);
	if (olen == -1) {
	/* new, append at end */
	KASSERT(ALIGNED_TO(eae + easize, struct extattr),
	("unaligned"));
	eap = (struct extattr *)(eae + easize);
	easize += ealength;
	} else {
	ul = eap->ea_length;
	i = (u_char *)EXTATTR_NEXT(eap) - eae;
	if (ul != ealength) {
	bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
	easize - i);
	easize += (ealength - ul);
	}
	}
	if (easize > lblktosize(fs, UFS_NXADDR)) {
	free(eae, M_TEMP);
	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
	if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
	ip->i_ea_error = ENOSPC;
	return (ENOSPC);
	}
	eap->ea_length = ealength;
	eap->ea_namespace = ap->a_attrnamespace;
	eap->ea_contentpadlen = eapad2;
	eap->ea_namelength = strlen(ap->a_name);
	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
	if (error) {
	free(eae, M_TEMP);
	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
	if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
	ip->i_ea_error = error;
	return (error);
	}
	bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);

	tmp = ip->i_ea_area;
	ip->i_ea_area = eae;
	ip->i_ea_len = easize;
	free(tmp, M_TEMP);
	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
	return (error);
	}

	/*
	* Vnode pointer to File handle
	*/
	static int
	ffs_vptofh(struct vop_vptofh_args *ap)
	/*
	vop_vptofh {
	IN struct vnode *a_vp;
	IN struct fid *a_fhp;
	};
	*/
	{
	struct inode *ip;
	struct ufid *ufhp;

	ip = VTOI(ap->a_vp);
	ufhp = (struct ufid *)ap->a_fhp;
	ufhp->ufid_len = sizeof(struct ufid);
	ufhp->ufid_ino = ip->i_number;
	ufhp->ufid_gen = ip->i_gen;
	return (0);
	}

	SYSCTL_DECL(_vfs_ffs);
	static int use_buf_pager = 1;
	SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
	"Always use buffer pager instead of bmap");

	static daddr_t
	ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
	{

	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
	}

	static int
	ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
	{

	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
	}

	static int
	ffs_getpages(struct vop_getpages_args *ap)
	{
	struct vnode *vp;
	struct ufsmount *um;

	vp = ap->a_vp;
	um = VFSTOUFS(vp->v_mount);

	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
	return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
	ap->a_rbehind, ap->a_rahead, NULL, NULL));
	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
	ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
	}

File Metadata

Mime Type: application/octet-stream
Expires: Tue, Jan 7, 11:19 AM (2 d)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: kwLV3tac3mN.
Default Alt Text: (4 MB)

Offset	End	Complete
0	4194304	Yes
4194304	4581209	Yes

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions